# **Sentiment Analysis Model - TI2**
The goal of this project is to build a sentiment analysis model using supervised learning with vanilla Recurrent Neural Networks and LSTM.


## **Preprocess:**

Neceesary imports:

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import pandas as pd
# Necessary resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Cristian
[nltk_data]     Perafan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Cristian
[nltk_data]     Perafan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

English stopwords: 

In [2]:
stop_words = set(stopwords.words('english'))

Read the data into a pandas dataframe, where each row is a sentence and each column is a label (0 for negative, 1 for positive) and the text itself.

In [3]:
amazon_df = pd.read_csv('./sentiment labelled sentences/amazon_cells_labelled.txt', sep='\t',names=['sentence', 'tag'])

imdb_df = pd.read_csv('./sentiment labelled sentences/imdb_labelled.txt', sep='\t',names=['sentence', 'tag'])

yelp_df = pd.read_csv('./sentiment labelled sentences/yelp_labelled.txt', sep='\t',names=['sentence', 'tag'])

Tokenize and delete the stop words from text data  using NLTK:

- *word.isalnum()* ensures that only words containing alphabetic or numeric characters are included and excludes punctuation marks or other special characters.

In [4]:
stop_words = set(stopwords.words('english'))

# Tokenize and delete stop words from the Amazon sentences

amazon_df['tokens'] = amazon_df['sentence'].apply(lambda x: [word.lower() for word in word_tokenize(x) if word.isalnum() and word.lower() not in stop_words])


# Tokenize and delete stop words from the IMDB sentences
imdb_df['tokens'] = imdb_df['sentence'].apply(lambda x: [word.lower() for word in word_tokenize(x) if word.isalnum() and word.lower() not in stop_words])

# Tokenize and delete stop words from the Yelp sentences
yelp_df['tokens'] = yelp_df['sentence'].apply(lambda x: [word.lower() for word in word_tokenize(x) if word.isalnum() and word.lower() not in stop_words])


combined_sentiments_df = pd.concat([amazon_df, imdb_df, yelp_df], ignore_index=True)


Split data into training and test sets:

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(combined_sentiments_df['tokens'], combined_sentiments_df['tag'], test_size=0.3, random_state=42)


## **DummyClassifier**

Neceesary imports:

In [6]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


Build the DummyClassifier model:

In [7]:
clf_dummy = DummyClassifier(random_state=43,strategy='prior')
clf_dummy.fit(X_train, Y_train)

y_pred = clf_dummy.predict(X_test)

**Model performance**

*Accuracy*: is the fraction of predictions our model got right.

In [8]:
accuracy_score(Y_test, y_pred)

0.4727272727272727

*Presicion*: is the fraction of positive predictions that are correct.

In [9]:
precision_score(Y_test, y_pred, average=None)

  _warn_prf(average, modifier, msg_start, len(result))


array([0.        , 0.47272727])

*Recaal*: is the fraction of the truly positive instances that the classifier recognizes.

In [10]:
recall_score(Y_test, y_pred, average=None)

array([0., 1.])

*f1-score*: is the harmonic mean of precision and recall.

In [11]:
f1_score(Y_test, y_pred, average=None)

array([0.        , 0.64197531])

## **RNN Model**	

Neceesary imports:

In [12]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers




Building the RNN model and adding layers:

In [13]:
model = keras.Sequential()
model.add(layers.SimpleRNN(64, input_shape=(None, 28)))
model.add(layers.BatchNormalization())
model.add(layers.Dense(10))
print(model.summary())



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn (SimpleRNN)      (None, 64)                5952      
                                                                 
 batch_normalization (BatchN  (None, 64)               256       
 ormalization)                                                   
                                                                 
 dense (Dense)               (None, 10)                650       
                                                                 
Total params: 6,858
Trainable params: 6,730
Non-trainable params: 128
_________________________________________________________________
None


Compiling the model:

In [14]:
model.compile(
    loss=keras.losses.SparseCategoricalCrossentropy(from_logits = True),
    optimizer="sgd",
    metrics=["accuracy"],
)





Refactoring the data:


Training the model:

- Creating a Tokenizer object. Tokenizer is a Keras class used to convert text into sequences of tokens (words). This object will contain information about the vocabulary of the data set and will allow text to be transformed into sequences of integers.

In [15]:
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=1000,lower=True)



This method examines the text and builds the tokenizer's internal vocabulary, assigning a unique index to each word in the training set.

In [16]:
tokenizer.fit_on_texts(X_train)

Assigning each word in the vocabulary will be assigned a unique integer.



In [17]:
X_train_sequence_of_integers = tokenizer.texts_to_sequences(X_train)
X_test_sequence_of_integers = tokenizer.texts_to_sequences(X_test)

# Adding 1 because of reserved 0 index
vocab_size = len(tokenizer.word_index) + 1

In [30]:
from keras.utils import pad_sequences


maxlen = 100

X_train_sequence_of_integers_padded = pad_sequences(X_train_sequence_of_integers, padding='post', maxlen=maxlen)


X_test_sequence_of_integers_padded = pad_sequences(X_test_sequence_of_integers, padding='post', maxlen=maxlen)



Building the model:

In [32]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, SimpleRNN,Activation
from keras import optimizers
from keras.utils import pad_sequences
import numpy as np


X_train_np = np.array(X_train_sequence_of_integers_padded).reshape(X_train_sequence_of_integers_padded.shape[0],X_train_sequence_of_integers_padded.shape[1],1)


X_test_np = np.array(X_test_sequence_of_integers_padded).reshape(X_test_sequence_of_integers_padded.shape[0],X_test_sequence_of_integers_padded.shape[1],1)


Vainilla RNN:

In [37]:
num_classes = 2

def vainilla_rnn():
    model = Sequential()
    model.add(SimpleRNN(50, input_shape=(maxlen,1),return_sequences=False))
    model.add(Dense(num_classes))
    model.add(Activation('softmax'))
    model.summary()

    adam= optimizers.Adam(lr=0.001)
    model.compile(loss='sparse_categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
    return model

In [38]:
# Training

from keras.wrappers.scikit_learn import KerasClassifier

model = KerasClassifier(build_fn=vainilla_rnn, epochs=10, batch_size=32, verbose=1)



  model = KerasClassifier(build_fn=vainilla_rnn, epochs=10, batch_size=32, verbose=1)


In [39]:

model.fit(X_train_np, Y_train)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn_2 (SimpleRNN)    (None, 50)                2600      
                                                                 
 dense_2 (Dense)             (None, 2)                 102       
                                                                 
 activation_1 (Activation)   (None, 2)                 0         
                                                                 
Total params: 2,702
Trainable params: 2,702
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1bc590db210>

In [41]:
y_pred = model.predict(X_test_np)

print(accuracy_score(Y_test, y_pred))   

0.5078787878787879
