<a href="https://colab.research.google.com/github/Chrakimnas6/NLP-Practice/blob/master/NN_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis with Neural Network



## Data Analysis

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import math

In [0]:
data_source_url = "https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv"
airline_tweets = pd.read_csv(data_source_url, delimiter='\t', header=None)
airline_tweets = airline_tweets[:2000]

In [0]:
# Change graphs' size
plot_size = plt.rcParams["figure.figsize"] 
plot_size[0] = 8
plot_size[1] = 6
plt.rcParams["figure.figsize"] = plot_size 

In [0]:
airline_tweets.head()

Unnamed: 0,0,1
0,"a stirring , funny and finally transporting re...",1
1,apparently reassembled from the cutting room f...,0
2,they presume their audience wo n't sit still f...,0
3,this is a visually stunning rumination on love...,1
4,jonathan parker 's bartleby should have been t...,1


In [0]:
# Number of rows and columns
airline_tweets.shape

(2000, 2)

In [0]:
# text as features and sentiment as labels
airline_tweets[['text', 'airline_sentiment']].head()

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,@VirginAmerica plus you've added commercials t...,positive
2,@VirginAmerica I didn't today... Must mean I n...,neutral
3,@VirginAmerica it's really aggressive to blast...,negative
4,@VirginAmerica and it's a really big bad thing...,negative


In [0]:
# Convert to integers values
airline_tweets['airline_sentiment'] = airline_tweets['airline_sentiment'].replace('neutral', 1)
airline_tweets['airline_sentiment'] = airline_tweets['airline_sentiment'].replace('negative', 0)
airline_tweets['airline_sentiment'] = airline_tweets['airline_sentiment'].replace('positive', 2)

In [0]:
X = airline_tweets.iloc[:, 0].values # features
y = airline_tweets.iloc[:, 1].values# labels
y

array([1, 0, 0, ..., 0, 0, 0])

## Preprocessing

In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence

In [0]:
# punctuation is removed, turning the texts into space-separated sequences of words
t = Tokenizer()
t.fit_on_texts(X)
sequences = t.texts_to_sequences(X)

In [0]:
# configuration
t.get_config()

{'char_level': False,
 'document_count': 2000,
 'filters': '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
 'index_docs': '{"3": 917, "2983": 1, "774": 5, "544": 7, "386": 10, "2": 965, "96": 40, "4": 798, "2982": 1, "1": 1080, "70": 50, "2981": 1, "773": 5, "385": 10, "1805": 2, "31": 120, "2984": 1, "483": 8, "1268": 3, "545": 7, "1807": 2, "1806": 2, "1808": 2, "546": 7, "74": 50, "14": 236, "2986": 1, "21": 167, "2988": 1, "2989": 1, "71": 48, "977": 4, "1811": 2, "1809": 2, "2990": 1, "776": 5, "628": 5, "978": 4, "2985": 1, "1269": 3, "189": 20, "44": 87, "41": 85, "1812": 2, "2987": 1, "10": 425, "354": 11, "88": 40, "124": 29, "246": 16, "775": 5, "209": 19, "627": 6, "1810": 2, "2991": 1, "6": 579, "171": 23, "15": 251, "547": 7, "235": 17, "777": 5, "105": 36, "214": 18, "24": 146, "215": 18, "2992": 1, "629": 6, "8": 499, "22": 152, "548": 6, "1270": 3, "36": 109, "1271": 3, "153": 25, "294": 13, "1813": 2, "107": 35, "101": 38, "2993": 1, "37": 109, "388": 8, "1814": 2, "190": 19, 

In [0]:
# Find number of unique words in our tweets
vocab_size = len(t.word_index) + 1
vocab_size

7245

In [0]:
# Find longest tweet in sequences
def max_tweet():
    for i in range(1, len(sequences)):
        max_length = len(sequences[0])
        if len(sequences[i]) > max_length:
            max_length = len(sequences[i])
    return max_length

In [0]:
tweet_num = max_tweet()
tweet_num

25

In [0]:
# each review has a different number of words, so pad sequence with 0's
# https://realpython.com/python-keras-text-classification/
from tensorflow.keras.preprocessing.sequence import pad_sequences
maxlen = tweet_num
padded_X = pad_sequences(sequences, padding='post', maxlen=maxlen)

In [0]:
# Convert labels 
labels = to_categorical(np.asarray(y))
labels

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [0]:
# Train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(padded_X, labels, test_size = 0.2, random_state = 0)

In [0]:
# Size of train and test datasets
print('X_train size:', X_train.shape)
print('y_train size:', y_train.shape)
print('X_test size:', X_test.shape)
print('y_test size:', y_test.shape)

X_train size: (1600, 25)
y_train size: (1600, 2)
X_test size: (400, 25)
y_test size: (400, 2)


## Pre-trained word embedding

In [0]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2020-01-19 09:43:55--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-01-19 09:43:56--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-01-19 09:43:56--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2020-0

In [0]:
!unzip glove*.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [0]:
!ls

glove.6B.100d.txt  glove.6B.300d.txt  glove.6B.zip
glove.6B.200d.txt  glove.6B.50d.txt   sample_data


In [0]:
embeddings_index = dict()
f = open('glove.6B.100d.txt')
# f = open('/content/drive/My Drive/glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [0]:
embeddings_index['and']

array([-0.071953,  0.23127 ,  0.023731, -0.50638 ,  0.33923 ,  0.1959  ,
       -0.32943 ,  0.18364 , -0.18057 ,  0.28963 ,  0.20448 , -0.5496  ,
        0.27399 ,  0.58327 ,  0.20468 , -0.49228 ,  0.19974 , -0.070237,
       -0.88049 ,  0.29485 ,  0.14071 , -0.1009  ,  0.99449 ,  0.36973 ,
        0.44554 ,  0.28998 , -0.1376  , -0.56365 , -0.029365, -0.4122  ,
       -0.25269 ,  0.63181 , -0.44767 ,  0.24363 , -0.10813 ,  0.25164 ,
        0.46967 ,  0.3755  , -0.23613 , -0.14129 , -0.44537 , -0.65737 ,
       -0.042421, -0.28636 , -0.28811 ,  0.063766,  0.20281 , -0.53542 ,
        0.41307 , -0.59722 , -0.38614 ,  0.19389 , -0.17809 ,  1.6618  ,
       -0.011819, -2.3737  ,  0.058427, -0.2698  ,  1.2823  ,  0.81925 ,
       -0.22322 ,  0.72932 , -0.053211,  0.43507 ,  0.85011 , -0.42935 ,
        0.92664 ,  0.39051 ,  1.0585  , -0.24561 , -0.18265 , -0.5328  ,
        0.059518, -0.66019 ,  0.18991 ,  0.28836 , -0.2434  ,  0.52784 ,
       -0.65762 , -0.14081 ,  1.0491  ,  0.5134  , 

In [0]:
# Create a matrix of weights for words in the training set
# One embedding for each word in the training set

embedding_matrix = np.zeros((vocab_size, 100))

# fill in matrix
for word, i in t.word_index.items():  # dictionary
    embedding_vector = embeddings_index.get(word) # gets embedded vector of word from GloVe
    if embedding_vector is not None:
        # add to matrix
        embedding_matrix[i] = embedding_vector # each row of matrix

In [0]:
embedding_matrix.shape

(7245, 100)

In [0]:
# Create embedding layer using embedding matrix
from tensorflow.keras.layers import Embedding

# input is vocab_size, output is 100
# weights from embedding matrix, set trainable = False
embedding_layer = Embedding(input_dim=vocab_size, output_dim=100, weights=[embedding_matrix],
                           input_length = tweet_num, trainable=False)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


## Build Neural Network - LSTM

In [0]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

In [0]:
lstm_model = Sequential()
lstm_model.add(embedding_layer)
lstm_model.add(LSTM(256, 
               dropout = 0.2, 
               recurrent_dropout = 0.5))
lstm_model.add(Dense(2, activation='softmax'))
lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
lstm_model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 25, 100)           724500    
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               365568    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 514       
Total params: 1,090,582
Trainable params: 366,082
Non-trainable params: 724,500
_________________________________________________________________


In [0]:
hist_1 = lstm_model.fit(X_train, y_train,
                    validation_split = 0.2,
                    epochs=100, batch_size=256)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 1280 samples, validate on 320 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/1

In [0]:
# Find train and test accuracy
loss, accuracy = lstm_model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = lstm_model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9312
Testing Accuracy:  0.7050


## Build Neural Network - Bidirectional RNN



In [0]:
from tensorflow.keras.layers import Bidirectional

In [0]:
# Bidirectional RNNs
biRnn_model = Sequential()
biRnn_model.add(embedding_layer)
biRnn_model.add(Bidirectional(LSTM(64,
                              dropout=0.2,
                              recurrent_dropout=0.5)))
biRnn_model.add(Dense(3,activation='softmax'))
biRnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
biRnn_model.summary()

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 30, 100)           1576900   
_________________________________________________________________
bidirectional (Bidirectional (None, 128)               84480     
_________________________________________________________________
dense_4 (Dense)              (None, 3)                 387       
Total params: 1,661,767
Trainable params: 84,867
Non-trainable params: 1,576,900
_________________________________________________________________


In [0]:
hist_2 = biRnn_model.fit(X_train, y_train,
                    validation_split = 0.2,
                    epochs=100, batch_size=256)

Train on 9369 samples, validate on 2343 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100

In [0]:
loss, accuracy = biRnn_model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = biRnn_model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.9338
Testing Accuracy:  0.7961
