# Sentiment Analysis on Movie Reviews
## This NLP project uses an LSTM model, with GloVe embeddings, to rate the movie reviews that may consist of up to 50 words

### Note that some helper functions in this notebook are from Coursera's Sequence Models Course

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Some Helper Functions from Coursera's Sequence Models Course

In [2]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

def read_glove_vecs(glove_file):
    with open('/content/drive/My Drive/' + glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum()

def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)]
    return Y


emoji_dictionary = {"0": "\u2764\uFE0F",    # :heart: prints a black instead of red heart depending on the font
                    "1": ":baseball:",
                    "2": ":smile:",
                    "3": ":disappointed:",
                    "4": ":fork_and_knife:"}
              
    
def print_predictions(X, pred):
    print()
    for i in range(X.shape[0]):
        print(X[i], label_to_emoji(int(pred[i])))
        
        
def plot_confusion_matrix(y_actu, y_pred, title='Confusion matrix', cmap=plt.cm.gray_r):
    
    df_confusion = pd.crosstab(y_actu, y_pred.reshape(y_pred.shape[0],), rownames=['Actual'], colnames=['Predicted'], margins=True)
    
    df_conf_norm = df_confusion / df_confusion.sum(axis=1)
    
    plt.matshow(df_confusion, cmap=cmap) # imshow
    #plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(df_confusion.columns))
    plt.xticks(tick_marks, df_confusion.columns, rotation=45)
    plt.yticks(tick_marks, df_confusion.index)
    #plt.tight_layout()
    plt.ylabel(df_confusion.index.name)
    plt.xlabel(df_confusion.columns.name)
    
    
def predict(X, Y, W, b, word_to_vec_map):
    """
    Given X (sentences) and Y (emoji indices), predict emojis and compute the accuracy of your model over the given set.
    
    Arguments:
    X -- input data containing sentences, numpy array of shape (m, None)
    Y -- labels, containing index of the label emoji, numpy array of shape (m, 1)
    
    Returns:
    pred -- numpy array of shape (m, 1) with your predictions
    """
    m = X.shape[0]
    pred = np.zeros((m, 1))
    
    for j in range(m):                       # Loop over training examples
        
        # Split jth test example (sentence) into list of lower case words
        words = X[j].lower().split()
        
        # Average words' vectors
        avg = np.zeros((50,))
        for w in words:
            avg += word_to_vec_map[w]
        avg = avg/len(words)

        # Forward propagation
        Z = np.dot(W, avg) + b
        A = softmax(Z)
        pred[j] = np.argmax(A)
        
    print("Accuracy: "  + str(np.mean((pred[:] == Y.reshape(Y.shape[0],1)[:]))))
    
    return pred

# Functions to read in training and test datasets

In [3]:
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

def read_data_csv(filename):
    phrase = []
    sentiment = []

    with open ('/content/drive/My Drive/' + filename) as csvDataFile:
        csvReader = csv.reader(csvDataFile)
        for row in csvReader:
            phrase.append(row[0])
            sentiment.append(row[1])

    X = np.asarray(phrase)
    Y = np.asarray(sentiment, dtype=int)

    return X, Y

def read_train_csv(filename):
    phrase = []
    sentiment = []

    with open ('/content/drive/My Drive/' + filename) as csvDataFile:
        csvReader = csv.reader(csvDataFile)
        for index, row in enumerate(csvReader):
            if index % 100 != 0:
                phrase.append(row[0])
                sentiment.append(row[1])   

    X = np.asarray(phrase)
    Y = np.asarray(sentiment, dtype=int)

    return X, Y

def read_test_csv(filename):
    phrase = []
    sentiment = []

    with open ('/content/drive/My Drive/' + filename) as csvDataFile:
        csvReader = csv.reader(csvDataFile)
        for index, row in enumerate(csvReader):
            if index % 100 == 0:
                phrase.append(row[0])
                sentiment.append(row[1])
                
    X = np.asarray(phrase)
    Y = np.asarray(sentiment, dtype=int)

    return X, Y

# Split the data to training, validation, and test datasets

In [5]:
from sklearn.model_selection import train_test_split

X, Y = read_train_csv('dataset.csv')
X_test, Y_test = read_test_csv('dataset.csv')
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size = 0.01)

# Preprocess the datasets

In [6]:
maxLen = len(max(X_train, key=len).split()) + 1

In [7]:
Y_oh_train = convert_to_one_hot(Y_train, C = 5)
Y_oh_validation = convert_to_one_hot(Y_validation, C = 5)
Y_oh_test = convert_to_one_hot(Y_test, C = 5)

# Load the GloVe embeddings

In [16]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('glove.6B.300d.txt')
total_word_number = len(word_to_index)
print(total_word_number)

400001


Those following are loaded:
- `word_to_index`: dictionary mapping from words to their indices in the vocabulary 
    - (400,001 words, with the valid indices ranging from 0 to 400,000)
- `index_to_word`: dictionary mapping from indices to their corresponding words in the vocabulary
- `word_to_vec_map`: dictionary mapping words to their GloVe vector representation.

In [9]:
import numpy as np
np.random.seed(0)
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform
np.random.seed(1)

# More Helper Functions from Coursera's Sequence Models Course

In [10]:
def sentences_to_indices(X, word_to_index, max_len):
    """
    Converts an array of sentences (strings) into an array of indices corresponding to words in the sentences.
    The output shape should be such that it can be given to `Embedding()` (described in Figure 4). 
    
    Arguments:
    X -- array of sentences (strings), of shape (m, 1)
    word_to_index -- a dictionary containing the each word mapped to its index
    max_len -- maximum number of words in a sentence. You can assume every sentence in X is no longer than this. 
    
    Returns:
    X_indices -- array of indices corresponding to words in the sentences from X, of shape (m, max_len)
    """
    
    m = X.shape[0]                                   # number of training examples
    
    ### START CODE HERE ###
    # Initialize X_indices as a numpy matrix of zeros and the correct shape (≈ 1 line)
    X_indices = np.zeros((m, max_len))
    
    for i in range(m):                               # loop over training examples
        
        # Convert the ith training sentence in lower case and split is into words. You should get a list of words.
        sentence_words = X[i].lower().split()
        
        # Initialize j to 0
        j = 0
        
        # Loop over the words of sentence_words
        for w in sentence_words:
            # Set the (i,j)th entry of X_indices to the index of the correct word.
            if w in word_to_index:
                X_indices[i, j] = word_to_index[w]
            else:
                X_indices[i, j] = np.random.randint(total_word_number)
            # Increment j to j + 1
            j = j + 1
            
    ### END CODE HERE ###
    
    return X_indices

In [11]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_len = len(word_to_index) + 1                  # adding 1 to fit Keras embedding (requirement)
    emb_dim = word_to_vec_map["cucumber"].shape[0]      # define dimensionality of your GloVe word vectors (= 50)
    
    ### START CODE HERE ###
    # Step 1
    # Initialize the embedding matrix as a numpy array of zeros.
    # See instructions above to choose the correct shape.
    emb_matrix = np.zeros((vocab_len, emb_dim))
    
    # Step 2
    # Set each row "idx" of the embedding matrix to be 
    # the word vector representation of the idx'th word of the vocabulary
    for word, idx in word_to_index.items():
        emb_matrix[idx, :] = word_to_vec_map[word]

    # Step 3
    # Define Keras embedding layer with the correct input and output sizes
    # Make it non-trainable.
    embedding_layer = Embedding(vocab_len, emb_dim, trainable = False)
    ### END CODE HERE ###

    # Step 4 (already done for you; please do not modify)
    # Build the embedding layer, it is required before setting the weights of the embedding layer. 
    embedding_layer.build((None,)) # Do not modify the "None".  This line of code is complete as-is.
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [12]:
def LSTM_model(input_shape, word_to_vec_map, word_to_index):
    """
    Function creating model's graph.
    
    Arguments:
    input_shape -- shape of the input, usually (max_len,)
    word_to_vec_map -- dictionary mapping every word in a vocabulary into its 50-dimensional vector representation
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    model -- a model instance in Keras
    """
    
    ### START CODE HERE ###
    # Define sentence_indices as the input of the graph.
    # It should be of shape input_shape and dtype 'int32' (as it contains indices, which are integers).
    sentence_indices = Input(shape=input_shape, dtype='int32')
    
    # Create the embedding layer pretrained with GloVe Vectors (≈1 line)
    embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
    
    # Propagate sentence_indices through your embedding layer
    # (See additional hints in the instructions).
    embeddings = embedding_layer(sentence_indices)
    
    # Propagate the embeddings through an LSTM layer with 128-dimensional hidden state
    # The returned output should be a batch of sequences.
    X = LSTM(128, return_sequences=True)(embeddings)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X trough another LSTM layer with 128-dimensional hidden state
    # The returned output should be a single hidden state, not a batch of sequences.
    X = LSTM(128, return_sequences=False)(X)
    # Add dropout with a probability of 0.5
    X = Dropout(0.5)(X)
    # Propagate X through a Dense layer with 5 units
    X = Dense(5)(X)
    # Add a softmax activation
    X = Activation('softmax')(X)
    
    # Create Model instance which converts sentence_indices into X.
    model = Model(inputs=sentence_indices, outputs=X)
    
    ### END CODE HERE ###
    
    return model

# Create the LSTM Model

In [13]:
model = LSTM_model((maxLen,), word_to_vec_map, word_to_index)
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 52)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 52, 300)           120000600 
_________________________________________________________________
lstm (LSTM)                  (None, 52, 128)           219648    
_________________________________________________________________
dropout (Dropout)            (None, 52, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 5)                

In [14]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Start training the model

In [17]:
X_train_indices = sentences_to_indices(X_train, word_to_index, maxLen)
Y_train_oh = convert_to_one_hot(Y_train, C = 5)

In [18]:
model.fit(X_train_indices, Y_train_oh, epochs = 50, batch_size = 1000, shuffle=True)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f8e88c145f8>

## The relaxing version to evaluate the accuracy of model, allowing plus or minus 1 error

In [19]:
def relax_evaluate(X_indices, Y):
  correct = 0
  total = len(Y)
  pred = np.argmax(model.predict(X_indices), axis=1)
  for i in range(total):
    if abs(pred[i] - Y[i]) <= 1:
      correct += 1

  return str(correct/total)

# Accuracy for the Validation dataset

In [20]:
X_validation_indices = sentences_to_indices(X_validation, word_to_index, max_len = maxLen)
Y_validation_oh = convert_to_one_hot(Y_validation, C = 5)
loss, acc = model.evaluate(X_validation_indices, Y_validation_oh)
print("Validation accuracy = ", acc)

Validation accuracy =  0.6640776991844177


# The relaxing accuracy for the training and the validation datasets

In [21]:
print('relaxed training accuracy = ' + relax_evaluate(X_train_indices, Y_train))
print('relaxed validation accuracy = ' + relax_evaluate(X_validation_indices, Y_validation))

relaxed training accuracy = 0.9951684820272761
relaxed validation accuracy = 0.9799352750809062


# The accuracy and the relaxing accuracy for the test dataset

In [22]:
X_test_indices = sentences_to_indices(X_test, word_to_index, max_len = maxLen)
Y_test_oh = convert_to_one_hot(Y_test, C = 5)
loss, acc = model.evaluate(X_test_indices, Y_test_oh)
print("Test accuracy = ", acc)
print('relaxed test accuracy = ' + relax_evaluate(X_test_indices, Y_test))

Test accuracy =  0.6713644862174988
relaxed test accuracy = 0.9814221652786675


# Test a sarcasm example

In [23]:
# Change the sentence below to see your prediction. Make sure all the words are in the Glove embeddings.
x_sarcasm = np.array(['This confusing movie makes me excited'])
X_sarcasm_indices = sentences_to_indices(x_sarcasm, word_to_index, maxLen)
print(x_sarcasm[0] +' '+  str(np.argmax(model.predict(X_sarcasm_indices))))

This confusing movie makes me excited 0


# Test a normal example

In [24]:
x_test = np.array(["Kinnear does n't aim for our sympathy , but rather delivers a performance of striking skill and depth ."])
X_test_indices = sentences_to_indices(x_test, word_to_index, maxLen)
print(x_test[0] +' '+  str(np.argmax(model.predict(X_test_indices))))

Kinnear does n't aim for our sympathy , but rather delivers a performance of striking skill and depth . 3


# Test a negation example

In [33]:
x_negation = np.array(['This movie does not makes me happy'])
X_negation_indices = sentences_to_indices(x_negation, word_to_index, maxLen)
print(x_negation[0] +' '+  str(np.argmax(model.predict(X_negation_indices))))

This movie does not makes me happy 1


# Save the model

In [34]:
import tensorflow as tf
model.save('drive/My Drive/my_model.h5')
new_model = tf.keras.models.load_model('drive/My Drive/my_model.h5')