In [65]:
#import the required libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from numpy import array

from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten
from keras.layers import GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing.text import Tokenizer

#CNN
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D


from sklearn.model_selection import train_test_split

In [35]:
#load data
df = pd.read_csv("/home/bukya/snap/firefox/common/Downloads/IMDB Dataset.csv")
df.shape

(50000, 2)

In [36]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [37]:
df['review'][3]# one fo the review from df

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

# Data Preprocessing

In [38]:
# helper function to remove htmal tags, punctuations

def preprocess_text(sen):
    sentence = remove_tags(sen) # html tags
    
    sentence = re.sub('[^a-zA-Z]', ' ', sentence) # punctuations and numbers 
                                                # except capital and small English letters 
    
    sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence) # remove single characters
    
    sentence = re.sub(r'\s+', ' ', sentence) #remove multiple spaces
    
    return sentence

In [39]:
TAG_RE = re.compile(r'<[^>]+>') # simply replaces anything between opening and closing <>

def remove_tags(text):
    return TAG_RE.sub('', text) # replaces with empty space

In [40]:
# we will process our reviews and will store them in a new list 
X = []

sentences = list(df['review']) # df to list 

for sen in sentences:
    X.append(preprocess_text(sen)) # append cleaned sentences to new list

In [41]:
X[3]

'Basically there a family where little boy Jake thinks there a zombie in his closet his parents are fighting all the time This movie is slower than soap opera and suddenly Jake decides to become Rambo and kill the zombie OK first of all when you re going to make film you must Decide if its thriller or drama As drama the movie is watchable Parents are divorcing arguing like in real life And then we have Jake with his closet which totally ruins all the film expected to see BOOGEYMAN similar movie and instead watched drama with some meaningless thriller spots out of just for the well playing parents descent dialogs As for the shots with Jake just ignore them '

In [42]:
# convert labels into digits
y = df['sentiment']

y = np.array(list(map(lambda x: 1 if x=='positive' else 0, y)))

In [43]:
y

array([1, 1, 1, ..., 0, 0, 0])

In [44]:
# divide dataset into train and test sets
x_train, x_test,  y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [45]:
len(x_test), len(x_train)

(10000, 40000)

# Preparing the Embedding Layer

In [46]:
# we will use Tokenizer class from keras to create a word to index dictionary
# each word in the corpus used as a key
# and corresponding unique index is used as the value for the key

tokenizer = Tokenizer(num_words = 5000)
tokenizer.fit_on_texts(x_train)

x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)

In [47]:
# padding
# adding 1 becuase of reserved 0 index

vocab_size = len(tokenizer.word_index) + 1

maxlen = 100

x_train = pad_sequences(x_train, padding='post', maxlen=maxlen)
x_test = pad_sequences(x_test, padding='post', maxlen=maxlen)

In [48]:
vocab_size # 92547 unique words

92547

In [49]:
# use Glove embeddings to create our feature matrix

from numpy import array
from numpy import asarray
from numpy import zeros

embeddings_dictionary = dict()
glove_file = open('/home/bukya/snap/firefox/common/Downloads/glove.6B/glove.6B.100d.txt', encoding='utf8')

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
    
glove_file.close()

In [50]:
# create an embedding matrix where each row number will correspond to the index of 
#the words in the corpus
# the matrix will have 100 columns where each column will contain the Golve embeddings 
# for the words in th corpus

embedding_matrix = zeros((vocab_size, 100))

for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [51]:
len(embedding_matrix)

92547

# Text Classification with Simple Neural Network

In [52]:
# simple deep neural network.

model = Sequential()
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model.add(embedding_layer)

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [53]:
# To compile our model, we will use the adam optimizer, 
# binary_crossentropy as our loss function and accuracy as metrics 

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

print(model.summary())

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 100, 100)          9254700   
_________________________________________________________________
flatten_2 (Flatten)          (None, 10000)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 10001     
Total params: 9,264,701
Trainable params: 10,001
Non-trainable params: 9,254,700
_________________________________________________________________
None


In [54]:
history = model.fit(x_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)

Train on 32000 samples, validate on 8000 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [55]:
score = model.evaluate(x_test, y_test, verbose=1)



In [56]:
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 0.5400524091720581
Test Accuracy: 0.739799976348877


test accuracy is 73.97% and training accuracy is 81.54%, the difference is large
This means that our model is overfitting on the training set. 
Overfitting occurs when model performs better on the training set than the test set. 
Ideally, the performance difference between training and test sets should be minimum

# Text Classification with a Convolutional Neural Network

In [66]:
# 1D convolutional neural networks to extract features from our data

model1 = Sequential() # sequential model

embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model1.add(embedding_layer)

# next create a one-dimensional convolutional layer with 128 features,
# The kernel size is 5 and the activation function used is sigmoid
model1.add(Conv1D(128, 5, activation='relu'))

model1.add(GlobalMaxPooling1D()) # max pooling layer to reduce feature size

model1.add(Dense(1, activation='sigmoid'))#  finally a dense layer with sigmoid activation

# To compile our model, we will use the adam optimizer, 
# binary_crossentropy as our loss function and accuracy as metrics 

model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [67]:
print(model1.summary())

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 100, 100)          9254700   
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 96, 128)           64128     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 129       
Total params: 9,318,957
Trainable params: 64,257
Non-trainable params: 9,254,700
_________________________________________________________________
None


In [68]:
# now train our model and evaluate it on the training set

history = model1.fit(x_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)

score = model1.evaluate(x_test, y_test, verbose=1)

Train on 32000 samples, validate on 8000 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [70]:
score = model1.evaluate(x_test, y_test, verbose=1)

print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 0.3392755681991577
Test Accuracy: 0.8528000116348267


# Text Classification with Recurrent Neural Network (LSTM)

In [72]:
#we will use an LSTM (Long Short Term Memory network) which is a variant of RNN, 
#to solve sentiment classification problem.

In [74]:
# import lstm from keras
from keras.layers.recurrent import LSTM

model2 = Sequential()
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model2.add(embedding_layer)
model2.add(LSTM(128))

model2.add(Dense(1, activation='sigmoid'))
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [75]:
print(model2.summary())

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 100, 100)          9254700   
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               117248    
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 129       
Total params: 9,372,077
Trainable params: 117,377
Non-trainable params: 9,254,700
_________________________________________________________________
None


In [79]:
#train the model on the training set and evaluate its performance on the test set.

history = model2.fit(x_train, y_train, batch_size=128, epochs=6, verbose=1, validation_split=0.2)

score = model2.evaluate(x_test, y_test, verbose=1)

Train on 32000 samples, validate on 8000 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [80]:
print("Test Score:", score[0])
print("Test Accuracy:", score[1])

Test Score: 0.3381689423561096
Test Accuracy: 0.8518999814987183


# Preidctions

In [101]:
instance = X[57]
print(instance)

I laughed all the way through this rotten movie It so unbelievable woman leaves her husband after many years of marriage has breakdown in front of real estate office What happens The office manager comes outside and offers her job Hilarious Next thing you know the two women are going at it Yep they re lesbians Nothing rings true in this Lifetime for Women with nothing better to do movie Clunky dialogue like don want to spend the rest of my life feeling like had chance to be happy and didn take it doesn help There a wealthy distant mother who disapproves of her daughter new relationship sassy black maid unbelievable that in the year film gets made in which there a sassy black maid Hattie McDaniel must be turning in her grave The woman has husband who freaks out and wants custody of the snotty teenage kids Sheesh No cliche is left unturned 


To predict the sentiment of this review, we have to convert this review into numeric form. We can do so using the tokenizer that we created in word embedding section. The text_to_sequences method will convert the sentence into its numeric counter part.

Next, we need to pad our input sequence as we did for our corpus. Finally, we can use the predict method of our model 

In [102]:
instance = tokenizer.texts_to_sequences(instance)

flat_list = []
for sublist in instance:
    for item in sublist:
        flat_list.append(item)

flat_list = [flat_list]

instance = pad_sequences(flat_list, padding='post', maxlen=maxlen)

model2.predict(instance)

array([[0.5604919]], dtype=float32)

we mapped the positive outputs to 1 and the negative outputs to 0
If the value is less than 0.5, the sentiment is considered negative where as if the value is greater than 0.5, the sentiment is considered as positive. 