## Sentiment Analysis using LSTM ( Long Short Term Memory)

In [68]:
#Import Libraries

In [69]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
#Rest of the libraries will be imported wherver necessary

In [70]:
df=pd.read_csv('D:\\M. Tech in Data Science & Machine Learning\\Natural language processing (NLP)\\Datasets\\IMDB Dataset.csv')
df.head()#orignal dataframe

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [71]:
# Keeping only the neccessary columns
df = df[['review','sentiment']]
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [130]:
df1 = df[df['sentiment'] != 'Neutral'].sample(1000)#onyl considering postive and neagtive reocrds and sample of 15000 reocrds for the process ahead
df1.head()

Unnamed: 0,review,sentiment
41987,"""Thieves and Liars"" presents us with a very na...",positive
20383,A great movie. Lansbury and Tomlinson are perf...,positive
6227,<br /><br />This movie sucked! The first one w...,negative
19348,It never ceases to amaze me how you can take a...,negative
22486,The movie shows many feelings and emotions tha...,positive


In [131]:
df1.shape

(1000, 2)

In [125]:
#1. Lowercase

In [126]:
##remove everything other than ^a-z\s
#^a-z:lowe case alphabets
#\s:space

In [133]:
df1['review_clean_text']=df1['review'].str.lower().str.replace('<br />','').str.replace('[^a-z\s]','')
#str.lower() : converting to lowercase (to bring all the same words in same format)
## str.replace: replace everything other than alphabets and space

In [134]:
df1.head()

Unnamed: 0,review,sentiment,review_clean_text
41987,"""Thieves and Liars"" presents us with a very na...",positive,thieves and liars presents us with a very natu...
20383,A great movie. Lansbury and Tomlinson are perf...,positive,a great movie lansbury and tomlinson are perfe...
6227,<br /><br />This movie sucked! The first one w...,negative,this movie sucked the first one was way better...
19348,It never ceases to amaze me how you can take a...,negative,it never ceases to amaze me how you can take a...
22486,The movie shows many feelings and emotions tha...,positive,the movie shows many feelings and emotions tha...


In [135]:
#2.Tokenize

In [136]:
#In Python tokenization basically refers to splitting up a larger body of text into smaller lines, 
#words or even creating words for a non-English language

In [137]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [138]:
vocabSize=2000
tokenizer = Tokenizer(num_words=vocabSize,split=' ')#split on space
tokenizer.fit_on_texts(df1['review_clean_text'].values)

#fit_on_texts :Updates internal vocabulary based on a list of texts. 
    #This method creates the vocabulary index based on word frequency. 
    #So if you give it something like, "The cat sat on the mat."
    #It will create a dictionary s.t. word_index["the"] = 1; word_index["cat"] = 2 it is word -> index dictionary so every word gets a unique integer value. 
    #0 is reserved for padding. So lower integer means more frequent word (often the first few are stop words because they appear a lot).

In [139]:
vocab_size=tokenizer.word_index#word_index
vocab_size

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'is': 6,
 'in': 7,
 'it': 8,
 'i': 9,
 'this': 10,
 'that': 11,
 'was': 12,
 'with': 13,
 'for': 14,
 'as': 15,
 'movie': 16,
 'but': 17,
 'film': 18,
 'on': 19,
 'his': 20,
 'are': 21,
 'not': 22,
 'have': 23,
 'you': 24,
 'be': 25,
 'he': 26,
 'one': 27,
 'at': 28,
 'its': 29,
 'by': 30,
 'all': 31,
 'they': 32,
 'an': 33,
 'like': 34,
 'from': 35,
 'who': 36,
 'so': 37,
 'or': 38,
 'out': 39,
 'just': 40,
 'about': 41,
 'has': 42,
 'if': 43,
 'good': 44,
 'what': 45,
 'some': 46,
 'more': 47,
 'there': 48,
 'when': 49,
 'her': 50,
 'my': 51,
 'had': 52,
 'very': 53,
 'even': 54,
 'would': 55,
 'their': 56,
 'which': 57,
 'only': 58,
 'were': 59,
 'can': 60,
 'up': 61,
 'no': 62,
 'time': 63,
 'see': 64,
 'really': 65,
 'story': 66,
 'than': 67,
 'me': 68,
 'she': 69,
 'other': 70,
 'much': 71,
 'well': 72,
 'get': 73,
 'most': 74,
 'do': 75,
 'how': 76,
 'been': 77,
 'great': 78,
 'into': 79,
 'also': 80,
 'people': 81,
 'will': 82

In [140]:
X = tokenizer.texts_to_sequences(df1['review_clean_text'].values)
#texts_to_sequences: Transforms each text in texts to a sequence of integers. 
    #So it basically takes each word in the text and replaces it with its corresponding integer value from the word_index dictionary. 
    #Nothing more, nothing less, certainly no magic involved.

In [141]:
X

[[2,
  1664,
  164,
  13,
  3,
  53,
  1665,
  4,
  1,
  1953,
  4,
  11,
  109,
  2,
  1121,
  92,
  5,
  98,
  865,
  43,
  22,
  938,
  41,
  56,
  2,
  56,
  434,
  453,
  473,
  1,
  165,
  6,
  287,
  1005,
  11,
  46,
  21,
  140,
  33,
  287,
  1048,
  11,
  1,
  631,
  4,
  632,
  7,
  1,
  18,
  1,
  1375,
  1,
  66,
  202,
  15,
  8,
  129,
  25,
  7,
  10,
  528,
  4,
  18,
  1,
  201,
  1,
  2,
  4,
  1,
  7,
  3,
  53,
  777,
  93,
  535,
  8,
  1182,
  34,
  297,
  141,
  3,
  691,
  141,
  10,
  18,
  24,
  252,
  15,
  43,
  1006,
  1,
  137,
  1248,
  2,
  831,
  141,
  8,
  9,
  434,
  8],
 [3,
  78,
  16,
  2,
  21,
  310,
  1,
  559,
  21,
  439,
  1,
  13,
  3,
  1122,
  808,
  14,
  1,
  21,
  1123,
  15,
  14,
  1,
  1124,
  1,
  778,
  212,
  1460,
  42,
  536,
  33,
  408,
  1,
  13,
  1,
  1183,
  4,
  1,
  6,
  2,
  1461,
  9,
  435,
  5,
  64,
  1,
  195,
  302,
  38,
  28,
  226,
  1,
  195,
  418,
  135,
  7,
  87,
  23,
  58,
  1,
  195,
  302,
  280,
  

In [142]:
X = pad_sequences(X)#pad_sequences is used to ensure that all sequences in a list have the same length.

In [143]:
#Examples of padding

In [144]:
X[1].shape#shape of the first text

(818,)

In [145]:
df1['review_clean_text'].values[1]#value of the first text

'a great movie lansbury and tomlinson are perfect the songs are wonderful the dances with a particular mention for the portobello ballet are gorgeous as for the animated section the match between animals has become an instant classic the climax with the attack of the armatures is chilling and fascinating i recommend to see the restored  minutes version or at least the  minutes video here in italy we have only the  minutes version although the film was presented in its original release at the running of  minutes if possible watch also the german videocassette it was generated from the  minutes running but its missing of every refer to world war ii and of all the scenes between english people and their nazi invaders'

In [146]:
X[1]#pad_sequences of the first text

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [147]:
#Similarly,

In [148]:
X[2].shape#shape of the second text

(818,)

In [149]:
df1['review_clean_text'].values[2]#value of the second text

'this movie sucked the first one was way better no one from the first has returned in this dumb sequel and in some way that is a good thing because of the bad acting but the characters in this film are not even better killjoy in the woods come on give me a break im suprised killjoys friend the blair witch didnt show up to make a cameo bad acting bad story and just plain out silly and boring dont waste your time'

In [150]:
X[2]#pad_sequences of the second text

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [151]:
X.shape#overall shape of x

(1000, 818)

In [152]:
#More explanation of .fit_on_texts and .texts_to_sequences on belwo link

#https://stackoverflow.com/questions/51956000/what-does-keras-tokenizer-method-exactly-do

In [153]:
#MODEL building

In [154]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D

In [155]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(vocabSize, embed_dim,input_length = X.shape[1]))
#vocabSize:input_dim (Integer. Size of the vocabulary, i.e. maximum integer index + 1.)
#embed_dim:output_dim (Integer. Dimension of the dense embedding.)
#input_length: Length of input sequences
model.add(SpatialDropout1D(0.4))#explained below
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
#lstm_out : Positive integer, dimensionality of the output space.
model.add(Dense(2,activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 818, 128)          256000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 818, 128)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________
None


In [156]:
#What is SpatialDropout1D?

In [157]:
#Dropout(): Let's define 2D input: [[1, 1, 1], [2, 2, 2]]. 
            #Dropout will consider every element independently, and may result in something like [[1, 0, 1], [0, 2, 2]]
    
#SpatialDropout1D(): In this case result will look like [[1, 0, 1], [2, 0, 2]]. 
                    #Notice that 2nd element was zeroed along all channels.

In [158]:
from sklearn.model_selection import train_test_split

Y = pd.get_dummies(df1['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.15, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(850, 818) (850, 2)
(150, 818) (150, 2)


In [159]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 5, batch_size=batch_size, verbose = 1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x2f0524f75f8>

In [161]:
score,acc = model.evaluate(X_test, Y_test, verbose = 1, batch_size = batch_size)
print("score: %.2f" % (score))
print("accuracy: %.2f" % (acc))

score: 0.69
accuracy: 0.65


In [162]:
#Generate Text

In [163]:
#Great, our model architecture is now ready and we can train it using our data. 
#Next lets write the function to predict the next word based on the input words (or seed text). 
#We will first tokenize the seed text, pad the sequences and pass into the trained model to get predicted word. The multiple predicted words can be appended together to get predicted sequence.

In [180]:
def generate_text(seed_text):
    for i in range(2):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]#tokenize 
        #print(token_list)
        token_list = pad_sequences([token_list], maxlen=818-1, padding='pre')#pad the sequences
        #maxlen: Optional Int, maximum length of all sequences. If not provided,sequences will be padded to the length of the longest individual sequence.
        #pad either before or after each sequence.
        #print(token_list)
        predicted = model.predict_classes(token_list, verbose=0)
        #print(predicted)
        output_word = ""
        for word,index in tokenizer.word_index.items():
            #print(word)
            #print(index)
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [183]:
df1['review_clean_text'].values[15]

'i neglected this film when i used to go to the movie store but then the curiosity got to me and i decided to check it out i loved it the movie starts off with judy and jay heading for a halloween party at the abandoned funeral parlor hull house then we meet a few more characters angela and suzzanne  the hosts frannie max rodger sal and helen then of course they start to party and when theyre really in the mood they decide to have a sance which awakens a demon the demon possesses angela and she starts her gruesome slaughtering will they survive the night of the demonsthe movie was overall great the gore was fine but the nudity provided by linnea quigley trash from rotld once again screws it i never was a fan of hers and never will be'

In [184]:
print (generate_text(df1['review_clean_text'].values[15]))

I Neglected This Film When I Used To Go To The Movie Store But Then The Curiosity Got To Me And I Decided To Check It Out I Loved It The Movie Starts Off With Judy And Jay Heading For A Halloween Party At The Abandoned Funeral Parlor Hull House Then We Meet A Few More Characters Angela And Suzzanne  The Hosts Frannie Max Rodger Sal And Helen Then Of Course They Start To Party And When Theyre Really In The Mood They Decide To Have A Sance Which Awakens A Demon The Demon Possesses Angela And She Starts Her Gruesome Slaughtering Will They Survive The Night Of The Demonsthe Movie Was Overall Great The Gore Was Fine But The Nudity Provided By Linnea Quigley Trash From Rotld Once Again Screws It I Never Was A Fan Of Hers And Never Will Be The The


In [None]:
#-----------------------------------------------------------END-----------------------------------------------------#