<a href="https://colab.research.google.com/github/ArshT/Mini-Projects/blob/master/Movie_Reviews_LSTMs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
###Importing Important Libraries

import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords

from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers.core import Activation, Dropout, Dense
from keras.layers import Flatten,LSTM,Bidirectional
from keras.layers import GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [2]:
###Mounting the Drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
cd '/content/drive/My Drive'

/content/drive/My Drive


In [4]:
###Importing the Data

movie_reviews = pd.read_csv('imdb_master.csv',encoding = "ISO-8859-1")
movie_reviews.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [5]:
###Checking for missing Values

movie_reviews.isnull().values.any()

False

In [6]:
###Checking for Shape of DataFrame

movie_reviews.shape

(100000, 5)

In [7]:
###Unique Values of Label

movie_reviews['label'].unique()

array(['neg', 'pos', 'unsup'], dtype=object)

In [8]:
###Removing the unsup training examples

train = movie_reviews[movie_reviews['label'] != 'unsup']
train.shape

(50000, 5)

In [9]:
###Example of a Review 

train['review'][5]

'A funny thing happened to me while watching "Mosquito": on the one hand, the hero is a deaf-mute and the director is totally unable to make us understand why he does what he does (mutilating mannequins...er, excuse me, corpses) through his images. On the other hand, the English version at least is very badly dubbed. So I found myself wishing there had been both more AND less dialogue at the same time! This film is stupid (funny how this guy has access to every graveyard and mortuary in his town) and lurid (where would we be in a 70s exploitationer without our gratuitous lesbian scene?). Not to mention the "romantic" aspect (oh, how sweet!)...Miss it. (*)'

In [0]:
###Pre-Processing the reviews

TAG_RE = re.compile(r'<[^>]+>')

def remove_tags(text):
    return TAG_RE.sub('', text)

In [0]:
##Function for Pre-Processing of the Reviews

def preprocess_text(sen):

  #Removing the HTML Tags
  sentence = remove_tags(sen)

  #Removing Single Characters
  sentence = re.sub(r"\s+[a-zA-Z]\s+", ' ', sentence)

  #Removing multiple spaces
  sentence = re.sub(r'\s+', ' ', sentence)

  return sentence

In [0]:
###Creating the Dataset after Pre-Processing

##X
X = []
sentences = list(train['review'])
for sen in sentences:
  X.append(preprocess_text(sen))

In [0]:
##Y

Y = train['label']
Y = np.array(list(map(lambda x: 1 if x=="pos" else 0, Y)))

In [0]:
###Splitting the Dataset for Training and Testing

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=42)

In [0]:
###Making a Vocabulary by Tokenizing, then the list of sentences will be converted to Indices.  

tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [0]:
###Declaring the Vocabulary Size and also restricting the reviews to a max length

vocab_size = len(tokenizer.word_index) + 1

maxlen = 100


##Padding to make the Length Uniform
X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

In [0]:

###Creating the Embedding matrix using GloVe

embeddings_dictionary = dict()
glove_file = open('glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
  records = line.split()
  word = records[0]
  vector_dimensions = np.asarray(records[1:], dtype='float32')
  embeddings_dictionary [word] = vector_dimensions
glove_file.close()

In [0]:
embedding_matrix = np.zeros((vocab_size, 100))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

In [0]:
###Model

model = Sequential()
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=maxlen , trainable=False)
model.add(embedding_layer)
model.add(Bidirectional(LSTM(128,return_sequences=True)))
model.add(Dropout(0.25))
model.add(LSTM(128))
model.add(Dropout(0.25))

model.add(Dense(64,activation='relu'))
model.add(Dropout(0.4))
model.add(Dense(1, activation='sigmoid'))

In [0]:
###Compiling the Model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

In [26]:
###Training the Model

history = model.fit(X_train, Y_train, batch_size=256, epochs=10, verbose=1, validation_split=0.2)

Train on 32000 samples, validate on 8000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [27]:
###Testing the Model

score = model.evaluate(X_test, Y_test, verbose=1)
score



[0.3158712195396423, 0.8631]