In [1]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [2]:
df = pd.read_csv('/content/drive/MyDrive/Colab Datasets/IMDB_Dataset.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [5]:
tokenize = Tokenizer()
tokenize.fit_on_texts(df['review'])
X_seq = tokenize.texts_to_sequences(df['review'])
X_pad = pad_sequences(X_seq, maxlen = 500)

In [6]:
vocab_size = len(tokenize.word_index) + 1

In [7]:
tokenize2 = Tokenizer()
tokenize2.fit_on_texts(df['sentiment'])
Y_seq = tokenize2.texts_to_sequences(df['sentiment'])
Y_pad = pad_sequences(Y_seq, maxlen = 1)

In [8]:
Y = df['sentiment'].replace({'positive' : 1, 'negative': 0})

In [10]:
model_lstm = keras.models.Sequential([
        keras.layers.Embedding(vocab_size, 40, input_length = 500),
        keras.layers.Dropout(0.5),
        keras.layers.LSTM(100),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(1, activation = 'sigmoid')
])

model_lstm.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model_lstm.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 40)           4970120   
_________________________________________________________________
dropout (Dropout)            (None, 500, 40)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               56400     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 5,026,621
Trainable params: 5,026,621
Non-trainable params: 0
_________________________________________________________________


In [11]:
model_lstm.fit(X_pad, Y, validation_split = 0.2, batch_size = 16, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f4e001c21d0>

In [14]:
model_lstm.save("movie_sentiment_analysis_lstm.h5")

with open("movie_sentiment_analysis_lstm.json", "w") as json_file:
  model_json = model_lstm.to_json() 
  json_file.write(model_json) 

# Saving weights of the model to a HDF5 file 
model_lstm.save_weights("movie_sentiment_analysis_lstm_weights.h5")

In [15]:
model_gru = keras.models.Sequential([
        keras.layers.Embedding(vocab_size, 40, input_length = 500),
        keras.layers.Dropout(0.5),
        keras.layers.GRU(100),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(1, activation = 'sigmoid')
])

model_gru.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model_gru.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 40)           4970120   
_________________________________________________________________
dropout_4 (Dropout)          (None, 500, 40)           0         
_________________________________________________________________
gru_1 (GRU)                  (None, 100)               42600     
_________________________________________________________________
dropout_5 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 5,012,821
Trainable params: 5,012,821
Non-trainable params: 0
_________________________________________________________________


In [16]:
model_gru.fit(X_pad, Y, validation_split = 0.2, batch_size = 16, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f4d91f3b550>

In [17]:
model_gru.save("movie_sentiment_analysis_gru.h5")

with open("movie_sentiment_analysis_gru.json", "w") as json_file:
  model_gru_json = model_gru.to_json() 
  json_file.write(model_gru_json) 

# Saving weights of the model to a HDF5 file 
model_gru.save_weights("movie_sentiment_analysis_gru_weights.h5")

In [19]:
model_bilstm = keras.models.Sequential([
        keras.layers.Embedding(vocab_size, 40, input_length = 500),
        keras.layers.Dropout(0.5),
        keras.layers.Bidirectional(keras.layers.LSTM(100, return_sequences = True)),
        keras.layers.Bidirectional(keras.layers.LSTM(100)),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(1, activation = 'sigmoid')
])

model_bilstm.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
model_bilstm.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 40)           4970120   
_________________________________________________________________
dropout_7 (Dropout)          (None, 500, 40)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 500, 200)          112800    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200)               240800    
_________________________________________________________________
dropout_8 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 201       
Total params: 5,323,921
Trainable params: 5,323,921
Non-trainable params: 0
____________________________________________

In [20]:
model_bilstm.fit(X_pad, Y, validation_split = 0.2, batch_size = 16, epochs = 5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f4d8ef637b8>

In [21]:
model_bilstm.save("movie_sentiment_analysis_bilstm.h5")

with open("movie_sentiment_analysis_bilstm.json", "w") as json_file:
  model_gru_json = model_bilstm.to_json() 
  json_file.write(model_gru_json) 

# Saving weights of the model to a HDF5 file 
model_bilstm.save_weights("movie_sentiment_analysis_bilstm_weights.h5")

In [None]:
# Without dropout
model.fit(X_pad, Y, validation_split = 0.2, batch_size = 16, epochs = 10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f69b5436518>