# Sentiment Analysis on IMDB Reviews using Ensemble Model. (LTSM, CNN, GRU)
<hr>

### Import libraries

In [107]:
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
from nltk.corpus import stopwords   # to get collection of stopwords
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential, Model     # the model
from tensorflow.keras.layers import Embedding, LSTM, Conv1D, GlobalMaxPooling1D, GRU, Dense, Dropout, Input, Average # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping   # save model
from tensorflow.keras.models import load_model   # load saved model
import re


### Preview dataset

In [108]:
from google.colab import drive
drive.mount('/content/drive')

data = pd.read_csv('/content/drive/MyDrive/IMDB_Dataset.csv')

print(data)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]


### Declaring the english stop words

In [109]:
import nltk
nltk.download('stopwords')

english_stops = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Load and Clean Dataset

In [110]:
def load_dataset():
    data = pd.read_csv('/content/drive/MyDrive/IMDB_Dataset.csv')
    x_data = data['review']       # Reviews/Input
    y_data = data['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case

    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)

Reviews
0        [one, reviewers, mentioned, watching, oz, epis...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, wonderful, way, spend, time, hot,...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [i, thought, movie, right, good, job, it, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, catholic, taught, parochial, elementary, s...
49998    [i, going, disagree, previous, comment, side, ...
49999    [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 50000, dtype: object 

Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


  y_data = y_data.replace('negative', 0)


### Split Dataset

In [111]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
47867    [i, agree, capital, city, dvd, i, watched, sho...
8905     [cheap, manipulative, this, film, heart, it, a...
14277    [i, surprised, film, well, hamptons, film, fes...
4073     [though, series, ran, season, stayed, years, i...
29863    [i, get, diehl, character, posed, microcosm, a...
                               ...                        
3180     [i, seen, hundreds, silent, movies, some, alwa...
6715     [what, makes, best, picture, material, the, os...
38047    [the, way, story, played, interaction, lead, c...
14996    [and, since, days, clarissa, explains, it, all...
36682    [one, biggest, hits, brown, harvard, exciting,...
Name: review, Length: 40000, dtype: object 

43044    [like, earliest, films, movie, short, lasting,...
27025    [no, one, going, mistake, the, squall, good, m...
26560    [the, idea, making, miniseries, berlin, airlif...
15168    [landscape, battle, opens, escaping, prisoners...
21718    [todd, rohal, mad, genius, knuckleface, jones,...
 

### Function for getting the maximum review length, by calculating the mean of all the reviews length (using <b>numpy.mean</b>)

In [112]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

### Tokenize and Pad/Truncate Reviews

In [113]:
# ENCODE REVIEW
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[   1  936 5367 ...  342   45 4801]
 [ 596 4726    8 ...    0    0    0]
 [   1  653    4 ...    0    0    0]
 ...
 [   2   26   13 ...    0    0    0]
 [  32  141  389 ...    0    0    0]
 [   5 1044 1807 ... 1771  320    7]] 

Encoded X Test
 [[   6 7997   34 ...    0    0    0]
 [ 249    5   80 ...   70  602 5893]
 [   2  228  137 ...   81 6294    0]
 ...
 [   1  331 1275 ...    0    0    0]
 [   1   14  415 ...  110 1789  824]
 [ 285   23    3 ...    0    0    0]] 

Maximum review length:  130


### Build Architecture/Model

In [114]:
def build_ensemble_model():
    inputs = Input(shape=(max_length,))
    embed = Embedding(total_words, 32)(inputs)

    # Three branches
    lstm = Dense(1, activation='sigmoid')(Dropout(0.2)(LSTM(64)(embed)))
    cnn = Dense(1, activation='sigmoid')(Dropout(0.3)(GlobalMaxPooling1D()(Conv1D(32, 3, activation='relu')(embed))))
    gru = Dense(1, activation='sigmoid')(Dropout(0.2)(GRU(64)(embed)))

    # Combine them
    output = Average()([lstm, cnn, gru])

    model = Model(inputs, output)
    model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
    return model

### Training


In [115]:
import os

# Create model directory
os.makedirs('/content/drive/MyDrive/models', exist_ok=True)

# Save model
model = build_ensemble_model()
checkpoint = ModelCheckpoint('/content/drive/MyDrive/models/Ensemble.keras', monitor='accuracy', save_best_only=True, verbose=1)

earlyStopping = EarlyStopping(
    monitor='accuracy',
    patience=1,
    verbose=1,
)

In [116]:
model.fit(x_train, y_train, batch_size=32, epochs=100, validation_split=0.1, callbacks=[checkpoint, earlyStopping], verbose=1)

Epoch 1/100
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 143ms/step - accuracy: 0.6720 - loss: 0.6137
Epoch 1: accuracy improved from -inf to 0.74539, saving model to /content/drive/MyDrive/models/Ensemble.keras
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 146ms/step - accuracy: 0.6721 - loss: 0.6136 - val_accuracy: 0.8087 - val_loss: 0.4667
Epoch 2/100
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step - accuracy: 0.8202 - loss: 0.4404
Epoch 2: accuracy improved from 0.74539 to 0.81561, saving model to /content/drive/MyDrive/models/Ensemble.keras
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 148ms/step - accuracy: 0.8202 - loss: 0.4404 - val_accuracy: 0.8245 - val_loss: 0.4389
Epoch 3/100
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 145ms/step - accuracy: 0.8586 - loss: 0.3896
Epoch 3: accuracy improved from 0.81561 to 0.86800, saving model to /content/drive/MyDri

<keras.src.callbacks.history.History at 0x79f366803980>

### Testing

In [117]:
y_pred = model.predict(x_test, verbose=0)
accuracy = np.mean((y_pred > 0.5).flatten() == y_test) * 100
print(f"\nAccuracy: {accuracy:.2f}%")


Accuracy: 84.03%


### Load Saved Model

In [118]:
Ensemble_model = load_model('/content/drive/MyDrive/models/Ensemble.keras')

Receives a review as an input to be predicted

In [119]:
review = str(input('Movie Review: '))

Movie Review: good


The input must be pre processed before it is passed to the model to be predicted

In [120]:
# Pre-process input
regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)
print('Cleaned: ', review)

words = review.split(' ')
filtered = [w for w in words if w not in english_stops]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]

print('Filtered: ', filtered)

Cleaned:  good
Filtered:  ['good']


We need to tokenize and encode the words. I use the tokenizer which was previously declared because we want to encode the words based on words that are known by the model.

In [121]:
tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post')
print(tokenize_words)

[[9 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


This is the result of the prediction which shows the **confidence score** of the review statement.

In [122]:
# Get predictions from model
result = model.predict(tokenize_words)[0][0]
print(result)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
0.81275547


Check whether the review is negative or postive.

In [123]:
if result >= 0.7:
    print('positive')
else:
    print('negative')

positive
