# Import the Libraries

In [46]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras

# Import the Dataset

In [56]:
imdb_reviews = pd.read_csv('imdb_reviews.csv')
test_reviews = pd.read_csv('test_reviews.csv')

In [57]:
imdb_reviews.head()

Unnamed: 0,Reviews,Sentiment
0,<START this film was just brilliant casting lo...,positive
1,<START big hair big boobs bad music and a gian...,negative
2,<START this has to be one of the worst films o...,negative
3,<START the <UNK> <UNK> at storytelling the tra...,positive
4,<START worst mistake of my life br br i picked...,negative


# Import the Word Index File

This file was used in this project for converting words to the numbers

In [58]:
word_index = pd.read_csv('word_indexes.csv')

In [59]:
word_index.head()

Unnamed: 0,Words,Indexes
0,tsukino,52009
1,nunnery,52010
2,sonja,16819
3,vani,63954
4,woods,1411


## Convert the word index to dictionary

In [60]:
word_index = dict(zip(word_index.Words, word_index.Indexes))

## Add some words in the Word index

In [92]:
word_index["<PAD>"] = 0
word_index["<START"]=1
word_index["<UNK>"] = 2
word_index["<UNUSED>"]=3

## Function for converting the words to the numbers which was identified in the Word Index file

In [93]:
def review_encoder(text):
    arr = [word_index[word] for word in text]
    return arr

In [94]:
train_data, train_labels=imdb_reviews['Reviews'], imdb_reviews['Sentiment']
test_data, test_labels=test_reviews['Reviews'], test_reviews['Sentiment']

In [95]:
train_data.head()

0    <START this film was just brilliant casting lo...
1    <START big hair big boobs bad music and a gian...
2    <START this has to be one of the worst films o...
3    <START the <UNK> <UNK> at storytelling the tra...
4    <START worst mistake of my life br br i picked...
Name: Reviews, dtype: object

## Split the sentences into the words

In [96]:
train_data = train_data.apply(lambda review:review.split())
test_data = test_data.apply(lambda review:review.split())

## Convert words in the train data to the numbers

In [97]:
train_data = train_data.apply(review_encoder)
test_data = test_data.apply(review_encoder)

In [83]:
train_data[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 2,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 2,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 2,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 2,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 2,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 2,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5,
 144,
 30,
 5535,
 18,

## Transform the Sentiment column to the numeric

In [84]:
def encode_sentiments(sentiment):
    if(sentiment == 'positive'):
        return 1
    else:
        return 0

In [103]:
train_labels = train_labels.apply(encode_sentiments)
test_labels = test_labels.apply(encode_sentiments)

In [98]:
train_data = keras.preprocessing.sequence.pad_sequences(train_data, value=word_index["<PAD>"], padding='post', maxlen = 500)
test_data = keras.preprocessing.sequence.pad_sequences(test_data, value=word_index["<PAD>"], padding='post', maxlen = 500)

# Build the Model

I added one hidden layer. I used ReLu activation function in the hidden layer and Sigmoid activation function in the output layer. I also used Adam optimizer function and Binary crossentropy for loss and accuracy metrics.

In [99]:
model = keras.Sequential([keras.layers.Embedding(10000,16, input_length=500),
                        keras.layers.GlobalAveragePooling1D(),
                        keras.layers.Dense(16, activation='relu'),
                        keras.layers.Dense(1,activation='sigmoid')])

In [100]:
model.compile(optimizer='adam', loss= 'binary_crossentropy', metrics= ['accuracy'])

In [104]:
history_model = model.fit(train_data,train_labels,epochs = 30, batch_size=512, validation_data= [test_data,test_labels] )

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


# Interpretation of the training results

My model accuracy is .95 and validation score is .88 so I can say that my model can predict thhe result as true.

# Build a new model

I want to try a new model. In this model, I used Batch Normalization and Dropout and I added a new hidden layer.

In [122]:
model2 = keras.Sequential([keras.layers.Embedding(10000,32, input_length=500),
                        keras.layers.GlobalAveragePooling1D(),
                        keras.layers.Dense(16, activation='relu'),
                        keras.layers.BatchNormalization(),
                        keras.layers.Dropout(0.5),
                        keras.layers.Dense(8, activation='relu'),
                        keras.layers.Dense(1,activation='sigmoid')])

In [123]:
model2.compile(optimizer='adam', loss= 'binary_crossentropy', metrics= ['accuracy'])

In [124]:
history_model = model2.fit(train_data,train_labels,epochs = 30, batch_size=512, validation_data= [test_data,test_labels] )

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


# Interpretation of the new model

My new model has .99 accuracy but .83 validation accuracy so I should say that my model was over-fitting. So, this is not good.

# Build a new one

This time, I just used a hidden layer with more perceptron.

In [141]:
model3 = keras.Sequential([keras.layers.Embedding(10000,32, input_length=500),
                        keras.layers.GlobalAveragePooling1D(),
                        keras.layers.Dense(32, activation='relu'),
                        keras.layers.Dense(16, activation='relu'),
                        keras.layers.Dense(1,activation='sigmoid')])

In [144]:
model3.compile(optimizer='adam', loss= 'binary_crossentropy', metrics= ['accuracy'])

In [145]:
history_model = model3.fit(train_data,train_labels,epochs = 30, batch_size=512, validation_data= [test_data,test_labels] )

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


# Interpretation of the result:

The result is the same as before model. My model accuracy is .99 however validation accuracy is .85. So my model is over-fitting again. All in all, the first is first :)