In [1]:
# To silence the TensorFlow warnings, you can use the following code before you import the TensorFlow library.
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow.keras.layers as layers
import re
import string
print("Imports successful!")

Imports successful!


Dataset : IMDB dataset of 50K movie reviews

In [2]:
dataset = pd.read_csv('IMDB_Dataset.csv')
dataset

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [3]:
dataset['review'] = dataset['review'].str.replace(f"[{string.punctuation}]", "", regex=True)
dataset.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production br br The filmin...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically theres a family where a little boy J...,negative
4,Petter Matteis Love in the Time of Money is a ...,positive


In [4]:
dataset['review'] = dataset['review'].apply(lambda row: row.lower())
dataset.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production br br the filmin...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically theres a family where a little boy j...,negative
4,petter matteis love in the time of money is a ...,positive


Preprocess the input

In [5]:
word_index = tf.keras.datasets.imdb.get_word_index(
    path='imdb_word_index.json'
)


In [6]:
print(len(word_index))
word_index['<PAD>'] = 0
word_index['<START>'] =1 
word_index['UNK>'] = 2
word_index['<UNUSED>'] = 3 
word_index['<br />'] = 3 

88584


Map each word to an integer

In [7]:
next_available_index = max(word_index.values()) + 1  # Start from the next available index

def review_encoder(text):
    global next_available_index  # Use the global variable to keep track of the index
    arr = []
    for word in text:
        if word in word_index:
            arr.append(word_index[word])  # Use the existing index
        else:
            # If the word is missing, assign a new index and update word_index
            word_index[word] = next_available_index
            arr.append(next_available_index)
            next_available_index += 1  # Increment the available index for future missing words
    return arr

SPLIT TRAIN AND TEST DATA FROM RAW DATASET

In [8]:
test_imdb = dataset.loc[0:9999, :]
print('20 % is test data', test_imdb.shape)
train_imdb = dataset.loc[10000:, :]
print('80 % is train data', train_imdb.shape)

train_data, train_labels = train_imdb['review'], train_imdb['sentiment']
test_data, test_labels = test_imdb['review'], test_imdb['sentiment']

print('train_data shape:', train_data.shape)
print('train_labels shape:', train_labels.shape)

20 % is test data (10000, 2)
80 % is train data (40000, 2)
train_data shape: (40000,)
train_labels shape: (40000,)


In [9]:
len(train_data[10000])
train_data[10000]
 

' while sporadically engrossing including a few effectively tender moments and humorous the sledgehammerobvious satire homecoming hinges on comes off as forced and ultimately unfulfilling with material like this timing is everything michael moore knew to release fahrenheit 911 before the 2004 elections and the real tragedy of dantes film is that it didnt come out 2 years ago when its message would have carried an energy that would have energized the dissidents further in 2006 mockery of the wellsettled bush administration hardly seems as controversially compelling or imperiled as it did thenbr br frankly anyone that could be convinced of anything by a ham fisted zombie flick has questionable intelligence br br and if you didnt notice michael moore didnt exactly help to defeat bushbr br there was nothing engrossing about this film i just felt disgust at how blatant and frankly stupid the film was it was painful to watch if you are going to do something like this you need a bit of wit sa

In [10]:
train_data = train_data.apply(lambda row: row.split())
test_data = test_data.apply(lambda row: row.split())
[]

[]

CONVERT TEXT DATA TO INTEGER

In [11]:
train_data = train_data.apply(review_encoder)
test_data = test_data.apply(review_encoder)
test_data

0       [28, 4, 1, 82, 1986, 44, 1043, 12, 100, 146, 4...
1       [3, 386, 114, 362, 7, 7, 1, 1420, 3117, 6, 52,...
2       [10, 194, 11, 13, 3, 386, 93, 5, 1139, 55, 20,...
3       [688, 21137, 3, 220, 118, 3, 114, 427, 3270, 1...
4       [53758, 105608, 116, 8, 1, 55, 4, 275, 6, 3, 2...
                              ...                        
9995    [250, 438, 17, 41, 3015, 1121, 2542, 2803, 342...
9996    [199, 69, 3, 986, 86, 67, 256, 132, 12, 11, 6,...
9997    [11, 17, 6, 3, 75, 17, 18, 100, 146, 32, 2200,...
9998    [11, 6, 3, 17, 12, 13, 239, 90, 5, 2833, 1, 65...
9999    [11210, 19, 41, 6477, 284, 1, 1593, 2, 677, 15...
Name: review, Length: 10000, dtype: object

Encode sentiments

In [12]:
def encode_sentiments(label):
    if label =='positive':
        return 1
    else:
        return 0
train_labels = train_labels.apply(encode_sentiments)
test_labels = test_labels.apply(encode_sentiments)

In [13]:
train_data = tf.keras.preprocessing.sequence.pad_sequences(train_data, value=word_index['<PAD>'], padding='post', maxlen=500)
test_data = tf.keras.preprocessing.sequence.pad_sequences(test_data, value=word_index['<PAD>'], padding='post', maxlen=500)

Till now texts are converted to integers but semantics is not applied yet. Eg: beautiful and pretty are similar in unknown till now.

The movie was awesome:After word embeddings:
The : [1.2,1.3,1,5,...]
movie: [3.1,2.2,3.1,...]
was:[1.1,2.2,3.1,...]
awesome:[1.1,1.2,2.1,...]


In [18]:
model = keras.Sequential([tf.keras.layers.Embedding(len(word_index), 16, input_length = 500),
                tf.keras.layers.GlobalAveragePooling1D(),
                tf.keras.layers.Dense(16, activation='relu'),
                tf.keras.layers.Dense(1, activation='sigmoid')])

In [19]:
model.compile(optimizer='adam',
             loss='binary_crossentropy', metrics=['accuracy'])

In [20]:
history = model.fit(train_data, train_labels, epochs=30, batch_size=512, validation_data=(test_data, test_labels))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [21]:
loss, accuracy = model.evaluate(test_data, test_labels)



Now, test with a known label whether the model predicts correct sentiment or not

In [38]:
rand_index = np.random.randint(1, 1000)
user_review = test_imdb.loc[rand_index]
user_review

review       this film cant make up its mind whether its me...
sentiment                                             negative
Name: 765, dtype: object

In [39]:
user_review = test_data[rand_index]
user_review = np.array([user_review])
if (model.predict(user_review) > 0.5).astype("int32"):
    print('Positive sentiment')
else:
    print('Negative sentiment')

Negative sentiment
