In [1]:
from keras.datasets import imdb
from keras import Sequential
from keras.layers import Dense, SimpleRNN, Embedding, Flatten, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

In [3]:
(x_train, y_train), (x_test, y_test) = imdb.load_data()

In [4]:
data = pd.read_csv('IMDB Dataset.csv')

In [5]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
data.shape

(50000, 2)

In [7]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [8]:
data.replace({'sentiment': {'positive': 1, 'negative': 0}}, inplace = True)

  data.replace({'sentiment': {'positive': 1, 'negative': 0}}, inplace = True)


In [9]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [10]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size = 0.2, random_state =1)

Data preprocessing

In [11]:
tokenizer = Tokenizer(num_words = 5000)

In [12]:
tokenizer.fit_on_texts(train_data['review'])

In [13]:
x_train = pad_sequences(tokenizer.texts_to_sequences(train_data['review']), maxlen = 200)
x_test = pad_sequences(tokenizer.texts_to_sequences(test_data['review']), maxlen = 200)

In [14]:
print(x_train)

[[   0    0    0 ... 2467 4903  375]
 [   0    0    0 ...    4    1  954]
 [   0    0    0 ...   29    1   57]
 ...
 [   0    0    0 ... 3833  457  155]
 [  73   74    3 ...   10  211   11]
 [   0    0    0 ...   69  541 1150]]


In [15]:
y_train = train_data['sentiment']
y_test = test_data['sentiment']

In [16]:
print(y_train)

18165    0
36059    1
13242    1
32985    1
41133    1
        ..
43723    1
32511    0
5192     0
12172    0
33003    1
Name: sentiment, Length: 40000, dtype: int64


LSTM

In [17]:
model = Sequential()
model.add(Embedding(input_dim = 5000, output_dim = 128, input_length = 200))
model.add(LSTM(128, dropout = 0.2, recurrent_dropout = 0.2))
model.add(Dense(1, activation = 'sigmoid'))



In [18]:
model.summary()

In [19]:
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

Training the model

In [23]:
model.fit(x_train, y_train, epochs = 5, validation_split = 0.3)

Epoch 1/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 83ms/step - accuracy: 0.9243 - loss: 0.1975 - val_accuracy: 0.8852 - val_loss: 0.2854
Epoch 2/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 83ms/step - accuracy: 0.9422 - loss: 0.1522 - val_accuracy: 0.8804 - val_loss: 0.3088
Epoch 3/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 97ms/step - accuracy: 0.9591 - loss: 0.1154 - val_accuracy: 0.8827 - val_loss: 0.3261
Epoch 4/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 85ms/step - accuracy: 0.9680 - loss: 0.0930 - val_accuracy: 0.8845 - val_loss: 0.3891
Epoch 5/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 89ms/step - accuracy: 0.9757 - loss: 0.0693 - val_accuracy: 0.8850 - val_loss: 0.4057
Epoch 6/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 91ms/step - accuracy: 0.9817 - loss: 0.0585 - val_accuracy: 0.8830 - val_loss: 0.4538
Epoc

<keras.src.callbacks.history.History at 0x305aeeff0>

In [25]:
loss, accuracy = model.evaluate(x_test, y_test)
print(f"Test loss: {loss}")
print(f"Test accuracy: {accuracy}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 41ms/step - accuracy: 0.8783 - loss: 0.5928
Test loss: 0.61236572265625
Test accuracy: 0.8736000061035156


Building a predictive system

In [None]:
def predict_sentiment(review):
    sequence = tokenizer.texts_to_sequences(review)
    padded_sequences = pad_sequences(sequences, maxlen = 200)
    prediction = model.predict(padded_sequnces)
    sentiment = 'positive' if prediction[0][0] > 0.5 else 'negative'
    return sentiment