In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split


In [2]:
df=pd.read_csv('IMDBDataset.csv')
print(df)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]


In [3]:
texts = df['review'].tolist()
labels = df['sentiment'].tolist()
labels = [1 if label == 'positive' else 0 for label in labels]
num_words = 10000
max_len = 200

In [4]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
x_data = pad_sequences(sequences, maxlen=max_len)
y_data = np.array(labels)
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=42)

In [5]:
x_train = pad_sequences(x_train, maxlen=max_len)
x_test = pad_sequences(x_test, maxlen=max_len)

In [10]:
embedding_dim = 128

model = Sequential([
    Embedding(input_dim=num_words, output_dim=embedding_dim),
    LSTM(units=64, dropout=0.2, recurrent_dropout=0.2),
    Dense(units=64, activation='relu'),
    Dense(units=1, activation='sigmoid')
])

model.summary()

In [7]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [8]:
batch_size = 128
epochs = 5

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_split=0.2)


Epoch 1/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 418ms/step - accuracy: 0.6940 - loss: 0.5650 - val_accuracy: 0.8530 - val_loss: 0.3442
Epoch 2/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 457ms/step - accuracy: 0.8699 - loss: 0.3218 - val_accuracy: 0.8510 - val_loss: 0.3542
Epoch 3/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 432ms/step - accuracy: 0.8886 - loss: 0.2886 - val_accuracy: 0.8320 - val_loss: 0.3950
Epoch 4/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 414ms/step - accuracy: 0.8639 - loss: 0.3334 - val_accuracy: 0.7345 - val_loss: 0.5346
Epoch 5/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 455ms/step - accuracy: 0.8506 - loss: 0.3299 - val_accuracy: 0.8660 - val_loss: 0.3683


In [9]:
loss, accuracy = model.evaluate(x_test, y_test)

print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 53ms/step - accuracy: 0.8740 - loss: 0.3385
Test Loss: 0.3422
Test Accuracy: 0.8741
