In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [2]:
df = pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
df['sentiment'] = df['sentiment'].map({'positive' : 1, 'negative' : 0})

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [5]:
max_features = 1000
tokenizer = Tokenizer(num_words = max_features, split= ' ')
tokenizer.fit_on_texts(df['review'])
X = tokenizer.texts_to_sequences(df['review'])
maxlen = 500
X = pad_sequences(X, maxlen=maxlen)

In [6]:
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
model = Sequential()
model.add(Embedding(max_features, 64, input_length = maxlen))
model.add(Dropout(0.2))
model.add(LSTM(50, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [8]:
batch_size = 64
epochs = 3
model.fit(X_train, y_train, epochs = epochs, batch_size=batch_size, validation_split=0.2)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x7d56e6c45b70>

In [9]:
score, acc = model.evaluate(X_test, y_test, batch_size=batch_size)
print("Test Accuracy: ", acc)

Test Accuracy:  0.8709999918937683


In [12]:
def predict_sentiment(tokenizer, model, review, maxlen):
  sequence = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequence, maxlen=maxlen)

  prediction = model.predict(padded_sequence)[0][0]

  if prediction > 0.5:
    print("Postive Review.")
  else:
    print("Negative Review.")

review = "This movie is really bad."
predict_sentiment(tokenizer, model, review, maxlen)

Negative Review.
