In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Dropout

In [7]:
data = pd.read_csv("IMDB Dataset.csv")
print(data.head())


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


In [9]:
import re
def clean_text(text):
    text = re.sub('<.*?>', '', text)          
    text = re.sub('[^a-zA-Z]', ' ', text)      
    text = text.lower()                       
    text = text.split()                        
    return ' '.join(text)

data['review'] = data['review'].apply(clean_text)
print(data.head())


                                              review sentiment
0  one of the other reviewers has mentioned that ...  positive
1  a wonderful little production the filming tech...  positive
2  i thought this was a wonderful way to spend ti...  positive
3  basically there s a family where a little boy ...  negative
4  petter mattei s love in the time of money is a...  positive


In [10]:
le = LabelEncoder()
data['sentiment'] = le.fit_transform(data['sentiment'])


In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    data['review'], data['sentiment'], test_size=0.2, random_state=42)


In [12]:
max_words = 10000    # top 10k words
max_len = 200        # truncate/pad reviews to 200 words

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')


In [14]:
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128),
    SimpleRNN(128, activation='tanh'),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.build(input_shape=(None, max_len))
model.summary()


In [15]:
history = model.fit(
    X_train_pad, y_train,
    epochs=5,
    batch_size=128,
    validation_data=(X_test_pad, y_test))


[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 149ms/step - accuracy: 0.5037 - loss: 0.7040 - val_accuracy: 0.5013 - val_loss: 0.7092
Epoch 2/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 150ms/step - accuracy: 0.4997 - loss: 0.7053 - val_accuracy: 0.5027 - val_loss: 0.6998
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 147ms/step - accuracy: 0.5036 - loss: 0.7008 - val_accuracy: 0.5148 - val_loss: 0.6963
Epoch 4/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 149ms/step - accuracy: 0.5022 - loss: 0.6976 - val_accuracy: 0.4939 - val_loss: 0.6966
Epoch 5/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 150ms/step - accuracy: 0.5023 - loss: 0.6967 - val_accuracy: 0.5079 - val_loss: 0.6928


In [None]:
loss, accuracy = model.evaluate(X_test_pad, y_test)
print(f"Test Accuracy: {accuracy:.4f}")


In [None]:
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()


In [None]:
new_reviews = [
    "The movie was fantastic! I loved it.",
    "Worst film ever. Terrible acting."
]

seq = tokenizer.texts_to_sequences(new_reviews)
pad = pad_sequences(seq, maxlen=max_len, padding='post')
predictions = model.predict(pad)

for review, pred in zip(new_reviews, predictions):
    sentiment = "Positive" if pred > 0.5 else "Negative"
    print(f"Review: {review}")
    print(f"Predicted Sentiment: {sentiment}\n")
