In [35]:
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv('a1_IMDB_Dataset.csv')  


print(data.head())
print(data['sentiment'].value_counts())


def preprocess_text(text):
    text = text.lower()  
    text = re.sub('[^a-zA-Z0-9\s]', '', text)  
    return text

data['review'] = data['review'].apply(preprocess_text)


label_encoder = LabelEncoder()
data['sentiment'] = label_encoder.fit_transform(data['sentiment'])


X_train, X_test, y_train, y_test = train_test_split(data['review'], data['sentiment'], test_size=0.2, random_state=42)


max_features = 2000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)


max_length = 100  
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)


embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length=max_length))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))  
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_data=(X_test_pad, y_test))

# Function to predict sentiment
def predict_sentiment(review):
    review = preprocess_text(review)
    review_seq = tokenizer.texts_to_sequences([review])
    review_pad = pad_sequences(review_seq, maxlen=max_length)
    prediction = model.predict(review_pad)
    return 'Positive' if prediction[0][0] > 0.5 else 'Negative'

input_review = "I loved this movie! It was fantastic." 
print(f'Review: "{input_review}" is {predict_sentiment(input_review)}')


                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
sentiment
positive    25000
negative    25000
Name: count, dtype: int64




Epoch 1/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 70ms/step - accuracy: 0.7209 - loss: 0.5394 - val_accuracy: 0.8431 - val_loss: 0.3659
Epoch 2/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 66ms/step - accuracy: 0.8392 - loss: 0.3743 - val_accuracy: 0.8481 - val_loss: 0.3434
Epoch 3/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 65ms/step - accuracy: 0.8618 - loss: 0.3269 - val_accuracy: 0.8574 - val_loss: 0.3345
Epoch 4/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 67ms/step - accuracy: 0.8728 - loss: 0.3052 - val_accuracy: 0.8519 - val_loss: 0.3503
Epoch 5/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 66ms/step - accuracy: 0.8860 - loss: 0.2762 - val_accuracy: 0.8643 - val_loss: 0.3127
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 161ms/step
Review: "I loved this movie! It was fantastic." is Positive
