# **Importing the Libraries**

In [None]:
#import libraries
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

# **Importing the dataset for sentiment analysis**


In [None]:
#Loading data
df = pd.read_csv("/content/IMDB Dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df["sentiment"].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

# **Changing the format of the labels**

In [None]:
#Changing the labels from string to numerical format
df.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)
print(df)

                                                  review  sentiment
0      One of the other reviewers has mentioned that ...          1
1      A wonderful little production. <br /><br />The...          1
2      I thought this was a wonderful way to spend ti...          1
3      Basically there's a family where a little boy ...          0
4      Petter Mattei's "Love in the Time of Money" is...          1
...                                                  ...        ...
49995  I thought this movie did a down right good job...          1
49996  Bad plot, bad dialogue, bad acting, idiotic di...          0
49997  I am a Catholic taught in parochial elementary...          0
49998  I'm going to have to disagree with the previou...          0
49999  No one expects the Star Trek movies to be high...          0

[50000 rows x 2 columns]


# **Splitting the data into train data and test data**

In [None]:
#spliting the data
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
print("Training_data_size : ",train_data.shape)
print("Test_data_size : ",test_data.shape)

Training_data_size :  (40000, 2)
Test_data_size :  (10000, 2)


# **Data Pre-processing**

After trying various data pre-processing techniques without significantly improving model accuracies and even reducing accuracy for the LSTM model, I decided to simplify the approach.


In [None]:
#Data pre-processing for model-training and teseting:
# Tokenizing the reviews
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["review"])
# Convert text reviews to sequences of integers and pad them to ensure uniform length
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)

In [None]:
print(X_train)

[[1935    1 1200 ...  205  351 3856]
 [   3 1651  595 ...   89  103    9]
 [   0    0    0 ...    2  710   62]
 ...
 [   0    0    0 ... 1641    2  603]
 [   0    0    0 ...  245  103  125]
 [   0    0    0 ...   70   73 2062]]


In [None]:
print(X_test)

[[   0    0    0 ...  995  719  155]
 [  12  162   59 ...  380    7    7]
 [   0    0    0 ...   50 1088   96]
 ...
 [   0    0    0 ...  125  200 3241]
 [   0    0    0 ... 1066    1 2305]
 [   0    0    0 ...    1  332   27]]


In [None]:
y_train = train_data["sentiment"]
y_test = test_data["sentiment"]

# **Building the LSTM model**

In [None]:
#Define an LSTM model for sentiment analysis
model_LSTM = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=200),
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation="sigmoid")
    ])

In [None]:
model_LSTM.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 128)          640000    
                                                                 
 lstm (LSTM)                 (None, 128)               131584    
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 771713 (2.94 MB)
Trainable params: 771713 (2.94 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Compile the LSTM model for sentiment analysis
model_LSTM.compile(optimizer='adam', loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
#Training the model
model_LSTM.fit(X_train, y_train, epochs=2, batch_size=64, validation_split=0.2)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7e21565f63e0>

#**Evaluating Different Models to Identify the Suitable One for the Task**

In [None]:
#Evaluating the LSTM model for sentiment analysis
LSTM_pred = model_LSTM.predict(X_test)
LSTM_pred_binary = (LSTM_pred > 0.5).astype(int)
LSTM_acc = accuracy_score(LSTM_pred_binary, y_test)
print("Test accuracy: {:.2f}%".format(LSTM_acc*100))

Test accuracy: 87.44%


In [None]:
#Print classification report for LSTM model
print(classification_report(y_test, LSTM_pred_binary))

              precision    recall  f1-score   support

           0       0.88      0.86      0.87      4961
           1       0.87      0.89      0.88      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [None]:
#Evaluating a Logistic Regression model for sentiment analysis
model_LR = LogisticRegression()
model_LR.fit(X_train, y_train)
LR_pred = model_LR.predict(X_test)
LR_acc = accuracy_score(LR_pred, y_test)
print("Test accuracy: {:.2f}%".format(LR_acc*100))

Test accuracy: 50.55%


In [None]:
#Print classification report for LR model
print(classification_report(y_test, LR_pred))

              precision    recall  f1-score   support

           0       0.50      0.45      0.47      4961
           1       0.51      0.56      0.53      5039

    accuracy                           0.51     10000
   macro avg       0.51      0.51      0.50     10000
weighted avg       0.51      0.51      0.50     10000



In [None]:
#Evaluating Multinomial Naive Bayes model for sentiment analysis
model_MNB = MultinomialNB()
model_MNB.fit(X_train, y_train)
MNB_pred = model_MNB.predict(X_test)
MNB_acc = accuracy_score(MNB_pred, y_test)
print("Test accuracy : {:.2f}%".format(MNB_acc*100))

Test accuracy : 50.05%


In [None]:
#Print classification report for MNB model
print(classification_report(y_test, MNB_pred))

              precision    recall  f1-score   support

           0       0.50      0.52      0.51      4961
           1       0.50      0.48      0.49      5039

    accuracy                           0.50     10000
   macro avg       0.50      0.50      0.50     10000
weighted avg       0.50      0.50      0.50     10000



In [None]:
#Evaluating Random Forst CLassifier model for sentiment analysis
model_RF = RandomForestClassifier(n_estimators=100, random_state=42)
model_RF.fit(X_train, y_train)
RF_pred = model_RF.predict(X_test)
RF_acc = accuracy_score(RF_pred, y_test)
print("Test accuracy of Random Forest: {:.2f}%".format(RF_acc*100))

Test accuracy of Random Forest: 54.26%


In [None]:
#Print classification report for RF model
print(classification_report(y_test, RF_pred))

              precision    recall  f1-score   support

           0       0.54      0.59      0.56      4961
           1       0.55      0.50      0.52      5039

    accuracy                           0.54     10000
   macro avg       0.54      0.54      0.54     10000
weighted avg       0.54      0.54      0.54     10000



In [None]:
#Comparing the accuaracies for diffreent models
acc_table = pd.DataFrame({
    "Model": ["LSTM", "LogisticRegression", "MultiNomialNaiveBayes", "RandomForest"],
    "Accuracy": [LSTM_acc, LR_acc, MNB_acc, RF_acc]
})

print(acc_table)

                   Model  Accuracy
0                   LSTM    0.8744
1     LogisticRegression    0.5055
2  MultiNomialNaiveBayes    0.5005
3           RandomForest    0.5426


From comparing accuracies, it's evident that the LSTM model outperforms other models in sentiment analysis. This superiority may stem from the complexity of the data, which other models struggle to handle effectively, often performing only marginally better than random guesses.

## **Checking the Results for LSTM model**

In [None]:
#defining predict_sentiment function to check the perfrmance of LSTM model on some random reviews
def predict_sentiment(model, review):
  sequence = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequence, maxlen=200)
  prediction = model.predict(padded_sequence)
  print(prediction)
  sentiment = "positive" if prediction[0][0] > 0.5 else "negative"
  return sentiment

In [None]:
#Examples
new_review = "This movie was ok but not that good."
sentiment = predict_sentiment(model_LSTM, new_review)
print(f"The sentiment of the review is: {sentiment}")

[[0.13616392]]
The sentiment of the review is: negative


In [None]:
new_review = "This movie was fantastic. I loved it."
sentiment = predict_sentiment(model_LSTM, new_review)
print(f"The sentiment of the review is: {sentiment}")

[[0.7596172]]
The sentiment of the review is: positive


In [None]:
new_review = "Movie fell flat with a weak plot and uninspiring acting, not worth the time."
sentiment = predict_sentiment(model_LSTM, new_review)
print(f"The sentiment of the review is: {sentiment}")

[[0.02912826]]
The sentiment of the review is: negative


In [None]:
new_review = "I recently watched the latest episode of 'Stranger Things' and absolutely loved it! The storyline was gripping, the characters were engaging, and the production quality was top-notch. Can't wait for the next season!"
sentiment = predict_sentiment(model_LSTM, new_review)
print(f"The sentiment of the review is: {sentiment}")

[[0.79210716]]
The sentiment of the review is: positive
