In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import re
import seaborn as sns
from sklearn.model_selection import train_test_split
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import multiprocessing as mp
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from textblob import Word
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Majid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Majid\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Majid\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
data =  pd.read_csv("IMDB Dataset.csv")

In [3]:
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
data.shape

(50000, 2)

In [5]:
data. describe()

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,positive
freq,5,25000


In [6]:
data['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [7]:
#spltting the dataset
train_reviews, test_reviews, train_sentiments, test_sentiments = train_test_split(data['review'], data['sentiment'], test_size=0.2, random_state=42 , stratify=data['sentiment'])

In [8]:
(train_reviews.shape, train_sentiments.shape), (test_reviews.shape, test_sentiments.shape)


(((40000,), (40000,)), ((10000,), (10000,)))

 In real-world applications, the model is trained on available data and then used to make predictions on new, unseen data. Setting aside a test set at the beginning helps replicate this scenario and gives a more accurate picture of how the model might perform post-deployment.

In [9]:
def clean_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    
    # Remove non-letters
    text = re.sub('[^a-zA-Z]', ' ', text)
    
    # Convert to lowercase and tokenize
    text = text.lower()
    words = nltk.word_tokenize(text)
    
    # Remove stopwords
    words = [w for w in words if w not in stopwords.words('english')]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]
    
    # Join the words back into one string
    text = ' '.join(words)
    return text

# Apply the cleaning function to the reviews in the training set
train_reviews_clean = train_reviews.apply(clean_text)
test_reviews_clean = test_reviews.apply(clean_text)

  text = BeautifulSoup(text, "html.parser").get_text()


In [10]:
train_reviews_clean.head()

47808    caught little gem totally accident back reviva...
20154    believe let movie accomplish favor friend ask ...
43069    spoiler alert get nerve people remake use term...
19413    one thing learnt watching george romero creeps...
13673    remember theater review said horrible well thi...
Name: review, dtype: object

In [11]:
test_reviews_clean.head()

18870    yes mtv really way market daria started clever...
39791    story bride fair amusing engaging one filmmake...
30381    team varied scully mulder two scientist pilot ...
42294    popular movie probably humor fast moving story...
33480    movie made angry thinking new horror movie one...
Name: review, dtype: object

In [12]:
# Count vectorizer for bag of words
cv = CountVectorizer(min_df=1, max_df=1, binary=False, ngram_range=(1,3))
cv_train_reviews = cv.fit_transform(train_reviews_clean)
cv_test_reviews = cv.transform(test_reviews_clean)

print('BOW_cv_train:', cv_train_reviews.shape)
print('BOW_cv_test:', cv_test_reviews.shape)

BOW_cv_train: (40000, 6084975)
BOW_cv_test: (10000, 6084975)


In [13]:
# Tfidf vectorizer
tv = TfidfVectorizer(min_df=1, max_df=1, use_idf=True, ngram_range=(1,3))
tv_train_reviews = tv.fit_transform(train_reviews_clean)
tv_test_reviews = tv.transform(test_reviews_clean)

print('Tfidf_train:', tv_train_reviews.shape)
print('Tfidf_test:', tv_test_reviews.shape)

Tfidf_train: (40000, 6084975)
Tfidf_test: (10000, 6084975)


In [14]:
lb = LabelBinarizer()
train_sentiments = lb.fit_transform(train_sentiments)
test_sentiments = lb.transform(test_sentiments)


In [15]:
train_sentiments = np.array(train_sentiments).astype('float32')
test_sentiments = np.array(test_sentiments).astype('float32')
print(train_sentiments)
print(test_sentiments)

[[1.]
 [0.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]]
[[0.]
 [0.]
 [1.]
 ...
 [0.]
 [1.]
 [0.]]


In [16]:
lr = LogisticRegression(penalty='l2', max_iter=500, C=1, random_state=42)

# Fitting the model for Bag of words
lr_bow = lr.fit(cv_train_reviews, train_sentiments.ravel())
print(lr_bow)

# Fitting the model for TF-IDF features
lr_tfidf = lr.fit(tv_train_reviews, train_sentiments.ravel())
print(lr_tfidf)

LogisticRegression(C=1, max_iter=500, random_state=42)
LogisticRegression(C=1, max_iter=500, random_state=42)


In [24]:
lr_bow_predictions = lr_bow.predict(cv_test_reviews)
lr_bow_score = accuracy_score(test_sentiments, lr_bow_predictions)



In [25]:
# Classification report for Logistic Regression with BOW
lr_bow_report = classification_report(test_sentiments, lr_bow_predictions, target_names=['Negative', 'Positive'])
print("LR with BOW Classification Report:\n", lr_bow_report)


LR with BOW Classification Report:
               precision    recall  f1-score   support

    Negative       0.74      0.74      0.74      5000
    Positive       0.74      0.74      0.74      5000

    accuracy                           0.74     10000
   macro avg       0.74      0.74      0.74     10000
weighted avg       0.74      0.74      0.74     10000



In [26]:
# Evaluate Logistic Regression with TF-IDF
lr_tfidf_predictions = lr_tfidf.predict(tv_test_reviews)
lr_tfidf_score = accuracy_score(test_sentiments, lr_tfidf_predictions)

# Classification report for Logistic Regression with TF-IDF
lr_tfidf_report = classification_report(test_sentiments, lr_tfidf_predictions, target_names=['Negative', 'Positive'])
print("LR with TF-IDF Classification Report:\n", lr_tfidf_report)

LR with TF-IDF Classification Report:
               precision    recall  f1-score   support

    Negative       0.73      0.76      0.74      5000
    Positive       0.75      0.72      0.73      5000

    accuracy                           0.74     10000
   macro avg       0.74      0.74      0.74     10000
weighted avg       0.74      0.74      0.74     10000



In [21]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping



In [22]:
max_words = 5000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_reviews_clean)
X_train = tokenizer.texts_to_sequences(train_reviews_clean)
X_test = tokenizer.texts_to_sequences(test_reviews_clean)

# Pad the sequences to have the same length
max_sequence_length = 200
X_train = pad_sequences(X_train, maxlen=max_sequence_length)
X_test = pad_sequences(X_test, maxlen=max_sequence_length)

# Define LSTM model structure
model = Sequential()
embedding_dim = 128
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the LSTM model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the LSTM model
batch_size = 64
epochs = 5
early_stop = EarlyStopping(monitor='val_loss', patience=2)
history = model.fit(X_train, train_sentiments, batch_size=batch_size, epochs=epochs, validation_split=0.2, callbacks=[early_stop])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5


In [28]:
print("\nComparing Model Performance:")
print(f"Logistic Regression BOW Accuracy: {lr_bow_score:.4f}")
print(f"Logistic Regression TF-IDF Accuracy: {lr_tfidf_score:.4f}")
print(f"LSTM Model Accuracy: {accuracy:.4f}")


Comparing Model Performance:
Logistic Regression BOW Accuracy: 0.7388
Logistic Regression TF-IDF Accuracy: 0.7384
LSTM Model Accuracy: 0.8705


In [27]:
# Evaluate Logistic Regression with BOW
lr_bow_score = lr_bow.score(cv_test_reviews, test_sentiments.ravel())

# Evaluate Logistic Regression with TF-IDF
lr_tfidf_score = lr_tfidf.score(tv_test_reviews, test_sentiments.ravel())

# Evaluate LSTM model
loss, accuracy = model.evaluate(X_test, test_sentiments)

print("\nComparing Model Performance:")
print(f"Logistic Regression BOW Accuracy: {lr_bow_score:.4f}")
print(f"Logistic Regression TF-IDF Accuracy: {lr_tfidf_score:.4f}")
print(f"LSTM Model Accuracy: {accuracy:.4f}")



Comparing Model Performance:
Logistic Regression BOW Accuracy: 0.7388
Logistic Regression TF-IDF Accuracy: 0.7384
LSTM Model Accuracy: 0.8705


In [None]:
# Make predictions on new data using LSTM
new_reviews = ["This movie is fantastic!", "I didn't like this film at all."]
new_reviews_clean = [clean_text(review) for review in new_reviews]



['movie fantastic', 'like film']