In [None]:
import pandas as pd
import numpy as np

In [None]:
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.tokenize.toktok import ToktokTokenizer
import random
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,f1_score
from sklearn.naive_bayes import MultinomialNB

In [None]:
df=pd.read_csv("/content/IMDB Dataset.csv")

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.duplicated().sum()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(8, 5))
sns.countplot(x='sentiment', data=df, palette='viridis')
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

In [None]:
# Generate the word clouds for positive and negative reviews

positive_reviews = " ".join(df[df['sentiment'] == 'positive']['review'])
negative_reviews = " ".join(df[df['sentiment'] == 'negative']['review'])

In [None]:
# Positive Reviews by using Wordcloud

positive_wc = WordCloud(width=800, height=400, background_color='white').generate(positive_reviews)
plt.figure(figsize=(10, 5))
plt.imshow(positive_wc, interpolation='bilinear')
plt.title('Word Cloud for Positive Reviews')
plt.axis('off')
plt.show()

In [None]:
# Negative Reviews by using Wordcloud

negative_wc = WordCloud(width=800, height=400, background_color='black').generate(negative_reviews)
plt.figure(figsize=(10, 5))
plt.imshow(negative_wc, interpolation='bilinear')
plt.title('Word Cloud for Negative Reviews')
plt.axis('off')
plt.show()

In [None]:
# Convert a dataset Positive and negative into 1 and 0 integer form

df.sentiment = [1 if s == 'positive' else 0 for s in df.sentiment]
df

# **Text Preprocessing**

In [None]:
# using stopwords from nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

In [None]:
# Remove HTML Strip and Special Characters
tag = random.randint(0, len(df)-1)
before_process = df.iloc[tag][0]

def process(a):
    a = re.sub('[,\.!?:()"]', '', a)
    a = re.sub('<.*?>', ' ', a)
    a = re.sub('http\S+', ' ', a)
    a = re.sub('[^a-zA-Z0-9]', ' ', a)
    a = re.sub('\s+', ' ', a)
    return a.lower().strip()

df['review'] = df['review'].apply(lambda a: process(a))
after_process = df.iloc[tag][0]

In [None]:
nltk.download('punkt_tab')
def sw_remove(a):
    words = nltk.tokenize.word_tokenize(a)
    filtered_list = [word for word in words if word not in stop_words]
    return ' '.join(filtered_list)

df['review'] = df['review'].apply(lambda a: sw_remove(a))
after_removal = sw_remove(after_process)

In [None]:
def stemmer(a):
  ps=nltk.porter.PorterStemmer()
  text= ' '.join([ps.stem(word) for word in a.split()])
  return text
df['review']=df['review'].apply(stemmer)

In [None]:
df.head(5)

In [None]:
Y_train=df['sentiment']
Y_test=df['sentiment']
Y_test

In [None]:
norm_train_reviews=df.review[:40000]
norm_train_reviews[0]
norm_test_reviews=df.review[40000:]
norm_test_reviews[45005]

In [None]:
cv=CountVectorizer(min_df=0.0,max_df=1.0,binary=False,ngram_range=(1,3))
#transformed train reviews
cv_train_reviews = cv.fit_transform(norm_train_reviews[:40000])  # Process only 40000 reviews
#transformed test reviews
cv_test_reviews=cv.transform(norm_test_reviews)

print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tv=TfidfVectorizer(min_df=0.0,max_df=1.0,binary=False,ngram_range=(1,3))
#transformed train reviews
tv_train_reviews = tv.fit_transform(norm_train_reviews[:40000])  # Process only 40000 reviews
#transformed test reviews
tv_test_reviews=tv.transform(norm_test_reviews)

print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
num_components = 800
lda = LDA(n_components=num_components)
# Apply LDA to training data
lda_train_features = lda.fit_transform(tv_train_reviews.toarray(), train_sentiment)
# Apply LDA to testing data
lda_test_features = lda.transform(tv_test_reviews.toarray())

In [None]:
lb=LabelBinarizer()
sentimentdata=lb.fit_transform(df['sentiment'])
print(sentimentdata.shape)

In [None]:
train_sentiment=sentimentdata[:40000]
test_sentiment=sentimentdata[40000:]
print(train_sentiment)
print(test_sentiment)

# **Using ML Algorithm (Logistic Regression and Naive Bayes Algorithm)**

In [None]:
lr = LogisticRegression(penalty='l2', max_iter=1000, C=1, random_state=42)  # Increased max_iter
lr_bow = lr.fit(cv_train_reviews, train_sentiment)
lr_tfidf=lr.fit(tv_train_reviews,train_sentiment)

In [None]:
print("Bag of Words :",lr_bow)
print("Tfidf :",lr_tfidf)

In [None]:
# Bag of words Prediction
lr_bow_predict=lr.predict(cv_test_reviews)
print(lr_bow_predict)
# Tfidf Prediction
lr_tfidf_predict=lr.predict(cv_test_reviews)
print(lr_tfidf_predict)

In [None]:
# Bag of Words Accuracy
lr_bow_accuracy=accuracy_score(test_sentiment,lr_bow_predict)
print("Accuracy of BOW: ",lr_bow_accuracy)
# Tfidf Accuracy
lr_tfidf_accuracy=accuracy_score(test_sentiment,lr_tfidf_predict)
print("Accuracy of Tfidf: ",lr_tfidf_accuracy)

In [None]:
# Classification Report of BOW
lr_bow_report=classification_report(test_sentiment,lr_bow_predict,target_names=['Positive','Negative'])
print(lr_bow_report)
# Classification Report of Tfidf
lr_tfidf_report=classification_report(test_sentiment,lr_tfidf_predict,target_names=['Positive','Negative'])
print(lr_tfidf_report)

In [None]:
# Confusion Matrix of BOW
bow_conf=confusion_matrix(test_sentiment,lr_bow_predict)
print("Confusion Matrix BOW: ",bow_conf)

# Confusion Matrix of Tfidf
tfidf_conf=confusion_matrix(test_sentiment,lr_tfidf_predict)
print("Confusion Matrix Tfidf: ",tfidf_conf)

In [None]:
multinv=MultinomialNB()
# For BOW
multinv_bow=multinv.fit(cv_train_reviews,train_sentiment)
print(multinv_bow)
# For Tfidf
multinv_tfidf=multinv.fit(tv_train_reviews,train_sentiment)
print(multinv_tfidf)

In [None]:
# Bag of words Prediction
multinv_bow_predict=multinv.predict(cv_test_reviews)
print(multinv_bow_predict)
# Tfidf Prediction
multinv_tfidf_predict=multinv.predict(cv_test_reviews)
print(multinv_tfidf_predict)

In [None]:
# Bag of Words Accuracy
multinv_bow_accuracy=accuracy_score(test_sentiment,multinv_bow_predict)
print("Accuracy of BOW: ",multinv_bow_accuracy)
# Tfidf Accuracy
multinv_tfidf_accuracy=accuracy_score(test_sentiment,multinv_tfidf_predict)
print("Accuracy of Tfidf: ",multinv_tfidf_accuracy)

In [None]:
# Classification Report of BOW
multinv_bow_report=classification_report(test_sentiment,multinv_bow_predict,target_names=['Positive','Negative'])
print(multinv_bow_report)
# Classification Report of Tfidf
multinv_tfidf_report=classification_report(test_sentiment,multinv_tfidf_predict,target_names=['Positive','Negative'])
print(multinv_tfidf_report)

In [None]:
# Confusion Matrix of BOW
bow_conf=confusion_matrix(test_sentiment,multinv_bow_predict)
print("Confusion Matrix BOW: ",bow_conf)

# Confusion Matrix of Tfidf
tfidf_conf=confusion_matrix(test_sentiment,multinv_tfidf_predict)
print("Confusion Matrix Tfidf: ",tfidf_conf)

# **Using Deep Learning LSTM Model**

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["review"], df['sentiment'], test_size=0.2, random_state=42)

In [None]:
# Text Vectorizer
max_words=10000
max_len=150

In [None]:
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

In [None]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [None]:
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len)

In [None]:
# Build the LSTM Model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
# Train the model
model.fit(X_train_pad, y_train, epochs=10, batch_size=64, validation_split=0.2)

In [None]:
# Evaluate the model
y_pred_prob = model.predict(X_test_pad)
y_pred = (y_pred_prob > 0.5).astype(int)

In [None]:
# Classification report
print("LSTM Classifier Results:")
print(classification_report(y_test, y_pred))

In [None]:
# Calculate F1 Score and Accuracy
f1_lstm = f1_score(y_test, y_pred)
accuracy_lstm = accuracy_score(y_test, y_pred)
print(f"LSTM F1 Score: {f1_lstm}")
print(f"LSTM Accuracy: {accuracy_lstm}")