## Import Libraries

In [1]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Load Dataset

In [2]:
# Load the dataset
data = pd.read_csv("IMDB Dataset.csv")

In [3]:
# Display the first few rows of the dataset
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


## Data Preprocessing

In [4]:
# Encode sentiment labels
data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)

  data.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)


In [5]:
# Remove duplicates
print("Duplicates in Dataset: ",data.duplicated().sum())
data.drop_duplicates(inplace = True)

Duplicates in Dataset:  418


In [6]:
# Convert reviews to lowercase
data['review'] = data['review'].str.lower()

In [7]:
# Remove HTML tags from reviews
import re
def remove_html_tags(text):
    pattern = r'[^a-zA-Z0-9\s]'
    text = re.sub(pattern,'',text)
    return text
data['review'] = data['review'].apply(remove_html_tags)

In [8]:
# Remove stopwords from reviews
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
# Intilize Stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arunendra\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [9]:
# Apply Stopwords
data['review'] = data['review'].apply(lambda x : ' '.join([word for word in x.split() if word not in (stop_words)]))

In [10]:
# Remove URLs from reviews
data['review'] = data['review'].str.replace('http\S+|www.\S+', '', case=False)

  data['review'] = data['review'].str.replace('http\S+|www.\S+', '', case=False)


## Split Data

In [11]:
# Split data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

## Tokenize and Pad Sequences

In [12]:
# Tokenize text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["review"])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)

In [13]:
# Prepare target variables
Y_train = train_data["sentiment"]
Y_test = test_data["sentiment"]

## Logistic Regression

In [14]:
# Import libraries for Logistic Regression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [15]:
# Prepare data for Logistic Regression
A = data['review']
B = data['sentiment']

In [16]:
# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(A,B, test_size=0.2, random_state=42)


In [17]:
# Transform text data using TF-IDF
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')  # Limit to 5000 features for simplicity
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [18]:
# Train Logistic Regression model
log_reg = LogisticRegression(max_iter=500)  # Increase max_iter if convergence issues occur
log_reg.fit(X_train_tfidf, Y_train)

In [19]:
# Make Predictions
y_pred = log_reg.predict(X_test_tfidf)

In [20]:
# Evaluate Logistic Regression model
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.88


In [21]:
from sklearn.metrics import f1_score
f1 = f1_score(Y_test, y_pred, average='weighted')  # Use 'weighted' if it's multiclass
print("F1 Score:", f1)

F1 Score: 0.8826023457441845


In [79]:
# Import pickle to save models
import pickle

In [80]:
# Save tokenizer
pickle.dump(tokenizer ,open('tokenizer.pkl' , 'wb'))

In [81]:
# Save LSTM model
pickle.dump(model ,open('model.pkl' , 'wb'))