In [1]:
import pandas as pd
import numpy as np
print("Environment setup complete ✅")


Environment setup complete ✅


In [None]:
df = pd.read_csv("C:\Projects\sentiment_analysis_reviews\data\IMDB Dataset.csv")
df.head()


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
df.info()
df["sentiment"].value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


sentiment
positive    25000
negative    25000
Name: count, dtype: int64

# STEP 2: clean the raw text

In [14]:
# Import libraries
import re
import string
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")


[nltk_data] Downloading package stopwords to
[nltk_data]     c:\Users\shwey\anaconda3\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
# Create a cleaning function
stop_words = set(stopwords.words("english"))

def clean_text(text):
    # remove HTML tags
    text = re.sub(r"<.*?>", "", text)
    # lowercase
    text = text.lower()
    # remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # remove stopwords
    words = text.split()
    words = [w for w in words if w not in stop_words]
    return " ".join(words)


In [16]:
# Apply cleaning 
df["clean_review"] = df["review"].apply(clean_text)
df[["review", "clean_review"]].head()


Unnamed: 0,review,clean_review
0,One of the other reviewers has mentioned that ...,one reviewers mentioned watching 1 oz episode ...
1,A wonderful little production. <br /><br />The...,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,basically theres family little boy jake thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",petter matteis love time money visually stunni...


In [17]:
# Quick checks
df.isnull().sum()
df["clean_review"].str.len().describe()



count    50000.000000
mean       838.412220
std        647.018549
min         17.000000
25%        439.000000
50%        616.000000
75%       1020.000000
max       9273.000000
Name: clean_review, dtype: float64

# STEP4: Convert text --> Numbers (TF-IDF)

In [18]:
# Prepare labels
df["label"] = df["sentiment"].map({"negative": 0, "positive": 1})
df[["sentiment", "label"]].head()


Unnamed: 0,sentiment,label
0,positive,1
1,positive,1
2,positive,1
3,negative,0
4,positive,1


In [19]:
# Train-test split
from sklearn.model_selection import train_test_split

X = df["clean_review"]
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [20]:
# TF-IDF vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


# Step5 : Train a sentiment model

In [21]:
# Train Logistic Regression
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)


In [22]:
# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8885
              precision    recall  f1-score   support

           0       0.90      0.87      0.89      4961
           1       0.88      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



# Step6 : Save the model

In [24]:
import joblib
import os

BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))

model_path = os.path.join(BASE_DIR, "sentiment_model.pkl")
vectorizer_path = os.path.join(BASE_DIR, "tfidf_vectorizer.pkl")

joblib.dump(model, model_path)
joblib.dump(tfidf, vectorizer_path)

print("Model saved to:", model_path)
print("Vectorizer saved to:", vectorizer_path)


Model saved to: c:\Projects\sentiment_analysis_reviews\sentiment_model.pkl
Vectorizer saved to: c:\Projects\sentiment_analysis_reviews\tfidf_vectorizer.pkl
