In [1]:
# Core libraries
import pandas as pd
import numpy as np
import re
import string

# Text processing
import nltk
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# ML utilities
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc

# Download required NLTK resources (runs once)
nltk.download('punkt')
nltk.download('stopwords')

print("âœ… Libraries imported successfully")

âœ… Libraries imported successfully


[nltk_data] Downloading package punkt to /Users/aaronrao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aaronrao/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Reading the csv file**

In [3]:
# Creating a DataFrame from the CSV file (portable & deployable)
df = pd.read_csv("/Users/aaronrao/Desktop/projects/sentiment-analyzer/sentiment_analysis.csv")

# Display first 5 rows
df.head()

  df = pd.read_csv("/Users/aaronrao/Desktop/projects/sentiment-analyzer/sentiment_analysis.csv")


Unnamed: 0,id,name,asins,brand,categories,keys,manufacturer,reviews.date,reviews.dateAdded,reviews.dateSeen,...,reviews.doRecommend,reviews.id,reviews.numHelpful,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.userCity,reviews.userProvince,reviews.username
0,AVqkIhwDv8e3D1O-lebb,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",B01AHB9CN2,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...","841667104676,amazon/53004484,amazon/b01ahb9cn2...",Amazon,2017-01-13T00:00:00.000Z,2017-07-03T23:33:15Z,"2017-06-07T09:04:00.000Z,2017-04-30T00:45:00.000Z",...,True,,0.0,5.0,http://reviews.bestbuy.com/3545/5620406/review...,This product so far has not disappointed. My c...,Kindle,,,Adapter
1,AVqkIhwDv8e3D1O-lebb,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",B01AHB9CN2,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...","841667104676,amazon/53004484,amazon/b01ahb9cn2...",Amazon,2017-01-13T00:00:00.000Z,2017-07-03T23:33:15Z,"2017-06-07T09:04:00.000Z,2017-04-30T00:45:00.000Z",...,True,,0.0,5.0,http://reviews.bestbuy.com/3545/5620406/review...,great for beginner or experienced person. Boug...,very fast,,,truman
2,AVqkIhwDv8e3D1O-lebb,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",B01AHB9CN2,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...","841667104676,amazon/53004484,amazon/b01ahb9cn2...",Amazon,2017-01-13T00:00:00.000Z,2017-07-03T23:33:15Z,"2017-06-07T09:04:00.000Z,2017-04-30T00:45:00.000Z",...,True,,0.0,5.0,http://reviews.bestbuy.com/3545/5620406/review...,Inexpensive tablet for him to use and learn on...,Beginner tablet for our 9 year old son.,,,DaveZ
3,AVqkIhwDv8e3D1O-lebb,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",B01AHB9CN2,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...","841667104676,amazon/53004484,amazon/b01ahb9cn2...",Amazon,2017-01-13T00:00:00.000Z,2017-07-03T23:33:15Z,"2017-06-07T09:04:00.000Z,2017-04-30T00:45:00.000Z",...,True,,0.0,4.0,http://reviews.bestbuy.com/3545/5620406/review...,I've had my Fire HD 8 two weeks now and I love...,Good!!!,,,Shacks
4,AVqkIhwDv8e3D1O-lebb,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",B01AHB9CN2,Amazon,"Electronics,iPad & Tablets,All Tablets,Fire Ta...","841667104676,amazon/53004484,amazon/b01ahb9cn2...",Amazon,2017-01-12T00:00:00.000Z,2017-07-03T23:33:15Z,"2017-06-07T09:04:00.000Z,2017-04-30T00:45:00.000Z",...,True,,0.0,5.0,http://reviews.bestbuy.com/3545/5620406/review...,I bought this for my grand daughter when she c...,Fantastic Tablet for kids,,,explore42


In [4]:
# Dataset structure and basic information
df.info()

print("\nâœ… Dataset info displayed successfully")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34660 entries, 0 to 34659
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    34660 non-null  object 
 1   name                  27900 non-null  object 
 2   asins                 34658 non-null  object 
 3   brand                 34660 non-null  object 
 4   categories            34660 non-null  object 
 5   keys                  34660 non-null  object 
 6   manufacturer          34660 non-null  object 
 7   reviews.date          34621 non-null  object 
 8   reviews.dateAdded     24039 non-null  object 
 9   reviews.dateSeen      34660 non-null  object 
 10  reviews.didPurchase   1 non-null      object 
 11  reviews.doRecommend   34066 non-null  object 
 12  reviews.id            1 non-null      float64
 13  reviews.numHelpful    34131 non-null  float64
 14  reviews.rating        34627 non-null  float64
 15  reviews.sourceURLs 

In [5]:
# Distribution of review ratings
rating_counts = df["reviews.rating"].value_counts().sort_values(ascending=False)

print("ðŸ“Š Review Rating Distribution:\n")
print(rating_counts)

ðŸ“Š Review Rating Distribution:

reviews.rating
5.0    23775
4.0     8541
3.0     1499
1.0      410
2.0      402
Name: count, dtype: int64


**Converting the emojis to text**

In [6]:
# Converting common emoticons to emotion words
emoji_map = {
    r":\)": " happy ",
    r":-\)": " happy ",
    r";-\)": " happy ",
    r":D": " laugh ",
    r":-\(": " sad ",
    r":\(": " sad ",
    r":'\)": " tear_of_joy ",
}

df["reviews.text"] = df["reviews.text"].astype(str).replace(
    emoji_map, regex=True
)

print("âœ… Emoticons converted to emotion words")

âœ… Emoticons converted to emotion words


In [7]:
# Initialize stemmer and stopwords (NLTK resources already downloaded)
stemmer = SnowballStemmer("english")
stop_words = set(stopwords.words("english"))

print("âœ… Stemmer and stopwords initialized")

âœ… Stemmer and stopwords initialized


In [8]:
# Text preprocessing function (robust & deployment-safe)
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    
    # Lowercase
    text = text.lower()
    
    # Remove numbers
    text = re.sub(r"\d+", "", text)
    
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Tokenize safely
    words = word_tokenize(text)
    
    # Remove stopwords and apply stemming
    words = [
        stemmer.stem(word)
        for word in words
        if word not in stop_words and len(word) > 2
    ]
    
    return " ".join(words)

print("âœ… Text preprocessing function ready")

âœ… Text preprocessing function ready


In [9]:
# Select required columns and rename them safely
data_set = df[["reviews.text", "reviews.rating"]].copy()
data_set.columns = ["reviews", "score"]

print("âœ… Dataset prepared with text and score columns")
data_set.head()

âœ… Dataset prepared with text and score columns


Unnamed: 0,reviews,score
0,This product so far has not disappointed. My c...,5.0
1,great for beginner or experienced person. Boug...,5.0
2,Inexpensive tablet for him to use and learn on...,5.0
3,I've had my Fire HD 8 two weeks now and I love...,4.0
4,I bought this for my grand daughter when she c...,5.0


In [10]:
# Keep only positive (4,5) and negative (1,2) reviews
data_set = data_set[data_set["score"] != 3]

# Map sentiment
data_set["sentiment"] = np.where(data_set["score"] >= 4, 1, 0)

print("âœ… Sentiment labels created (1 = Positive, 0 = Negative)")
data_set.head()

âœ… Sentiment labels created (1 = Positive, 0 = Negative)


Unnamed: 0,reviews,score,sentiment
0,This product so far has not disappointed. My c...,5.0,1
1,great for beginner or experienced person. Boug...,5.0,1
2,Inexpensive tablet for him to use and learn on...,5.0,1
3,I've had my Fire HD 8 two weeks now and I love...,4.0,1
4,I bought this for my grand daughter when she c...,5.0,1


In [11]:
# Separate majority and minority classes correctly
df_majority = data_set[data_set["sentiment"] == 1]
df_minority = data_set[data_set["sentiment"] == 0]

print("âœ… Class split done")
print("Positive samples:", len(df_majority))
print("Negative samples:", len(df_minority))

âœ… Class split done
Positive samples: 32316
Negative samples: 845


In [12]:
print("Positive samples:", len(df_majority))
print("Negative samples:", len(df_minority))

Positive samples: 32316
Negative samples: 845


In [13]:
# Balance the dataset (correct labels)

# Separate classes
df_majority = data_set[data_set["sentiment"] == 1]
df_minority = data_set[data_set["sentiment"] == 0]

# Downsample majority class
df_majority_downsampled = resample(
    df_majority,
    replace=False,
    n_samples=len(df_minority),
    random_state=42
)

# Combine balanced dataset
balanced_data = pd.concat([df_majority_downsampled, df_minority])

# Shuffle the dataset
balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Preprocess all reviews
balanced_data["processed_reviews"] = balanced_data["reviews"].apply(preprocess_text)

print("âœ… Dataset balanced and preprocessed")
print(balanced_data["sentiment"].value_counts())
balanced_data.head()

âœ… Dataset balanced and preprocessed
sentiment
0    845
1    845
Name: count, dtype: int64


Unnamed: 0,reviews,score,sentiment,processed_reviews
0,I bought this for my daughter during the holid...,1.0,0,bought daughter holiday four month use would l...
1,"At first, the fire box was great. After two mo...",1.0,0,first fire box great two month remot stop work...
2,I have to say this was not that great a purcha...,2.0,0,say great purchas like take advantag greatest ...
3,I bought this tablet because of the price and ...,1.0,0,bought tablet price spec seem fine slow freez ...
4,I tried to set it up for over an hour and it w...,1.0,0,tri set hour wouldnt connect wifi orang ring d...


In [14]:
# Split into train and test sets (with stratification)
train, test = train_test_split(
    balanced_data,
    test_size=0.3,
    random_state=42,
    stratify=balanced_data["sentiment"]
)

print("âœ… Train-test split completed")
print("Train class distribution:\n", train["sentiment"].value_counts())
print("Test class distribution:\n", test["sentiment"].value_counts())

âœ… Train-test split completed
Train class distribution:
 sentiment
1    592
0    591
Name: count, dtype: int64
Test class distribution:
 sentiment
0    254
1    253
Name: count, dtype: int64


In [15]:
# Save train and test datasets to CSV (with headers)
train[["processed_reviews", "sentiment"]].to_csv(
    "train.csv", index=False
)

test[["processed_reviews", "sentiment"]].to_csv(
    "test.csv", index=False
)

print("âœ… Train and test CSV files saved")

âœ… Train and test CSV files saved


In [16]:
# Use training data directly from DataFrame
X_train = train["processed_reviews"]
y_train = train["sentiment"]

X_test = test["processed_reviews"]
y_test = test["sentiment"]

print("âœ… Training and test data prepared")
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)

âœ… Training and test data prepared
X_train shape: (1183,)
y_train shape: (1183,)


In [17]:
# OPTIONAL: For visualization only (not model training)

positive_text = " ".join(
    train[train["sentiment"] == 1]["processed_reviews"]
)

negative_text = " ".join(
    train[train["sentiment"] == 0]["processed_reviews"]
)

print("âœ… Positive and negative text corpora created")

âœ… Positive and negative text corpora created


In [18]:
# OPTIONAL: Word frequency analysis (not used in model training)
from collections import Counter

positive_words = Counter(
    " ".join(train[train["sentiment"] == 1]["processed_reviews"]).split()
)

negative_words = Counter(
    " ".join(train[train["sentiment"] == 0]["processed_reviews"]).split()
)

print("Most common positive words:", positive_words.most_common(10))
print("Most common negative words:", negative_words.most_common(10))

Most common positive words: [('great', 215), ('use', 196), ('love', 176), ('tablet', 148), ('kindl', 115), ('amazon', 113), ('easi', 110), ('read', 98), ('good', 95), ('one', 87)]
Most common negative words: [('amazon', 240), ('tablet', 225), ('use', 178), ('kindl', 174), ('work', 173), ('get', 154), ('app', 148), ('would', 145), ('one', 141), ('fire', 133)]


In [19]:
# Optional: check class distribution in training data
print("Training class distribution:")
print(y_train.value_counts(normalize=True))

Training class distribution:
sentiment
1    0.500423
0    0.499577
Name: proportion, dtype: float64


In [20]:
# Correct way to inspect class balance
print("Training class distribution:")
print(y_train.value_counts(normalize=True))

Training class distribution:
sentiment
1    0.500423
0    0.499577
Name: proportion, dtype: float64


In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# Vectorization
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2)
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train Naive Bayes model
model = MultinomialNB()
model.fit(X_train_vec, y_train)

print("âœ… Model trained successfully")

âœ… Model trained successfully


In [22]:
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Predict on test data
y_pred = model.predict(X_test_vec)
y_proba = model.predict_proba(X_test_vec)[:, 1]

print("âœ… Model Evaluation Results\n")

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=["Negative", "Positive"]))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

auc_score = roc_auc_score(y_test, y_proba)
print(f"AUC Score: {auc_score:.3f}")

âœ… Model Evaluation Results

Classification Report:
              precision    recall  f1-score   support

    Negative       0.84      0.85      0.85       254
    Positive       0.85      0.84      0.84       253

    accuracy                           0.85       507
   macro avg       0.85      0.85      0.85       507
weighted avg       0.85      0.85      0.85       507

Confusion Matrix:
[[217  37]
 [ 41 212]]
AUC Score: 0.925


In [23]:
# Already prepared earlier â€” this is correct
X_test = test["processed_reviews"]
y_test = test["sentiment"]

In [24]:
# Evaluate model using sklearn (correct & deployable)

y_pred = model.predict(X_test_vec)
y_proba = model.predict_proba(X_test_vec)[:, 1]

print("Classification Report:\n")
print(classification_report(y_test, y_pred, target_names=["Negative", "Positive"]))

print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))

auc_score = roc_auc_score(y_test, y_proba)
print(f"\nAUC Score: {auc_score:.3f}")

Classification Report:

              precision    recall  f1-score   support

    Negative       0.84      0.85      0.85       254
    Positive       0.85      0.84      0.84       253

    accuracy                           0.85       507
   macro avg       0.85      0.85      0.85       507
weighted avg       0.85      0.85      0.85       507


Confusion Matrix:

[[217  37]
 [ 41 212]]

AUC Score: 0.925


In [25]:
# Correct interactive prediction using sklearn model

def predict_sentiment_sklearn(review_text):
    processed = preprocess_text(review_text)
    vectorized = vectorizer.transform([processed])
    prediction = model.predict(vectorized)[0]
    probability = model.predict_proba(vectorized)[0][1]

    return prediction, probability


# Test prediction
sample_review = "This product is amazing and worth the price"
pred, prob = predict_sentiment_sklearn(sample_review)

print("Review:", sample_review)
print("Predicted Sentiment:", "POSITIVE" if pred == 1 else "NEGATIVE")
print(f"Confidence: {prob:.2f}")

Review: This product is amazing and worth the price
Predicted Sentiment: POSITIVE
Confidence: 0.69


In [26]:
import pickle

# Save model
with open("sentiment_model.pkl", "wb") as f:
    pickle.dump(model, f)

# Save vectorizer
with open("vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("âœ… Model and vectorizer saved successfully")

âœ… Model and vectorizer saved successfully
