Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix,
    f1_score,
    make_scorer
)

import joblib
import os

# Ensure output dirs exist
os.makedirs("../Models", exist_ok=True)
os.makedirs("../Dashboards", exist_ok=True)


Load data and create target

In [2]:
# Load cleaned reviews
df = pd.read_csv("../Data Processed/Cleaned Reviews.csv")

# Drop neutral reviews
df = df[df["Score"] != 3]

# Binary sentiment: 1 = positive (4–5), 0 = negative (1–2)
df["Sentiment"] = df["Score"].apply(lambda x: 1 if x >= 4 else 0)

X = df["Text"]
y = df["Sentiment"]

df[["Score", "Sentiment"]].head()


Unnamed: 0,Score,Sentiment
0,5,1
1,1,0
2,4,1
3,2,0
4,5,1


TF-IDF with unigrams + bigrams

In [3]:
tfidf = TfidfVectorizer(
    stop_words="english",
    max_features=30000,
    ngram_range=(1, 2)  # unigrams + bigrams
)

X_tfidf = tfidf.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape, X_test.shape


((420651, 30000), (105163, 30000))

Model comparison (LogReg, NB, LinearSVC)

In [4]:
models = {
    "LogisticRegression": LogisticRegression(max_iter=500),
    "MultinomialNB": MultinomialNB(),
    "LinearSVC": LinearSVC()
}

results = []

for name, clf in models.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    
    # Focus on negative class (0) for precision/recall/F1
    precision, recall, f1, _ = precision_recall_fscore_support(
        y_test,
        y_pred,
        average="binary",
        pos_label=0
    )
    
    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision (neg)": precision,
        "Recall (neg)": recall,
        "F1 (neg)": f1
    })

results_df = pd.DataFrame(results)
results_df.sort_values("F1 (neg)", ascending=False)




Unnamed: 0,Model,Accuracy,Precision (neg),Recall (neg),F1 (neg)
2,LinearSVC,0.950058,0.86964,0.799781,0.833249
0,LogisticRegression,0.942708,0.883439,0.728957,0.798798
1,MultinomialNB,0.910339,0.894594,0.482111,0.626559


In [5]:
results_df.to_csv("../Models/Model Comparison Results.csv", index=False)
results_df


Unnamed: 0,Model,Accuracy,Precision (neg),Recall (neg),F1 (neg)
0,LogisticRegression,0.942708,0.883439,0.728957,0.798798
1,MultinomialNB,0.910339,0.894594,0.482111,0.626559
2,LinearSVC,0.950058,0.86964,0.799781,0.833249


Hyperparameter tuning for LinearSVC (optimize F1 on negative class)

In [6]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import make_scorer, f1_score
from sklearn.svm import LinearSVC

# smaller sample for tuning
X_tune, _, y_tune, _ = train_test_split(
    X_train, y_train,
    train_size=50000,   # e.g. 50k samples
    stratify=y_train,
    random_state=42
)

neg_f1_scorer = make_scorer(f1_score, pos_label=0)

param_grid = {
    "C": [0.1, 1, 10],
    "loss": ["squared_hinge"]  # fix this to reduce combos
}

svc = LinearSVC(max_iter=5000, dual=False)

grid = GridSearchCV(
    svc,
    param_grid,
    scoring=neg_f1_scorer,
    cv=2,        # lighter CV
    n_jobs=1,
    verbose=1
)

grid.fit(X_tune, y_tune)
print("Best params:", grid.best_params_)
print("Best CV F1 (neg):", grid.best_score_)


Fitting 2 folds for each of 3 candidates, totalling 6 fits
Best params: {'C': 1, 'loss': 'squared_hinge'}
Best CV F1 (neg): 0.7358271088978684


Evaluate tuned LinearSVC on test set

In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

# Train best model on full training data
best_svc = LinearSVC(
    C=grid.best_params_["C"],
    loss=grid.best_params_["loss"],
    max_iter=5000,
    dual=False
)

best_svc.fit(X_train, y_train)

# Evaluate on test set
y_pred_svc = best_svc.predict(X_test)

print("Test Accuracy:", accuracy_score(y_test, y_pred_svc))

print("\nNegative class metrics:")
precision_neg = precision_score(y_test, y_pred_svc, pos_label=0)
recall_neg = recall_score(y_test, y_pred_svc, pos_label=0)
f1_neg = f1_score(y_test, y_pred_svc, pos_label=0)
print("Precision (neg):", precision_neg)
print("Recall (neg):", recall_neg)
print("F1 (neg):", f1_neg)

print("\nClassification report (all classes):")
print(classification_report(y_test, y_pred_svc))

print("\nConfusion matrix:")
print(confusion_matrix(y_test, y_pred_svc))


Test Accuracy: 0.9500584806443331

Negative class metrics:
Precision (neg): 0.8696891363425465
Recall (neg): 0.799719631864448
F1 (neg): 0.8332380770940496

Classification report (all classes):
              precision    recall  f1-score   support

           0       0.87      0.80      0.83     16407
           1       0.96      0.98      0.97     88756

    accuracy                           0.95    105163
   macro avg       0.92      0.89      0.90    105163
weighted avg       0.95      0.95      0.95    105163


Confusion matrix:
[[13121  3286]
 [ 1966 86790]]


Train Logistic Regression for explainability (coefficients)

In [8]:
log_reg = LogisticRegression(max_iter=500)
log_reg.fit(X_train, y_train)

feature_names = tfidf.get_feature_names_out()
coefs = log_reg.coef_[0]  # coef > 0 -> pushes toward class 1 (positive)


In [9]:
# Strong negative indicators: most negative coefficients (toward class 0)
top_neg_idx = np.argsort(coefs)[:50]
top_pos_idx = np.argsort(coefs)[-50:]

top_neg_words = [(feature_names[i], coefs[i]) for i in top_neg_idx]
top_pos_words = [(feature_names[i], coefs[i]) for i in top_pos_idx]

# Turn into DataFrames for export
neg_words_df = pd.DataFrame(top_neg_words, columns=["word", "coef"])
pos_words_df = pd.DataFrame(top_pos_words, columns=["word", "coef"])

neg_words_df.to_csv("../Dashboards/Top negative words model coeffs.csv", index=False)
pos_words_df.to_csv("../Dashboards/Top positive words model coeffs.csv", index=False)

neg_words_df.head(), pos_words_df.head()


(            word       coef
 0          worst -12.243729
 1  disappointing -10.907983
 2       terrible  -9.969264
 3   disappointed  -9.531933
 4          awful  -9.126645,
         word      coef
 0     easier  4.443439
 1     unique  4.512170
 2  complaint  4.568517
 3  beautiful  4.660276
 4   terrific  4.855010)

Frequency-based complaint keywords (CountVectorizer)

In [10]:
neg_df = df[df["Sentiment"] == 0]

cv = CountVectorizer(
    stop_words="english",
    max_features=1000
)

neg_matrix = cv.fit_transform(neg_df["Text"])

sum_words = neg_matrix.sum(axis=0)
word_freq = [
    (word, int(sum_words[0, idx]))
    for word, idx in cv.vocabulary_.items()
]

sorted_words = sorted(word_freq, key=lambda x: x[1], reverse=True)

complaint_words_df = pd.DataFrame(sorted_words, columns=["word", "count"])
complaint_words_df.head(20)


Unnamed: 0,word,count
0,br,104606
1,like,44561
2,product,35007
3,taste,32120
4,just,28111
5,food,22825
6,coffee,22675
7,good,21665
8,flavor,20543
9,amazon,16708


In [11]:
complaint_words_df.to_csv("../Dashboards/Top negative words frequency.csv", index=False)


Save best model + vectorizer for deployment

In [12]:
joblib.dump(best_svc, "../Models/best_sentiment_model.pkl")
joblib.dump(tfidf, "../Models/tfidf_vectorizer.pkl")

print("Saved best model and TF-IDF vectorizer.")


Saved best model and TF-IDF vectorizer.


Quick sanity check with a custom review

In [13]:
sample_reviews = [
    "This product was terrible, completely stale and I want a refund.",
    "Absolutely loved it! Fresh, tasty and arrived on time.",
]

X_sample = tfidf.transform(sample_reviews)
preds = best_svc.predict(X_sample)

for text, label in zip(sample_reviews, preds):
    print("Review:", text)
    print("Predicted sentiment:", "Positive" if label == 1 else "Negative")
    print("-" * 60)


Review: This product was terrible, completely stale and I want a refund.
Predicted sentiment: Negative
------------------------------------------------------------
Review: Absolutely loved it! Fresh, tasty and arrived on time.
Predicted sentiment: Positive
------------------------------------------------------------
