In [1]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

import joblib


In [15]:
import pandas as pd

df = pd.read_json(
    "../data/raw/Electronics_5 2.json",
    lines=True
)

df.head()


Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AO94DHGC771SJ,528881469,amazdnu,"[0, 0]",We got this GPS for my husband who is an (OTR)...,5,Gotta have GPS!,1370131200,"06 2, 2013"
1,AMO214LNFCEI4,528881469,Amazon Customer,"[12, 15]","I'm a professional OTR truck driver, and I bou...",1,Very Disappointed,1290643200,"11 25, 2010"
2,A3N7T0DY83Y4IG,528881469,C. A. Freeman,"[43, 45]","Well, what can I say. I've had this unit in m...",3,1st impression,1283990400,"09 9, 2010"
3,A1H8PY3QHMQQA0,528881469,"Dave M. Shaw ""mack dave""","[9, 10]","Not going to write a long review, even thought...",2,"Great grafics, POOR GPS",1290556800,"11 24, 2010"
4,A24EV6RXELQZ63,528881469,Wayne Smith,"[0, 0]",I've had mine for a year and here's what we go...,1,"Major issues, only excuses for support",1317254400,"09 29, 2011"


In [16]:
df = df[["reviewText", "overall"]].copy()
df.dropna(inplace=True)

df.head()


Unnamed: 0,reviewText,overall
0,We got this GPS for my husband who is an (OTR)...,5
1,"I'm a professional OTR truck driver, and I bou...",1
2,"Well, what can I say. I've had this unit in m...",3
3,"Not going to write a long review, even thought...",2
4,I've had mine for a year and here's what we go...,1


In [17]:
def rating_to_sentiment(rating):
    if rating >= 4:
        return "positive"
    elif rating <= 2:
        return "negative"
    else:
        return "neutral"


In [18]:
df["sentiment"] = df["overall"].apply(rating_to_sentiment)
df["sentiment"].value_counts()


sentiment
positive    1356067
negative     190864
neutral      142257
Name: count, dtype: int64

In [19]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


df["clean_text"] = df["reviewText"].apply(clean_text)
df[["clean_text", "sentiment"]].head()


Unnamed: 0,clean_text,sentiment
0,we got this gps for my husband who is an otr o...,positive
1,i m a professional otr truck driver and i boug...,negative
2,well what can i say i ve had this unit in my t...,neutral
3,not going to write a long review even thought ...,negative
4,i ve had mine for a year and here s what we go...,negative


In [20]:
from sklearn.model_selection import train_test_split

X = df["clean_text"]
y = df["sentiment"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3,
    random_state=42,
    stratify=y
)


In [21]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

sentiment_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1, 2),
        max_features=8000,
        stop_words="english"
    )),
    ("clf", LogisticRegression(
        max_iter=1000,
        class_weight="balanced"
    ))
])

sentiment_pipeline.fit(X_train, y_train)


In [22]:
from sklearn.metrics import classification_report

y_pred = sentiment_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

    negative       0.57      0.72      0.63     57259
     neutral       0.23      0.56      0.33     42677
    positive       0.96      0.78      0.86    406821

    accuracy                           0.76    506757
   macro avg       0.59      0.69      0.61    506757
weighted avg       0.86      0.76      0.79    506757



In [28]:
test_sentences = [
    "battery drains fast",          # ❌ should be negative
    "battery charges fast",         # ✅ positive
    "phone is fast",                # ✅ positive
    "performance is fast",          # ✅ positive
    "battery died fast",            # ❌ negative
    "battery lasts long",           # ✅ positive
]


sentiment_pipeline.predict(test_sentences)


array(['positive', 'positive', 'positive', 'positive', 'negative',
       'positive'], dtype=object)

In [27]:
len(df)


1689188

In [30]:
test_sentencess = [
    "battery drains quickly",
    "battery lasts only two hours",
    "battery backup is poor",
    "battery is decent",
    "battery life is okay",
    "battery life is terrible"
]

sentiment_pipeline.predict(test_sentencess)

array(['positive', 'positive', 'negative', 'neutral', 'neutral',
       'negative'], dtype=object)

In [None]:
import joblib

joblib.dump(sentiment_pipeline, "../backend/sentiment_model.joblib")


['../backend/sentiment_model.joblib']

: 