**Import Basic Libraries**

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

**Load Dataset**

In [2]:
df = pd.read_csv(r"D:\My Items\Coding\SmartReviewFilter\data\electronics.csv")

Shorten the dataset

In [3]:
# df = df.sample(n=25000, random_state=42).reset_index(drop=True)
# df.head()

**Dropping Unnecessary Columns**

In [4]:
df = df.drop(columns=['_id','asin','reviewerName','reviewTime','category'])
df

Unnamed: 0,reviewerID,helpful,reviewText,overall,summary,unixReviewTime,class
0,ADKVDDYRW56UB,"[0, 0]",Can't really say much about this other than th...,5.0,Good CD's,1341446400,1.0
1,A2F8IHET2WY0XP,"[0, 0]",This case came and it is really awesome. But i...,5.0,Great case and a good deal.,1390176000,1.0
2,AYGHD53QKX480,"[0, 0]",I used these to change out the OEM fans in myN...,5.0,Quiet and powerful!,1344297600,1.0
3,AY16GWUDYFPPA,"[0, 0]","Using it on my new MacBook 13"" with Thunderbol...",1.0,"You get what you pay for, performance is terrible",1303776000,0.0
4,A3I8H7F0MW5BX0,"[0, 0]",I bought this case for my husband because the ...,1.0,color different than shown.,1361145600,0.0
...,...,...,...,...,...,...,...
99995,A1GWIONPEVAOOC,"[0, 0]",i used this headphone for monitoring tracking ...,5.0,pretty decent,1353196800,1.0
99996,ASXY7UL8PDMU4,"[0, 0]",It's super tough and awesome. It's easy to put...,5.0,Love it,1374451200,1.0
99997,A3LHZQMWBJBSLU,"[1, 1]",This band implies that one size will fit all. ...,2.0,Beware. Not a Universal Fit,1370390400,0.0
99998,A34Z7D2RD0LNQW,"[12, 12]","My 1st GPS unit, so my rating of ""5"" is admitt...",5.0,Garmin Quest Pocket-sized GPS Navigator,1142640000,1.0


**DATA PREPROCESSING**

Class Balance

In [5]:
df['class'].value_counts()

class
1.0    75914
0.0    24086
Name: count, dtype: int64

In [6]:
minority_df = df[df['class'] == 0.0]
majority_df = df[df['class'] == 1.0].sample(n=len(minority_df), random_state=42)

df = pd.concat([minority_df, majority_df]).sample(frac=1, random_state=42).reset_index(drop=True)

Converting unix review time to date time format

In [7]:
df['reviewTime'] = pd.to_datetime(df['unixReviewTime'], unit='s')
df.drop(columns=['unixReviewTime'], inplace=True)

Lowercase the text

In [8]:
df['reviewText'] = df['reviewText'].astype(str).str.lower()
df['summary'] = df['summary'].astype(str).str.lower()

Handle missing values

In [9]:
df.isna().sum()

reviewerID    0
helpful       0
reviewText    0
overall       0
summary       0
class         0
reviewTime    0
dtype: int64

Combining review text and summary 

In [10]:
df['text'] = df['summary'] + ". " + df['reviewText']
df = df.drop(columns=['reviewText','summary'], axis='columns')
df

Unnamed: 0,reviewerID,helpful,overall,class,reviewTime,text
0,A20QWZMNCFYCON,"[2, 2]",4.0,1.0,2011-02-27,"good quality, entry-level headphones. i purcha..."
1,AOBKR23R5VC0A,"[4, 4]",3.0,0.0,2011-10-20,be careful! previously used and sold as new. p...
2,A9TG1D8V9FUBE,"[0, 0]",5.0,1.0,2014-04-21,great speakers at a reasonable price. i never ...
3,A2HMQAWU66BQN,"[0, 2]",1.0,0.0,2012-05-18,never again. i have never had a bad purchase f...
4,A2Q4W76MM2HKMT,"[5, 9]",1.0,0.0,2014-03-06,"poor, incompetent support & does not work with..."
...,...,...,...,...,...,...
48167,A2Y6UWH2GPW8CZ,"[0, 0]",1.0,0.0,2014-01-14,this bulb will not last long at all. stopped w...
48168,A1FTF7K2ZH21PB,"[0, 0]",5.0,1.0,2014-01-11,perefct for my needs. they work as expected. t...
48169,AUX6CQ7FOTSHV,"[0, 0]",5.0,1.0,2014-02-15,worked as intended. the first unit had some co...
48170,A2WB4OWBUH2VQX,"[1, 1]",2.0,0.0,2013-01-28,i've tried several cases and this one is not g...


Handling duplicates

In [11]:
df.drop_duplicates(subset=['text'], inplace=True)
df.reset_index(drop=True, inplace=True)

Add reviewer frequency

In [12]:
review_count = df['reviewerID'].value_counts()
df['reviewFreq'] = df['reviewerID'].map(review_count)
df

Unnamed: 0,reviewerID,helpful,overall,class,reviewTime,text,reviewFreq
0,A20QWZMNCFYCON,"[2, 2]",4.0,1.0,2011-02-27,"good quality, entry-level headphones. i purcha...",1
1,AOBKR23R5VC0A,"[4, 4]",3.0,0.0,2011-10-20,be careful! previously used and sold as new. p...,1
2,A9TG1D8V9FUBE,"[0, 0]",5.0,1.0,2014-04-21,great speakers at a reasonable price. i never ...,1
3,A2HMQAWU66BQN,"[0, 2]",1.0,0.0,2012-05-18,never again. i have never had a bad purchase f...,1
4,A2Q4W76MM2HKMT,"[5, 9]",1.0,0.0,2014-03-06,"poor, incompetent support & does not work with...",1
...,...,...,...,...,...,...,...
48159,A2Y6UWH2GPW8CZ,"[0, 0]",1.0,0.0,2014-01-14,this bulb will not last long at all. stopped w...,1
48160,A1FTF7K2ZH21PB,"[0, 0]",5.0,1.0,2014-01-11,perefct for my needs. they work as expected. t...,2
48161,AUX6CQ7FOTSHV,"[0, 0]",5.0,1.0,2014-02-15,worked as intended. the first unit had some co...,1
48162,A2WB4OWBUH2VQX,"[1, 1]",2.0,0.0,2013-01-28,i've tried several cases and this one is not g...,2


In [13]:
def review_freq_binning(freq):
    if freq == 1:
        return "One time"
    elif freq <= 3:
        return "Low"
    elif freq <= 10:
        return "Medium"
    else:
        return "High"

In [14]:
df['reviewerType'] = df['reviewFreq'].apply(review_freq_binning)

In [15]:
crosstab = pd.crosstab(df['reviewerType'], df['class'], normalize='index')
crosstab


class,0.0,1.0
reviewerType,Unnamed: 1_level_1,Unnamed: 2_level_1
Low,0.456277,0.543723
Medium,0.388889,0.611111
One time,0.501181,0.498819


In [16]:
df['isFreqReviewer'] = df['reviewFreq'].apply(lambda x: 1 if x > 1 else 0).astype('int')

Add Review Length

In [17]:
df['reviewWordCount'] = df['text'].apply(lambda x: len(x.split()))
df['reviewCharCount'] = df['text'].apply(len)

In [18]:
df.groupby('class')['reviewWordCount'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,24085.0,99.967449,115.135808,2.0,37.0,65.0,120.0,4101.0
1.0,24079.0,86.02741,111.770383,2.0,30.0,48.0,96.0,2338.0


In [19]:
df.groupby('class')['reviewCharCount'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,24085.0,544.396222,645.415079,14.0,198.0,349.0,646.0,24210.0
1.0,24079.0,468.984592,621.82291,14.0,163.0,261.0,516.0,13896.0


**TEXT CLEANING**

In [20]:
import spacy
import re
from tqdm import tqdm

Load english tokenizer

In [21]:
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "tagger"])

In [22]:
def clean_texts(text_list):
    cleaned_reviews = []

    # Pre-clean the text (lowercase, remove HTML, URLs, extra spaces)
    preprocessed = []
    for text in text_list:
        text = text.lower()
        text = re.sub(r'<.*?>', '', text)
        text = re.sub(r'http\S+|www.\S+|https\S+', '', text)
        text = re.sub(r'\s+', ' ', text)
        preprocessed.append(text)

    # Use spaCy's nlp.pipe for fast tokenization & lemmatization
    for doc in tqdm(nlp.pipe(preprocessed, batch_size=100), total=len(preprocessed), desc="Cleaning Reviews"):
        tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
        cleaned_reviews.append(" ".join(tokens))

    return cleaned_reviews


In [23]:
df['text'] = clean_texts(df['text'].tolist())

Cleaning Reviews: 100%|██████████| 48164/48164 [06:29<00:00, 123.61it/s]


In [24]:
df.head()

Unnamed: 0,reviewerID,helpful,overall,class,reviewTime,text,reviewFreq,reviewerType,isFreqReviewer,reviewWordCount,reviewCharCount
0,A20QWZMNCFYCON,"[2, 2]",4.0,1.0,2011-02-27,good quality entry level headphones purchased ...,1,One time,0,164,986
1,AOBKR23R5VC0A,"[4, 4]",3.0,0.0,2011-10-20,careful previously sold new purchased amazon s...,1,One time,0,203,1158
2,A9TG1D8V9FUBE,"[0, 0]",5.0,1.0,2014-04-21,great speakers reasonable price speakers lapto...,1,One time,0,55,289
3,A2HMQAWU66BQN,"[0, 2]",1.0,0.0,2012-05-18,bad purchase amazon sellers ordered product on...,1,One time,0,181,980
4,A2Q4W76MM2HKMT,"[5, 9]",1.0,0.0,2014-03-06,poor incompetent support work beam g air card ...,1,One time,0,188,1176


**TRAIN TEST SPLIT**

In [25]:
X = df[['text', 'isFreqReviewer', 'reviewWordCount', 'reviewCharCount']]
y = df['class']

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**TEXT VECTORIZATION**

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), stop_words='english')

In [28]:
X_train_text = vectorizer.fit_transform(X_train['text'])
X_test_text = vectorizer.transform(X_test['text'])

**SCALING FEATURES**

In [29]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [30]:
length_features_train = scaler.fit_transform(X_train[['reviewWordCount', 'reviewCharCount']])
length_features_test = scaler.transform(X_test[['reviewWordCount', 'reviewCharCount']])

**COMBINE TEXT & NUMERIC**

In [31]:
from scipy.sparse import hstack

In [32]:
X_train_final = hstack([X_train_text, X_train[['isFreqReviewer']], length_features_train])
X_test_final = hstack([X_test_text, X_test[['isFreqReviewer']], length_features_test])

**MODEL TRAINING**

In [33]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time

In [34]:
models = [LogisticRegression(), MultinomialNB(), RandomForestClassifier(n_estimators=100, random_state=42), LinearSVC()]
model_names = ['Logistic Regression', 'Multinomial Naive Bayes', 'Random Forest', 'Linear SVC']

Training and comparing different models

In [35]:
for model, name in zip(models, model_names):
    start_time = time.time()
    model.fit(X_train_final, y_train)
    y_pred = model.predict(X_test_final)

    print(f"----- {name} -----")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
    print(f"Time taken: {time.time() - start_time:.2f} seconds\n")

----- Logistic Regression -----
Accuracy: 0.8667
Precision: 0.8738
Recall: 0.8601
F1 Score: 0.8669
Time taken: 0.55 seconds

----- Multinomial Naive Bayes -----
Accuracy: 0.8487
Precision: 0.8464
Recall: 0.8556
F1 Score: 0.8510
Time taken: 0.10 seconds

----- Random Forest -----
Accuracy: 0.8440
Precision: 0.8561
Recall: 0.8305
F1 Score: 0.8431
Time taken: 117.16 seconds

----- Linear SVC -----
Accuracy: 0.8587
Precision: 0.8643
Recall: 0.8542
F1 Score: 0.8592
Time taken: 0.89 seconds



Training a voting classifier

In [36]:
from sklearn.ensemble import VotingClassifier

In [37]:
log_clf = LogisticRegression(max_iter=1000)
nb_clf = MultinomialNB()
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

In [43]:
voting_clf = VotingClassifier(
    estimators=[
        ('lr', log_clf),
        ('nb', nb_clf),
        ('rf', rf_clf)
    ],
    voting='soft'
)

In [44]:
voting_clf.fit(X_train_final, y_train)
y_pred = voting_clf.predict(X_test_final)

In [45]:
print("Voting Classifier Metrics:")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1 Score :", f1_score(y_test, y_pred))

Voting Classifier Metrics:
Accuracy : 0.8678500986193294
Precision: 0.8733097566049511
Recall   : 0.8634306869600987
F1 Score : 0.8683421243148206


**SAVING THE MODEL**

In [46]:
import joblib

In [47]:
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(voting_clf, 'model.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']