**Import Basic Libraries**

In [4]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

**Load Dataset**

In [5]:
df = pd.read_csv(r"D:\My Items\Coding\SmartReviewFilter\data\electronics.csv")

Shorten the dataset

In [6]:
df = df.sample(n=25000, random_state=42).reset_index(drop=True)
df.head()

Unnamed: 0,_id,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,category,class
0,{'$oid': '5a13246c741a2384e8a12ef0'},AFR4XH8GTRPIM,B000MTXFXS,Karen M. Masset,"[29, 29]",So let me tell you what this thing is/does. It...,4.0,"Simple, just what is needed to find a satellit...",1331942400,"03 17, 2012",Electronics,1.0
1,{'$oid': '5a1325ae741a2384e80119fc'},A1FH6RHFF0NPI6,B00DTLVJB6,Nancy Lechner,"[0, 0]",It was a gift for my son. He LOVES it! He said...,5.0,Love it!,1381190400,"10 8, 2013",Electronics,1.0
2,{'$oid': '5a1325c3741a2384e8084f3e'},A1HCYVW6U42SI7,B00ID2HGK4,Joshua D.,"[3, 5]",This is what I was worried about. I didn't re...,2.0,Buy for 3D content only,1398729600,"04 29, 2014",Electronics,0.0
3,{'$oid': '5a1325bb741a2384e805135c'},A1KJN3JIU3HJ1Z,B00G6IY5HY,Danita Bell,"[2, 2]",I have been looking for a case with a handle s...,4.0,Nice Case,1390521600,"01 24, 2014",Electronics,1.0
4,{'$oid': '5a132480741a2384e8a721d9'},AO8NH0BV82XIE,B00113V748,J. Luu,"[3, 3]",I got this for my wife so she can listen to mo...,5.0,Good speakers worthy of all the praise,1214524800,"06 27, 2008",Electronics,1.0


**Dropping Unnecessary Columns**

In [7]:
df = df.drop(columns=['_id','asin','reviewerName','reviewTime','category'])
df

Unnamed: 0,reviewerID,helpful,reviewText,overall,summary,unixReviewTime,class
0,AFR4XH8GTRPIM,"[29, 29]",So let me tell you what this thing is/does. It...,4.0,"Simple, just what is needed to find a satellit...",1331942400,1.0
1,A1FH6RHFF0NPI6,"[0, 0]",It was a gift for my son. He LOVES it! He said...,5.0,Love it!,1381190400,1.0
2,A1HCYVW6U42SI7,"[3, 5]",This is what I was worried about. I didn't re...,2.0,Buy for 3D content only,1398729600,0.0
3,A1KJN3JIU3HJ1Z,"[2, 2]",I have been looking for a case with a handle s...,4.0,Nice Case,1390521600,1.0
4,AO8NH0BV82XIE,"[3, 3]",I got this for my wife so she can listen to mo...,5.0,Good speakers worthy of all the praise,1214524800,1.0
...,...,...,...,...,...,...,...
24995,AFZYJ31GTZAGP,"[13, 13]","I am a 67 year old woman, 5ft 6, 150 lbs. I've...",4.0,Kata 3n1 20 nit picks,1272326400,1.0
24996,A3KZPW3F12UI11,"[0, 0]",GOOD THINGS:* I activated this modem with Comc...,5.0,Works with COMCAST but NO 2-YEAR WARRANTY,1318550400,1.0
24997,A3R0RJM7FMZTV1,"[0, 0]",The IR extender works great. It's very reliab...,5.0,The IR extender works great. It's very reliable,1405900800,1.0
24998,A1LW0PJPYY36W2,"[0, 0]",We bought this for our 39&#34; TV and it's per...,5.0,Perfect!,1374796800,1.0


**DATA PREPROCESSING**

Converting unix review time to date time format

In [8]:
df['reviewTime'] = pd.to_datetime(df['unixReviewTime'], unit='s')
df.drop(columns=['unixReviewTime'], inplace=True)

Lowercase the text

In [9]:
df['reviewText'] = df['reviewText'].astype(str).str.lower()
df['summary'] = df['summary'].astype(str).str.lower()

Handle missing values

In [10]:
df.isna().sum()

reviewerID    0
helpful       0
reviewText    0
overall       0
summary       0
class         0
reviewTime    0
dtype: int64

Combining review text and summary 

In [11]:
df['text'] = df['summary'] + ". " + df['reviewText']
df = df.drop(columns=['reviewText','summary'], axis='columns')
df

Unnamed: 0,reviewerID,helpful,overall,class,reviewTime,text
0,AFR4XH8GTRPIM,"[29, 29]",4.0,1.0,2012-03-17,"simple, just what is needed to find a satellit..."
1,A1FH6RHFF0NPI6,"[0, 0]",5.0,1.0,2013-10-08,love it!. it was a gift for my son. he loves i...
2,A1HCYVW6U42SI7,"[3, 5]",2.0,0.0,2014-04-29,buy for 3d content only. this is what i was wo...
3,A1KJN3JIU3HJ1Z,"[2, 2]",4.0,1.0,2014-01-24,nice case. i have been looking for a case with...
4,AO8NH0BV82XIE,"[3, 3]",5.0,1.0,2008-06-27,good speakers worthy of all the praise. i got ...
...,...,...,...,...,...,...
24995,AFZYJ31GTZAGP,"[13, 13]",4.0,1.0,2010-04-27,kata 3n1 20 nit picks. i am a 67 year old woma...
24996,A3KZPW3F12UI11,"[0, 0]",5.0,1.0,2011-10-14,works with comcast but no 2-year warranty. goo...
24997,A3R0RJM7FMZTV1,"[0, 0]",5.0,1.0,2014-07-21,the ir extender works great. it's very reliabl...
24998,A1LW0PJPYY36W2,"[0, 0]",5.0,1.0,2013-07-26,perfect!. we bought this for our 39&#34; tv an...


Handling duplicates

In [12]:
df.drop_duplicates(subset=['text'], inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,reviewerID,helpful,overall,class,reviewTime,text
0,AFR4XH8GTRPIM,"[29, 29]",4.0,1.0,2012-03-17,"simple, just what is needed to find a satellit..."
1,A1FH6RHFF0NPI6,"[0, 0]",5.0,1.0,2013-10-08,love it!. it was a gift for my son. he loves i...
2,A1HCYVW6U42SI7,"[3, 5]",2.0,0.0,2014-04-29,buy for 3d content only. this is what i was wo...
3,A1KJN3JIU3HJ1Z,"[2, 2]",4.0,1.0,2014-01-24,nice case. i have been looking for a case with...
4,AO8NH0BV82XIE,"[3, 3]",5.0,1.0,2008-06-27,good speakers worthy of all the praise. i got ...
...,...,...,...,...,...,...
24991,AFZYJ31GTZAGP,"[13, 13]",4.0,1.0,2010-04-27,kata 3n1 20 nit picks. i am a 67 year old woma...
24992,A3KZPW3F12UI11,"[0, 0]",5.0,1.0,2011-10-14,works with comcast but no 2-year warranty. goo...
24993,A3R0RJM7FMZTV1,"[0, 0]",5.0,1.0,2014-07-21,the ir extender works great. it's very reliabl...
24994,A1LW0PJPYY36W2,"[0, 0]",5.0,1.0,2013-07-26,perfect!. we bought this for our 39&#34; tv an...


Add reviewer frequency

In [13]:
review_count = df['reviewerID'].value_counts()
df['reviewFreq'] = df['reviewerID'].map(review_count)

Class Balance

In [14]:
df['class'].value_counts()

class
1.0    19024
0.0     5972
Name: count, dtype: int64

In [15]:
minority_df = df[df['class'] == 0.0]
majority_df = df[df['class'] == 1.0].sample(n=len(minority_df), random_state=42)

df = pd.concat([minority_df, majority_df]).sample(frac=1, random_state=42).reset_index(drop=True)

**TEXT CLEANING**

In [16]:
import spacy
import re
from tqdm import tqdm

Load english tokenizer

In [17]:
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "tagger"])

In [18]:
def clean_texts(text_list):
    cleaned_reviews = []

    # Pre-clean the text (lowercase, remove HTML, URLs, extra spaces)
    preprocessed = []
    for text in text_list:
        text = text.lower()
        text = re.sub(r'<.*?>', '', text)
        text = re.sub(r'http\S+|www.\S+|https\S+', '', text)
        text = re.sub(r'\s+', ' ', text)
        preprocessed.append(text)

    # Use spaCy's nlp.pipe for fast tokenization & lemmatization
    for doc in tqdm(nlp.pipe(preprocessed, batch_size=100), total=len(preprocessed), desc="Cleaning Reviews"):
        tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
        cleaned_reviews.append(" ".join(tokens))

    return cleaned_reviews


In [19]:
df['text'] = clean_texts(df['text'].tolist())

Cleaning Reviews: 100%|██████████| 11944/11944 [01:38<00:00, 120.69it/s]


In [20]:
df.head()

Unnamed: 0,reviewerID,helpful,overall,class,reviewTime,text,reviewFreq
0,A2XHOLOLKGV1FE,"[2, 4]",2.0,0.0,2012-12-22,eye piercing blue power led bright blue light ...,1
1,A1H1HWYHYHLZPS,"[1, 2]",1.0,0.0,2013-02-19,star generous sucky tinny cheap garbled sound ...,1
2,A30L5WRJM93TEF,"[0, 0]",5.0,1.0,2013-03-19,looking perfect mouse button love mouse fits s...,1
3,A156UC77S3FFTI,"[0, 0]",3.0,0.0,2013-10-23,job bought install car cd player chevy camaro ...,1
4,ANALOM35NWNNF,"[1, 1]",5.0,1.0,2013-01-12,cheap works canon black felt inside difference...,1


**TEXT VECTORIZATION**

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2), stop_words='english')

In [23]:
text_features = vectorizer.fit_transform(df['text'])

**SCALING DATA**

In [30]:
from sklearn.preprocessing import MinMaxScaler

In [31]:
scaler = MinMaxScaler()
review_freq_scaled = scaler.fit_transform(df[['reviewFreq']])

**SEPERATING FEATURE AND TARGET**

In [32]:
from scipy.sparse import hstack

In [33]:
X = hstack([text_features, review_freq_scaled])
y = df['class']

**TRAIN TEST SPLIT**

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**MODEL TRAINING**

In [36]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time

In [37]:
models = [LogisticRegression(), MultinomialNB(), RandomForestClassifier(n_estimators=100, random_state=42), LinearSVC(), SVC()]
model_names = ['Logistic Regression', 'Multinomial Naive Bayes', 'Random Forest', 'Linear SVC', 'SVC']

In [38]:
for model, name in zip(models, model_names):
    start_time = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"----- {name} -----")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
    print(f"Time taken: {time.time() - start_time:.2f} seconds\n")

----- Logistic Regression -----
Accuracy: 0.8614
Precision: 0.8579
Recall: 0.8658
F1 Score: 0.8618
Time taken: 0.15 seconds

----- Multinomial Naive Bayes -----
Accuracy: 0.8518
Precision: 0.8374
Recall: 0.8725
F1 Score: 0.8546
Time taken: 0.01 seconds

----- Random Forest -----
Accuracy: 0.8359
Precision: 0.8367
Recall: 0.8339
F1 Score: 0.8353
Time taken: 16.23 seconds

----- Linear SVC -----
Accuracy: 0.8439
Precision: 0.8450
Recall: 0.8414
F1 Score: 0.8432
Time taken: 0.09 seconds

----- SVC -----
Accuracy: 0.8614
Precision: 0.8621
Recall: 0.8599
F1 Score: 0.8610
Time taken: 46.71 seconds



In [39]:
from sklearn.ensemble import VotingClassifier

# Create individual models
log_clf = LogisticRegression(max_iter=1000)
nb_clf = MultinomialNB()
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Combine them in a VotingClassifier
voting_clf = VotingClassifier(
    estimators=[
        ('lr', log_clf),
        ('nb', nb_clf),
        ('rf', rf_clf)
    ],
    voting='soft'  # Use 'soft' if all models support predict_proba (better than hard voting)
)

# Fit and evaluate
voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)

print("Voting Classifier Metrics:")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall   :", recall_score(y_test, y_pred))
print("F1 Score :", f1_score(y_test, y_pred))


Voting Classifier Metrics:
Accuracy : 0.8635412306404353
Precision: 0.8572607260726073
Recall   : 0.8716442953020134
F1 Score : 0.8643926788685524


In [41]:
from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)
gb_preds = gb_model.predict(X_test)

print("Gradient Boosting Classifier Metrics:")
print("Accuracy:", accuracy_score(y_test, gb_preds))
print("Precision:", precision_score(y_test, gb_preds))
print("Recall:", recall_score(y_test, gb_preds))
print("F1 Score:", f1_score(y_test, gb_preds))


Gradient Boosting Classifier Metrics:
Accuracy: 0.7944746755964839
Precision: 0.8236380424746076
Recall: 0.7483221476510067
F1 Score: 0.7841758241758242
