# fake reviews classifier

- logistic regression
- support vector classifier
- naive bayes
- random forest classifier

In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [36]:
import warnings
warnings.filterwarnings("ignore")

In [37]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

In [38]:
# 1. import dataset
df = pd.read_csv("../data/fake_reviews.csv")
df

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...
...,...,...,...,...
40427,Clothing_Shoes_and_Jewelry_5,4.0,OR,I had read some reviews saying that this bra r...
40428,Clothing_Shoes_and_Jewelry_5,5.0,CG,I wasn't sure exactly what it would be. It is ...
40429,Clothing_Shoes_and_Jewelry_5,2.0,OR,"You can wear the hood by itself, wear it with ..."
40430,Clothing_Shoes_and_Jewelry_5,1.0,CG,I liked nothing about this dress. The only rea...


- Clean the review text (lowercasing; removing punctuation, digits and stop words).
- Vectorize review text into numerical vectors (TfidfVectorizer).

In [39]:
# 2. Preprocessing

# a) encode label
df['encoded_label'] = df['label'].map({'OR': 0, 'CG': 1})


# b) Clean the review text
from utils.clean_text import clean_text

df['clean_text'] = df['text_'].apply(clean_text)


# c) Vectorize review text into numerical vectors
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['clean_text'])
y = df["encoded_label"]

In [40]:
# 3. save cleaned dataset
df.to_csv("../data/cleaned_reviews.csv", index=False)
df = pd.read_csv("../data/cleaned_reviews.csv")
df

Unnamed: 0,category,rating,label,text_,encoded_label,clean_text
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor...",1,love well made sturdy comfortable love itvery ...
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I...",1,love great upgrade original ive mine couple years
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...,1,pillow saved back love look feel pillow
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i...",1,missing information use great product price
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...,1,nice set good quality set two months
...,...,...,...,...,...,...
40427,Clothing_Shoes_and_Jewelry_5,4.0,OR,I had read some reviews saying that this bra r...,0,read reviews saying bra ran small ordered two ...
40428,Clothing_Shoes_and_Jewelry_5,5.0,CG,I wasn't sure exactly what it would be. It is ...,1,wasnt sure exactly would little large small si...
40429,Clothing_Shoes_and_Jewelry_5,2.0,OR,"You can wear the hood by itself, wear it with ...",0,wear hood wear hood wear jacket without hood s...
40430,Clothing_Shoes_and_Jewelry_5,1.0,CG,I liked nothing about this dress. The only rea...,1,liked nothing dress reason gave stars ordered ...


In [41]:
# 4. train test spilt
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [42]:
# 5. train model -> predict -> performance metrics
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report

models={
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": LinearSVC(),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier()
}

for i in range(len(list(models))):

    # a) train model
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    # b) predict
    y_pred = model.predict(X_test)

    # c) performance metrics 
    print(list(models.keys())[i])

    print("-> performance metrics for test data")
    # print("- Accuracy: {:.4f}".format(accuracy_score(y_test, y_pred)))
    # print("- F1 score: {:.4f}".format(f1_score(y_test, y_pred)))
    # print("- Precision: {:.4f}".format(precision_score(y_test, y_pred)))
    # print("- Recall: {:.4f}".format(recall_score(y_test, y_pred)))
    print("- classification report: \n", classification_report(y_test, y_pred))
    print('----------------------------------')

    print()

Logistic Regression
-> performance metrics for test data
- classification report: 
               precision    recall  f1-score   support

           0       0.88      0.89      0.88      4044
           1       0.89      0.88      0.88      4043

    accuracy                           0.88      8087
   macro avg       0.88      0.88      0.88      8087
weighted avg       0.88      0.88      0.88      8087

----------------------------------

SVM
-> performance metrics for test data
- classification report: 
               precision    recall  f1-score   support

           0       0.89      0.88      0.88      4044
           1       0.88      0.89      0.88      4043

    accuracy                           0.88      8087
   macro avg       0.88      0.88      0.88      8087
weighted avg       0.88      0.88      0.88      8087

----------------------------------

Naive Bayes
-> performance metrics for test data
- classification report: 
               precision    recall  f1-score   

In [43]:
#  6. cross validation score
from sklearn.model_selection import cross_val_score

for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1')
    print(f"{name} F1 Score (5-fold CV): {scores.mean():.4f}")


Logistic Regression F1 Score (5-fold CV): 0.8726
SVM F1 Score (5-fold CV): 0.8760
Naive Bayes F1 Score (5-fold CV): 0.8583
Random Forest F1 Score (5-fold CV): 0.8471


7. hyperparameter tunning and cross-validation

In [44]:
# parameters
lr_params = {
    'C': [0.01, 0.1, 1, 10, 100],           # Inverse of regularization
    'penalty': ['l2'],                      # 'l1' is slower and not always supported
    'solver': ['liblinear', 'lbfgs'],       # 'liblinear' supports small datasets and l1/l2
    'max_iter': [100, 500, 1000, 2000]
}
svm_params = {
    'C': [0.01, 0.1, 1, 10, 100],
    # 'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
    'loss': ['hinge', 'squared_hinge'],
    'max_iter': [1000, 2000]
}
nb_params = {
    'alpha': [0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0]
}
rf_params = {
    'n_estimators': [100, 200, 500, 1000],  # Number of trees
    'max_depth': [5, 10, 15, 20, None],     # Tree depth
    'min_samples_split': [2, 5, 10, 20],    # Minimum samples to split
    'min_samples_leaf': [1, 2, 4, 8],       # Minimum samples at leaf
    'bootstrap': [True, False]              # Bootstrap sampling
}

In [45]:
# models
randomcv_models = [
        ("LR", LogisticRegression(), lr_params),
        ("SVM", LinearSVC(), svm_params),
        # ("NB", MultinomialNB(), nb_params),
        # ("RF", RandomForestClassifier(), rf_params),
]

In [46]:
# random search cv
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
model_score = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(
        estimator=model,
        param_distributions=params,
        n_iter=100,
        cv=3,
        verbose=2,
        n_jobs=-1
    )
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_
    model_score[name] = random.best_score_

for model_name in model_param:
    print(model_name)
    print(model_param[model_name])
    print(model_score[model_name])

Fitting 3 folds for each of 40 candidates, totalling 120 fits
Fitting 3 folds for each of 20 candidates, totalling 60 fits
LR
{'solver': 'liblinear', 'penalty': 'l2', 'max_iter': 100, 'C': 10}
0.8739526714308138
SVM
{'max_iter': 1000, 'loss': 'hinge', 'C': 1}
0.8732106739878404
