## Model Training for Fake Review Detection

### 1) Import Libraries and Load Data

Import necessary libraries for data manipulation, machine learning, and visualization. Load the `fakereviewsdataset.csv`.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, classification_report
import warnings
warnings.filterwarnings('ignore')

# Import XGBoost and LightGBM
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

In [6]:
df = pd.read_csv('data/fake_reviews_data.csv')
df.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...


### 2) Data Preprocessing and Feature Engineering

We will engineer more sophisticated features from the `text_` column and other metadata to better capture patterns indicative of fake reviews.

In [7]:
# Basic Text Features
df["text_length"] = df["text_"].apply(len)
df["word_count"] = df["text_"].apply(lambda x: len(str(x).split()))
df["unique_word_count"] = df["text_"].apply(lambda x: len(set(str(x).split())))
df["avg_word_length"] = df["text_"].apply(lambda x: np.mean([len(w) for w in str(x).split()]) if len(str(x).split()) > 0 else 0)
df["punctuation_count"] = df["text_"].apply(lambda x: sum([1 for char in str(x) if char in ".,!?;:"]))

In [8]:
# Aggregate features based on category
category_stats = df.groupby("category").agg(
    avg_rating_cat=("rating", "mean"),
    std_rating_cat=("rating", "std"),
    avg_text_len_cat=("text_length", "mean"),
    std_text_len_cat=("text_length", "std"),
    avg_word_count_cat=("word_count", "mean"),
    std_word_count_cat=("word_count", "std")
).reset_index()

df = df.merge(category_stats, on="category", how="left")

In [9]:
# Fill any NaN values that might arise from std (e.g., single review categories) or merges
df = df.fillna(df.mean(numeric_only=True))

df.head()

Unnamed: 0,category,rating,label,text_,text_length,word_count,unique_word_count,avg_word_length,punctuation_count,avg_rating_cat,std_rating_cat,avg_text_len_cat,std_text_len_cat,avg_word_count_cat,std_word_count_cat
0,Home_and_Kitchen_5,5.0,CG,"Love this! Well made, sturdy, and very comfor...",75,12,12,5.166667,5,4.246795,1.209086,311.259369,341.822811,60.570513,65.733981
1,Home_and_Kitchen_5,5.0,CG,"love it, a great upgrade from the original. I...",80,16,15,4.0,2,4.246795,1.209086,311.259369,341.822811,60.570513,65.733981
2,Home_and_Kitchen_5,5.0,CG,This pillow saved my back. I love the look and...,67,14,14,3.857143,2,4.246795,1.209086,311.259369,341.822811,60.570513,65.733981
3,Home_and_Kitchen_5,1.0,CG,"Missing information on how to use it, but it i...",81,17,17,3.764706,2,4.246795,1.209086,311.259369,341.822811,60.570513,65.733981
4,Home_and_Kitchen_5,5.0,CG,Very nice set. Good quality. We have had the s...,85,18,17,3.777778,2,4.246795,1.209086,311.259369,341.822811,60.570513,65.733981


In [10]:
# Define features (X) and target (y)
# We now include the newly engineered features and drop original text and category
X = df.drop(["label", "text_", "category"] , axis=1) 
y = df["label"]

In [11]:
# Identify numerical features (all remaining features should be numerical after engineering)
numerical_features = X.columns.tolist() 

print(f"Numerical Features: {list(numerical_features)}")

Numerical Features: ['rating', 'text_length', 'word_count', 'unique_word_count', 'avg_word_length', 'punctuation_count', 'avg_rating_cat', 'std_rating_cat', 'avg_text_len_cat', 'std_text_len_cat', 'avg_word_count_cat', 'std_word_count_cat']


In [12]:
# Create a column transformer for preprocessing (only scaling numerical features now)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_features) 
    ])

In [13]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")

X_train shape: (32345, 12)
X_test shape: (8087, 12)


### 3) Model Training and Evaluation for Best Accuracy

we will focus on robust models and comprehensive hyperparameter tuning. We will use a `VotingClassifier` as our custom model, combining several strong traditional classifiers, and perform `GridSearchCV` for optimal hyperparameter selection.

In [14]:
# Function to evaluate models
def evaluate_model(true, predicted, model_name, predicted_proba=None):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted, pos_label='OR') 
    recall = recall_score(true, predicted, pos_label='OR')
    f1 = f1_score(true, predicted, pos_label='OR')
    roc_auc = roc_auc_score(true, predicted_proba[:, 1]) if predicted_proba is not None and predicted_proba.shape[1] > 1 else None
    
    print(f"--- {model_name} ---")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Score: {f1:.4f}")
    if roc_auc is not None:
        print(f"ROC AUC Score: {roc_auc:.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(true, predicted))
    print("Classification Report:")
    print(classification_report(true, predicted))
    print("\n")
    return accuracy, precision, recall, f1, roc_auc

In [15]:
# Define base models for the ensemble
clf1 = LogisticRegression(random_state=42)
clf2 = RandomForestClassifier(random_state=42)
clf3 = GradientBoostingClassifier(random_state=42)
clf4 = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
clf5 = LGBMClassifier(random_state=42)

In [16]:
# Create a VotingClassifier (soft voting for probabilities)
eclf1 = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gb', clf3), ('xgb', clf4), ('lgbm', clf5)], voting='soft', weights=[0.1, 0.25, 0.25, 0.2, 0.2])

In [17]:
# Create a pipeline for the ensemble model
ensemble_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', eclf1)])

In [18]:
# Train the ensemble model
ensemble_pipeline.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 16173, number of negative: 16172
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000483 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1132
[LightGBM] [Info] Number of data points in the train set: 32345, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500015 -> initscore=0.000062
[LightGBM] [Info] Start training from score 0.000062


In [19]:
print("Training Performance of Ensemble Model:")

# Predict on training data
y_pred_train_ensemble = ensemble_pipeline.predict(X_train)
predicted_proba_train_ensemble = ensemble_pipeline.predict_proba(X_train)

# Evaluate
evaluate_model(y_train, y_pred_train_ensemble, "VotingClassifier (Training)", predicted_proba_train_ensemble)

Training Performance of Ensemble Model:
--- VotingClassifier (Training) ---
Accuracy: 0.9006
Precision: 0.9133
Recall: 0.8853
F1-Score: 0.8991
ROC AUC Score: 0.9730
Confusion Matrix:
[[14813  1359]
 [ 1855 14318]]
Classification Report:
              precision    recall  f1-score   support

          CG       0.89      0.92      0.90     16172
          OR       0.91      0.89      0.90     16173

    accuracy                           0.90     32345
   macro avg       0.90      0.90      0.90     32345
weighted avg       0.90      0.90      0.90     32345





(0.9006337919307467,
 0.9133124960132678,
 0.8853026649353861,
 0.8990894819466249,
 np.float64(0.9729994318939452))

In [20]:
ensemble_pipeline.predict_proba(X_train)

array([[0.6139057 , 0.3860943 ],
       [0.62086955, 0.37913045],
       [0.40998932, 0.59001068],
       ...,
       [0.06572316, 0.93427684],
       [0.83743847, 0.16256154],
       [0.63777858, 0.36222142]])

In [21]:
# Convert categorical labels to binary integers
y_train_binary = y_train.map({'CG': 0, 'OR': 1})

In [22]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Probabilities for the positive class
y_train_probs = ensemble_pipeline.predict_proba(X_train)[:, 1]

# Convert labels to binary
y_train_binary = y_train.map({'CG': 0, 'OR': 1})

# Now compute MSE and RMSE
mse_train = mean_squared_error(y_train_binary, y_train_probs)
rmse_train = np.sqrt(mse_train)

print("Training MSE:", mse_train)
print("Training RMSE:", rmse_train)

Training MSE: 0.08634859048554269
Training RMSE: 0.29385130676167276


In [23]:
# Evaluate the ensemble model
y_pred_ensemble = ensemble_pipeline.predict(X_test)
predicted_proba_ensemble = ensemble_pipeline.predict_proba(X_test)

In [24]:
print("Ensemble Model Performance:")
evaluate_model(y_test, y_pred_ensemble, "VotingClassifier (Ensemble)", predicted_proba_ensemble)

Ensemble Model Performance:
--- VotingClassifier (Ensemble) ---
Accuracy: 0.8151
Precision: 0.8262
Recall: 0.7982
F1-Score: 0.8119
ROC AUC Score: 0.9014
Confusion Matrix:
[[3365  679]
 [ 816 3227]]
Classification Report:
              precision    recall  f1-score   support

          CG       0.80      0.83      0.82      4044
          OR       0.83      0.80      0.81      4043

    accuracy                           0.82      8087
   macro avg       0.82      0.82      0.82      8087
weighted avg       0.82      0.82      0.82      8087





(0.815135402497836,
 0.8261648745519713,
 0.7981696759831808,
 0.8119260284312492,
 np.float64(0.9014406333693215))

In [25]:
# Convert categorical labels to binary integers
y_test_binary = y_test.map({'CG': 0, 'OR': 1})

In [26]:
from sklearn.metrics import mean_squared_error
import numpy as np

# Probabilities for the positive class on test set
y_test_probs = ensemble_pipeline.predict_proba(X_test)[:, 1]

# Convert test labels to binary
y_test_binary = y_test.map({'CG': 0, 'OR': 1})

# Compute MSE and RMSE for test data
mse_test = mean_squared_error(y_test_binary, y_test_probs)
rmse_test = np.sqrt(mse_test)

print("Testing MSE:", mse_test)
print("Testing RMSE:", rmse_test)


Testing MSE: 0.1303648198693475
Testing RMSE: 0.36106068723879026
