Just for the sake of checking the model and the data, I will first just implement naive bayes model for toys and games category. And also this provides me an insight about hyperparameters

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.impute import SimpleImputer

from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
import joblib
import time

In [2]:
base_path = os.path.dirname(os.getcwd())
data_dir = os.path.join(base_path, "data", "processed", "Toys_and_Games")
DATA_PATH = os.path.join(data_dir, "toys_and_games.parquet")

data = pd.read_parquet(DATA_PATH)

X = data.drop('class', axis=1)
y = data['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(), 'cleaned_text'),
        ('numeric', numeric_transformer, ['overall', 'helpfulness_ratio']) 
    ],
    remainder='drop'
)

pipeline = Pipeline([("preprocessor", preprocessor), ("sampler", RandomUnderSampler(random_state=42)), ("model", MultinomialNB())])

param_grid = {
    'preprocessor__text__max_features': [7500, 15000],  # En sık geçen kaç kelime alınsın?
    'preprocessor__text__ngram_range': [(1, 1), (1, 2)], # Tekli kelimeler mi, ikili gruplar da mı?
    'model__alpha': [0.1, 0.5, 1.0]                     # Naive Bayes'in düzgünleştirme parametresi
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_weighted', n_jobs=4, verbose=3)

grid_search.fit(X_train, y_train)

In [None]:
print("\nOptimization completed")
print("-" * 40)
print(f"Best F1 Score: {grid_search.best_score_:.4f}")
print("Best Hyperparameters:")
print(grid_search.best_params_)
print("-" * 40)

# En iyi modeli al
best_model = grid_search.best_estimator_

# En iyi model ile Test seti üzerinde tahmin yap
y_pred = best_model.predict(X_test)

# Sınıflandırma Raporu
print("final test set report")
print(classification_report(y_test, y_pred, target_names=['Real (0)', 'Spam (1)']))

# Karmaşıklık Matrisi (Confusion Matrix)
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Real (0)', 'Spam (1)'],
            yticklabels=['Real (0)', 'Spam (1)'])
plt.ylabel('Real Value')
plt.xlabel('Estimation')
plt.title('Confusion Matrix')
plt.show()

Best F1 Score: 0.9160
Best Hyperparameters:
{'model__alpha': 1.0, 'preprocessor__text__max_features': 15000, 'preprocessor__text__ngram_range': (1, 2)}
----------------------------------------
FİNAL TEST SETİ PERFORMANS RAPORU
                  precision    recall  f1-score   support

Gerçek Yorum (0)       0.68      0.88      0.77     66877
  Spam Yorum (1)       0.97      0.92      0.95    332551

        accuracy                           0.91    399428
       macro avg       0.83      0.90      0.86    399428
    weighted avg       0.93      0.91      0.92    399428

The model achieved an overall accuracy of 91% and a cross-validated F1 score of 0.916, which indicates strong generalization performance.

The model performs very well on detecting spam reviews, with a precision of 0.97 and recall of 0.92.

The performance on genuine reviews is weaker in comparison, with precision = 0.68 and recall = 0.88.

Overall, the model is biased toward the majority class (spam), which is expected given the strong class imbalance in the dataset. The imbalance likely causes the model to favor predicting spam more confidently while misclassifying some real reviews.

To improve performance on the minority class (real comments), I will apply oversampling with SMOTE.

In [3]:
CATEGORIES = [
  "Cell_Phones_and_Accessories", 
  "Clothing_Shoes_and_Jewelry", 
  "Electronics", 
  "Home_and_Kitchen", 
  "Sports_and_Outdoors",
  "Toys_and_Games"]

TEST_SIZE = 0.20
RANDOM_STATE = 42
CV_FOLDS = 3
N_JOBS = 4 # cpu's to run

In [7]:
results_file = os.path.join(os.path.dirname(os.getcwd()), "reports", "model_result.csv")
param_grid = {
  'preprocessor__text__max_features': [15000],
  'preprocessor__text__ngram_range': [(1, 2)],
  'model__alpha': [1.0, 1.5]
}

all_results = []

for category in CATEGORIES:
  print(f"Processing {category}")

  model_dir = os.path.join(base_path, "models", category)
  model_filename = f"naive_bayes_{category.lower()}.joblib"
  model_path = os.path.join(model_dir, model_filename)

  if os.path.exists(model_path):
    print(f"'{category}' model has already created.")
    continue
  
  start_time = time.time()

  base_path = os.path.dirname(os.getcwd())
  data_dir = os.path.join(base_path, "data", "processed", category)
  data_path = os.path.join(data_dir, f"{category.lower()}.parquet")

  if not os.path.exists(data_path):
    print(f"No data found for category {category} at {data_path}")
    print("Skipping")
    continue

  data = pd.read_parquet(data_path)

  if category == "Electronics":
    if len(data) > 1000000:
      print(f"original size {len(data)}")
      data = data.sample(n = 3787084, random_state=RANDOM_STATE)

  X = data.drop("class", axis=1)
  y = data["class"]

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)
  
  numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value=0))])
  
  preprocessor = ColumnTransformer(transformers=[
    ('text', TfidfVectorizer(), 'cleaned_text'),
    ('numeric', numeric_transformer, ['overall', 'helpfulness_ratio'])
  ])
  
  pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('sampler', RandomUnderSampler(random_state=RANDOM_STATE)),
    ('model', MultinomialNB())
  ])
  
  grid_search = GridSearchCV(pipeline, param_grid, cv=CV_FOLDS, scoring='f1_weighted', n_jobs=N_JOBS, verbose=1)
  
  print(f"Grid Search starts for category {category}")
  grid_search.fit(X_train, y_train)
  
  print(f"Best model found for category {category}")
  best_model = grid_search.best_estimator_
  y_pred = best_model.predict(X_test)
  
  report = classification_report(y_test, y_pred, output_dict=True)
  
  result_data = {
    'category': category,
    'best_cv_f1_score': grid_search.best_score_,
    'test_accuracy': report['accuracy'],
    'test_f1_real_review': report['0']['f1-score'],
    'test_precision_real_review': report['0']['precision'],
    'test_recall_real_review': report['0']['recall'],
    'test_f1_spam_review': report['1']['f1-score'],
    'test_precision_spam_review': report['1']['precision'],
    'test_recall_spam_review': report['1']['recall'],
    'best_params': str(grid_search.best_params_),
    'training_time_minutes': (time.time() - start_time) / 60
  }
  all_results.append(result_data)
  
  temp_df = pd.DataFrame([result_data])
  header = not os.path.exists(results_file)
  temp_df.to_csv(results_file, mode='a', header=header, index=False)
  
  print(f"Results saved")
  
  
  os.makedirs(model_dir, exist_ok=True)
  model_filename = f"naive_bayes_{category.lower()}.joblib"
  model_path = os.path.join(model_dir, model_filename)
  joblib.dump(best_model, model_path)
  print(f"Model for {category} saved to {model_path}")
print("Done")


Processing Cell_Phones_and_Accessories
'Cell_Phones_and_Accessories' model has already created.
Processing Clothing_Shoes_and_Jewelry
'Clothing_Shoes_and_Jewelry' model has already created.
Processing Electronics
original size 7574169
Grid Search starts for category Electronics
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Best model found for category Electronics
Results saved
Model for Electronics saved to c:\work environment\Projects\amazon-spam-review\models\Electronics\naive_bayes_electronics.joblib
Processing Home_and_Kitchen
'Home_and_Kitchen' model has already created.
Processing Sports_and_Outdoors
'Sports_and_Outdoors' model has already created.
Processing Toys_and_Games
'Toys_and_Games' model has already created.
Done


In [None]:
final_results_df = pd.read_csv(results_file)
print(final_results_df)