In [1]:
import pandas as pd
import numpy as np
import os
import joblib
import time

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit 

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB

In [8]:
# Path düzeltmesi: notebooks/deceptency/ içindeyiz
# os.getcwd() -> .../notebooks/deceptency/
base_path = os.path.dirname(os.getcwd())  # notebooks/ klasörü
data_path = os.path.join(base_path, "data", "processed", "diplomacy")
print(f"Base path: {base_path}")
print(f"Data path: {data_path}")
print(f"Files exist: {os.path.exists(data_path)}")

Base path: c:\Users\aadil\Desktop\YAP470\amazon-fake-review-detector\notebooks
Data path: c:\Users\aadil\Desktop\YAP470\amazon-fake-review-detector\notebooks\data\processed\diplomacy
Files exist: True


In [9]:
categorical_features = ["speaker", "receiver", "season"]
col_types = {col: 'object' for col in categorical_features}

data = pd.read_parquet(os.path.join(data_path, "diplomacy_processed.parquet"))
data = data.astype(col_types)
train_df = pd.read_parquet(os.path.join(data_path, "train_processed.parquet"))
train_df = train_df.astype(col_types)
val_df = pd.read_parquet(os.path.join(data_path, "val_processed.parquet"))
val_df = val_df.astype(col_types)
test_df = pd.read_parquet(os.path.join(data_path, "test_processed.parquet"))
test_df = test_df.astype(col_types)

In [10]:
X_train = train_df.drop('target', axis=1)
y_train = train_df['target']
X_val = val_df.drop('target', axis=1)
y_val = val_df['target']
X_test = test_df.drop('target', axis=1)
y_test = test_df['target']

In [11]:
numeric_features = ["game_score", "game_score_delta", "year", "message_length"]
numeric_transformer = Pipeline(steps=[
  ("imputer", SimpleImputer(strategy="most_frequent")),
  ("scaler", MinMaxScaler())
])

categorical_features = ["speaker", "receiver", "season"]
categorical_transformer = Pipeline(steps=[
  ("imputer", SimpleImputer(strategy="constant", fill_value="most_frequent")),
  ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

text_feature = "cleaned_text"
text_transformer = TfidfVectorizer()

preprocessor = ColumnTransformer(transformers=[
  ('text', text_transformer, text_feature),
  ('num', numeric_transformer, numeric_features),
  ('cat', categorical_transformer, categorical_features)
], remainder="drop")

In [12]:
X_train_val = pd.concat([X_train, X_val])
y_train_val = pd.concat([y_train, y_val])
# because gridsearch expects the formal as x and y, we concat train and validation set; but modify how gridsearch splits data
split_index = [-1] * len(X_train) + [0] * len(X_val)
ps = PredefinedSplit(test_fold=split_index)

In [13]:
# try to cover data imbalance with generating artificial lies with smote
pipeline_nb_smote = Pipeline([
    ('preprocessor', preprocessor), # Mevcut önişleme adımı
    ('sampler', SMOTE(random_state=42)),
    ('model', MultinomialNB())
])

In [14]:
param_grid_nb = {
    # TF-IDF Vektörleyici Parametreleri
    'preprocessor__text__max_features': [1000, 5000, 15000],
    'model__alpha': [0.001, 0.005, 0.01, 0.1, 1.0, 10.0]
}

In [15]:
grid_search = GridSearchCV(
    pipeline_nb_smote, 
    param_grid_nb, 
    cv=ps,
    scoring='f1_weighted', 
    n_jobs=4, 
    verbose=2
)

In [16]:
grid_search.fit(X_train_val, y_train_val)
best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)
print(classification_report(y_test, y_pred_test, target_names=['Truth', 'Deception']))

Fitting 1 folds for each of 18 candidates, totalling 18 fits
              precision    recall  f1-score   support

       Truth       0.91      0.75      0.83      2501
   Deception       0.09      0.26      0.14       240

    accuracy                           0.71      2741
   macro avg       0.50      0.51      0.48      2741
weighted avg       0.84      0.71      0.77      2741

              precision    recall  f1-score   support

       Truth       0.91      0.75      0.83      2501
   Deception       0.09      0.26      0.14       240

    accuracy                           0.71      2741
   macro avg       0.50      0.51      0.48      2741
weighted avg       0.84      0.71      0.77      2741





In [17]:
print(grid_search.best_params_)

{'model__alpha': 0.005, 'preprocessor__text__max_features': 15000}


In [18]:
pipeline_nb_balanced = Pipeline([
    ('preprocessor', preprocessor),
    ('sampler', SMOTE(random_state=42, k_neighbors=5)), # Unutmayın: MinMaxScaler kullanan preprocessor
    ('model', ComplementNB())       # class_weight parametresi yok, kendisi halleder
])

In [19]:
param_grid_cnb = {
    'preprocessor__text__max_features': [1000, 5000],
    'model__alpha': [0.1, 1.0, 10.0],
    'model__norm': [False, True] # ComplementNB'ye özel ekstra bir ayar (denenebilir)
}

In [20]:
grid_search = GridSearchCV(
    pipeline_nb_balanced, 
    param_grid_cnb, 
    cv=ps, 
    scoring='f1_weighted',
    n_jobs=4,
    verbose=2
)

In [21]:
grid_search.fit(X_train_val, y_train_val)
best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)
print(classification_report(y_test, y_pred_test, target_names=['Truth', 'Deception']))

Fitting 1 folds for each of 12 candidates, totalling 12 fits
              precision    recall  f1-score   support

       Truth       0.92      0.73      0.81      2501
   Deception       0.10      0.30      0.15       240

    accuracy                           0.69      2741
   macro avg       0.51      0.52      0.48      2741
weighted avg       0.84      0.69      0.75      2741



