In [42]:
import pandas as pd
import numpy as np
import os
import joblib
import time

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit 

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB

In [23]:
base_path = os.path.dirname(os.path.dirname(os.getcwd()))
data_path = os.path.join(base_path, "data", "processed", "diplomacy")
print(base_path)

c:\work environment\Projects\amazon-spam-review


In [24]:
categorical_features = ["speaker", "receiver", "season"]
col_types = {col: 'object' for col in categorical_features}

data = pd.read_parquet(os.path.join(data_path, "diplomacy_processed.parquet"))
data = data.astype(col_types)
train_df = pd.read_parquet(os.path.join(data_path, "train_processed.parquet"))
train_df = train_df.astype(col_types)
val_df = pd.read_parquet(os.path.join(data_path, "val_processed.parquet"))
val_df = val_df.astype(col_types)
test_df = pd.read_parquet(os.path.join(data_path, "test_processed.parquet"))
test_df = test_df.astype(col_types)

In [25]:
X_train = train_df.drop('target', axis=1)
y_train = train_df['target']
X_val = val_df.drop('target', axis=1)
y_val = val_df['target']
X_test = test_df.drop('target', axis=1)
y_test = test_df['target']

In [26]:
numeric_features = ["game_score", "game_score_delta", "year", "message_length"]
numeric_transformer = Pipeline(steps=[
  ("imputer", SimpleImputer(strategy="most_frequent")),
  ("scaler", MinMaxScaler())
])

categorical_features = ["speaker", "receiver", "season"]
categorical_transformer = Pipeline(steps=[
  ("imputer", SimpleImputer(strategy="constant", fill_value="most_frequent")),
  ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

text_feature = "cleaned_text"
text_transformer = TfidfVectorizer()

preprocessor = ColumnTransformer(transformers=[
  ('text', text_transformer, text_feature),
  ('num', numeric_transformer, numeric_features),
  ('cat', categorical_transformer, categorical_features)
], remainder="drop")

In [27]:
X_train_val = pd.concat([X_train, X_val])
y_train_val = pd.concat([y_train, y_val])
# because gridsearch expects the formal as x and y, we concat train and validation set; but modify how gridsearch splits data
split_index = [-1] * len(X_train) + [0] * len(X_val)
ps = PredefinedSplit(test_fold=split_index)

In [28]:
# try to cover data imbalance with generating artificial lies with smote
pipeline_nb_smote = Pipeline([
    ('preprocessor', preprocessor), # Mevcut önişleme adımı
    ('sampler', SMOTE(random_state=42)),
    ('model', MultinomialNB())
])

In [38]:
param_grid_nb = {
    # TF-IDF Vektörleyici Parametreleri
    'preprocessor__text__max_features': [1000, 5000, 15000],
    'model__alpha': [0.001, 0.005, 0.01, 0.1, 1.0, 10.0]
}

In [39]:
grid_search = GridSearchCV(
    pipeline_nb_smote, 
    param_grid_nb, 
    cv=ps,
    scoring='f1_weighted', 
    n_jobs=4, 
    verbose=2
)

In [40]:
start_time = time.time()
grid_search.fit(X_train_val, y_train_val)
end_time = time.time()
training_time_minutes = (end_time - start_time) / 60
best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)
print(classification_report(y_test, y_pred_test, target_names=['Truth', 'Deception']))
# --- 5. ADIM: MODEL VE SONUÇLARI KAYDETME ---
# Save Model
model_dir = os.path.join(base_path, "models", "deceptency")
os.makedirs(model_dir, exist_ok=True)
joblib.dump(best_model, os.path.join(model_dir, "naive_bayes_deceptency.pkl"))
print("Model saved.")

# Save Results to CSV
report = classification_report(y_test, y_pred_test, output_dict=True)

result_data = {
    'category': 'Deceptency_NaiveBayes',
    'best_cv_f1_score': grid_search.best_score_,
    'test_accuracy': report['accuracy'],
    'test_f1_truth': report['0']['f1-score'],
    'test_precision_truth': report['0']['precision'],
    'best_params': str(grid_search.best_params_),
    'training_time_minutes': training_time_minutes
}

results_file = os.path.join(base_path, "reports", "model_results_deceptency_naive_bayes.csv")
result_df = pd.DataFrame([result_data])
header = not os.path.exists(results_file)
result_df.to_csv(results_file, mode='a', header=header, index=False)

print(f"Results saved to {results_file}")

Fitting 1 folds for each of 18 candidates, totalling 18 fits
              precision    recall  f1-score   support

       Truth       0.91      0.75      0.83      2501
   Deception       0.09      0.26      0.14       240

    accuracy                           0.71      2741
   macro avg       0.50      0.51      0.48      2741
weighted avg       0.84      0.71      0.77      2741



In [41]:
print(grid_search.best_params_)

{'model__alpha': 0.005, 'preprocessor__text__max_features': 15000}


In [None]:
pipeline_nb_balanced = Pipeline([
    ('preprocessor', preprocessor),
    ('sampler', SMOTE(random_state=42, k_neighbors=5)), # Unutmayın: MinMaxScaler kullanan preprocessor
    ('model', ComplementNB())       # class_weight parametresi yok, kendisi halleder
])

In [44]:
param_grid_cnb = {
    'preprocessor__text__max_features': [1000, 5000],
    'model__alpha': [0.1, 1.0, 10.0],
    'model__norm': [False, True] # ComplementNB'ye özel ekstra bir ayar (denenebilir)
}

In [45]:
grid_search = GridSearchCV(
    pipeline_nb_balanced, 
    param_grid_cnb, 
    cv=ps, 
    scoring='f1_weighted',
    n_jobs=4,
    verbose=2
)

In [46]:
grid_search.fit(X_train_val, y_train_val)
best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)
print(classification_report(y_test, y_pred_test, target_names=['Truth', 'Deception']))

Fitting 1 folds for each of 12 candidates, totalling 12 fits
              precision    recall  f1-score   support

       Truth       0.91      1.00      0.95      2501
   Deception       0.00      0.00      0.00       240

    accuracy                           0.91      2741
   macro avg       0.46      0.50      0.48      2741
weighted avg       0.83      0.91      0.87      2741



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
