In [2]:
import pandas as pd
import numpy as np
import os
import joblib
import time
import lightgbm as lgb

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit 

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [3]:
# Path düzeltmesi: notebooks/deceptency/ içindeyiz
base_path = os.path.dirname(os.getcwd())  # notebooks/ klasörü
data_path = os.path.join(base_path, "data", "processed", "diplomacy")
print(f"Base path: {base_path}")
print(f"Data path: {data_path}")

Base path: c:\Users\aadil\Desktop\YAP470\amazon-fake-review-detector\notebooks
Data path: c:\Users\aadil\Desktop\YAP470\amazon-fake-review-detector\notebooks\data\processed\diplomacy


In [4]:
categorical_features = ["speaker", "receiver", "season"]
col_types = {col: 'object' for col in categorical_features}

data = pd.read_parquet(os.path.join(data_path, "diplomacy_processed.parquet"))
data = data.astype(col_types)
train_df = pd.read_parquet(os.path.join(data_path, "train_processed.parquet"))
train_df = train_df.astype(col_types)
val_df = pd.read_parquet(os.path.join(data_path, "val_processed.parquet"))
val_df = val_df.astype(col_types)
test_df = pd.read_parquet(os.path.join(data_path, "test_processed.parquet"))
test_df = test_df.astype(col_types)

In [5]:
print(data.columns)

Index(['game_id', 'speaker', 'receiver', 'message_text', 'sender_intention',
       'game_score', 'game_score_delta', 'year', 'season', 'original_fold'],
      dtype='object')


In [12]:
X_train = train_df.drop('target', axis=1)
y_train = train_df['target']
X_val = val_df.drop('target', axis=1)
y_val = val_df['target']
X_test = test_df.drop('target', axis=1)
y_test = test_df['target']

In [8]:
print(data.columns)

Index(['game_id', 'speaker', 'receiver', 'message_text', 'sender_intention',
       'game_score', 'game_score_delta', 'year', 'season', 'original_fold'],
      dtype='object')


In [13]:
print(f"train set size: {len(X_train)}")

train set size: 13132


In [10]:
numeric_features = ["game_score", "game_score_delta", "year", "message_length"]
numeric_transformer = Pipeline(steps=[
  ("imputer", SimpleImputer(strategy="most_frequent")),
  ("scaler", StandardScaler())
])

categorical_features = ["speaker", "receiver", "season"]
categorical_transformer = Pipeline(steps=[
  ("imputer", SimpleImputer(strategy="constant", fill_value="most_frequent")),
  ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

text_feature = "cleaned_text"
text_transformer = TfidfVectorizer()

preprocessor = ColumnTransformer(transformers=[
  ('text', text_transformer, text_feature),
  ('num', numeric_transformer, numeric_features),
  ('cat', categorical_transformer, categorical_features)
], remainder="drop")

In [14]:
X_train_val = pd.concat([X_train, X_val])
y_train_val = pd.concat([y_train, y_val])
# because gridsearch expects the formal as x and y, we concat train and validation set; but modify how gridsearch splits data
split_index = [-1] * len(X_train) + [0] * len(X_val)
ps = PredefinedSplit(test_fold=split_index)

In [15]:
# try to cover data imbalance with generating artificial lies with smote
pipeline_lgbm_smote = Pipeline([
    ('preprocessor', preprocessor),
    ('sampler', SMOTE(random_state=42)),
    ('model', lgb.LGBMClassifier(random_state=42))
])

In [16]:
param_grid_lgbm = {
    'preprocessor__text__max_features': [5000, 15000],
    'model__n_estimators': [100, 200, 500],
    'model__learning_rate': [0.1, 0.5, 1.0]
}

In [17]:
grid_search = GridSearchCV(
    pipeline_lgbm_smote, 
    param_grid_lgbm, 
    cv=ps,
    scoring='f1_weighted', 
    n_jobs=4, 
    verbose=2
)

In [18]:
grid_search.fit(X_train_val, y_train_val)
best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)
print(classification_report(y_test, y_pred_test, target_names=['Truth', 'Deception']))

Fitting 1 folds for each of 18 candidates, totalling 18 fits




[LightGBM] [Info] Number of positive: 13901, number of negative: 13901
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.084763 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115728
[LightGBM] [Info] Number of data points in the train set: 27802, number of used features: 1990
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
              precision    recall  f1-score   support

       Truth       0.91      1.00      0.95      2501
   Deception       0.39      0.03      0.05       240

    accuracy                           0.91      2741
   macro avg       0.65      0.51      0.50      2741
weighted avg       0.87      0.91      0.87      2741

              precision    recall  f1-score   support

       Truth       0.91      1.00      0.95      2501
   Deception       0.39      0.03      0.05       240

    accuracy                           0.91      2741
   macro avg



Not good at all. LightGBM model with smote calls everything truth so deceptency detection is miserable.

In [61]:
print(grid_search.best_params_)

{'model__learning_rate': 0.1, 'model__n_estimators': 500, 'preprocessor__text__max_features': 15000}


In [62]:
# try to punish missing lies more
pipeline_lgbm_weighted = Pipeline([
    ('preprocessor', preprocessor),
    ('model', lgb.LGBMClassifier(
        random_state=42,
        class_weight='balanced' 
    ))
])

In [None]:
param_grid_lgbm = {
    'preprocessor__text__max_features': [500, 1000, 5000],
    'model__n_estimators': [2, 5, 10, 100, 500],
    'model__learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0]
}

In [72]:
grid_search = GridSearchCV(
    pipeline_lgbm_weighted, 
    param_grid_lgbm, 
    cv=ps, 
    scoring='f1',
    n_jobs=4, 
    verbose=2
)

In [73]:
grid_search.fit(X_train_val, y_train_val)
best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)
print(classification_report(y_test, y_pred_test, target_names=['Truth', 'Deception']))

Fitting 1 folds for each of 27 candidates, totalling 27 fits
[LightGBM] [Info] Number of positive: 647, number of negative: 13901
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.015178 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37845
[LightGBM] [Info] Number of data points in the train set: 14548, number of used features: 999
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
              precision    recall  f1-score   support

       Truth       0.93      0.70      0.80      2501
   Deception       0.13      0.45      0.20       240

    accuracy                           0.68      2741
   macro avg       0.53      0.58      0.50      2741
weighted avg       0.86      0.68      0.75      2741





Although results are significantly better than last time, deceptency detection is still not good enough.

Fitting 1 folds for each of 27 candidates, totalling 27 fits
[LightGBM] [Info] Number of positive: 647, number of negative: 13901
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016604 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37845
[LightGBM] [Info] Number of data points in the train set: 14548, number of used features: 999
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
              precision    recall  f1-score   support

       Truth       0.92      0.90      0.91      2501
   Deception       0.17      0.23      0.19       240

    accuracy                           0.84      2741
   macro avg       0.55      0.56      0.55      2741
weighted avg       0.86      0.84      0.85      2741

In [74]:
print(grid_search.best_params_)

{'model__learning_rate': 0.1, 'model__n_estimators': 10, 'preprocessor__text__max_features': 5000}


In [5]:
train_df = pd.read_parquet(os.path.join(data_path, "train_final.parquet"))
val_df = pd.read_parquet(os.path.join(data_path, "val_final.parquet"))
test_df = pd.read_parquet(os.path.join(data_path, "test_final.parquet"))

In [6]:
categorical_features_list = ['speaker', 'receiver', 'season']
col_types = {col: 'object' for col in categorical_features_list}
train_df = train_df.astype(col_types)
val_df = val_df.astype(col_types)
test_df = test_df.astype(col_types)

In [5]:
X_train = train_df.drop('target', axis=1)
y_train = train_df['target']
X_val = val_df.drop('target', axis=1)
y_val = val_df['target']
X_test = test_df.drop('target', axis=1)
y_test = test_df['target']

In [6]:
numeric_features = [
    'game_score', 'game_score_delta', 'year', 'message_length',
    'action_support_count', 'action_move_count', 'action_hold_count'
]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = ['speaker', 'receiver', 'season']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

text_feature = 'cleaned_text'
text_transformer = TfidfVectorizer() 

preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, text_feature),
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

In [7]:
X_train_val = pd.concat([X_train, X_val], ignore_index=True)
y_train_val = pd.concat([y_train, y_val], ignore_index=True)
split_index = [-1] * len(X_train) + [0] * len(X_val)
ps = PredefinedSplit(test_fold=split_index)

In [8]:
pipeline_lgbm_enriched = Pipeline([
    ('preprocessor', preprocessor),
    ('model', lgb.LGBMClassifier(
        random_state=42,
        class_weight='balanced'
    ))
])

In [9]:
param_grid_lgbm = {
    'preprocessor__text__max_features': [5000],
    'model__n_estimators': [100],
    'model__learning_rate': [0.1]
}

In [10]:
grid_search = GridSearchCV(
    pipeline_lgbm_enriched, 
    param_grid_lgbm, 
    cv=ps, 
    scoring='f1_macro', 
    n_jobs=4, 
    verbose=2
)

In [13]:
grid_search.fit(X_train_val, y_train_val)
best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)
print("\n--- FİNAL TEST SETİ PERFORMANS RAPORU (Zenginleştirilmiş Model) ---")
print(classification_report(y_test, y_pred_test, target_names=['Doğru Mesaj (0)', 'Yalan Mesaj (1)']))

Fitting 1 folds for each of 1 candidates, totalling 1 fits
[LightGBM] [Info] Number of positive: 647, number of negative: 13901
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019555 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37845
[LightGBM] [Info] Number of data points in the train set: 14548, number of used features: 999
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000

--- FİNAL TEST SETİ PERFORMANS RAPORU (Zenginleştirilmiş Model) ---
                 precision    recall  f1-score   support

Doğru Mesaj (0)       0.92      0.90      0.91      2501
Yalan Mesaj (1)       0.17      0.23      0.19       240

       accuracy                           0.84      2741
      macro avg       0.55      0.56      0.55      2741
   weighted avg       0.86   



In [6]:
categorical_features_list = ['speaker', 'receiver', 'season']
col_types = {col: 'object' for col in categorical_features_list}
train_final_df = train_df.astype(col_types)
val_final_df = val_df.astype(col_types)
test_final_df = test_df.astype(col_types)

# Şimdi veriyi X ve y olarak ayıralım
X_train = train_final_df.drop('target', axis=1)
y_train = train_final_df['target']
X_val = val_final_df.drop('target', axis=1)
y_val = val_final_df['target']
X_test = test_final_df.drop('target', axis=1)
y_test = test_final_df['target']

print("Nihai zenginleştirilmiş setler (train/val/test) yüklendi.")
print(f"Eğitim seti özellikleri: {list(X_train.columns)}")
print("-" * 40)


# --- 2. ADIM: PREPROCESSOR'Ü SON KEZ GÜNCELLE ---
# (Bu bölüm senin kodunla AYNI, mükemmel)

# Yeni 'mismatch' ve 'promise' özelliklerimizi sayısal listeye ekliyoruz
numeric_features = [
    'game_score', 'game_score_delta', 'year', 'message_length',
    'action_support_count', 'action_move_count', 'action_hold_count',
    'promise_support', 'promise_attack', 'promise_hold',
    'mismatch_support_vs_attack', 'mismatch_attack_vs_no_attack', 'mismatch_support_vs_no_support'
]

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_features = ['speaker', 'receiver', 'season']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])
text_feature = 'cleaned_text'
text_transformer = TfidfVectorizer(max_features=5000)

preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, text_feature),
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

# --- 3. ADIM: PREDEFINEDSPLIT HAZIRLIĞI ---
# (Bu bölüm senin kodunla AYNI, mükemmel)
X_train_val = pd.concat([X_train, X_val], ignore_index=True)
y_train_val = pd.concat([y_train, y_val], ignore_index=True)
split_index = [-1] * len(X_train) + [0] * len(X_val)
ps = PredefinedSplit(test_fold=split_index)

# --- 4. ADIM: AĞIRLIKLANDIRILMIŞ LİGHTGBM PİPELİNE ---
# (Bu bölüm senin kodunla AYNI, mükemmel)
pipeline_lgbm_final = Pipeline([
    ('preprocessor', preprocessor),
    ('model', lgb.LGBMClassifier(
        random_state=42,
        class_weight='balanced' # Dengesizlikle başa çıkmak için
    ))
])

# Parametre ızgarası (Basit tutuyoruz)
param_grid_lgbm = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.1]
}

grid_search = GridSearchCV(
    pipeline_lgbm_final, 
    param_grid_lgbm, 
    cv=ps, 
    scoring='f1_macro', 
    n_jobs=4, 
    verbose=2
)

print("LightGBM (NİHAİ ZENGİNLEŞTİRİLMİŞ VERİ ile) için GridSearchCV başlıyor...")
start_time = time.time()
grid_search.fit(X_train_val, y_train_val)
end_time = time.time()
duration_minutes = (end_time - start_time) / 60

# --- 5. ADIM: NİHAİ SONUÇLAR ---
# (Bu bölüm senin kodunla AYNI, mükemmel)
print("\nOptimizasyon tamamlandı.")
print(f"En iyi CV (Validation Seti) Skoru (F1 Macro): {grid_search.best_score_:.4f}")
print(f"En İyi Parametreler: {grid_search.best_params_}")

best_model = grid_search.best_estimator_

y_pred_test = best_model.predict(X_test)
print("\n--- FİNAL TEST SETİ PERFORMANS RAPORU (NİHAİ MODEL) ---")
print(classification_report(y_test, y_pred_test, target_names=['Doğru Mesaj (0)', 'Yalan Mesaj (1)']))

Nihai zenginleştirilmiş setler (train/val/test) yüklendi.
Eğitim seti özellikleri: ['game_id', 'speaker', 'receiver', 'message_text', 'sender_intention', 'game_score', 'game_score_delta', 'year', 'season', 'original_fold', 'cleaned_text', 'message_length', 'action_support_count', 'action_move_count', 'action_hold_count', 'promise_support', 'promise_attack', 'promise_hold', 'mismatch_support_vs_attack', 'mismatch_attack_vs_no_attack', 'mismatch_support_vs_no_support']
----------------------------------------
LightGBM (NİHAİ ZENGİNLEŞTİRİLMİŞ VERİ ile) için GridSearchCV başlıyor...
Fitting 1 folds for each of 2 candidates, totalling 2 fits
[LightGBM] [Info] Number of positive: 647, number of negative: 13901
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024055 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37860
[LightGBM] [Info] Number



In [14]:
# --- 1. ADIM: YENİ NİHAİ VERİLERİ YÜKLE ---
BASE_PROJECT_PATH = os.path.dirname(os.getcwd())
PROCESSED_DIR = os.path.join(BASE_PROJECT_PATH, "data", "processed", "diplomacy")

train_df = pd.read_parquet(os.path.join(PROCESSED_DIR, "train_final.parquet"))
val_df = pd.read_parquet(os.path.join(PROCESSED_DIR, "val_final.parquet"))
test_df = pd.read_parquet(os.path.join(PROCESSED_DIR, "test_final.parquet"))

# Tipleri zorla
categorical_features_list = ['speaker', 'receiver', 'season']
col_types = {col: 'object' for col in categorical_features_list}
train_df = train_df.astype(col_types); val_df = val_df.astype(col_types); test_df = test_df.astype(col_types)

X_train = train_df.drop('target', axis=1)
y_train = train_df['target']
X_val = val_df.drop('target', axis=1)
y_val = val_df['target']
X_test = test_df.drop('target', axis=1)
y_test = test_df['target']

print("Nihai zenginleştirilmiş setler (v2) yüklendi.")

# --- 2. ADIM: PREPROCESSOR (YENİ ÖZELLİKLE) ---
numeric_features = [
    'game_score', 'game_score_delta', 'year', 'message_length',
    'mismatch_score' # <-- YENİ GÜÇLÜ ÖZELLİĞİMİZ
]
numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])

categorical_features = ['speaker', 'receiver', 'season']
categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))])

text_feature = 'cleaned_text'
text_transformer = TfidfVectorizer(max_features=5000)

preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, text_feature),
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='drop'
)

# --- 3. ADIM: MODEL EĞİTİMİ (AYNI KALIYOR) ---
X_train_val = pd.concat([X_train, X_val], ignore_index=True)
y_train_val = pd.concat([y_train, y_val], ignore_index=True)
split_index = [-1] * len(X_train) + [0] * len(X_val)
ps = PredefinedSplit(test_fold=split_index)

pipeline_lgbm_final_v2 = Pipeline([
    ('preprocessor', preprocessor),
    ('model', lgb.LGBMClassifier(random_state=42, class_weight='balanced'))
])

param_grid_lgbm = {
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.1]
}

grid_search = GridSearchCV(pipeline_lgbm_final_v2, param_grid_lgbm, cv=ps, scoring='f1_macro', n_jobs=4, verbose=2)

print("LightGBM (NİHAİ V2 VERİSİ ile) GridSearchCV başlıyor...")
grid_search.fit(X_train_val, y_train_val)

# --- 4. ADIM: SONUÇLAR ---
print("\nOptimizasyon tamamlandı.")
print(f"En iyi CV (Validation Seti) Skoru (F1 Macro): {grid_search.best_score_:.4f}")
print(f"En İyi Parametreler: {grid_search.best_params_}")

best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)

print("\n--- FİNAL TEST SETİ PERFORMANS RAPORU (NİHAİ MODEL V2) ---")
print(classification_report(y_test, y_pred_test, target_names=['Doğru Mesaj (0)', 'Yalan Mesaj (1)']))

Nihai zenginleştirilmiş setler (v2) yüklendi.
LightGBM (NİHAİ V2 VERİSİ ile) GridSearchCV başlıyor...
Fitting 1 folds for each of 2 candidates, totalling 2 fits
[LightGBM] [Info] Number of positive: 647, number of negative: 13901
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.020164 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37845
[LightGBM] [Info] Number of data points in the train set: 14548, number of used features: 999
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000

Optimizasyon tamamlandı.
En iyi CV (Validation Seti) Skoru (F1 Macro): 0.5569
En İyi Parametreler: {'model__learning_rate': 0.1, 'model__n_estimators': 100}

--- FİNAL TEST SETİ PERFORMANS RAPORU (NİHAİ MODEL V2) ---
                 precision    recall  f1-score   support

Doğru M

