In [1]:
import pandas as pd
import numpy as np
import os
import joblib
import time
import lightgbm as lgb

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit 

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [2]:
base_path = os.path.dirname(os.path.dirname(os.getcwd()))
data_path = os.path.join(base_path, "data", "processed", "diplomacy")
print(base_path)

c:\work environment\Projects\amazon-spam-review


In [3]:
categorical_features = ["speaker", "receiver", "season"]
col_types = {col: 'object' for col in categorical_features}

data = pd.read_parquet(os.path.join(data_path, "diplomacy_processed.parquet"))
data = data.astype(col_types)
train_df = pd.read_parquet(os.path.join(data_path, "train_processed.parquet"))
train_df = train_df.astype(col_types)
val_df = pd.read_parquet(os.path.join(data_path, "val_processed.parquet"))
val_df = val_df.astype(col_types)
test_df = pd.read_parquet(os.path.join(data_path, "test_processed.parquet"))
test_df = test_df.astype(col_types)

In [4]:
print(data.columns)

Index(['game_id', 'speaker', 'receiver', 'message_text', 'sender_intention',
       'game_score', 'game_score_delta', 'year', 'season', 'original_fold',
       'target', 'cleaned_text', 'message_length'],
      dtype='object')


In [None]:
# CHECK FOR EXISTING MODEL
model_path = os.path.join(model_dir, 'lightgbm_diplomacy.pkl')
try:
    best_model = joblib.load(model_path)
    print(f'Model loaded from {model_path}')
    print('Skipping training and proceeding to evaluation...')
    skip_training = True
except FileNotFoundError:
    print('No existing model found. Will train a new model.')
    skip_training = False

In [6]:
X_train = train_df.drop('target', axis=1)
y_train = train_df['target']
X_val = val_df.drop('target', axis=1)
y_val = val_df['target']
X_test = test_df.drop('target', axis=1)
y_test = test_df['target']

In [7]:
print(f"train set size: {len(X_train)}")

train set size: 13132


In [8]:
numeric_features = ["game_score", "game_score_delta", "year", "message_length"]
numeric_transformer = Pipeline(steps=[
  ("imputer", SimpleImputer(strategy="most_frequent")),
  ("scaler", StandardScaler())
])

categorical_features = ["speaker", "receiver", "season"]
categorical_transformer = Pipeline(steps=[
  ("imputer", SimpleImputer(strategy="constant", fill_value="most_frequent")),
  ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

text_feature = "cleaned_text"
text_transformer = TfidfVectorizer()

preprocessor = ColumnTransformer(transformers=[
  ('text', text_transformer, text_feature),
  ('num', numeric_transformer, numeric_features),
  ('cat', categorical_transformer, categorical_features)
], remainder="drop")

In [9]:
X_train_val = pd.concat([X_train, X_val])
y_train_val = pd.concat([y_train, y_val])
# because gridsearch expects the formal as x and y, we concat train and validation set; but modify how gridsearch splits data
split_index = [-1] * len(X_train) + [0] * len(X_val)
ps = PredefinedSplit(test_fold=split_index)

In [10]:
# try to cover data imbalance with generating artificial lies with smote
pipeline_lgbm_smote = Pipeline([
    ('preprocessor', preprocessor),
    ('sampler', SMOTE(random_state=42)),
    ('model', lgb.LGBMClassifier(random_state=42))
])

In [11]:
param_grid_lgbm = {
    'preprocessor__text__max_features': [5000, 15000],
    'model__n_estimators': [100, 200, 500],
    'model__learning_rate': [0.1, 0.5, 1.0]
}

In [12]:
grid_search = GridSearchCV(
    pipeline_lgbm_smote, 
    param_grid_lgbm, 
    cv=ps,
    scoring='f1_weighted', 
    n_jobs=4, 
    verbose=2
)

In [None]:
if not skip_training:
    start_time = time.time()
    grid_search.fit(X_train_val, y_train_val)
    end_time = time.time()
    training_time_minutes = (end_time - start_time) / 60
    print(f"Training time: {training_time_minutes:.2f} minutes")
best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)
print(classification_report(y_test, y_pred_test, target_names=['Truth', 'Deception']))


Fitting 1 folds for each of 18 candidates, totalling 18 fits
[LightGBM] [Info] Number of positive: 13901, number of negative: 13901
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.099371 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 115719
[LightGBM] [Info] Number of data points in the train set: 27802, number of used features: 1990
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Training time: 1.36 minutes
              precision    recall  f1-score   support

       Truth       0.91      0.99      0.95      2501
   Deception       0.33      0.03      0.05       240

    accuracy                           0.91      2741
   macro avg       0.62      0.51      0.50      2741
weighted avg       0.86      0.91      0.87      2741





Not good at all. LightGBM model with smote calls everything truth so deceptency detection is miserable.

In [14]:
# # --- SAVE MODEL AND REPORT ---
# model_dir = os.path.join(base_path, "models", "deceptency")
# os.makedirs(model_dir, exist_ok=True)
# print("Model saved.")

# # Metrics
# from sklearn.metrics import classification_report
# report = classification_report(y_test, y_pred_test, output_dict=True)
# result_data = {
#     'category': 'Deceptency_lightgbm_diplomacy',
#     'best_cv_f1_score': grid_search.best_score_,
#     'best_params': str(grid_search.best_params_),
#     'test_accuracy': report['accuracy'],
#     'test_f1_truth': report['0']['f1-score'],
#     'test_precision_truth': report['0']['precision'],
#     'training_time_minutes': training_time_minutes
# }
# results_file = os.path.join(base_path, "reports", "model_results_deceptency_lightgbm_diplomacy.csv")
# result_df = pd.DataFrame([result_data])
# header = not os.path.exists(results_file)
# result_df.to_csv(results_file, mode='a', header=header, index=False)
# print(f"Results saved to {results_file}")

In [None]:
print(grid_search.best_params_)
best_model = grid_search.best_estimator_
joblib.dump(best_model, os.path.join(model_dir, "lightgbm_diplomacy.pkl"))

In [16]:
# try to punish missing lies more
pipeline_lgbm_weighted = Pipeline([
    ('preprocessor', preprocessor),
    ('model', lgb.LGBMClassifier(
        random_state=42,
        class_weight='balanced'
    ))
])

In [17]:
param_grid_lgbm = {
    'preprocessor__text__max_features': [500, 1000, 5000],
    'model__n_estimators': [2, 5, 10, 100, 500],
    'model__learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0]
}

In [18]:
grid_search = GridSearchCV(
    pipeline_lgbm_weighted, 
    param_grid_lgbm, 
    cv=ps, 
    scoring='f1',
    n_jobs=4, 
    verbose=2
)

In [19]:
grid_search.fit(X_train_val, y_train_val)
best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)
print(classification_report(y_test, y_pred_test, target_names=['Truth', 'Deception']))

Fitting 1 folds for each of 75 candidates, totalling 75 fits
[LightGBM] [Info] Number of positive: 647, number of negative: 13901
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.021908 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 37845
[LightGBM] [Info] Number of data points in the train set: 14548, number of used features: 999
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
              precision    recall  f1-score   support

       Truth       0.93      0.70      0.80      2501
   Deception       0.13      0.45      0.20       240

    accuracy                           0.68      2741
   macro avg       0.53      0.58      0.50      2741
weighted avg       0.86      0.68      0.75      2741





Although results are significantly better than last time, deceptency detection is still not good enough.

In [20]:
print(grid_search.best_params_)

{'model__learning_rate': 0.1, 'model__n_estimators': 10, 'preprocessor__text__max_features': 5000}


In [21]:
pipeline_lgbm_enriched = Pipeline([
    ('preprocessor', preprocessor),
    ('model', lgb.LGBMClassifier(
        random_state=42,
        class_weight='balanced'
    ))
])

In [22]:
param_grid_lgbm = {
    'preprocessor__text__max_features': [5000],
    'model__n_estimators': [100],
    'model__learning_rate': [0.1]
}

In [23]:
grid_search = GridSearchCV(
    pipeline_lgbm_enriched, 
    param_grid_lgbm, 
    cv=ps, 
    scoring='f1_macro', 
    n_jobs=4, 
    verbose=2
)

In [24]:
grid_search.fit(X_train_val, y_train_val)
best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)
print("\n--- FİNAL TEST SETİ PERFORMANS RAPORU ---")
print(classification_report(y_test, y_pred_test, target_names=['Doğru Mesaj (0)', 'Yalan Mesaj (1)']))

Fitting 1 folds for each of 1 candidates, totalling 1 fits
[LightGBM] [Info] Number of positive: 647, number of negative: 13901
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.018968 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 37845
[LightGBM] [Info] Number of data points in the train set: 14548, number of used features: 999
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000

--- FİNAL TEST SETİ PERFORMANS RAPORU ---
                 precision    recall  f1-score   support

Doğru Mesaj (0)       0.92      0.90      0.91      2501
Yalan Mesaj (1)       0.17      0.23      0.19       240

       accuracy                           0.84      2741
      macro avg       0.55      0.56      0.55      2741
   weighted avg       0.86      0.84      0.85      274



In [25]:
BASE_PROJECT_PATH = os.path.dirname(os.path.dirname(os.getcwd()))
print(BASE_PROJECT_PATH)

c:\work environment\Projects\amazon-spam-review


In [26]:
# Save Model
BASE_PROJECT_PATH = os.path.dirname(os.path.dirname(os.getcwd()))
model_dir = os.path.join(BASE_PROJECT_PATH, "models", "deceptency")
os.makedirs(model_dir, exist_ok=True)
joblib.dump(best_model, os.path.join(model_dir, "lightgbm_diplomacy.pkl"))
print("Model saved.")

# Save Results to CSV
report = classification_report(y_test, y_pred_test, output_dict=True)

result_data = {
    'category': 'Deceptency_LightGBM',
    'best_cv_f1_score': grid_search.best_score_,
    'test_accuracy': report['accuracy'],
    'test_f1_truth': report['0']['f1-score'],
    'test_precision_truth': report['0']['precision'],
    'best_params': str(grid_search.best_params_),
    'training_time_minutes': training_time_minutes or 0
}

results_file = os.path.join(BASE_PROJECT_PATH, "reports", "model_results_deceptency_lightgbm.csv")
result_df = pd.DataFrame([result_data])
header = not os.path.exists(results_file)
result_df.to_csv(results_file, mode='a', header=header, index=False)


Model saved.


## Training on Enriched Data (Additional Experiment)
Retraining LightGBM using the enriched dataset (Linguistic + Moves features).

In [27]:
print("Loading Enriched Data...")
train_df_en = pd.read_parquet(os.path.join(data_path, "train_enriched.parquet"))
val_df_en = pd.read_parquet(os.path.join(data_path, "val_enriched.parquet"))
test_df_en = pd.read_parquet(os.path.join(data_path, "test_enriched.parquet"))

# Ensure columns are object if needed for categories
for col in categorical_features:
    train_df_en[col] = train_df_en[col].astype('object')
    val_df_en[col] = val_df_en[col].astype('object')
    test_df_en[col] = test_df_en[col].astype('object')

X_train_en = train_df_en.drop('target', axis=1)
y_train_en = train_df_en['target']
X_val_en = val_df_en.drop('target', axis=1)
y_val_en = val_df_en['target']
X_test_en = test_df_en.drop('target', axis=1)
y_test_en = test_df_en['target']

X_train_val_en = pd.concat([X_train_en, X_val_en])
y_train_val_en = pd.concat([y_train_en, y_val_en])
split_index_en = [-1] * len(X_train_en) + [0] * len(X_val_en)
ps_en = PredefinedSplit(test_fold=split_index_en)

print(f"Enriched Train/Val Shape: {X_train_val_en.shape}")

Loading Enriched Data...
Enriched Train/Val Shape: (14548, 27)


In [28]:
# Update Numeric Features
new_numeric_features = [
    'sentiment_polarity', 'sentiment_subjectivity',
    'n_question_marks', 'n_exclamation_marks', 'n_ellipses',
    'n_i_pronouns', 'n_we_pronouns', 
    'avg_word_length', 'uppercase_ratio',
    'n_orders', 'n_support', 'n_hold', 'n_move_fails'
]
all_numeric_features = numeric_features + new_numeric_features

preprocessor_en = ColumnTransformer(transformers=[
  ('text', text_transformer, text_feature),
  ('num', numeric_transformer, all_numeric_features),
  ('cat', categorical_transformer, categorical_features)
], remainder="drop")

pipeline_lgbm_en = Pipeline([
    ('preprocessor', preprocessor_en),
    ('model', lgb.LGBMClassifier(
        random_state=42,
        class_weight='balanced'
    ))
])

In [29]:
# Train on Enriched Data
print("Training LightGBM on Enriched Data...")
start_time = time.time()

# Using smaller grid for speed
param_grid_lgbm_en = {
    'preprocessor__text__max_features': [5000],
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.1]
}

grid_search_en = GridSearchCV(
    pipeline_lgbm_en, 
    param_grid_lgbm_en, 
    cv=ps_en, 
    scoring='f1_macro', 
    n_jobs=4, 
    verbose=2
)

grid_search_en.fit(X_train_val_en, y_train_val_en)

end_time = time.time()
train_time_en = (end_time - start_time) / 60
print(f"Enriched Training Time: {train_time_en:.2f} min")

best_model_en = grid_search_en.best_estimator_
y_pred_test_en = best_model_en.predict(X_test_en)

print("\n--- ENRICHED TEST SET PERFORMANCE ---")
print(classification_report(y_test_en, y_pred_test_en, target_names=['Truth', 'Deception']))

Training LightGBM on Enriched Data...
Fitting 1 folds for each of 2 candidates, totalling 2 fits
[LightGBM] [Info] Number of positive: 647, number of negative: 13901
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019156 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 38958
[LightGBM] [Info] Number of data points in the train set: 14548, number of used features: 1012
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Enriched Training Time: 0.05 min

--- ENRICHED TEST SET PERFORMANCE ---
              precision    recall  f1-score   support

       Truth       0.92      0.91      0.91      2501
   Deception       0.18      0.21      0.19       240

    accuracy                           0.85      2741
   macro avg       0.55      0.56      0.55      2741
we



Enriched data does not provide enough information to train a model when we consider the additional burden that it would add to the model. 