In [162]:
import pandas as pd
import numpy as np
import os
import joblib
import time
import lightgbm as lgb

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import PredefinedSplit 

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [163]:
base_path = os.path.dirname(os.getcwd())
data_path = os.path.join(base_path, "data", "processed", "diplomacy")

categorical_features = ["speaker", "receiver", "season"]
col_types = {col: 'object' for col in categorical_features}

data = pd.read_parquet(os.path.join(data_path, "diplomacy_processed.parquet"))
data = data.astype(col_types)
train_df = pd.read_parquet(os.path.join(data_path, "train_processed.parquet"))
train_df = train_df.astype(col_types)
val_df = pd.read_parquet(os.path.join(data_path, "val_processed.parquet"))
val_df = val_df.astype(col_types)
test_df = pd.read_parquet(os.path.join(data_path, "test_processed.parquet"))
test_df = test_df.astype(col_types)

In [164]:
print(data.columns)

Index(['game_id', 'speaker', 'receiver', 'message_text', 'sender_intention',
       'game_score', 'game_score_delta', 'year', 'season', 'original_fold',
       'target', 'cleaned_text', 'message_length'],
      dtype='object')


In [165]:
X_train = train_df.drop('target', axis=1)
y_train = train_df['target']
X_val = val_df.drop('target', axis=1)
y_val = val_df['target']
X_test = test_df.drop('target', axis=1)
y_test = test_df['target']

In [166]:
print(f"train set size: {len(X_train)}")

train set size: 13132


In [167]:
numeric_features = ["game_score", "game_score_delta", "year", "message_length"]
numeric_transformer = Pipeline(steps=[
  ("imputer", SimpleImputer(strategy="most_frequent")),
  ("scaler", StandardScaler())
])

categorical_features = ["speaker", "receiver", "season"]
categorical_transformer = Pipeline(steps=[
  ("imputer", SimpleImputer(strategy="constant", fill_value="most_frequent")),
  ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

text_feature = "cleaned_text"
text_transformer = TfidfVectorizer()

preprocessor = ColumnTransformer(transformers=[
  ('text', text_transformer, text_feature),
  ('num', numeric_transformer, numeric_features),
  ('cat', categorical_transformer, categorical_features)
], remainder="drop")

In [168]:
X_train_val = pd.concat([X_train, X_val])
y_train_val = pd.concat([y_train, y_val])
# because gridsearch expects the formal as x and y, we concat train and validation set; but modify how gridsearch splits data
split_index = [-1] * len(X_train) + [0] * len(X_val)
ps = PredefinedSplit(test_fold=split_index)

In [169]:
pipeline_lgbm_smote = Pipeline([
    ('preprocessor', preprocessor),
    ('sampler', SMOTE(random_state=42)),
    ('model', lgb.LGBMClassifier(random_state=42))
])

In [170]:
param_grid_lgbm = {
    'preprocessor__text__max_features': [5000],
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.1, 0.5, 1.0]
}

In [171]:
grid_search = GridSearchCV(
    pipeline_lgbm_smote, 
    param_grid_lgbm, 
    cv=ps,
    scoring='f1_weighted', 
    n_jobs=4, 
    verbose=2
)

In [172]:
grid_search.fit(X_train_val, y_train_val)
best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)
print(classification_report(y_test, y_pred_test, target_names=['Truth', 'Deception']))

Fitting 1 folds for each of 6 candidates, totalling 6 fits
[LightGBM] [Info] Number of positive: 13901, number of negative: 13901
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.089093 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 113245
[LightGBM] [Info] Number of data points in the train set: 27802, number of used features: 1821
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 13901, number of negative: 13901
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.089093 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 113245
[LightGBM] [Info] Number of data points in the train set: 27802, number of used features: 1821
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
              precision    recall  f1-score   support





In [None]:
pipeline_lgbm_weighted = Pipeline([
    ('preprocessor', preprocessor),
    ('model', lgb.LGBMClassifier(
        random_state=42,
        class_weight='balanced' 
    ))
])

In [None]:
param_grid_lgbm = {
    'preprocessor__text__max_features': [5000],
    'model__n_estimators': [100, 200],
    'model__learning_rate': [0.1, 0.5, 1.0]
}

In [None]:
grid_search = GridSearchCV(
    pipeline_lgbm_weighted, 
    param_grid_lgbm, 
    cv=ps, 
    scoring='f1_macro',
    n_jobs=4, 
    verbose=2
)