In [None]:
import pandas as pd
import numpy as np
import os
import joblib
import time
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline

In [None]:
current_dir = os.getcwd()
base_path = os.path.abspath(os.path.join(current_dir, '..', '..'))
data_path = os.path.join(base_path, "data", "processed", "diplomacy")
model_dir = os.path.join(base_path, "models", "deceptency")
reports_dir = os.path.join(base_path, "reports")

In [None]:
os.makedirs(model_dir, exist_ok=True)
os.makedirs(reports_dir, exist_ok=True)

In [None]:
categorical_features = ["speaker", "receiver", "season"]
col_types = {col: 'object' for col in categorical_features}

train_df = pd.read_parquet(os.path.join(data_path, "train_processed.parquet"))
train_df = train_df.astype(col_types)

val_df = pd.read_parquet(os.path.join(data_path, "val_processed.parquet"))
val_df = val_df.astype(col_types)

test_df = pd.read_parquet(os.path.join(data_path, "test_processed.parquet"))
test_df = test_df.astype(col_types)

X_train = train_df.drop('target', axis=1)
y_train = train_df['target']
X_val = val_df.drop('target', axis=1)
y_val = val_df['target']
X_test = test_df.drop('target', axis=1)
y_test = test_df['target']

In [None]:
print(f"Train set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

In [None]:
numeric_features = ["game_score", "game_score_delta", "year", "message_length"]
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

text_feature = "cleaned_text"
text_transformer = TfidfVectorizer()

preprocessor = ColumnTransformer(transformers=[
    ('text', text_transformer, text_feature),
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
], remainder="drop")

In [None]:
X_train_val = pd.concat([X_train, X_val])
y_train_val = pd.concat([y_train, y_val])

split_index = [-1] * len(X_train) + [0] * len(X_val)
ps = PredefinedSplit(test_fold=split_index)

In [None]:
pipeline_svm = Pipeline([
    ('preprocessor', preprocessor),
    ('model', SVC(
        kernel='linear',
        class_weight='balanced',
        probability=True
    ))
])

In [None]:
param_grid_svm = {
    'preprocessor__text__max_features': [5000],
    'model__C': [0.1, 1, 10],
    'model__kernel': ['linear', 'rbf']
}