In [1]:
import pandas as pd
import numpy as np
import os
import joblib
import time
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline

In [2]:
current_dir = os.getcwd()
base_path = os.path.abspath(os.path.join(current_dir, '..', '..'))
data_path = os.path.join(base_path, "data", "processed", "diplomacy")
model_dir = os.path.join(base_path, "models", "deceptency")
reports_dir = os.path.join(base_path, "reports")

In [3]:
os.makedirs(model_dir, exist_ok=True)
os.makedirs(reports_dir, exist_ok=True)

In [4]:
categorical_features = ["speaker", "receiver", "season"]
col_types = {col: 'object' for col in categorical_features}

train_df = pd.read_parquet(os.path.join(data_path, "train_processed.parquet"))
train_df = train_df.astype(col_types)

val_df = pd.read_parquet(os.path.join(data_path, "val_processed.parquet"))
val_df = val_df.astype(col_types)

test_df = pd.read_parquet(os.path.join(data_path, "test_processed.parquet"))
test_df = test_df.astype(col_types)

X_train = train_df.drop('target', axis=1)
y_train = train_df['target']
X_val = val_df.drop('target', axis=1)
y_val = val_df['target']
X_test = test_df.drop('target', axis=1)
y_test = test_df['target']

In [5]:
# CHECK FOR EXISTING MODEL
model_path = os.path.join(model_dir, 'svm_diplomacy.pkl')
if os.path.exists(model_path):
    print(f'Model found at {model_path}. Loading...')
    best_model = joblib.load(model_path)
    
    print('Evaluating existing model...')
    y_pred_test = best_model.predict(X_test)
    print(classification_report(y_test, y_pred_test, target_names=['Truth', 'Deception']))
    
    print('Model already exists. Stopping execution to prevent retraining.')
    raise SystemExit('Model already exists.')


Model found at c:\work environment\Projects\amazon-spam-review\models\deceptency\svm_diplomacy.pkl. Loading...
Evaluating existing model...
              precision    recall  f1-score   support

       Truth       0.93      0.92      0.92      2501
   Deception       0.23      0.25      0.24       240

    accuracy                           0.86      2741
   macro avg       0.58      0.58      0.58      2741
weighted avg       0.87      0.86      0.86      2741

Model already exists. Stopping execution to prevent retraining.


SystemExit: Model already exists.

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [None]:
print(f"Train set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

In [None]:
numeric_features = ["game_score", "game_score_delta", "year", "message_length"]
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

text_feature = "cleaned_text"
text_transformer = TfidfVectorizer()

preprocessor = ColumnTransformer(transformers=[
    ('text', text_transformer, text_feature),
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
], remainder="drop")

In [None]:
X_train_val = pd.concat([X_train, X_val])
y_train_val = pd.concat([y_train, y_val])

split_index = [-1] * len(X_train) + [0] * len(X_val)
ps = PredefinedSplit(test_fold=split_index)

In [None]:
pipeline_svm = Pipeline([
    ('preprocessor', preprocessor),
    ('model', SVC(
        kernel='linear',
        class_weight='balanced',
        probability=True
    ))
])

In [None]:
param_grid_svm = {
    'preprocessor__text__max_features': [5000],
    'model__C': [0.1, 1, 10],
    'model__kernel': ['linear', 'rbf']
}

In [None]:
# Train
start_time = time.time()

grid_search = GridSearchCV(
    pipeline_svm,
    param_grid_svm,
    cv=ps,
    scoring='f1_macro',
    n_jobs=4,
    verbose=2
)

grid_search.fit(X_train_val, y_train_val)

end_time = time.time()

In [None]:
training_time_minutes = (end_time - start_time) / 60
print(f"Training time: {training_time_minutes:.2f} minutes")
print(f"Best params: {grid_search.best_params_}")

In [None]:
best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)

print("\n--- FINAL TEST SET PERFORMANCE REPORT ---")
report_str = classification_report(y_test, y_pred_test, target_names=['Truth', 'Deception'])
print(report_str)

In [None]:
if not skip_training:
    # Save Model
    model_filename = "svm_diplomacy.pkl"
    joblib.dump(best_model, os.path.join(model_dir, model_filename))
    print(f"Model saved to {os.path.join(model_dir, model_filename)}")

In [None]:
# Save Results
report_dict = classification_report(y_test, y_pred_test, output_dict=True)
result_data = {
    'category': 'Deceptency_SVM_Diplomacy',
    'best_cv_f1_score': grid_search.best_score_,
    'best_params': str(grid_search.best_params_),
    'test_accuracy': report_dict['accuracy'],
    'test_f1_truth': report_dict['0']['f1-score'],
    'test_precision_truth': report_dict['0']['precision'],
    'test_f1_deception': report_dict['1']['f1-score'],
    'test_precision_deception': report_dict['1']['precision'],
    'training_time_minutes': training_time_minutes
}

results_file = os.path.join(reports_dir, "model_results_deceptency_svm_diplomacy.csv")
result_df = pd.DataFrame([result_data])
header = not os.path.exists(results_file)
result_df.to_csv(results_file, mode='a', header=header, index=False)
print(f"Results saved to {results_file}")