In [1]:
import pandas as pd
import numpy as np
import os
import joblib
import time
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report
from imblearn.pipeline import Pipeline

In [2]:
current_dir = os.getcwd()
base_path = os.path.abspath(os.path.join(current_dir, '..', '..'))
data_path = os.path.join(base_path, "data", "processed", "diplomacy")
model_dir = os.path.join(base_path, "models", "deceptency")
reports_dir = os.path.join(base_path, "reports")
os.makedirs(model_dir, exist_ok=True)
os.makedirs(reports_dir, exist_ok=True)

In [None]:
# CHECK FOR EXISTING MODEL
model_path = os.path.join(model_dir, 'logistic_regression_diplomacy.pkl')
try:
    best_model = joblib.load(model_path)
    print(f'Model loaded from {model_path}')
    print('Skipping training and proceeding to evaluation...')
    skip_training = True
except FileNotFoundError:
    print('No existing model found. Will train a new model.')
    skip_training = False

In [3]:
categorical_features = ["speaker", "receiver", "season"]
col_types = {col: 'object' for col in categorical_features}

train_df = pd.read_parquet(os.path.join(data_path, "train_processed.parquet"))
train_df = train_df.astype(col_types)

val_df = pd.read_parquet(os.path.join(data_path, "val_processed.parquet"))
val_df = val_df.astype(col_types)

test_df = pd.read_parquet(os.path.join(data_path, "test_processed.parquet"))
test_df = test_df.astype(col_types)

X_train = train_df.drop('target', axis=1)
y_train = train_df['target']
X_val = val_df.drop('target', axis=1)
y_val = val_df['target']
X_test = test_df.drop('target', axis=1)
y_test = test_df['target']

print(f"Train set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

Train set size: 13132
Validation set size: 1416
Test set size: 2741


In [4]:
numeric_features = ["game_score", "game_score_delta", "year", "message_length"]
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

text_feature = "cleaned_text"
text_transformer = TfidfVectorizer()

preprocessor = ColumnTransformer(transformers=[
    ('text', text_transformer, text_feature),
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
], remainder="drop")

In [5]:
X_train_val = pd.concat([X_train, X_val])
y_train_val = pd.concat([y_train, y_val])
split_index = [-1] * len(X_train) + [0] * len(X_val)
ps = PredefinedSplit(test_fold=split_index)

In [None]:
pipeline_lr = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(
        class_weight='balanced',
        max_iter=1000,
        random_state=42
    ))
])

param_grid_lr = {
    'preprocessor__text__max_features': [5000, 10000],
    'model__C': [0.1, 1, 10],
    'model__solver': ['lbfgs', 'liblinear']
}

In [None]:
if not skip_training:
    start_time = time.time()
    grid_search = GridSearchCV(
        pipeline_lr,
        param_grid_lr,
        cv=ps,
        scoring='f1_macro',
        n_jobs=4,
        verbose=2
    )
    grid_search.fit(X_train_val, y_train_val)
    end_time = time.time()
    
    training_time_minutes = (end_time - start_time) / 60
    print(f"Training time: {training_time_minutes:.2f} minutes")
    print(f"Best params: {grid_search.best_params_}")
    
    # Get best model
    best_model = grid_search.best_estimator_
else:
    print("Using loaded model, training skipped")
    training_time_minutes = 0.0
    # For reporting purposes when model is loaded
    grid_search = type('obj', (object,), {
        'best_score_': 'N/A (model loaded)',
        'best_params_': 'N/A (model loaded)'
    })()

Fitting 1 folds for each of 12 candidates, totalling 12 fits
Training time: 0.11 minutes
Best params: {'model__C': 10, 'model__solver': 'lbfgs', 'preprocessor__text__max_features': 10000}


In [8]:
best_model = grid_search.best_estimator_
y_pred_test = best_model.predict(X_test)

print("\n--- FINAL TEST SET PERFORMANCE REPORT ---")
report_str = classification_report(y_test, y_pred_test, target_names=['Truth', 'Deception'])
print(report_str)


--- FINAL TEST SET PERFORMANCE REPORT ---
              precision    recall  f1-score   support

       Truth       0.92      0.82      0.87      2501
   Deception       0.13      0.28      0.18       240

    accuracy                           0.77      2741
   macro avg       0.53      0.55      0.52      2741
weighted avg       0.85      0.77      0.81      2741



In [None]:
# Save Model
model_filename = "logistic_regression_diplomacy.pkl"
joblib.dump(best_model, os.path.join(model_dir, model_filename))
print(f"Model saved to {os.path.join(model_dir, model_filename)}")

Model saved to c:\work environment\Projects\amazon-spam-review\models\deceptency\logistic_regression_diplomacy.pkl


In [10]:
# Save Results with extended metrics
report_dict = classification_report(y_test, y_pred_test, output_dict=True)
result_data = {
    'category': 'Deceptency_LogisticRegression_Diplomacy',
    'best_cv_f1_score': grid_search.best_score_,
    'best_params': str(grid_search.best_params_),
    'test_accuracy': report_dict['accuracy'],
    'test_f1_truth': report_dict['0']['f1-score'],
    'test_precision_truth': report_dict['0']['precision'],
    'test_f1_deception': report_dict['1']['f1-score'],
    'test_precision_deception': report_dict['1']['precision'],
    'training_time_minutes': training_time_minutes
}

results_file = os.path.join(reports_dir, "model_results_deceptency_logistic_regression_diplomacy.csv")
result_df = pd.DataFrame([result_data])
header = not os.path.exists(results_file)
result_df.to_csv(results_file, mode='a', header=header, index=False)
print(f"Results saved to {results_file}")

Results saved to c:\work environment\Projects\amazon-spam-review\reports\model_results_deceptency_logistic_regression_diplomacy.csv
