In [None]:
import pandas as pd
import numpy as np
import os
import joblib
import time
from sklearn.metrics import classification_report

In [None]:
current_dir = os.getcwd()
base_path = os.path.abspath(os.path.join(current_dir, '..', '..'))
data_path = os.path.join(base_path, "data", "processed", "diplomacy")
model_dir = os.path.join(base_path, "models", "deceptency")
reports_dir = os.path.join(base_path, "reports")
os.makedirs(model_dir, exist_ok=True)
os.makedirs(reports_dir, exist_ok=True)

In [None]:
# Load Test Data
# We only need Test data because we are using pre-trained models
categorical_features = ["speaker", "receiver", "season"]
col_types = {col: 'object' for col in categorical_features}

test_df = pd.read_parquet(os.path.join(data_path, "test_processed.parquet"))
test_df = test_df.astype(col_types)

X_test = test_df.drop('target', axis=1)
y_test = test_df['target']

print(f"Test set size: {len(X_test)}")

In [None]:
# Load Pre-trained Models
models_to_load = {
    'RandomForest': 'random_forest_diplomacy.pkl',
    'SVM': 'svm_diplomacy.pkl',
    'LightGBM': 'lightgbm_diplomacy.pkl',
    'MLP': 'neural_network_diplomacy.pkl',
    'NaiveBayes': 'naive_bayes_deceptency.pkl',
    'LogisticRegression': 'logistic_regression_diplomacy.pkl'
}

loaded_models = {}
for name, filename in models_to_load.items():
    path = os.path.join(model_dir, filename)
    if os.path.exists(path):
        try:
            print(f"Loading {name} from {filename}...")
            model = joblib.load(path)
            loaded_models[name] = model
        except Exception as e:
            print(f"Failed to load {name}: {e}")
    else:
        print(f"WARNING: {filename} not found. Skipping {name}. Please run its notebook to generate the model.")

print(f"\nSuccessfully loaded {len(loaded_models)} models: {list(loaded_models.keys())}")

In [None]:
if len(loaded_models) == 0:
    raise ValueError("No models loaded! Cannot proceed with Ensemble.")

print("Calculating Soft Voting Probabilities...")
start_time = time.time()

# Collect probabilities from each model
all_probs = []
for name, model in loaded_models.items():
    print(f"Predicting with {name}...")
    # Each model is a Pipeline, so it takes the raw DataFrame
    probs = model.predict_proba(X_test)
    # Taking probability of class 1 (Deception) or both? 
    # predict_proba returns [prob_0, prob_1]
    all_probs.append(probs)

# Convert to array for averaging (Shape: [n_models, n_samples, n_classes])
all_probs_array = np.array(all_probs)

# Average probabilities (Soft Voting)
avg_probs = np.mean(all_probs_array, axis=0)

# Get final class predictions (argmax)
y_pred_test = np.argmax(avg_probs, axis=1)

end_time = time.time()
inference_time_minutes = (end_time - start_time) / 60
print(f"Inference time: {inference_time_minutes:.2f} minutes")

In [None]:
print("\n--- FINAL ENSEMBLE TEST SET PERFORMANCE REPORT ---")
report_str = classification_report(y_test, y_pred_test, target_names=['Truth', 'Deception'])
print(report_str)

In [None]:
# Save Results
report_dict = classification_report(y_test, y_pred_test, output_dict=True)
model_names_str = "+".join(loaded_models.keys())

result_data = {
    'category': 'Deceptency_Ensemble_VotingSoft_PreTrained',
    'best_cv_f1_score': 'N/A',
    'best_params': f'Soft Voting ({model_names_str})',
    'test_accuracy': report_dict['accuracy'],
    'test_f1_truth': report_dict['0']['f1-score'],
    'test_precision_truth': report_dict['0']['precision'],
    'test_f1_deception': report_dict['1']['f1-score'],
    'test_precision_deception': report_dict['1']['precision'],
    'training_time_minutes': inference_time_minutes # Note: This is inference time here
}

results_file = os.path.join(reports_dir, "model_results_deceptency_ensemble_diplomacy.csv")
result_df = pd.DataFrame([result_data])
header = not os.path.exists(results_file)
result_df.to_csv(results_file, mode='a', header=header, index=False)
print(f"Results saved to {results_file}")