In [12]:
# This script performs Soft Voting on the saved probabilities from the SVM and CNN-RNN models.
# It assumes two files have been generated by Stages 1 and 2:
# 1. 'svm_predictions_test.csv'
# 2. 'cnn_rnn_predictions_test.csv'
import pandas as pd
import numpy as np
import os
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

In [13]:
print("--- 1. Loading Predictions for Ensemble Voting ---")

--- 1. Loading Predictions for Ensemble Voting ---


In [14]:
# Define the prediction file names
svm_file = 'svm_predictions_test.csv'
cnn_rnn_file = 'cnn_rnn_predictions_test.csv'

In [15]:
# Initialize DataFrames outside the try block to avoid NameError if loading fails
svm_df = pd.DataFrame()
cnn_rnn_df = pd.DataFrame()

In [18]:
try:
    # Load SVM predictions (includes 'true_label' and 'svm_prob')
    svm_df = pd.read_csv(svm_file)
    print(f"Loaded SVM predictions from {svm_file}")

    # Load CNN-RNN predictions (includes 'true_label' and 'cnn_rnn_prob')
    cnn_rnn_df = pd.read_csv(cnn_rnn_file)
    print(f"Loaded CNN-RNN predictions from {cnn_rnn_file}")

except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Cannot proceed. Please ensure the SVM and CNN-RNN scripts ran successfully and saved their prediction files.")
    exit()


Loaded SVM predictions from svm_predictions_test.csv
Loaded CNN-RNN predictions from cnn_rnn_predictions_test.csv


In [19]:
def validate_predictions(df, required_cols, filename):
    """Checks if the necessary columns exist in the loaded DataFrame."""
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        print(f"\n--- ERROR: Column Mismatch in {filename} ---")
        print(f"File Columns Found: {df.columns.tolist()}")
        print(f"Missing Required Columns: {missing_cols}")
        print("Please rerun the corresponding model script (SVM or CNN-RNN) to generate the correct file.")
        exit()

In [20]:
# If the program reached this point, the DataFrames should be successfully loaded.
if svm_df.empty or cnn_rnn_df.empty:
    print("\nError: One or both prediction files were loaded but appear empty. Check the previous model steps.")
    exit()


In [21]:
# Perform validation check before merging
validate_predictions(svm_df, ['true_label', 'svm_prob'], svm_file)
validate_predictions(cnn_rnn_df, ['true_label', 'cnn_rnn_prob'], cnn_rnn_file)

In [22]:
# 2. Combine Predictions and Align Data

# We merge the two prediction files using the shared 'true_label' column
# This step ensures we are comparing predictions from the same test samples.
ensemble_df = pd.merge(
    svm_df[['true_label', 'svm_prob']],
    cnn_rnn_df[['true_label', 'cnn_rnn_prob']],
    on='true_label',
    how='inner' # Only include samples present in both dataframes
)
Y_true = ensemble_df['true_label'].values
print(f"\nCombined {len(ensemble_df)} test sample predictions.")



Combined 32699785 test sample predictions.


In [23]:
# 3. Soft Voting (Weighted Averaging) Ensemble
print("\n--- 3. Performing Soft Voting Ensemble ---")


--- 3. Performing Soft Voting Ensemble ---


In [24]:
# Determine optimal weights.
# we will use weights based on their individual performance:
# If CNN-RNN had 95% accuracy and SVM had 85% accuracy, the ratio of their weights
# should be 95:85, or roughly 55% for CNN-RNN and 45% for SVM.

# Example Weights (adjust these based on the actual accuracy results you got):
W_SVM = 0.45
W_CNN_RNN = 0.55
print(f"Ensemble Weights: SVM ({W_SVM:.2f}), CNN-RNN ({W_CNN_RNN:.2f})")

Ensemble Weights: SVM (0.45), CNN-RNN (0.55)


In [25]:
# Calculate the final ensemble probability (Soft Voting)
Y_ensemble_prob = (
    (ensemble_df['svm_prob'] * W_SVM) +
    (ensemble_df['cnn_rnn_prob'] * W_CNN_RNN)
)

In [26]:
# Convert the ensemble probability to a class prediction (0 or 1) using a 0.5 threshold
Y_ensemble_class = (Y_ensemble_prob > 0.5).astype(int)

In [27]:
# 4. Final Evaluation of Ensemble Model

print("\n--- 4. Ensemble Model Evaluation ---")

# Calculate standard metrics
ensemble_accuracy = accuracy_score(Y_true, Y_ensemble_class)
ensemble_auc = roc_auc_score(Y_true, Y_ensemble_prob)
print(f"Overall Ensemble Accuracy: {ensemble_accuracy:.4f}")
print(f"Overall Ensemble AUC Score: {ensemble_auc:.4f}")


--- 4. Ensemble Model Evaluation ---
Overall Ensemble Accuracy: 0.9709
Overall Ensemble AUC Score: 0.9965


In [28]:
# Display the final Classification Report
print("\n--- Results: Ensemble Voting Classification Report ---")
print(classification_report(Y_true, Y_ensemble_class, target_names=['Real (0)', 'Fake (1)']))


--- Results: Ensemble Voting Classification Report ---
              precision    recall  f1-score   support

    Real (0)       0.97      0.97      0.97  16353936
    Fake (1)       0.97      0.97      0.97  16345849

    accuracy                           0.97  32699785
   macro avg       0.97      0.97      0.97  32699785
weighted avg       0.97      0.97      0.97  32699785



In [None]:
# You can now compare this report to the individual SVM and CNN-RNN reports
# to show the improvement from the ensemble method.