## 1. Setup: Imports and Configuration

In [None]:
import numpy as np
import pandas as pd
import librosa
import librosa.display
import matplotlib.pyplot as plt
import os
import time
import warnings
from joblib import dump, load

# Use tqdm.auto for notebook/script compatibility
from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, classification_report, roc_auc_score,
    precision_recall_fscore_support, confusion_matrix
)
import xgboost as xgb

import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import (
    Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization, Input
)
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam


# Ignore librosa warnings about audioread potentially failing on MP3s
warnings.filterwarnings('ignore', module='librosa')

In [None]:
# --- Configuration ---
# IMPORTANT: Change this path to the root directory of your extracted dataset
DATASET_PATH = 'C:/Users/USER/Downloads/Authentica---DeepFake-detection/audio_final_new/archive/for-2sec/for-2seconds/training'
REAL_DIR = os.path.join(DATASET_PATH, 'real')
FAKE_DIR = os.path.join(DATASET_PATH, 'fake')

SAMPLE_RATE = 16000  # Sample rate for loading audio
SEED = 42          # Random seed for reproducibility

# Feature Extraction Params
N_MFCC = 20         # Number of MFCCs

# Spectrogram Params
N_MELS = 128        # Number of Mel bands
HOP_LENGTH = 512    # Hop length for STFT/Mel spectrogram
N_FFT = 2048        # FFT window size
SPEC_MAX_LEN = 63 # Fixed time dimension for spectrograms (ADJUST BASED ON YOUR DATA ANALYSIS)

# Model Training Params
TEST_SIZE = 0.2     # Proportion of data for validation
BATCH_SIZE = 32     # Batch size for CNN training
EPOCHS = 50         # Max epochs for CNN (EarlyStopping likely stops sooner)
XGB_PARAMS = {      # Example XGBoost parameters (tune these)
    'objective': 'binary:logistic',
    'early_stopping_rounds': 20,
    'eval_metric': 'logloss', # Use 'auc' or 'error' as well
    'eta': 0.1,          # Learning rate
    'max_depth': 4,
    'subsample': 0.8,    # Fraction of samples used per tree
    'colsample_bytree': 0.8, # Fraction of features used per tree
    'min_child_weight': 1,
    'gamma': 0.1,        # Minimum loss reduction to make a split
    'lambda': 1,         # L2 regularization
    'alpha': 0,          # L1 regularization
    'seed': SEED
}
CNN_LEARNING_RATE = 0.0001
EARLY_STOPPING_PATIENCE = 10

## 2. Data Loading and Preparation

Load audio file paths, assign labels (0=Real, 1=Fake), shuffle, and split into training and validation sets.

In [None]:
print("Scanning dataset directories...")
filepaths = []
labels = []

# Ensure directories exist
if not os.path.isdir(REAL_DIR):
    raise ValueError(f"Real directory not found: {REAL_DIR}\nPlease check DATASET_PATH.")
if not os.path.isdir(FAKE_DIR):
    raise ValueError(f"Fake directory not found: {FAKE_DIR}\nPlease check DATASET_PATH.")

# Load real files
print(f"Looking for audio files in: {REAL_DIR}")
real_files = [f for f in os.listdir(REAL_DIR) if f.lower().endswith(('.wav', '.mp3', '.flac'))]
print(f"Found {len(real_files)} potential real files.")
for filename in tqdm(real_files, desc="Loading real file paths"):
    filepaths.append(os.path.join(REAL_DIR, filename))
    labels.append(0) # 0 for real

# Load fake files
print(f"\nLooking for audio files in: {FAKE_DIR}")
fake_files = [f for f in os.listdir(FAKE_DIR) if f.lower().endswith(('.wav', '.mp3', '.flac'))]
print(f"Found {len(fake_files)} potential fake files.")
for filename in tqdm(fake_files, desc="Loading fake file paths"):
    filepaths.append(os.path.join(FAKE_DIR, filename))
    labels.append(1) # 1 for fake

if not filepaths:
    raise ValueError(f"No audio files found in {REAL_DIR} or {FAKE_DIR}. Check paths and file extensions.")

# Create DataFrame
df = pd.DataFrame({'filepath': filepaths, 'label': labels})
df = df.sample(frac=1, random_state=SEED).reset_index(drop=True) # Shuffle

print(f"\nFound {len(df)} total audio files.")
print("Label distribution:")
print(df['label'].value_counts())

# Split data (using indices for consistency across models)
train_indices, val_indices = train_test_split(
    df.index,
    test_size=TEST_SIZE,
    random_state=SEED,
    stratify=df['label'] # Ensure proportional splits
)

train_df = df.loc[train_indices].reset_index(drop=True)
val_df = df.loc[val_indices].reset_index(drop=True)

y_train = train_df['label'].values
y_val = val_df['label'].values

print(f"\nTraining set size: {len(train_df)} ({len(y_train)})")
print(f"Validation set size: {len(val_df)} ({len(y_val)})")
print("Training label distribution:")
print(train_df['label'].value_counts())
print("Validation label distribution:")
print(val_df['label'].value_counts())

## 3. Helper Functions

Functions for loading audio, extracting aggregated features, and generating spectrograms.

In [None]:
def load_audio(filepath, sr=SAMPLE_RATE):
    """Loads an audio file using librosa, handling potential errors."""
    try:
        # duration=None loads the entire file
        audio, _ = librosa.load(filepath, sr=sr, duration=None, res_type='kaiser_fast')
        if len(audio) == 0:
            print(f"Warning: Empty audio loaded from {filepath}")
            return None
        return audio
    except Exception as e:
        print(f"Error loading {filepath}: {e}")
        return None

def extract_features(audio, sr=SAMPLE_RATE, n_mfcc=N_MFCC):
    """Extracts aggregated audio features (mean and std dev)."""
    if audio is None or len(audio) == 0:
        # Return a zero vector of the expected size
        # Size = (n_mfcc mean + n_mfcc std) + (chroma mean + std) + (spec_contrast mean + std) + (zcr mean + std) + (rms mean)
        feature_size = n_mfcc * 2 + 2 + 2 + 2 + 1
        return np.zeros(feature_size)

    features = []
    # MFCCs
    mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
    features.extend(np.mean(mfccs, axis=1))
    features.extend(np.std(mfccs, axis=1))

    # Chroma Features
    chroma = librosa.feature.chroma_stft(y=audio, sr=sr)
    features.append(np.mean(chroma))
    features.append(np.std(chroma))

    # Spectral Contrast
    spec_contrast = librosa.feature.spectral_contrast(y=audio, sr=sr)
    features.append(np.mean(spec_contrast))
    features.append(np.std(spec_contrast))

    # Zero-Crossing Rate
    zcr = librosa.feature.zero_crossing_rate(y=audio)
    features.append(np.mean(zcr))
    features.append(np.std(zcr))

    # RMS Energy
    rms = librosa.feature.rms(y=audio)
    features.append(np.mean(rms))
    # std(rms) is often very small and might not be informative
    # features.append(np.std(rms))

    return np.array(features)

def generate_spectrogram(audio, sr=SAMPLE_RATE, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP_LENGTH, max_len=SPEC_MAX_LEN):
    """Generates a Mel Spectrogram and pads/truncates it to a fixed length."""
    if audio is None:
        # Return an empty spectrogram of the target shape
        return np.zeros((n_mels, max_len))

    try:
        mel_spec = librosa.feature.melspectrogram(y=audio, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels)
        # Convert to decibels (log scale)
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

        # Pad or truncate the time dimension (axis=1)
        current_len = mel_spec_db.shape[1]
        if current_len < max_len:
            pad_width = max_len - current_len
            # Pad with the minimum value of the spectrogram (or a constant like -80 dB)
            mel_spec_db = np.pad(mel_spec_db, ((0, 0), (0, pad_width)), mode='constant', constant_values=np.min(mel_spec_db))
        elif current_len > max_len:
            mel_spec_db = mel_spec_db[:, :max_len]

        return mel_spec_db
    except Exception as e:
        print(f"Error generating spectrogram: {e}")
        return np.zeros((n_mels, max_len))

def normalize_spectrogram(spec):
    """Normalizes a single spectrogram to the range [0, 1]."""
    min_val = np.min(spec)
    max_val = np.max(spec)
    if max_val > min_val:
        # Normalize to [0, 1]
        return (spec - min_val) / (max_val - min_val)
    elif max_val == min_val:
         # Handle constant spectrogram (e.g., silence)
         return np.zeros_like(spec)
    return spec # Should not happen if max_val > min_val check works

## 4. XGBoost Model Path

### 4.1 Feature Extraction

In [None]:
print("Extracting features for XGBoost...")
X_train_features_list = []
X_val_features_list = []

# Process training data
print("Processing Training Data:")
for filepath in tqdm(train_df['filepath'], desc="Extracting Train Features"):
    audio = load_audio(filepath, sr=SAMPLE_RATE)
    features = extract_features(audio, sr=SAMPLE_RATE, n_mfcc=N_MFCC)
    X_train_features_list.append(features)

# Process validation data
print("\nProcessing Validation Data:")
for filepath in tqdm(val_df['filepath'], desc="Extracting Val Features"):
    audio = load_audio(filepath, sr=SAMPLE_RATE)
    features = extract_features(audio, sr=SAMPLE_RATE, n_mfcc=N_MFCC)
    X_val_features_list.append(features)

X_train_features = np.array(X_train_features_list)
X_val_features = np.array(X_val_features_list)

# Handle potential NaNs or Infs resulting from silent audio or errors
print(f"\nFeatures before NaN/Inf handling - Train: {np.isnan(X_train_features).sum()} NaNs, {np.isinf(X_train_features).sum()} Infs")
print(f"Features before NaN/Inf handling - Val: {np.isnan(X_val_features).sum()} NaNs, {np.isinf(X_val_features).sum()} Infs")
X_train_features = np.nan_to_num(X_train_features, nan=0.0, posinf=0.0, neginf=0.0)
X_val_features = np.nan_to_num(X_val_features, nan=0.0, posinf=0.0, neginf=0.0)

print(f"\nFeature array shape (Train): {X_train_features.shape}")
print(f"Feature array shape (Val): {X_val_features.shape}")

### 4.2 Feature Scaling

In [None]:
print("Scaling features...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_features)
X_val_scaled = scaler.transform(X_val_features)

print(f"Scaled feature shape (Train): {X_train_scaled.shape}")
print(f"Scaled feature shape (Val): {X_val_scaled.shape}")

### 4.3 XGBoost Model Training

In [None]:
print("Training XGBoost Model...")
start_time = time.time()

# Assuming XGB_PARAMS includes the necessary params for XGBClassifier
xgb_model = xgb.XGBClassifier(**XGB_PARAMS)

# Train with early stopping based on validation loss
xgb_model.fit(
    X_train_scaled, y_train,
    eval_set=[(X_val_scaled, y_val)],  # Validation set for early stopping
    #early_stopping_rounds=EARLY_STOPPING_PATIENCE,  # Stop if validation loss doesn't improve
    verbose=True  # Set to True for detailed logs
)

xgb_train_time = time.time() - start_time

# Save the trained model to disk
dump(xgb_model, 'C:/Users/USER/Downloads/Authentica---DeepFake-detection/audio_final_new/checkpoints_xgb/xgb_model.pkl')

print(f"XGBoost training completed in {xgb_train_time:.2f} seconds.")
print(f"Best iteration: {xgb_model.best_iteration}")
print(f"Best score ({XGB_PARAMS.get('eval_metric', 'logloss')}): {xgb_model.best_score:.4f}")


### 4.4 XGBoost Model Evaluation

In [None]:
print("--- XGBoost Evaluation ---")
# Get probabilities for the positive class (Fake)
xgb_pred_proba = xgb_model.predict_proba(X_val_scaled)[:, 1]
# Get class predictions based on 0.5 threshold
xgb_pred_labels = (xgb_pred_proba > 0.5).astype(int)

# Calculate metrics
xgb_accuracy = accuracy_score(y_val, xgb_pred_labels)
xgb_precision, xgb_recall, xgb_f1, _ = precision_recall_fscore_support(y_val, xgb_pred_labels, average='binary', zero_division=0)
try:
    xgb_auc = roc_auc_score(y_val, xgb_pred_proba)
except ValueError as e:
    print(f"AUC calculation warning: {e}. Probabilities might be non-finite or constant.")
    # Handle cases with potential issues (e.g., clipping)
    xgb_auc = roc_auc_score(y_val, np.clip(xgb_pred_proba, 1e-7, 1 - 1e-7))

print(f"Accuracy: {xgb_accuracy:.4f}")
print(f"Precision: {xgb_precision:.4f}")
print(f"Recall: {xgb_recall:.4f}")
print(f"F1-Score: {xgb_f1:.4f}")
print(f"AUC: {xgb_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_val, xgb_pred_labels, target_names=['Real (0)', 'Fake (1)'], zero_division=0))
print("Confusion Matrix:")
print(confusion_matrix(y_val, xgb_pred_labels))

## 5. CNN Model Path

### 5.1 Spectrogram Generation

In [None]:
print("Generating Spectrograms for CNN...")
X_train_spec_list = []
X_val_spec_list = []

# Process training data
print("Processing Training Data:")
for filepath in tqdm(train_df['filepath'], desc="Generating Train Spectrograms"):
    audio = load_audio(filepath, sr=SAMPLE_RATE)
    spectrogram = generate_spectrogram(audio, sr=SAMPLE_RATE, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP_LENGTH, max_len=SPEC_MAX_LEN)
    X_train_spec_list.append(spectrogram)

# Process validation data
print("\nProcessing Validation Data:")
for filepath in tqdm(val_df['filepath'], desc="Generating Val Spectrograms"):
    audio = load_audio(filepath, sr=SAMPLE_RATE)
    spectrogram = generate_spectrogram(audio, sr=SAMPLE_RATE, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP_LENGTH, max_len=SPEC_MAX_LEN)
    X_val_spec_list.append(spectrogram)

X_train_spec = np.array(X_train_spec_list)
X_val_spec = np.array(X_val_spec_list)

# Reshape for CNN (add channel dimension: height, width, channels)
X_train_spec = X_train_spec[..., np.newaxis]
X_val_spec = X_val_spec[..., np.newaxis]

print(f"\nSpectrogram array shape (Train): {X_train_spec.shape}")
print(f"Spectrogram array shape (Val): {X_val_spec.shape}")

### 5.2 Spectrogram Normalization

Normalize spectrogram pixel values (dB scale) to the range [0, 1] for better CNN training.

In [None]:
print("Normalizing spectrograms...")

# Apply normalization per spectrogram
X_train_spec_norm = np.array([normalize_spectrogram(s) for s in tqdm(X_train_spec, desc="Normalizing Train Specs")])
X_val_spec_norm = np.array([normalize_spectrogram(s) for s in tqdm(X_val_spec, desc="Normalizing Val Specs")])

# Check min/max values after normalization (should be close to 0 and 1)
print(f"\nTrain spec norm min/max: {np.min(X_train_spec_norm):.2f} / {np.max(X_train_spec_norm):.2f}")
print(f"Val spec norm min/max: {np.min(X_val_spec_norm):.2f} / {np.max(X_val_spec_norm):.2f}")

### 5.3 CNN Model Definition

In [None]:
def build_cnn_model(input_shape):
    """Defines the 2D CNN architecture."""
    model = Sequential(name="Spectrogram_CNN")
    model.add(Input(shape=input_shape))

    # Layer 1
    model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 2)))

    # Layer 2
    model.add(Conv2D(64, kernel_size=(3, 3), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 2)))

    # Layer 3
    model.add(Conv2D(128, kernel_size=(3, 3), activation='relu', padding='same'))
    model.add(BatchNormalization())
    model.add(MaxPooling2D(pool_size=(2, 2)))

    # Flatten and Dense Layers
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5)) # Dropout for regularization
    model.add(Dense(1, activation='sigmoid')) # Output layer for binary classification

    return model

# Get the input shape from the normalized training data
input_shape_cnn = X_train_spec_norm.shape[1:]
print(f"CNN Input Shape: {input_shape_cnn}")

cnn_model = build_cnn_model(input_shape_cnn)

# Compile the model
cnn_model.compile(optimizer=Adam(learning_rate=CNN_LEARNING_RATE),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

cnn_model.summary()

### 5.4 CNN Model Training

In [None]:
print("Training CNN Model...")

# Callbacks
early_stopping = EarlyStopping(
    monitor='val_loss',         # Metric to monitor
    patience=EARLY_STOPPING_PATIENCE, # Number of epochs with no improvement
    restore_best_weights=True, # Restore model weights from the epoch with the best monitored metric
    verbose=1
)

# Optional: Save the best model checkpoint
# model_checkpoint = ModelCheckpoint('best_cnn_model.keras', monitor='val_loss', save_best_only=True, verbose=0)
checkpoint_cb = ModelCheckpoint(
    filepath='C:/Users/USER/Downloads/Authentica---DeepFake-detection/audio_final_new/checkpoints_cnn/cnn_epoch_{epoch:02d}_{accuracy:02f}.h5',  # Save model after each epoch
    save_freq='epoch',
    save_weights_only=False,   # Set to True if you only want to save weights
    save_best_only=False,      # Saves every epoch, not just the best one
    verbose=1
)

start_time = time.time()
history = cnn_model.fit(
    X_train_spec_norm, y_train,
    validation_data=(X_val_spec_norm, y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[checkpoint_cb], #early_stopping], # Add model_checkpoint here if using
    verbose=1 # Set to 2 for less output per epoch, 0 for silent
)
cnn_train_time = time.time() - start_time
print(f"\nCNN training completed in {cnn_train_time:.2f} seconds.")

### 5.5 CNN Model Evaluation

In [None]:
print("--- CNN Evaluation ---")
from tensorflow.keras.models import load_model
cnn_model = load_model('C:/Users/USER/Downloads/Authentica---DeepFake-detection/audio_final_new/checkpoints_cnn/cnn_epoch_16_1.000000.h5') # Load the best model
# Evaluate the model (using weights restored by EarlyStopping)
cnn_loss, cnn_accuracy = cnn_model.evaluate(X_val_spec_norm, y_val, verbose=0)

# Get probabilities and labels
cnn_pred_proba = cnn_model.predict(X_val_spec_norm).flatten() # Flatten to get shape (n_samples,)
cnn_pred_labels = (cnn_pred_proba > 0.5).astype(int)

# Calculate metrics
cnn_precision, cnn_recall, cnn_f1, _ = precision_recall_fscore_support(y_val, cnn_pred_labels, average='binary', zero_division=0)
try:
    cnn_auc = roc_auc_score(y_val, cnn_pred_proba)
except ValueError as e:
    print(f"AUC calculation warning: {e}. Probabilities might be non-finite or constant.")
    cnn_auc = roc_auc_score(y_val, np.clip(cnn_pred_proba, 1e-7, 1 - 1e-7))

print(f"Validation Loss: {cnn_loss:.4f}")
print(f"Accuracy: {cnn_accuracy:.4f}")
print(f"Precision: {cnn_precision:.4f}")
print(f"Recall: {cnn_recall:.4f}")
print(f"F1-Score: {cnn_f1:.4f}")
print(f"AUC: {cnn_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_val, cnn_pred_labels, target_names=['Real (0)', 'Fake (1)'], zero_division=0))
print("Confusion Matrix:")
print(confusion_matrix(y_val, cnn_pred_labels))

### 5.6 Plot CNN Training History

In [None]:
if history:
    plt.figure(figsize=(12, 5))

    # Plot Loss
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Val Loss')
    plt.title('CNN Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(loc='upper right')

    # Plot Accuracy
    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Val Accuracy')
    plt.title('CNN Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(loc='lower right')

    plt.tight_layout()
    plt.show()
else:
    print("Skipping history plots as CNN training did not run or complete.")

## 6. Ensemble Model

Combine the predictions from XGBoost and the CNN using simple averaging.

In [None]:
print("Evaluating Ensemble Model (Simple Averaging)...")

# --- Sanity Checks ---
# Check if both models produced predictions
if 'xgb_pred_proba' not in locals() or 'cnn_pred_proba' not in locals():
     raise RuntimeError("One or both models did not produce predictions. Cannot create ensemble.")

# Check if prediction arrays have the same length as the validation set
if len(xgb_pred_proba) != len(y_val):
     raise ValueError(f"XGBoost prediction length ({len(xgb_pred_proba)}) doesn't match validation labels ({len(y_val)}).")
if len(cnn_pred_proba) != len(y_val):
     raise ValueError(f"CNN prediction length ({len(cnn_pred_proba)}) doesn't match validation labels ({len(y_val)}).")

# --- Simple Averaging Ensemble ---
ensemble_pred_proba = (xgb_pred_proba + cnn_pred_proba) / 2.0

# Convert averaged probabilities to class labels
ensemble_pred_labels = (ensemble_pred_proba > 0.5).astype(int)

# --- Evaluate Ensemble ---
print("\n--- Ensemble Evaluation ---")
ens_accuracy = accuracy_score(y_val, ensemble_pred_labels)
ens_precision, ens_recall, ens_f1, _ = precision_recall_fscore_support(y_val, ensemble_pred_labels, average='binary', zero_division=0)

try:
    ens_auc = roc_auc_score(y_val, ensemble_pred_proba)
except ValueError as e:
    print(f"AUC calculation warning: {e}. Clipping probabilities for calculation.")
    # Attempt calculation after clipping probabilities slightly away from 0 and 1
    ens_auc = roc_auc_score(y_val, np.clip(ensemble_pred_proba, 1e-7, 1 - 1e-7))

print(f"Accuracy: {ens_accuracy:.4f}")
print(f"Precision: {ens_precision:.4f}")
print(f"Recall: {ens_recall:.4f}")
print(f"F1-Score: {ens_f1:.4f}")
print(f"AUC: {ens_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_val, ensemble_pred_labels, target_names=['Real (0)', 'Fake (1)'], zero_division=0))
print("Confusion Matrix:")
print(confusion_matrix(y_val, ensemble_pred_labels))

## 7. Comparison Summary

In [None]:
print("--- Performance Comparison ---")
print("                                         ")
print(f"Metric      | XGBoost |   CNN   | Ensemble")
print(f"------------|---------|---------|----------")
print(f"Accuracy    | {xgb_accuracy:7.4f} | {cnn_accuracy:7.4f} | {ens_accuracy:8.4f}")
print(f"Precision   | {xgb_precision:7.4f} | {cnn_precision:7.4f} | {ens_precision:8.4f}")
print(f"Recall      | {xgb_recall:7.4f} | {cnn_recall:7.4f} | {ens_recall:8.4f}")
print(f"F1-Score    | {xgb_f1:7.4f} | {cnn_f1:7.4f} | {ens_f1:8.4f}")
print(f"AUC         | {xgb_auc:7.4f} | {cnn_auc:7.4f} | {ens_auc:8.4f}")

print("\nDone.")

In [None]:
TEST_DATASET_PATH = 'C:/Users/USER/Downloads/Authentica---DeepFake-detection/audio_final_new/archive/for-2sec/for-2seconds/validation'
TEST_REAL_DIR = os.path.join(TEST_DATASET_PATH, 'real')
TEST_FAKE_DIR = os.path.join(TEST_DATASET_PATH, 'fake')

# Check if the test directories exist
if not os.path.isdir(TEST_DATASET_PATH):
    print(f"WARNING: Test dataset path not found: {TEST_DATASET_PATH}")
    print("Skipping evaluation on the separate test set.")
    run_test_evaluation = False
else:
    if not os.path.isdir(TEST_REAL_DIR):
         raise ValueError(f"Test 'real' directory not found: {TEST_REAL_DIR}")
    if not os.path.isdir(TEST_FAKE_DIR):
         raise ValueError(f"Test 'fake' directory not found: {TEST_FAKE_DIR}")
    run_test_evaluation = True

# %% [code]
# --- Load Test Data Filepaths and Labels ---
if run_test_evaluation:
    print(f"Loading test data from: {TEST_DATASET_PATH}")
    test_filepaths = []
    test_labels = []

    # Load real files
    print(f"Looking for audio files in: {TEST_REAL_DIR}")
    test_real_files = [f for f in os.listdir(TEST_REAL_DIR) if f.lower().endswith(('.wav', '.mp3', '.flac'))]
    print(f"Found {len(test_real_files)} potential real test files.")
    for filename in tqdm(test_real_files, desc="Loading real test file paths"):
        test_filepaths.append(os.path.join(TEST_REAL_DIR, filename))
        test_labels.append(0) # 0 for real

    # Load fake files
    print(f"\nLooking for audio files in: {TEST_FAKE_DIR}")
    test_fake_files = [f for f in os.listdir(TEST_FAKE_DIR) if f.lower().endswith(('.wav', '.mp3', '.flac'))]
    print(f"Found {len(test_fake_files)} potential fake test files.")
    for filename in tqdm(test_fake_files, desc="Loading fake test file paths"):
        test_filepaths.append(os.path.join(TEST_FAKE_DIR, filename))
        test_labels.append(1) # 1 for fake

    if not test_filepaths:
        print("WARNING: No audio files found in the test dataset directories. Skipping evaluation.")
        run_test_evaluation = False
    else:
        # Create test DataFrame (optional, but good practice)
        test_df = pd.DataFrame({'filepath': test_filepaths, 'label': test_labels})
        # No shuffling needed for testing, but keep track of true labels
        y_test = test_df['label'].values
        print(f"\nLoaded {len(test_df)} test files.")
        print("Test set label distribution:")
        print(test_df['label'].value_counts())


# %% [code]
# --- Preprocess Test Data for XGBoost ---
if run_test_evaluation:
    print("\nPreprocessing test data for XGBoost...")
    X_test_features_list = []

    # Extract features
    for filepath in tqdm(test_df['filepath'], desc="Extracting Test Features"):
        audio = load_audio(filepath, sr=SAMPLE_RATE)
        features = extract_features(audio, sr=SAMPLE_RATE, n_mfcc=N_MFCC)
        X_test_features_list.append(features)

    X_test_features = np.array(X_test_features_list)

    # Handle NaNs/Infs
    X_test_features = np.nan_to_num(X_test_features, nan=0.0, posinf=0.0, neginf=0.0)
    print(f"Test feature array shape: {X_test_features.shape}")

    # Scale features using the *original* scaler
    print("Scaling test features using the scaler fitted on training data...")
    X_test_scaled = scaler.transform(X_test_features) # Use transform, NOT fit_transform
    print(f"Scaled test feature shape: {X_test_scaled.shape}")

# %% [code]
# --- Preprocess Test Data for CNN ---
if run_test_evaluation:
    print("\nPreprocessing test data for CNN...")
    X_test_spec_list = []

    # Generate Spectrograms
    for filepath in tqdm(test_df['filepath'], desc="Generating Test Spectrograms"):
        audio = load_audio(filepath, sr=SAMPLE_RATE)
        # Use the SAME parameters as during training!
        spectrogram = generate_spectrogram(audio, sr=SAMPLE_RATE, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP_LENGTH, max_len=SPEC_MAX_LEN)
        X_test_spec_list.append(spectrogram)

    X_test_spec = np.array(X_test_spec_list)

    # Reshape for CNN (add channel dimension)
    X_test_spec = X_test_spec[..., np.newaxis]
    print(f"Test spectrogram array shape: {X_test_spec.shape}")

    # Normalize Spectrograms using the same method
    print("Normalizing test spectrograms...")
    X_test_spec_norm = np.array([normalize_spectrogram(s) for s in tqdm(X_test_spec, desc="Normalizing Test Specs")])
    print(f"Normalized test spectrogram shape: {X_test_spec_norm.shape}")
    print(f"Test spec norm min/max: {np.min(X_test_spec_norm):.2f} / {np.max(X_test_spec_norm):.2f}")


# %% [code]
# --- Make Predictions on Test Data ---
if run_test_evaluation:
    print("\nMaking predictions on the test set...")

    # XGBoost predictions
    print("Predicting with XGBoost...")
    test_xgb_pred_proba = xgb_model.predict_proba(X_test_scaled)[:, 1]
    print(f"XGBoost test prediction probabilities shape: {test_xgb_pred_proba.shape}")


    # CNN predictions
    print("Predicting with CNN...")
    test_cnn_pred_proba = cnn_model.predict(X_test_spec_norm, batch_size=BATCH_SIZE).flatten()
    print(f"CNN test prediction probabilities shape: {test_cnn_pred_proba.shape}")

    # Sanity check lengths
    if len(test_xgb_pred_proba) != len(y_test) or len(test_cnn_pred_proba) != len(y_test):
        raise ValueError("Prediction length mismatch with test labels!")

# %% [code]
# --- Ensemble Predictions and Evaluation on Test Set ---
if run_test_evaluation:
    print("\nEvaluating Ensemble Model on the Test Set...")

    # Simple Averaging Ensemble
    test_ensemble_pred_proba = (test_xgb_pred_proba + test_cnn_pred_proba) / 2.0
    test_ensemble_pred_labels = (test_ensemble_pred_proba > 0.5).astype(int)

    # --- Evaluate Ensemble on Test Set ---
    print("\n--- Test Set Ensemble Evaluation ---")
    test_ens_accuracy = accuracy_score(y_test, test_ensemble_pred_labels)
    test_ens_precision, test_ens_recall, test_ens_f1, _ = precision_recall_fscore_support(
        y_test, test_ensemble_pred_labels, average='binary', zero_division=0
    )

    try:
        test_ens_auc = roc_auc_score(y_test, test_ensemble_pred_proba)
    except ValueError as e:
        print(f"AUC calculation warning: {e}. Clipping probabilities for calculation.")
        test_ens_auc = roc_auc_score(y_test, np.clip(test_ensemble_pred_proba, 1e-7, 1 - 1e-7))

    print(f"Accuracy: {test_ens_accuracy:.4f}")
    print(f"Precision: {test_ens_precision:.4f}")
    print(f"Recall: {test_ens_recall:.4f}")
    print(f"F1-Score: {test_ens_f1:.4f}")
    print(f"AUC: {test_ens_auc:.4f}")
    print("\nClassification Report (Test Set):")
    print(classification_report(y_test, test_ensemble_pred_labels, target_names=['Real (0)', 'Fake (1)'], zero_division=0))
    print("Confusion Matrix (Test Set):")
    print(confusion_matrix(y_test, test_ensemble_pred_labels))

# %% [code]
# Optional: Evaluate individual models on the test set for comparison

if run_test_evaluation:
    print("\n--- Individual Model Evaluation on Test Set ---")

    # XGBoost Only
    test_xgb_pred_labels = (test_xgb_pred_proba > 0.5).astype(int)
    xgb_test_acc = accuracy_score(y_test, test_xgb_pred_labels)
    try:
      xgb_test_auc = roc_auc_score(y_test, test_xgb_pred_proba)
    except ValueError:
       xgb_test_auc = roc_auc_score(y_test, np.clip(test_xgb_pred_proba, 1e-7, 1-1e-7))
    # CNN Only
    test_cnn_pred_labels = (test_cnn_pred_proba > 0.5).astype(int)
    cnn_test_acc = accuracy_score(y_test, test_cnn_pred_labels)
    try:
        cnn_test_auc = roc_auc_score(y_test, test_cnn_pred_proba)
    except ValueError:
       cnn_test_auc = roc_auc_score(y_test, np.clip(test_cnn_pred_proba, 1e-7, 1-1e-7))
    print(f"Ensemble Test Accuracy: {test_ens_accuracy:.4f}, AUC: {test_ens_auc:.4f}")