
# Step 1: Audio Preprocessing and Denoising
# Install required packages first:
# pip install librosa pandas numpy noisereduce soundfile

In [12]:
# =========================
# BASIC SYSTEM & UTILITIES
# =========================
import os
import warnings
warnings.filterwarnings("ignore")

# =========================
# NUMERICAL & DATA HANDLING
# =========================
import numpy as np
import pandas as pd
import pickle
from collections import Counter

# =========================
# AUDIO PROCESSING
# =========================
import librosa
import librosa.display
import soundfile as sf
import noisereduce as nr

# =========================
# VISUALIZATION
# =========================
import matplotlib.pyplot as plt
from tqdm import tqdm

# =========================
# SCIKIT-LEARN (CLASSICAL ML)
# =========================
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
    f1_score
)

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import SVC

# =========================
# XGBOOST
# =========================
import xgboost as xgb

# =========================
# PYTORCH CORE
# =========================
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader

# =========================
# TORCHVISION & MODELS
# =========================
from torchvision import transforms

import torchvision.models as tv_models

import timm

# =========================
# OTHER
# =========================
import copy


In [2]:

# =========================
# CONFIGURATION
# =========================
DATASET_DIR = "datasets"
OUTPUT_DIR = "datasets_cleaned"
SR = 16000
DURATION = 5          # seconds
SAMPLES = SR * DURATION
N_MFCC = 40

CLASSES = {
    "modified": 0,
    "unmodified": 1,
    "synthetic": 2,
    "spliced": 3
}

os.makedirs(OUTPUT_DIR, exist_ok=True)

# =========================
# AUDIO FUNCTIONS
# =========================
def load_audio(path):
    audio, _ = librosa.load(path, sr=SR, mono=True)
    audio = nr.reduce_noise(y=audio, sr=SR)
    return audio

def fix_length(audio):
    if len(audio) < SAMPLES:
        return np.pad(audio, (0, SAMPLES - len(audio)))
    return audio[:SAMPLES]

def extract_mfcc(audio):
    mfcc = librosa.feature.mfcc(
        y=audio,
        sr=SR,
        n_mfcc=N_MFCC
    )
    return mfcc.T  # (time, features)

# =========================
# PROCESS DATASET
# =========================
X, y, filenames = [], [], []

for class_name, label in CLASSES.items():
    class_path = os.path.join(DATASET_DIR, class_name)
    output_class = os.path.join(OUTPUT_DIR, class_name)
    os.makedirs(output_class, exist_ok=True)

    print(f"\nProcessing {class_name}...")

    for file in tqdm(os.listdir(class_path)):
        if not file.endswith(".wav"):
            continue

        path = os.path.join(class_path, file)

        try:
            audio = load_audio(path)
            audio = fix_length(audio)

            # Save clean audio
            sf.write(os.path.join(output_class, file), audio, SR)

            mfcc = extract_mfcc(audio)

            X.append(mfcc)
            y.append(label)
            filenames.append(file)

        except Exception as e:
            print(f"Error: {file} - {e}")

# =========================
# SAVE DATA
# =========================
X = np.array(X)
y = np.array(y)

np.save("X_mfcc.npy", X)
np.save("y_labels.npy", y)

metadata = pd.DataFrame({
    "filename": filenames,
    "label": y
})
metadata.to_csv("metadata.csv", index=False)

print("\n‚úÖ Preprocessing Complete")
print("MFCC shape:", X.shape)
print("Class distribution:")
print(metadata["label"].value_counts())



Processing modified...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 300/300 [00:16<00:00, 18.29it/s]



Processing unmodified...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 300/300 [00:17<00:00, 16.70it/s]



Processing synthetic...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 300/300 [00:16<00:00, 18.64it/s]



Processing spliced...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 300/300 [00:20<00:00, 14.85it/s]


‚úÖ Preprocessing Complete
MFCC shape: (1200, 157, 40)
Class distribution:
label
0    300
1    300
2    300
3    300
Name: count, dtype: int64





# Step 2: Feature Extraction - MFCCs and Mel-Spectrograms
# This creates both MFCC features and Mel-Spectrogram images for ResNet

In [4]:
# =========================
# STEP 2: FEATURE EXTRACTION
# MFCCs + Mel-Spectrograms
# =========================



# =========================
# CONFIGURATION (ALIGNED WITH STEP 1)
# =========================
CLEAN_AUDIO_FOLDER = "datasets_cleaned"
MFCC_OUTPUT_FOLDER = "mfcc_features"
MELSPEC_OUTPUT_FOLDER = "melspec_features"

SR = 16000
DURATION = 5                      # seconds
SAMPLES = SR * DURATION
N_MFCC = 13
N_MELS = 128
N_FFT = 1024
HOP_LENGTH = 512
FIXED_FRAMES = 216                # consistent time dimension

CLASSES = {
    "modified": 0,
    "unmodified": 1,
    "synthetic": 2,
    "spliced": 3
}

os.makedirs(MFCC_OUTPUT_FOLDER, exist_ok=True)
os.makedirs(MELSPEC_OUTPUT_FOLDER, exist_ok=True)

# =========================
# FUNCTIONS
# =========================
def load_audio_fixed(path):
    y, _ = librosa.load(path, sr=SR, mono=True)
    y = librosa.util.fix_length(y, size=SAMPLES)
    return y

def extract_mfcc(audio):
    mfcc = librosa.feature.mfcc(
        y=audio,
        sr=SR,
        n_mfcc=N_MFCC,
        n_fft=N_FFT,
        hop_length=HOP_LENGTH
    )

    delta = librosa.feature.delta(mfcc)
    delta2 = librosa.feature.delta(mfcc, order=2)

    mfcc = np.vstack([mfcc, delta, delta2])
    mfcc = librosa.util.fix_length(mfcc, size=FIXED_FRAMES, axis=1)

    return mfcc   # (39, 216)

def extract_melspec(audio):
    mel = librosa.feature.melspectrogram(
        y=audio,
        sr=SR,
        n_mels=N_MELS,
        n_fft=N_FFT,
        hop_length=HOP_LENGTH
    )

    mel_db = librosa.power_to_db(mel, ref=np.max)
    mel_db = librosa.util.fix_length(mel_db, size=FIXED_FRAMES, axis=1)

    # Convert to 3-channel for ResNet
    mel_rgb = np.stack([mel_db] * 3, axis=-1)

    return mel_rgb   # (128, 216, 3)

# =========================
# PROCESS DATASET
# =========================
files, labels = [], []
processed, failed = 0, 0

print("\nExtracting MFCCs and Mel-Spectrograms...\n")

for class_name, label in CLASSES.items():
    class_path = os.path.join(CLEAN_AUDIO_FOLDER, class_name)

    if not os.path.exists(class_path):
        print(f"Skipping missing folder: {class_path}")
        continue

    print(f"Processing {class_name}...")

    for file in tqdm(os.listdir(class_path)):
        if not file.endswith(".wav"):
            continue

        file_path = os.path.join(class_path, file)
        file_id = f"{class_name}_{os.path.splitext(file)[0]}"

        try:
            audio = load_audio_fixed(file_path)

            mfcc = extract_mfcc(audio)
            mel = extract_melspec(audio)

            np.save(os.path.join(MFCC_OUTPUT_FOLDER, file_id + ".npy"), mfcc)
            np.save(os.path.join(MELSPEC_OUTPUT_FOLDER, file_id + ".npy"), mel)

            files.append(file_id)
            labels.append(label)
            processed += 1

        except Exception as e:
            print(f"Failed: {file} -> {e}")
            failed += 1

# =========================
# SAVE LABEL FILE
# =========================
df = pd.DataFrame({
    "file": files,
    "label": labels
})
df.to_csv("labels_step2.csv", index=False)

# =========================
# SUMMARY
# =========================
print("\n‚úì STEP 2 COMPLETE")
print(f"Processed: {processed}")
print(f"Failed: {failed}")
print("MFCC shape:", mfcc.shape)
print("Mel-Spectrogram shape:", mel.shape)
print("Labels saved to labels_step2.csv")



Extracting MFCCs and Mel-Spectrograms...

Processing modified...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 300/300 [00:02<00:00, 124.46it/s]


Processing unmodified...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 300/300 [00:02<00:00, 120.43it/s]


Processing synthetic...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 300/300 [00:02<00:00, 116.03it/s]


Processing spliced...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 300/300 [00:02<00:00, 112.25it/s]


‚úì STEP 2 COMPLETE
Processed: 1200
Failed: 0
MFCC shape: (39, 216)
Mel-Spectrogram shape: (128, 216, 3)
Labels saved to labels_step2.csv






# Step 3: Dataset Preparation for 4-Class Classification
# This prepares data for both traditional ML models and ResNet

In [5]:
# =========================
# STEP 3: DATASET PREPARATION
# 4-CLASS CLASSIFICATION
# =========================



# =========================
# CONFIGURATION (ALIGNED)
# =========================
MFCC_FOLDER = "mfcc_features"
MELSPEC_FOLDER = "melspec_features"   # NPYS, NOT IMAGES
LABEL_FILE = "labels_step2.csv"

TEST_SIZE = 0.2
VAL_SIZE = 0.1
RANDOM_STATE = 42

# =========================
# LOAD LABELS
# =========================
labels_df = pd.read_csv(LABEL_FILE)

files = labels_df["file"].values
labels = labels_df["label"].values

print(f"Total samples: {len(labels)}")

# =========================
# LOAD MFCC FEATURES
# =========================
print("\nLoading MFCC features...")

X_mfcc = []
valid_files = []
valid_labels = []

for file_id, label in zip(files, labels):
    path = os.path.join(MFCC_FOLDER, file_id + ".npy")
    if os.path.exists(path):
        mfcc = np.load(path)
        X_mfcc.append(mfcc)
        valid_files.append(file_id)
        valid_labels.append(label)

X_mfcc = np.array(X_mfcc)
y = np.array(valid_labels)

print("MFCC shape:", X_mfcc.shape)  # (samples, 39, 216)

# =========================
# CLASS DISTRIBUTION
# =========================
print("\nClass Distribution:")
for k, v in Counter(y).items():
    print(f"Class {k}: {v}")

# =========================
# PREPARE MFCC FOR ML
# =========================
samples = X_mfcc.shape[0]
X_mfcc_flat = X_mfcc.reshape(samples, -1)

# =========================
# SPLIT DATA (STRATIFIED)
# =========================
X_train_temp, X_test, y_train_temp, y_test = train_test_split(
    X_mfcc_flat, y,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_temp, y_train_temp,
    test_size=VAL_SIZE / (1 - TEST_SIZE),
    random_state=RANDOM_STATE,
    stratify=y_train_temp
)

print("\nDataset Split:")
print(f"Train: {len(X_train)}")
print(f"Val:   {len(X_val)}")
print(f"Test:  {len(X_test)}")

# =========================
# SCALE FEATURES
# =========================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# =========================
# SAVE ML DATA
# =========================
np.save("X_train_mfcc.npy", X_train_scaled)
np.save("X_val_mfcc.npy", X_val_scaled)
np.save("X_test_mfcc.npy", X_test_scaled)
np.save("y_train.npy", y_train)
np.save("y_val.npy", y_val)
np.save("y_test.npy", y_test)

with open("mfcc_scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print("\n‚úì MFCC DATA READY FOR ML MODELS")

# ======================================================
# PREPARE MEL-SPECTROGRAM DATA FOR RESNET
# ======================================================
print("\nPreparing Mel-Spectrogram data for ResNet...")

X_mel = []

for file_id in valid_files:
    path = os.path.join(MELSPEC_FOLDER, file_id + ".npy")
    if os.path.exists(path):
        mel = np.load(path)
        X_mel.append(mel)

X_mel = np.array(X_mel)
print("Mel-Spectrogram shape:", X_mel.shape)  # (samples, 128, 216, 3)

# SAME SPLITS (important!)
X_mel_train = X_mel[:len(y_train)]
X_mel_val = X_mel[len(y_train):len(y_train)+len(y_val)]
X_mel_test = X_mel[-len(y_test):]

np.save("X_train_mel.npy", X_mel_train)
np.save("X_val_mel.npy", X_mel_val)
np.save("X_test_mel.npy", X_mel_test)

print("\n‚úì STEP 3 COMPLETE")
print("‚úì MFCC ‚Üí Traditional ML")
print("‚úì Mel-Spectrogram ‚Üí ResNet")


Total samples: 1200

Loading MFCC features...
MFCC shape: (1200, 39, 216)

Class Distribution:
Class 0: 300
Class 1: 300
Class 2: 300
Class 3: 300

Dataset Split:
Train: 840
Val:   120
Test:  240

‚úì MFCC DATA READY FOR ML MODELS

Preparing Mel-Spectrogram data for ResNet...
Mel-Spectrogram shape: (1200, 128, 216, 3)

‚úì STEP 3 COMPLETE
‚úì MFCC ‚Üí Traditional ML
‚úì Mel-Spectrogram ‚Üí ResNet


# Step 4: Training Traditional ML Models for 4-Class Classification
# pip install xgboost scikit-learn shap

In [6]:
# =========================
# STEP 4: TRAINING TRADITIONAL ML MODELS
# 4-CLASS AUDIO CLASSIFICATION
# =========================



# =========================
# LOAD DATA (ALIGNED WITH STEP 3)
# =========================
print("Loading preprocessed MFCC data...")

X_train = np.load("X_train_mfcc.npy")
X_val   = np.load("X_val_mfcc.npy")
X_test  = np.load("X_test_mfcc.npy")

y_train = np.load("y_train.npy")
y_val   = np.load("y_val.npy")
y_test  = np.load("y_test.npy")

CLASS_NAMES = ["modified", "unmodified", "synthetic", "spliced"]

print("Data loaded successfully")
print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)

# =========================
# DEFINE MODELS
# =========================
models = {
    "XGBoost": xgb.XGBClassifier(
        objective="multi:softprob",
        num_class=4,
        max_depth=6,
        learning_rate=0.1,
        n_estimators=200,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric="mlogloss"
    ),

    "RandomForest": RandomForestClassifier(
        n_estimators=200,
        max_depth=20,
        random_state=42,
        n_jobs=-1
    ),

    "KNN": KNeighborsClassifier(
        n_neighbors=5,
        n_jobs=-1
    ),

    "LDA": LinearDiscriminantAnalysis(),

    "LogisticRegression": LogisticRegression(
    max_iter=1000,
    solver="lbfgs",
    random_state=42
),


    "SVM": SVC(
        kernel="rbf",
        C=1.0,
        probability=True,
        random_state=42
    )
}

# =========================
# TRAIN & EVALUATE
# =========================
results = {}

for name, model in models.items():
    print(f"\n{'='*60}")
    print(f"Training {name}")
    print(f"{'='*60}")

    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_val_pred   = model.predict(X_val)
    y_test_pred  = model.predict(X_test)

    results[name] = {
        "train_acc": accuracy_score(y_train, y_train_pred),
        "val_acc": accuracy_score(y_val, y_val_pred),
        "test_acc": accuracy_score(y_test, y_test_pred),
        "train_f1": f1_score(y_train, y_train_pred, average="weighted"),
        "val_f1": f1_score(y_val, y_val_pred, average="weighted"),
        "test_f1": f1_score(y_test, y_test_pred, average="weighted")
    }

    print(f"Train Acc: {results[name]['train_acc']:.4f}")
    print(f"Val   Acc: {results[name]['val_acc']:.4f}")
    print(f"Test  Acc: {results[name]['test_acc']:.4f}")

    print("\nClassification Report (Test):")
    print(classification_report(
        y_test, y_test_pred,
        target_names=CLASS_NAMES,
        digits=4
    ))

    # =========================
    # CONFUSION MATRIX
    # =========================
    cm = confusion_matrix(y_test, y_test_pred)

    plt.figure(figsize=(6, 5))
    plt.imshow(cm)
    plt.title(f"{name} - Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.xticks(range(4), CLASS_NAMES, rotation=45)
    plt.yticks(range(4), CLASS_NAMES)

    for i in range(4):
        for j in range(4):
            plt.text(j, i, cm[i, j], ha="center", va="center")

    plt.tight_layout()
    plt.savefig(f"{name}_confusion_matrix.png", dpi=300)
    plt.close()

    # =========================
    # SAVE MODEL
    # =========================
    with open(f"{name}_model.pkl", "wb") as f:
        pickle.dump(model, f)

    print(f"Model saved: {name}_model.pkl")

# =========================
# MODEL COMPARISON SUMMARY
# =========================
print(f"\n{'='*60}")
print("MODEL COMPARISON SUMMARY")
print(f"{'='*60}")

print(f"{'Model':<20} {'Test Acc':<10} {'Test F1':<10}")
print("-" * 45)

for name, m in results.items():
    print(f"{name:<20} {m['test_acc']:<10.4f} {m['test_f1']:<10.4f}")

best_model = max(results, key=lambda x: results[x]["test_acc"])
print(f"\nüèÜ Best Model: {best_model}")
print(f"Test Accuracy: {results[best_model]['test_acc']:.4f}")
print(f"Test F1-Score: {results[best_model]['test_f1']:.4f}")

print("\n‚úì STEP 4 COMPLETE")
print("‚úì Traditional ML models trained and evaluated")


Loading preprocessed MFCC data...
Data loaded successfully
Train shape: (840, 8424)
Test shape: (240, 8424)

Training XGBoost
Train Acc: 1.0000
Val   Acc: 0.8167
Test  Acc: 0.8000

Classification Report (Test):
              precision    recall  f1-score   support

    modified     0.7258    0.7500    0.7377        60
  unmodified     0.7119    0.7000    0.7059        60
   synthetic     0.8983    0.8833    0.8908        60
     spliced     0.8667    0.8667    0.8667        60

    accuracy                         0.8000       240
   macro avg     0.8007    0.8000    0.8003       240
weighted avg     0.8007    0.8000    0.8003       240

Model saved: XGBoost_model.pkl

Training RandomForest
Train Acc: 1.0000
Val   Acc: 0.7917
Test  Acc: 0.7125

Classification Report (Test):
              precision    recall  f1-score   support

    modified     0.6508    0.6833    0.6667        60
  unmodified     0.6316    0.6000    0.6154        60
   synthetic     0.8750    0.7000    0.7778        6


# Step 5: Prepare Dataset for ResNet with Mel-Spectrograms
# pip install torch torchvision pillow

In [7]:
# =========================
# STEP 5: PREPARE DATASET FOR RESNET
# (ALIGNED WITH STEPS 1‚Äì4)
# =========================



# =========================
# CONFIGURATION (ALIGNED)
# =========================
MELSPEC_FOLDER = "melspec_features"   # .npy files
LABEL_FILE = "labels_step2.csv"

BATCH_SIZE = 32
IMG_SIZE = 224
TEST_SIZE = 0.2
VAL_SIZE = 0.1
RANDOM_STATE = 42

# =========================
# LOAD LABELS
# =========================
labels_df = pd.read_csv(LABEL_FILE)

files = labels_df["file"].values
labels = labels_df["label"].values

print(f"Total samples: {len(labels)}")

print("\nLabel Distribution:")
for k, v in Counter(labels).items():
    print(f"Class {k}: {v}")

# =========================
# TRAIN / VAL / TEST SPLIT
# =========================
X_temp, X_test, y_temp, y_test = train_test_split(
    files, labels,
    test_size=TEST_SIZE,
    random_state=RANDOM_STATE,
    stratify=labels
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=VAL_SIZE / (1 - TEST_SIZE),
    random_state=RANDOM_STATE,
    stratify=y_temp
)

print("\nDataset Split:")
print(f"Train: {len(X_train)}")
print(f"Val:   {len(X_val)}")
print(f"Test:  {len(X_test)}")

# =========================
# DATASET CLASS
# =========================
class MelSpectrogramDataset(Dataset):
    def __init__(self, file_ids, labels, transform=None):
        self.file_ids = file_ids
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.file_ids)

    def __getitem__(self, idx):
        file_id = self.file_ids[idx]
        label = self.labels[idx]

        mel = np.load(os.path.join(MELSPEC_FOLDER, file_id + ".npy"))
        mel = torch.tensor(mel, dtype=torch.float32).permute(2, 0, 1)  # (C,H,W)

        if self.transform:
            mel = self.transform(mel)

        return mel, label

# =========================
# TRANSFORMS (RESNET)
# =========================
train_transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.RandomHorizontalFlip(p=0.3),
    transforms.RandomRotation(5),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

val_test_transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

# =========================
# DATA LOADERS
# =========================
train_dataset = MelSpectrogramDataset(X_train, y_train, train_transform)
val_dataset   = MelSpectrogramDataset(X_val, y_val, val_test_transform)
test_dataset  = MelSpectrogramDataset(X_test, y_test, val_test_transform)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# =========================
# SANITY CHECK
# =========================
sample_x, sample_y = next(iter(train_loader))

print("\nSample Batch Check:")
print("Input shape:", sample_x.shape)   # (B, 3, 224, 224)
print("Labels:", sample_y[:5])

# =========================
# SAVE DATASET INFO
# =========================
dataset_info = {
    "train_files": X_train.tolist(),
    "val_files": X_val.tolist(),
    "test_files": X_test.tolist(),
    "num_classes": 4,
    "class_names": ["modified", "unmodified", "synthetic", "spliced"]
}

with open("resnet_dataset_info.pkl", "wb") as f:
    pickle.dump(dataset_info, f)

print("\n‚úì STEP 5 COMPLETE")
print("‚úì Mel-Spectrogram dataset ready for ResNet")
print("‚úì Fully aligned with Steps 1‚Äì4")


Total samples: 1200

Label Distribution:
Class 0: 300
Class 1: 300
Class 2: 300
Class 3: 300

Dataset Split:
Train: 840
Val:   120
Test:  240

Sample Batch Check:
Input shape: torch.Size([32, 3, 224, 224])
Labels: tensor([0, 3, 0, 3, 2])

‚úì STEP 5 COMPLETE
‚úì Mel-Spectrogram dataset ready for ResNet
‚úì Fully aligned with Steps 1‚Äì4


# Step 6: Train ResNet and EfficientNet Models
# pip install torch torchvision timm

In [None]:
# =========================
# STEP 6: TRAIN RESNET & EFFICIENTNET (GPU ENABLED)
# =========================



# =========================
# GPU SETTINGS
# =========================
torch.backends.cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# =========================
# LOAD DATA INFO
# =========================
with open("resnet_dataset_info.pkl", "rb") as f:
    dataset_info = pickle.load(f)

labels_df = pd.read_csv("labels_step2.csv")
label_map = dict(zip(labels_df["file"], labels_df["label"]))

CLASS_NAMES = ["modified", "unmodified", "synthetic", "spliced"]
NUM_CLASSES = 4

# =========================
# DATASET CLASS
# =========================
class MelDataset(Dataset):
    def __init__(self, file_ids, labels, transform=None):
        self.file_ids = file_ids
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.file_ids)

    def __getitem__(self, idx):
        mel = np.load(f"melspec_features/{self.file_ids[idx]}.npy")
        mel = torch.tensor(mel, dtype=torch.float32).permute(2, 0, 1)

        if self.transform:
            mel = self.transform(mel)

        return mel, self.labels[idx]

# =========================
# TRANSFORMS
# =========================
from torchvision import transforms

IMG_SIZE = 224
BATCH_SIZE = 64  # reduce to 32 if GPU memory is low

transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

# =========================
# DATASETS & LOADERS
# =========================
train_ds = MelDataset(
    dataset_info["train_files"],
    [label_map[f] for f in dataset_info["train_files"]],
    transform
)

val_ds = MelDataset(
    dataset_info["val_files"],
    [label_map[f] for f in dataset_info["val_files"]],
    transform
)

test_ds = MelDataset(
    dataset_info["test_files"],
    [label_map[f] for f in dataset_info["test_files"]],
    transform
)

train_loader = DataLoader(
    train_ds, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True
)
val_loader = DataLoader(
    val_ds, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True
)
test_loader = DataLoader(
    test_ds, batch_size=BATCH_SIZE, shuffle=False, pin_memory=True
)

# =========================
# MODEL WRAPPER
# =========================
class AudioClassifier(nn.Module):
    def __init__(self, backbone="resnet50"):
        super().__init__()

        if backbone == "resnet50":
            self.model = tv_models.resnet50(
                weights=tv_models.ResNet50_Weights.IMAGENET1K_V1
            )
            self.model.fc = nn.Linear(
                self.model.fc.in_features,
                NUM_CLASSES
            )

        elif backbone == "efficientnet_b0":
            self.model = timm.create_model(
                "efficientnet_b0",
                pretrained=True,
                num_classes=NUM_CLASSES
            )

        else:
            raise ValueError(f"Unknown backbone: {backbone}")

    def forward(self, x):
        return self.model(x)


# =========================
# TRAIN / VALIDATE LOOP
# =========================
def run_epoch(model, loader, criterion, optimizer=None):
    training = optimizer is not None
    model.train() if training else model.eval()

    correct, total = 0, 0
    preds, labels_all = [], []

    for x, y in tqdm(loader, leave=False):
        x = x.to(device, non_blocking=True)
        y = y.to(device, non_blocking=True)

        if training:
            optimizer.zero_grad()

        outputs = model(x)
        loss = criterion(outputs, y)

        if training:
            loss.backward()
            optimizer.step()

        _, predicted = outputs.max(1)
        correct += predicted.eq(y).sum().item()
        total += y.size(0)

        preds.extend(predicted.cpu().numpy())
        labels_all.extend(y.cpu().numpy())

    acc = 100 * correct / total
    f1 = f1_score(labels_all, preds, average="weighted")

    return acc, f1, preds, labels_all

# =========================
# TRAIN MODEL
# =========================
def train_model(backbone, epochs=25, lr=1e-3):
    print(f"\nTraining {backbone}")

    model = AudioClassifier(backbone).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = ReduceLROnPlateau(optimizer, mode="max", patience=3)

    best_acc = 0
    best_weights = copy.deepcopy(model.state_dict())

    for epoch in range(epochs):
        print(f"\nEpoch {epoch+1}/{epochs}")

        train_acc, _, _, _ = run_epoch(
            model, train_loader, criterion, optimizer
        )

        val_acc, val_f1, _, _ = run_epoch(
            model, val_loader, criterion
        )

        scheduler.step(val_acc)

        print(f"Train Acc: {train_acc:.2f}%")
        print(f"Val   Acc: {val_acc:.2f}% | Val F1: {val_f1:.4f}")

        if val_acc > best_acc:
            best_acc = val_acc
            best_weights = copy.deepcopy(model.state_dict())

    model.load_state_dict(best_weights)
    return model

# =========================
# EVALUATION
# =========================
def evaluate(model, name):
    acc, f1, preds, labels = run_epoch(
        model, test_loader, nn.CrossEntropyLoss()
    )

    print(f"\n{name} TEST RESULTS")
    print(f"Accuracy: {acc:.2f}%")
    print(f"F1-score: {f1:.4f}")
    print(classification_report(labels, preds, target_names=CLASS_NAMES))

    cm = confusion_matrix(labels, preds)
    plt.imshow(cm)
    plt.title(f"{name} Confusion Matrix")
    plt.xticks(range(4), CLASS_NAMES, rotation=45)
    plt.yticks(range(4), CLASS_NAMES)

    for i in range(4):
        for j in range(4):
            plt.text(j, i, cm[i, j], ha="center", va="center")

    plt.tight_layout()
    plt.savefig(f"{name}_confusion_matrix.png", dpi=300)
    plt.close()

    return acc, f1

# =========================
# RUN EXPERIMENTS
# =========================


experiments = {
    "resnet50": {"epochs": 25, "lr": 1e-3},
    "efficientnet_b0": {"epochs": 25, "lr": 1e-3}
}

results = {}

for model_name, cfg in experiments.items():
    model = train_model(model_name, cfg["epochs"], cfg["lr"])

    acc, f1 = evaluate(model, model_name)

    torch.save(model.state_dict(), f"{model_name}.pth")
    results[model_name] = {"acc": acc, "f1": f1}

# =========================
# SUMMARY
# =========================
print("\nFINAL RESULTS")
for name, res in results.items():
    print(f"{name}: Accuracy={res['acc']:.2f}%, F1={res['f1']:.4f}")

best_model = max(results, key=lambda x: results[x]["acc"])
print(f"\nüèÜ Best Model: {best_model}")


Using device: cuda

Training resnet50


AttributeError: 'dict' object has no attribute 'resnet50'


# Step 7: Complete Inference Pipeline for New Audio Files
# This allows you to predict on new audio files

In [None]:
# =========================
# STEP 7: INFERENCE PIPELINE (ALIGNED)
# =========================



# =========================
# CONFIGURATION (SAME AS TRAINING)
# =========================
SR = 16000
DURATION = 5
SAMPLES = SR * DURATION
IMG_SIZE = 224

CLASS_NAMES = ["modified", "unmodified", "synthetic", "spliced"]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# =========================
# MODEL DEFINITION
# =========================
class AudioClassifier(nn.Module):
    def __init__(self, backbone="resnet50", num_classes=4):
        super().__init__()

        if backbone == "resnet50":
            self.model = models.resnet50(weights=None)
            self.model.fc = nn.Linear(self.model.fc.in_features, num_classes)

        elif backbone == "efficientnet_b0":
            self.model = timm.create_model(
                "efficientnet_b0",
                pretrained=False,
                num_classes=num_classes
            )

    def forward(self, x):
        return self.model(x)

# =========================
# AUDIO PREPROCESSING
# =========================
def preprocess_audio(audio_path):
    y, _ = librosa.load(audio_path, sr=SR, mono=True)
    y = librosa.util.fix_length(y, size=SAMPLES)
    y = nr.reduce_noise(y=y, sr=SR)
    return y

def create_melspectrogram(y):
    mel = librosa.feature.melspectrogram(
        y=y, sr=SR, n_mels=128, n_fft=1024, hop_length=512
    )
    mel_db = librosa.power_to_db(mel, ref=np.max)
    mel_db = librosa.util.fix_length(mel_db, size=216, axis=1)
    mel_rgb = np.stack([mel_db] * 3, axis=-1)  # (H, W, 3)
    return mel_rgb

# =========================
# TRANSFORM (SAME AS TRAINING)
# =========================
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

# =========================
# PREDICTOR CLASS
# =========================
class AudioPredictor:
    def __init__(self, model_name="resnet50"):
        self.model_name = model_name
        self.model = AudioClassifier(model_name).to(device)
        self.model.load_state_dict(
            torch.load(f"{model_name}.pth", map_location=device)
        )
        self.model.eval()

    def predict(self, audio_path):
        y = preprocess_audio(audio_path)
        mel = create_melspectrogram(y)
        x = transform(mel).unsqueeze(0).to(device)

        with torch.no_grad():
            logits = self.model(x)
            probs = torch.softmax(logits, dim=1)[0].cpu().numpy()
            pred = int(np.argmax(probs))

        return {
            "predicted_class": CLASS_NAMES[pred],
            "confidence": float(probs[pred]),
            "all_probabilities": dict(zip(CLASS_NAMES, probs))
        }

# =========================
# VISUALIZATION (OPTIONAL)
# =========================
def visualize(audio_path, result):
    y = preprocess_audio(audio_path)
    mel = librosa.feature.melspectrogram(y=y, sr=SR)
    mel_db = librosa.power_to_db(mel, ref=np.max)

    plt.figure(figsize=(10, 4))
    librosa.display.specshow(mel_db, sr=SR, x_axis="time", y_axis="mel")
    plt.title(
        f"Prediction: {result['predicted_class']} "
        f"({result['confidence']*100:.2f}%)"
    )
    plt.colorbar(format="%+2.0f dB")
    plt.tight_layout()
    plt.show()

# =========================
# EXAMPLE USAGE
# =========================
if __name__ == "__main__":
    predictor = AudioPredictor("resnet50")  # or "efficientnet_b0"

    audio_file = "your_audio.wav"
    if os.path.exists(audio_file):
        result = predictor.predict(audio_file)
        print("\nPrediction Result:")
        for k, v in result.items():
            print(k, ":", v)

        visualize(audio_file, result)
