# Step 1: Install Required Libraries
!pip install datasets librosa scikit-learn matplotlib seaborn tensorflow

# Step 2: Imports

In [42]:
import numpy as np
import pandas as pd
import librosa
import os
from datasets import load_dataset
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.metrics import classification_report, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.layers import BatchNormalization, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping

import joblib


# Step 3: Load Urdu Dataset

In [25]:
ds = load_dataset("CSALT/deepfake_detection_dataset_urdu", split="train")

Repo card metadata block was not found. Setting CardData to empty.
Downloading data: 100%|██████████| 6794/6794 [00:44<00:00, 153.93files/s] 
Generating train split: 100%|██████████| 6794/6794 [00:16<00:00, 419.29 examples/s]


# Step 4: Feature Extraction (MFCCs)

In [38]:
def extract_features(batch, max_len=100):
    features, labels = [], []
    for example in batch:
        audio_path = example['audio']['path']  # Correct path reference
        # Load audio using librosa
        y, sr = librosa.load(audio_path, sr=None)
        
        # Extract MFCC features
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        if mfcc.shape[1] < max_len:
            mfcc = np.pad(mfcc, ((0, 0), (0, max_len - mfcc.shape[1])))
        else:
            mfcc = mfcc[:, :max_len]
        
        features.append(mfcc.flatten())  # Flatten for classical models

        # Infer the label from the folder name (e.g., 'Bonafide' -> 0, 'Spoof' -> 1)
        label = 1 if 'Spoof' in audio_path else 0  # Adjust based on folder structure
        labels.append(label)

    return np.array(features), np.array(labels)

X, y = extract_features(ds)


# Step 5: Train-Test Split

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Classical ML Models

In [40]:
models = {
    "SVM": SVC(probability=True, kernel='rbf'),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Perceptron": Perceptron()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    probas = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X_test)
    print(f"\n{name} Report:")
    print(classification_report(y_test, preds))
    print("ROC AUC:", roc_auc_score(y_test, probas))
    joblib.dump(model, f"{name.lower().replace(' ', '_')}_audio_model.pkl")



SVM Report:
              precision    recall  f1-score   support

           0       0.95      0.94      0.94       681
           1       0.94      0.95      0.94       678

    accuracy                           0.94      1359
   macro avg       0.94      0.94      0.94      1359
weighted avg       0.94      0.94      0.94      1359

ROC AUC: 0.9891427234805661


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Logistic Regression Report:
              precision    recall  f1-score   support

           0       0.79      0.82      0.80       681
           1       0.81      0.78      0.79       678

    accuracy                           0.80      1359
   macro avg       0.80      0.80      0.80      1359
weighted avg       0.80      0.80      0.80      1359

ROC AUC: 0.8825516873936038

Perceptron Report:
              precision    recall  f1-score   support

           0       0.73      0.93      0.82       681
           1       0.90      0.66      0.76       678

    accuracy                           0.79      1359
   macro avg       0.82      0.79      0.79      1359
weighted avg       0.81      0.79      0.79      1359

ROC AUC: 0.9156324856297567


# Step 7: Deep Neural Network

In [48]:

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define your model
model_dnn = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

# Compile the model
model_dnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model_dnn.fit(
    X_train, y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=[EarlyStopping(patience=3, restore_best_weights=True)]
)


Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.8008 - loss: 0.4181 - val_accuracy: 0.7579 - val_loss: 0.7281
Epoch 2/20
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9231 - loss: 0.1905 - val_accuracy: 0.8698 - val_loss: 0.3246
Epoch 3/20
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.9492 - loss: 0.1388 - val_accuracy: 0.9735 - val_loss: 0.0863
Epoch 4/20
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.9627 - loss: 0.0986 - val_accuracy: 0.9676 - val_loss: 0.0895
Epoch 5/20
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.9562 - loss: 0.1165 - val_accuracy: 0.9632 - val_loss: 0.0969
Epoch 6/20
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9562 - loss: 0.1158 - val_accuracy: 0.9441 - val_loss: 0.1553


# Evaluate DNN

In [50]:
dnn_preds = (model_dnn.predict(X_test) > 0.5).astype(int)
print("\nDNN Report:")
print(classification_report(y_test, dnn_preds))
print("ROC AUC:", roc_auc_score(y_test, model_dnn.predict(X_test)))
model_dnn.save("deepfake_dnn.h5")

[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step

DNN Report:
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       681
           1       0.98      0.96      0.97       678

    accuracy                           0.97      1359
   macro avg       0.97      0.97      0.97      1359
weighted avg       0.97      0.97      0.97      1359

[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step




ROC AUC: 0.9962466267288691
