In [8]:
import os
import numpy as np
import librosa
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from tqdm import tqdm
import pickle
import json

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


### Configuration and Paths

In [9]:
AUDIO_DIR = r"D:\BE Project\Self\MAIN DATASET"
SPECTROGRAM_DIR = r"D:\BE Project\Self\APPROACH3-CNN-BiLSTM\spectrograms"
EMBEDDING_DIR = r"D:\BE Project\Self\APPROACH3-CNN-BiLSTM\embeddings"
MODEL_DIR = r"D:\BE Project\Self\APPROACH3-CNN-BiLSTM\models"

# Create directories
for dir_path in [SPECTROGRAM_DIR, EMBEDDING_DIR, MODEL_DIR]:
    os.makedirs(dir_path, exist_ok=True)

# Audio processing parameters
SAMPLE_RATE = 16000
DURATION = 3  # seconds
SEGMENT_SAMPLES = SAMPLE_RATE * DURATION
N_MELS = 128
N_FFT = 1024
HOP_LENGTH = 512

### Data Preprocessing - Generate Spectrograms


In [3]:
def generate_spectrograms():
    """Generate mel-spectrograms from audio files"""
    print("Generating spectrograms...")
    
    for file in tqdm(os.listdir(AUDIO_DIR)):
        if not file.endswith(".wav"):
            continue
            
        file_path = os.path.join(AUDIO_DIR, file)
        
        # Determine label from filename
        # Assumes 'fake' in filename = fake (1), otherwise real (0)
        label = 1 if "fake" in file.lower() else 0
        
        try:
            # Load and preprocess audio
            y, sr = librosa.load(file_path, sr=SAMPLE_RATE, mono=True)
            y = librosa.util.fix_length(y, size=SEGMENT_SAMPLES)
            
            # Generate mel-spectrogram
            mel_spec = librosa.feature.melspectrogram(
                y=y, sr=sr, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP_LENGTH
            )
            mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
            
            # Save spectrogram
            out_name = os.path.splitext(file)[0] + ".npy"
            np.save(
                os.path.join(SPECTROGRAM_DIR, out_name), 
                {"spec": mel_spec_db, "label": label, "filename": file}
            )
            
        except Exception as e:
            print(f"Error processing {file}: {e}")
    
    print("Spectrogram generation complete!")

# Generate spectrograms
generate_spectrograms()

Generating spectrograms...


100%|██████████| 800/800 [00:29<00:00, 27.07it/s]

Spectrogram generation complete!





### Data Preprocessing - Generate wav2vec Embeddings

In [18]:
# Load wav2vec 2.0 model
bundle = torchaudio.pipelines.WAV2VEC2_BASE
model_wav2vec = bundle.get_model().to(device).eval()

In [None]:


def extract_wav2vec_embedding(audio_path):
    """Extract wav2vec embedding from audio file"""
    waveform, sr = torchaudio.load(audio_path)
    
    # Convert to mono if stereo
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    
    # Resample if necessary
    if sr != bundle.sample_rate:
        waveform = torchaudio.transforms.Resample(sr, bundle.sample_rate)(waveform)
    
    waveform = waveform.to(device)
    
    with torch.inference_mode():
        features, _ = model_wav2vec(waveform)
        # Average across time dimension
        embedding = features.mean(dim=1).squeeze().cpu().numpy()
    
    return embedding

def generate_embeddings():
    """Generate wav2vec embeddings from audio files"""
    print("Generating wav2vec embeddings...")
    
    for file in tqdm(os.listdir(AUDIO_DIR)):
        if not file.endswith(".wav"):
            continue
            
        file_path = os.path.join(AUDIO_DIR, file)
        label = 1 if "fake" in file.lower() else 0
        
        try:
            embedding = extract_wav2vec_embedding(file_path)
            
            # Save embedding
            out_name = os.path.splitext(file)[0] + ".npy"
            np.save(
                os.path.join(EMBEDDING_DIR, out_name),
                {"embedding": embedding, "label": label, "filename": file}
            )
            
        except Exception as e:
            print(f"Error processing {file}: {e}")
    
    print("Embedding generation complete!")

# Generate embeddings
generate_embeddings()

Generating wav2vec embeddings...


100%|██████████| 800/800 [13:37<00:00,  1.02s/it]

Embedding generation complete!





### Dataset Classes

In [10]:
class SpectrogramDataset(Dataset):
    def __init__(self, spec_dir):
        self.paths = [
            os.path.join(spec_dir, fname) 
            for fname in os.listdir(spec_dir) 
            if fname.endswith(".npy")
        ]

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        data = np.load(self.paths[idx], allow_pickle=True).item()
        spec = data["spec"]
        label = data["label"]
        
        # Normalize spectrogram
        spec = (spec - spec.mean()) / (spec.std() + 1e-8)
        spec_tensor = torch.tensor(spec, dtype=torch.float32).unsqueeze(0)  # [1, Mel, Time]
        
        return spec_tensor, torch.tensor(label, dtype=torch.float32)

class EmbeddingDataset(Dataset):
    def __init__(self, embedding_dir):
        self.paths = [
            os.path.join(embedding_dir, fname)
            for fname in os.listdir(embedding_dir)
            if fname.endswith(".npy")
        ]

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        data = np.load(self.paths[idx], allow_pickle=True).item()
        embedding = data["embedding"]
        label = data["label"]
        
        return torch.tensor(embedding, dtype=torch.float32), torch.tensor(label, dtype=torch.float32)


### Model Definitions


In [11]:
class CNN_BiLSTM(nn.Module):
    def __init__(self, n_mels=128):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d((2, 2)),
            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d((2, 2)),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.MaxPool2d((2, 2)),
        )
        
        # Calculate LSTM input size
        lstm_input_size = 64 * (n_mels // 8)  # After 3 maxpool operations
        
        self.lstm = nn.LSTM(
            input_size=lstm_input_size, 
            hidden_size=128, 
            batch_first=True, 
            bidirectional=True,
            dropout=0.2
        )
        
        self.fc = nn.Sequential(
            nn.Linear(256, 128),  # 256 because bidirectional
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        # CNN feature extraction
        x = self.cnn(x)  # [B, 64, Mel/8, Time/8]
        
        # Prepare for LSTM: [B, Time, Features]
        x = x.permute(0, 3, 1, 2)  # [B, Time, Channels, Mels]
        x = x.flatten(2)           # [B, Time, Channels * Mels]
        
        # LSTM processing
        lstm_out, _ = self.lstm(x)  # [B, Time, 256]
        
        # Use last output
        x = lstm_out[:, -1, :]  # [B, 256]
        
        # Classification
        return self.fc(x).squeeze()

class MLPClassifier(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.model(x).squeeze()

print("Models defined successfully!")

Models defined successfully!


### Training Functions

In [12]:
def train_model(model, train_loader, val_loader, model_name, epochs=50, patience=7):
    """Generic training function"""
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3)
    criterion = nn.BCEWithLogitsLoss()
    
    best_val_loss = float('inf')
    patience_counter = 0
    train_losses = []
    val_losses = []
    val_accuracies = []
    
    print(f"\nTraining {model_name}...")
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        correct_train = 0
        total_train = 0
        
        for batch_x, batch_y in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            
            preds = torch.sigmoid(outputs) > 0.5
            correct_train += (preds == batch_y.bool()).sum().item()
            total_train += batch_y.numel()

        avg_train_loss = train_loss / len(train_loader)
        train_accuracy = correct_train / total_train * 100

        # Validation phase
        model.eval()
        val_loss = 0.0
        correct_val = 0
        total_val = 0

        with torch.no_grad():
            for batch_x, batch_y in val_loader:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
                val_loss += loss.item()

                preds = torch.sigmoid(outputs) > 0.5
                correct_val += (preds == batch_y.bool()).sum().item()
                total_val += batch_y.numel()

        avg_val_loss = val_loss / len(val_loader)
        val_accuracy = correct_val / total_val * 100

        train_losses.append(avg_train_loss)
        val_losses.append(avg_val_loss)
        val_accuracies.append(val_accuracy)
        
        print(f"Epoch {epoch+1}: Train Loss = {avg_train_loss:.4f}, "
              f"Train Acc = {train_accuracy:.2f}%, "
              f"Val Loss = {avg_val_loss:.4f}, Val Acc = {val_accuracy:.2f}%")

        scheduler.step(avg_val_loss)

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            patience_counter = 0
            torch.save(model.state_dict(), os.path.join(MODEL_DIR, f"{model_name}_best.pth"))
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered.")
                break

    return train_losses, val_losses, val_accuracies


def evaluate_model(model, test_loader, model_name):
    """Evaluate model performance"""
    model.eval()
    test_preds, test_labels = [], []
    
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            batch_x = batch_x.to(device)
            outputs = model(batch_x)
            probs = torch.sigmoid(outputs).cpu().numpy()
            test_preds.extend(probs)
            test_labels.extend(batch_y.numpy())
    
    test_preds = np.array(test_preds)
    test_labels = np.array(test_labels)
    pred_binary = test_preds > 0.5
    
    metrics = {
        'accuracy': accuracy_score(test_labels, pred_binary),
        'precision': precision_score(test_labels, pred_binary),
        'recall': recall_score(test_labels, pred_binary),
        'f1': f1_score(test_labels, pred_binary),
        'auc': roc_auc_score(test_labels, test_preds)
    }
    
    print(f"\n{model_name} Test Results:")
    for metric, value in metrics.items():
        print(f"{metric.capitalize()}: {value:.4f}")
    
    return test_preds, test_labels, metrics


### Train CNN+BiLSTM Model

In [13]:
# Create datasets and data loaders
spec_dataset = SpectrogramDataset(SPECTROGRAM_DIR)
embed_dataset = EmbeddingDataset(EMBEDDING_DIR)

print(f"Spectrogram dataset size: {len(spec_dataset)}")
print(f"Embedding dataset size: {len(embed_dataset)}")

# New 90% train, 10% test split
torch.manual_seed(42)
spec_lengths = [int(0.9 * len(spec_dataset))]
spec_lengths.append(len(spec_dataset) - sum(spec_lengths))  # 10%
spec_train, spec_test = random_split(spec_dataset, spec_lengths)

torch.manual_seed(42)
embed_lengths = [int(0.9 * len(embed_dataset))]
embed_lengths.append(len(embed_dataset) - sum(embed_lengths))  # 10%
embed_train, embed_test = random_split(embed_dataset, embed_lengths)

# Create data loaders
spec_train_loader = DataLoader(spec_train, batch_size=8, shuffle=True)
spec_val_loader = DataLoader(spec_test, batch_size=8)  # test set used as validation
spec_test_loader = DataLoader(spec_test, batch_size=8)

# Train CNN+BiLSTM model
cnn_bilstm = CNN_BiLSTM(n_mels=N_MELS)
cnn_losses = train_model(cnn_bilstm, spec_train_loader, spec_val_loader, "CNN_BiLSTM", epochs=30)

# Load best model and evaluate
cnn_bilstm.load_state_dict(torch.load(os.path.join(MODEL_DIR, "CNN_BiLSTM_best.pth")))
cnn_bilstm = cnn_bilstm.to(device)
cnn_preds, cnn_labels, cnn_metrics = evaluate_model(cnn_bilstm, spec_test_loader, "CNN+BiLSTM")


Spectrogram dataset size: 800
Embedding dataset size: 800





Training CNN_BiLSTM...


Epoch 1/30: 100%|██████████| 90/90 [00:23<00:00,  3.83it/s]


Epoch 1: Train Loss = 0.6926, Train Acc = 51.67%, Val Loss = 0.7020, Val Acc = 46.25%


Epoch 2/30: 100%|██████████| 90/90 [00:09<00:00,  9.74it/s]


Epoch 2: Train Loss = 0.6548, Train Acc = 62.92%, Val Loss = 0.5882, Val Acc = 68.75%


Epoch 3/30: 100%|██████████| 90/90 [00:09<00:00,  9.90it/s]


Epoch 3: Train Loss = 0.5813, Train Acc = 69.17%, Val Loss = 0.6943, Val Acc = 66.25%


Epoch 4/30: 100%|██████████| 90/90 [00:08<00:00, 10.78it/s]


Epoch 4: Train Loss = 0.4972, Train Acc = 75.56%, Val Loss = 0.3407, Val Acc = 86.25%


Epoch 5/30: 100%|██████████| 90/90 [00:08<00:00, 10.59it/s]


Epoch 5: Train Loss = 0.3045, Train Acc = 87.08%, Val Loss = 0.0846, Val Acc = 98.75%


Epoch 6/30: 100%|██████████| 90/90 [00:09<00:00,  9.56it/s]


Epoch 6: Train Loss = 0.1538, Train Acc = 92.92%, Val Loss = 0.8401, Val Acc = 72.50%


Epoch 7/30: 100%|██████████| 90/90 [00:08<00:00, 10.09it/s]


Epoch 7: Train Loss = 0.1622, Train Acc = 94.17%, Val Loss = 0.1828, Val Acc = 93.75%


Epoch 8/30: 100%|██████████| 90/90 [00:10<00:00,  8.96it/s]


Epoch 8: Train Loss = 0.0456, Train Acc = 98.75%, Val Loss = 0.2488, Val Acc = 91.25%


Epoch 9/30: 100%|██████████| 90/90 [00:09<00:00,  9.51it/s]


Epoch 9: Train Loss = 0.1024, Train Acc = 97.08%, Val Loss = 0.0099, Val Acc = 100.00%


Epoch 10/30: 100%|██████████| 90/90 [00:07<00:00, 11.42it/s]


Epoch 10: Train Loss = 0.0191, Train Acc = 99.58%, Val Loss = 0.0024, Val Acc = 100.00%


Epoch 11/30: 100%|██████████| 90/90 [00:07<00:00, 12.28it/s]


Epoch 11: Train Loss = 0.0029, Train Acc = 100.00%, Val Loss = 0.0005, Val Acc = 100.00%


Epoch 12/30: 100%|██████████| 90/90 [00:08<00:00, 10.41it/s]


Epoch 12: Train Loss = 0.0008, Train Acc = 100.00%, Val Loss = 0.0002, Val Acc = 100.00%


Epoch 13/30: 100%|██████████| 90/90 [00:09<00:00,  9.59it/s]


Epoch 13: Train Loss = 0.0014, Train Acc = 100.00%, Val Loss = 0.0001, Val Acc = 100.00%


Epoch 14/30: 100%|██████████| 90/90 [00:10<00:00,  8.44it/s]


Epoch 14: Train Loss = 0.0005, Train Acc = 100.00%, Val Loss = 0.0000, Val Acc = 100.00%


Epoch 15/30: 100%|██████████| 90/90 [00:11<00:00,  8.03it/s]


Epoch 15: Train Loss = 0.0003, Train Acc = 100.00%, Val Loss = 0.0000, Val Acc = 100.00%


Epoch 16/30: 100%|██████████| 90/90 [00:11<00:00,  7.55it/s]


Epoch 16: Train Loss = 0.0001, Train Acc = 100.00%, Val Loss = 0.0000, Val Acc = 100.00%


Epoch 17/30: 100%|██████████| 90/90 [00:13<00:00,  6.65it/s]


Epoch 17: Train Loss = 0.0001, Train Acc = 100.00%, Val Loss = 0.0000, Val Acc = 100.00%


Epoch 18/30: 100%|██████████| 90/90 [00:15<00:00,  5.82it/s]


Epoch 18: Train Loss = 0.0001, Train Acc = 100.00%, Val Loss = 0.0000, Val Acc = 100.00%


Epoch 19/30: 100%|██████████| 90/90 [00:16<00:00,  5.44it/s]


Epoch 19: Train Loss = 0.0219, Train Acc = 99.72%, Val Loss = 0.6945, Val Acc = 82.50%


Epoch 20/30: 100%|██████████| 90/90 [00:16<00:00,  5.55it/s]


Epoch 20: Train Loss = 0.1249, Train Acc = 96.25%, Val Loss = 0.1666, Val Acc = 92.50%


Epoch 21/30: 100%|██████████| 90/90 [00:14<00:00,  6.01it/s]


Epoch 21: Train Loss = 0.1003, Train Acc = 96.53%, Val Loss = 0.0193, Val Acc = 100.00%


Epoch 22/30: 100%|██████████| 90/90 [00:14<00:00,  6.14it/s]


Epoch 22: Train Loss = 0.0523, Train Acc = 98.61%, Val Loss = 0.0075, Val Acc = 100.00%


Epoch 23/30: 100%|██████████| 90/90 [00:15<00:00,  5.93it/s]


Epoch 23: Train Loss = 0.0077, Train Acc = 99.86%, Val Loss = 0.0036, Val Acc = 100.00%


Epoch 24/30: 100%|██████████| 90/90 [00:15<00:00,  5.93it/s]


Epoch 24: Train Loss = 0.0051, Train Acc = 100.00%, Val Loss = 0.0026, Val Acc = 100.00%


Epoch 25/30: 100%|██████████| 90/90 [00:15<00:00,  5.73it/s]


Epoch 25: Train Loss = 0.0033, Train Acc = 100.00%, Val Loss = 0.0022, Val Acc = 100.00%
Early stopping triggered.

CNN+BiLSTM Test Results:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1: 1.0000
Auc: 1.0000


### Train wav2vec+MLP Model

In [None]:
# New 90% train / 10% test split for embeddings
torch.manual_seed(42)
embed_lengths = [int(0.9 * len(embed_dataset))]
embed_lengths.append(len(embed_dataset) - sum(embed_lengths))  # 10%
embed_train, embed_test = random_split(embed_dataset, embed_lengths)

# Create data loaders
embed_train_loader = DataLoader(embed_train, batch_size=32, shuffle=True)
embed_val_loader = DataLoader(embed_test, batch_size=32)  # using test as val
embed_test_loader = DataLoader(embed_test, batch_size=32)

# Get embedding dimension
sample_embedding = embed_dataset[0][0]
embedding_dim = int(sample_embedding.shape[0])
print(f"Embedding dimension: {embedding_dim}")

# Train MLP model
mlp_model = MLPClassifier(embedding_dim)
mlp_losses = train_model(mlp_model, embed_train_loader, embed_val_loader, "wav2vec_MLP", epochs=30)

# Load best model and evaluate
mlp_model.load_state_dict(torch.load(os.path.join(MODEL_DIR, "wav2vec_MLP_best.pth")))
mlp_model = mlp_model.to(device)
mlp_preds, mlp_labels, mlp_metrics = evaluate_model(mlp_model, embed_test_loader, "wav2vec+MLP")


Embedding dimension: 768

Training wav2vec_MLP...


Epoch 1/30: 100%|██████████| 23/23 [00:12<00:00,  1.91it/s]


Epoch 1: Train Loss = 0.6910, Train Acc = 52.64%, Val Loss = 0.6964, Val Acc = 42.50%


Epoch 2/30: 100%|██████████| 23/23 [00:00<00:00, 43.19it/s]


Epoch 2: Train Loss = 0.6564, Train Acc = 61.94%, Val Loss = 0.5944, Val Acc = 68.75%


Epoch 3/30: 100%|██████████| 23/23 [00:00<00:00, 33.68it/s]


Epoch 3: Train Loss = 0.5052, Train Acc = 76.53%, Val Loss = 0.2849, Val Acc = 96.25%


Epoch 4/30: 100%|██████████| 23/23 [00:00<00:00, 30.75it/s]


Epoch 4: Train Loss = 0.2447, Train Acc = 91.67%, Val Loss = 0.1469, Val Acc = 95.00%


Epoch 5/30: 100%|██████████| 23/23 [00:00<00:00, 33.96it/s]


Epoch 5: Train Loss = 0.1292, Train Acc = 94.72%, Val Loss = 0.0407, Val Acc = 100.00%


Epoch 6/30: 100%|██████████| 23/23 [00:00<00:00, 30.77it/s]


Epoch 6: Train Loss = 0.0630, Train Acc = 97.64%, Val Loss = 0.1297, Val Acc = 96.25%


Epoch 7/30: 100%|██████████| 23/23 [00:00<00:00, 37.22it/s]


Epoch 7: Train Loss = 0.0853, Train Acc = 96.53%, Val Loss = 0.4877, Val Acc = 81.25%


Epoch 8/30: 100%|██████████| 23/23 [00:00<00:00, 36.06it/s]


Epoch 8: Train Loss = 0.1445, Train Acc = 93.47%, Val Loss = 0.0851, Val Acc = 97.50%


Epoch 9/30: 100%|██████████| 23/23 [00:00<00:00, 34.30it/s]


Epoch 9: Train Loss = 0.0369, Train Acc = 98.61%, Val Loss = 0.0346, Val Acc = 98.75%


Epoch 10/30: 100%|██████████| 23/23 [00:00<00:00, 31.03it/s]


Epoch 10: Train Loss = 0.0231, Train Acc = 99.03%, Val Loss = 0.0450, Val Acc = 98.75%


Epoch 11/30: 100%|██████████| 23/23 [00:00<00:00, 43.76it/s]


Epoch 11: Train Loss = 0.0167, Train Acc = 99.44%, Val Loss = 0.0080, Val Acc = 100.00%


Epoch 12/30: 100%|██████████| 23/23 [00:00<00:00, 38.69it/s]


Epoch 12: Train Loss = 0.0116, Train Acc = 99.72%, Val Loss = 0.0029, Val Acc = 100.00%


Epoch 13/30: 100%|██████████| 23/23 [00:00<00:00, 45.98it/s]


Epoch 13: Train Loss = 0.0188, Train Acc = 99.31%, Val Loss = 0.0084, Val Acc = 100.00%


Epoch 14/30: 100%|██████████| 23/23 [00:00<00:00, 47.64it/s]


Epoch 14: Train Loss = 0.0290, Train Acc = 98.61%, Val Loss = 0.0457, Val Acc = 97.50%


Epoch 15/30: 100%|██████████| 23/23 [00:00<00:00, 57.09it/s]


Epoch 15: Train Loss = 0.0654, Train Acc = 97.92%, Val Loss = 0.0247, Val Acc = 98.75%


Epoch 16/30: 100%|██████████| 23/23 [00:00<00:00, 48.46it/s]


Epoch 16: Train Loss = 0.0301, Train Acc = 99.31%, Val Loss = 0.0103, Val Acc = 100.00%


Epoch 17/30: 100%|██████████| 23/23 [00:00<00:00, 58.72it/s]


Epoch 17: Train Loss = 0.0122, Train Acc = 99.58%, Val Loss = 0.0147, Val Acc = 100.00%


Epoch 18/30: 100%|██████████| 23/23 [00:00<00:00, 44.47it/s]


Epoch 18: Train Loss = 0.0083, Train Acc = 100.00%, Val Loss = 0.0233, Val Acc = 98.75%


Epoch 19/30: 100%|██████████| 23/23 [00:00<00:00, 43.12it/s]


Epoch 19: Train Loss = 0.0116, Train Acc = 99.72%, Val Loss = 0.0064, Val Acc = 100.00%
Early stopping triggered.

wav2vec+MLP Test Results:
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1: 1.0000
Auc: 1.0000


### Create Ensemble Model

In [15]:
def find_best_ensemble_weight(val_preds_1, val_preds_2, val_labels):
    """Find optimal ensemble weight using validation set"""
    best_f1 = 0
    best_alpha = 0.5
    
    for alpha in np.arange(0.0, 1.05, 0.05):
        ensemble_preds = alpha * val_preds_1 + (1 - alpha) * val_preds_2
        ensemble_binary = ensemble_preds > 0.5
        f1 = f1_score(val_labels, ensemble_binary)
        
        if f1 > best_f1:
            best_f1 = f1
            best_alpha = alpha
    
    return best_alpha, best_f1

# Get predictions on validation set for ensemble tuning
cnn_bilstm.eval()
mlp_model.eval()

val_preds_cnn, val_preds_mlp = [], []
val_labels_ensemble = []

with torch.no_grad():
    # CNN predictions on validation set
    for batch_x, batch_y in spec_val_loader:
        batch_x = batch_x.to(device)
        outputs = torch.sigmoid(cnn_bilstm(batch_x)).cpu().numpy()
        val_preds_cnn.extend(outputs)
        val_labels_ensemble.extend(batch_y.numpy())
    
    # MLP predictions on validation set
    val_labels_mlp = []
    for batch_x, batch_y in embed_val_loader:
        batch_x = batch_x.to(device)
        outputs = torch.sigmoid(mlp_model(batch_x)).cpu().numpy()
        val_preds_mlp.extend(outputs)
        val_labels_mlp.extend(batch_y.numpy())

val_preds_cnn = np.array(val_preds_cnn)
val_preds_mlp = np.array(val_preds_mlp)

# Find best ensemble weight
best_alpha, best_val_f1 = find_best_ensemble_weight(val_preds_cnn, val_preds_mlp, val_labels_ensemble)
print(f"\nBest ensemble weight (α): {best_alpha:.3f}")
print(f"Best validation F1: {best_val_f1:.4f}")



Best ensemble weight (α): 0.000
Best validation F1: 1.0000


### Evaluate Ensemble on Test Set

In [16]:
# Get test predictions from both models
test_preds_cnn, test_preds_mlp = [], []
test_labels_ensemble = []

with torch.no_grad():
    # CNN predictions on test set
    for batch_x, batch_y in spec_test_loader:
        batch_x = batch_x.to(device)
        outputs = torch.sigmoid(cnn_bilstm(batch_x)).cpu().numpy()
        test_preds_cnn.extend(outputs)
        test_labels_ensemble.extend(batch_y.numpy())
    
    # MLP predictions on test set
    for batch_x, batch_y in embed_test_loader:
        batch_x = batch_x.to(device)
        outputs = torch.sigmoid(mlp_model(batch_x)).cpu().numpy()
        test_preds_mlp.extend(outputs)

test_preds_cnn = np.array(test_preds_cnn)
test_preds_mlp = np.array(test_preds_mlp)

# Create ensemble predictions
ensemble_preds = best_alpha * test_preds_cnn + (1 - best_alpha) * test_preds_mlp
ensemble_binary = ensemble_preds > 0.5

# Calculate ensemble metrics
ensemble_metrics = {
    'accuracy': accuracy_score(test_labels_ensemble, ensemble_binary),
    'precision': precision_score(test_labels_ensemble, ensemble_binary),
    'recall': recall_score(test_labels_ensemble, ensemble_binary),
    'f1': f1_score(test_labels_ensemble, ensemble_binary),
    'auc': roc_auc_score(test_labels_ensemble, ensemble_preds)
}

print("\n" + "="*50)
print("ENSEMBLE MODEL TEST RESULTS")
print("="*50)
for metric, value in ensemble_metrics.items():
    print(f"{metric.capitalize()}: {value:.4f}")


ENSEMBLE MODEL TEST RESULTS
Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1: 1.0000
Auc: 1.0000


### Save Complete Ensemble Model

In [22]:
class EnsembleModel:
    def __init__(self, cnn_model, mlp_model, wav2vec_model, alpha, device):
        self.cnn_model = cnn_model
        self.mlp_model = mlp_model
        self.wav2vec_model = wav2vec_model
        self.alpha = alpha
        self.device = device
        
        # Set models to eval mode
        self.cnn_model.eval()
        self.mlp_model.eval()
        self.wav2vec_model.eval()
    
    def predict_audio_file(self, audio_path):
        """Predict single audio file using ensemble"""
        # Generate spectrogram
        y, sr = librosa.load(audio_path, sr=SAMPLE_RATE, mono=True)
        y = librosa.util.fix_length(y, size=SEGMENT_SAMPLES)
        mel_spec = librosa.feature.melspectrogram(
            y=y, sr=sr, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP_LENGTH
        )
        mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
        mel_spec_norm = (mel_spec_db - mel_spec_db.mean()) / (mel_spec_db.std() + 1e-8)
        spec_tensor = torch.tensor(mel_spec_norm, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
        
        # Generate wav2vec embedding
        waveform, sr = torchaudio.load(audio_path)
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)
        if sr != 16000:
            waveform = torchaudio.transforms.Resample(sr, 16000)(waveform)
        
        waveform = waveform.to(self.device)
        with torch.inference_mode():
            features, _ = self.wav2vec_model(waveform)
            embedding = features.mean(dim=1).squeeze().unsqueeze(0)
        
        # Get predictions from both models
        with torch.no_grad():
            spec_tensor = spec_tensor.to(self.device)
            cnn_pred = torch.sigmoid(self.cnn_model(spec_tensor)).cpu().item()
            mlp_pred = torch.sigmoid(self.mlp_model(embedding)).cpu().item()
        
        # Ensemble prediction
        ensemble_pred = self.alpha * cnn_pred + (1 - self.alpha) * mlp_pred
        
        return {
            'ensemble_score': ensemble_pred,
            'cnn_score': cnn_pred,
            'mlp_score': mlp_pred,
            'prediction': 'FAKE' if ensemble_pred > 0.5 else 'REAL'
        }

# Create ensemble model
ensemble_model = EnsembleModel(cnn_bilstm, mlp_model, model_wav2vec, best_alpha, device)

def cast_all(obj):
    """Recursively convert NumPy types to standard Python types."""
    import numbers

    if isinstance(obj, dict):
        return {k: cast_all(v) for k, v in obj.items()}
    elif isinstance(obj, (list, tuple)):
        return type(obj)(cast_all(v) for v in obj)
    elif isinstance(obj, (np.integer,)):
        return int(obj)
    elif isinstance(obj, (numbers.Real,)):
        return float(obj)
    elif isinstance(obj, np.ndarray):
        return obj.tolist()
    else:
        return obj



# Save ensemble model
ensemble_save_dict = {
    'cnn_state_dict': cnn_bilstm.state_dict(),
    'mlp_state_dict': mlp_model.state_dict(),
    'alpha': float(best_alpha),
    'embedding_dim': int(embedding_dim),
    'model_config': {
        'n_mels': int(N_MELS),
        'sample_rate': int(SAMPLE_RATE),
        'duration': int(DURATION),
        'n_fft': int(N_FFT),
        'hop_length': int(HOP_LENGTH)
    },
    'metrics': cast_all({
    'cnn_metrics': cnn_metrics,
    'mlp_metrics': mlp_metrics,
    'ensemble_metrics': ensemble_metrics
})

}

torch.save(ensemble_save_dict, os.path.join(MODEL_DIR, 'complete_ensemble_model.pth'))
print(f"\nComplete ensemble model saved to: {os.path.join(MODEL_DIR, 'complete_ensemble_model.pth')}")



Complete ensemble model saved to: D:\BE Project\Self\APPROACH3-CNN-BiLSTM\models\complete_ensemble_model.pth


### Test Ensemble on Sample Files

In [23]:
# sample_files = [
#     "1_original_shree.wav",
#     "1_fake_aaditya.wav", 
#     "1_fake_jui.wav",
#     "1_original_omkar.wav"
# ]

sample_files = [
    "D:\BE Project\Self\APPROACH3-CNN-BiLSTM\AI Voice Aaditya.wav",
    "D:\BE Project\Self\APPROACH3-CNN-BiLSTM\omkar_new_test_real.wav", 
    "D:\BE Project\Self\APPROACH3-CNN-BiLSTM\omkar_real.wav",
    "D:\BE Project\Self\APPROACH3-CNN-BiLSTM\Real Voice Aaditya.wav",
    
]

print("\n" + "="*60)
print("TESTING ENSEMBLE ON SAMPLE FILES")
print("="*60)

for filename in sample_files:
    file_path = os.path.join(AUDIO_DIR, filename)
    if os.path.exists(file_path):
        try:
            result = ensemble_model.predict_audio_file(file_path)
            print(f"\nFile: {filename}")
            print(f"Prediction: {result['prediction']}")
            print(f"Ensemble Score: {result['ensemble_score']:.4f}")
            print(f"CNN Score: {result['cnn_score']:.4f}")
            print(f"MLP Score: {result['mlp_score']:.4f}")
        except Exception as e:
            print(f"Error processing {filename}: {e}")
    else:
        print(f"File not found: {filename}")

print("\n✅ Training complete! Your ensemble model is ready for use.")



TESTING ENSEMBLE ON SAMPLE FILES

File: D:\BE Project\Self\APPROACH3-CNN-BiLSTM\AI Voice Aaditya.wav
Prediction: FAKE
Ensemble Score: 1.0000
CNN Score: 1.0000
MLP Score: 1.0000

File: D:\BE Project\Self\APPROACH3-CNN-BiLSTM\omkar_new_test_real.wav
Prediction: REAL
Ensemble Score: 0.0000
CNN Score: 0.0000
MLP Score: 0.0000

File: D:\BE Project\Self\APPROACH3-CNN-BiLSTM\omkar_real.wav
Prediction: REAL
Ensemble Score: 0.0001
CNN Score: 0.0000
MLP Score: 0.0001

File: D:\BE Project\Self\APPROACH3-CNN-BiLSTM\Real Voice Aaditya.wav
Prediction: REAL
Ensemble Score: 0.0095
CNN Score: 0.0000
MLP Score: 0.0095

✅ Training complete! Your ensemble model is ready for use.
