In [46]:
import os
import torch
import torchaudio
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn.functional as F


In [47]:
# Configuration
SAMPLE_RATE = 16000
NUM_EPOCHS = 50
BATCH_SIZE = 8
LEARNING_RATE = 1e-3
FIXED_LENGTH = SAMPLE_RATE * 2  # 2 seconds

In [48]:
# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [49]:
# Custom Dataset class
class AudioDeepfakeDataset(Dataset):
    def __init__(self, files, labels, transform=None):
        self.files = files
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        path = self.files[idx]
        label = self.labels[idx]

        waveform, sr = torchaudio.load(path)
        if sr != SAMPLE_RATE:
            resampler = torchaudio.transforms.Resample(sr, SAMPLE_RATE)
            waveform = resampler(waveform)

        if self.transform:
            waveform = self.transform(waveform)

        return waveform, label

In [50]:
# Pad or truncate waveform to a fixed length
def pad_waveform(waveform, length=FIXED_LENGTH):
    if waveform.shape[1] > length:
        return waveform[:, :length]
    else:
        return F.pad(waveform, (0, length - waveform.shape[1]))

transform = lambda x: pad_waveform(x)

In [51]:
# Load dataset paths
def load_dataset(data_dir="/Users/fenilvadher/Documents/Collage Data/SEM - 6/AI/AI Project/fake_audio"):
    paths, labels = [], []
    label_map = {"real": 0, "fake": 1}
    for label_str, label_int in label_map.items():
        folder = os.path.join(data_dir, label_str)
        for file in os.listdir(folder):
            if file.endswith(".wav"):
                paths.append(os.path.join(folder, file))
                labels.append(label_int)
    return paths, labels

In [52]:
# CNN Model
class AudioCNN(nn.Module):
    def __init__(self, input_length):
        super(AudioCNN, self).__init__()
        self.conv1 = nn.Conv1d(1, 16, kernel_size=5, stride=2)
        self.bn1 = nn.BatchNorm1d(16)
        self.pool1 = nn.MaxPool1d(2)
        self.conv2 = nn.Conv1d(16, 32, kernel_size=5, stride=2)
        self.bn2 = nn.BatchNorm1d(32)
        self.pool2 = nn.MaxPool1d(2)

        # Compute output shape dynamically
        with torch.no_grad():
            dummy_input = torch.zeros(1, 1, input_length)
            x = self.pool1(F.relu(self.bn1(self.conv1(dummy_input))))
            x = self.pool2(F.relu(self.bn2(self.conv2(x))))
            self.flattened_size = x.view(1, -1).shape[1]

        self.fc1 = nn.Linear(self.flattened_size, 64)
        self.fc2 = nn.Linear(64, 2)

    def forward(self, x):
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

In [53]:
# Load data
file_paths, file_labels = load_dataset("/Users/fenilvadher/Documents/Collage Data/SEM - 6/AI/AI Project/fake_audio")
train_paths, test_paths, train_labels, test_labels = train_test_split(
    file_paths, file_labels, test_size=0.2, random_state=42
)

In [54]:
# Optional transform: fix length
def pad_waveform(waveform, length=16000*2):  # 2 sec fixed length
    if waveform.shape[1] > length:
        return waveform[:, :length]
    else:
        return F.pad(waveform, (0, length - waveform.shape[1]))

transform = lambda x: pad_waveform(x)

In [55]:
train_dataset = AudioDeepfakeDataset(train_paths, train_labels, transform=transform)
test_dataset = AudioDeepfakeDataset(test_paths, test_labels, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [56]:
# Initialize model
model = AudioCNN(FIXED_LENGTH).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [57]:
# Training loop
for epoch in range(NUM_EPOCHS):
    model.train()
    total_loss = 0
    for waveforms, labels in train_loader:
        waveforms = waveforms.to(device)
        labels = labels.to(device)

        # Ensure shape [batch, channels, length]
        if waveforms.dim() == 2:
            waveforms = waveforms.unsqueeze(1)

        outputs = model(waveforms)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{NUM_EPOCHS} - Loss: {total_loss:.4f}")

Epoch 1/50 - Loss: 25.9870
Epoch 2/50 - Loss: 17.7850
Epoch 3/50 - Loss: 10.9809
Epoch 4/50 - Loss: 8.9897
Epoch 5/50 - Loss: 5.5005
Epoch 6/50 - Loss: 4.3007
Epoch 7/50 - Loss: 3.7679
Epoch 8/50 - Loss: 3.5930
Epoch 9/50 - Loss: 3.2931
Epoch 10/50 - Loss: 5.9810
Epoch 11/50 - Loss: 3.6334
Epoch 12/50 - Loss: 3.3944
Epoch 13/50 - Loss: 3.8008
Epoch 14/50 - Loss: 5.9878
Epoch 15/50 - Loss: 3.4583
Epoch 16/50 - Loss: 2.5799
Epoch 17/50 - Loss: 3.5423
Epoch 18/50 - Loss: 2.9140
Epoch 19/50 - Loss: 2.4229
Epoch 20/50 - Loss: 2.2211
Epoch 21/50 - Loss: 2.3235
Epoch 22/50 - Loss: 4.3096
Epoch 23/50 - Loss: 3.5523
Epoch 24/50 - Loss: 3.8224
Epoch 25/50 - Loss: 4.6887
Epoch 26/50 - Loss: 2.7464
Epoch 27/50 - Loss: 3.8396
Epoch 28/50 - Loss: 2.6017
Epoch 29/50 - Loss: 2.6682
Epoch 30/50 - Loss: 2.7672
Epoch 31/50 - Loss: 2.0667
Epoch 32/50 - Loss: 1.8288
Epoch 33/50 - Loss: 1.9619
Epoch 34/50 - Loss: 3.4900
Epoch 35/50 - Loss: 3.0830
Epoch 36/50 - Loss: 1.9323
Epoch 37/50 - Loss: 2.2664
Epoch 3

In [58]:
# Save model
torch.save(model.state_dict(), "audio_deepfake_model.pth")
print("✅ Model trained and saved successfully.")

✅ Model trained and saved successfully.
