In [2]:
!pip install datasets[audio] torchaudio librosa pyannote.audio kaggle

Collecting datasets[audio]
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting pyannote.audio
  Downloading pyannote.audio-3.1.1-py2.py3-none-any.whl (208 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m208.7/208.7 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets[audio])
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets[audio])
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Collecting asteroid-filterbanks>=0.4 (from pyannote.audio)
  Downloading asteroid_filterbanks-0.4.0-py3-none-any.whl (29 kB)

## Loading dataset from kaggle

Ensure that you have [kaggle key](https://www.kaggle.com/docs/api) in your directory

In [3]:
!mkdir -p ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/ # path to your kaggle key
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d birdy654/deep-voice-deepfake-voice-recognition

Downloading deep-voice-deepfake-voice-recognition.zip to /content
 33% 1.23G/3.69G [00:16<00:41, 63.4MB/s]

In [None]:
!unzip /content/deep-voice-deepfake-voice-recognition.zip

## Creating directories for load_data() from HuggingFace
We need input files to be arranged in the following format:

Dataset
  - test
    - fake
    - real
  - train
    - fake
    - real

In [None]:
!mkdir dataset && cd dataset && mkdir train && mkdir test
!cd dataset/train && mkdir fake && mkdir real
!cd dataset/test && mkdir fake && mkdir real

## Moving files from kaggle directory to dataset training directory

In [None]:
!mv /content/KAGGLE/AUDIO/FAKE/* /content/dataset/train/fake
!mv /content/KAGGLE/AUDIO/REAL/* /content/dataset/train/real

## Augmenting real data
since we have less files in real data, we augment the real training dataset

In [None]:
import librosa
import numpy as np
import os
import random
import soundfile as sf

# Function to load audio file using Librosa
def load_audio(file_path, target_sr=16000):
    audio, _ = librosa.load(file_path, sr=target_sr)
    return audio

# Function to add random noise to audio
def add_noise(audio, noise_level=0.005):
    noise = np.random.normal(0, noise_level, len(audio))
    augmented_audio = audio + noise
    return augmented_audio

# Function to perform time stretching on audio
def time_stretch(audio, rate=1.2):
    augmented_audio = librosa.effects.time_stretch(audio, rate=rate)
    return augmented_audio

# Function to perform pitch shifting on audio
def pitch_shift(audio, semitone_steps=2):
    augmented_audio = librosa.effects.pitch_shift(audio, sr=16000, n_steps=semitone_steps)
    return augmented_audio

# Function to save augmented audio
def save_audio(audio, output_path, sr=16000):
  """Saves augmented audio using soundfile."""
  sf.write(output_path, audio, sr, subtype='PCM_16')


# Function to augment audio and save the augmented samples
def augment_and_save(input_folder, output_folder, num_augmentations=5):
    # Ensure output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Iterate through audio files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith(".wav"):
            file_path = os.path.join(input_folder, filename)
            audio = load_audio(file_path)

            # Augment and save multiple times
            for i in range(num_augmentations):
                augmented_audio = audio

                # Apply random augmentation
                augmentation_type = random.choice(['noise', 'time_stretch', 'pitch_shift'])
                if augmentation_type == 'noise':
                    augmented_audio = add_noise(augmented_audio)
                elif augmentation_type == 'time_stretch':
                    augmented_audio = time_stretch(augmented_audio)
                elif augmentation_type == 'pitch_shift':
                    augmented_audio = pitch_shift(augmented_audio)

                # Save augmented audio
                output_filename = f"{os.path.splitext(filename)[0]}_aug_{i+1}.wav"
                output_path = os.path.join(output_folder, output_filename)
                save_audio(augmented_audio, output_path)

# Example usage
input_folder = "/content/dataset/test/fake/"
output_folder = "/content/dataset/train/fake/"
augment_and_save(input_folder, output_folder, num_augmentations=3)


## Loading audio files
Next step is to load your audio files into their respective directory. After that, we use load_dataset() from HuggingFace to access those audio files, convert them to 16kHz sampling rate on mono channel.

In [None]:
from datasets import load_dataset, Audio

# used load_dataset from huggingface to read data
dataset = load_dataset("audiofolder", data_dir="/content/dataset/")
# convert audio to a sample rate of 16000
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000, mono=True))

Resolving data files:   0%|          | 0/97 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'label'],
        num_rows: 97
    })
    test: Dataset({
        features: ['audio', 'label'],
        num_rows: 8
    })
})

The following code creates dictionaries to map between textual labels and numerical IDs for a machine learning task.

In [None]:
# Retrieve label names from the training dataset's "label" feature
labels = dataset["train"].features["label"].names

# Create empty dictionaries to store label-to-ID and ID-to-label mappings
label2id, id2label = dict(), dict()

# Iterate through each label and assign a unique integer ID (as a string)
for i, label in enumerate(labels):
    # Map label to its ID
    label2id[label] = str(i)
    # Map ID to its corresponding label
    id2label[str(i)] = label

# Access and print the labels corresponding to IDs "0" and "1"
print(id2label["0"], id2label["1"])

fake real


### Log in to HuggingFace
This is necessary to download the pre-trained model

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Download the pre-trained model
For this example, we are using the WeSpeaker model, pre-trained on voxceleb dataset, wrapped by pyannote-audio library. Other models can be used too in the same way.

In [None]:
from pyannote.audio import Model

base_model = Model.from_pretrained("pyannote/wespeaker-voxceleb-resnet34-LM")

In [None]:
import torch
# setting device to GPU if available, othervise CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

## Prepare PyTorch Dataloaders

The followign code-block loads the dataset into PyTorch dataloaders, and pads them in accorance with the max audio in the dataset.

In [1]:
import torch
from torch.utils.data import DataLoader


BATCH_SIZE = 1

# Collate function to handle audio and label tensors during batching
def collate_fn(batch):
    """Collate function to handle audio and label tensors."""
    # Extract audio arrays and labels from the batch
    audios = [item["audio"]["array"] for item in batch]
    labels = torch.tensor([item["label"] for item in batch]).repeat(BATCH_SIZE)

    # Pad audio tensors to the maximum length in the batch
    max_len = max(audio.shape[0] for audio in audios)
    padded_audios = torch.zeros(len(BATCH_SIZE), max_len)
    for i, audio in enumerate(audios):
        # Copy the audio data to the padded tensor, ensuring proper padding
        padded_audios[i, :audio.shape[0]] = torch.from_numpy(audio)

    # Return the padded audio tensors and corresponding labels
    return padded_audios, labels

# Create DataLoaders for training and test sets
# DataLoader for the training set
train_dataloader = DataLoader(
    dataset["train"],
    batch_size=BATCH_SIZE,
    shuffle=True,        # Shuffle the training data
    collate_fn=collate_fn  # Use the defined collate function for handling audio and labels
)

# DataLoader for the test set
test_dataloader = DataLoader(
    dataset["test"],
    batch_size=BATCH_SIZE,
    shuffle=False,       # Do not shuffle the test data
    collate_fn=collate_fn  # Use the defined collate function for handling audio and labels
)


NameError: name 'dataset' is not defined

## Preparing for fine-tuning
Freeze all the layers of the base model.
Add a classifier head for Binary Classification.

In [None]:
from torch import nn

class WeSpeakerResNet34WithClassifier(nn.Module):
    def __init__(self, model: nn.Module, num_classes=2):
        super().__init__()

        # Move the model to GPU if available
        self.base_model = model.to(device)

        # Freeze the parameters of the base model
        for param in self.base_model.parameters():
            param.requires_grad = False

        self.classifier = nn.Sequential(
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, num_classes)
        ).to(device)

    def forward(self, x):
        x = self.base_model(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Instantiate the custom network with the base model and specify the number of classes
model = WeSpeakerResNet34WithClassifier(num_classes=2, model=base_model)


In [None]:
import torch
from torch import nn

class WeSpeakerResNet34WithLSTMClassifier(nn.Module):
    def __init__(self, model: nn.Module, hidden_size=128, num_classes=2):
        super().__init__()

        # Move the model to GPU if available
        self.base_model = model.to(device)

        # Freeze the parameters of the base model
        for param in self.base_model.parameters():
            param.requires_grad = False

        # Add an LSTM layer before the linear classifier
        self.lstm = nn.LSTM(256, hidden_size, batch_first=True)

        self.classifier = nn.Sequential(
            nn.Linear(hidden_size, 64),
            nn.ReLU(),
            nn.Linear(64, num_classes)
        ).to(device)

    def forward(self, x):
        x = self.base_model(x)
        x = x.view(x.size(0), -1)

        # LSTM layer
        lstm_out, _ = self.lstm(x.unsqueeze(1))  # Adding an additional dimension for time steps

        # Get the output of the last time step from the LSTM
        lstm_last_output = lstm_out[:, -1, :]

        # Linear classifier
        x = self.classifier(lstm_last_output)

        return x

# Move the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define hidden size for the LSTM
hidden_size = 256  # Adjust according to your desired hidden size

# Instantiate the custom network with the base model and LSTM classifier
model_with_lstm = WeSpeakerResNet34WithLSTMClassifier(model=base_model, hidden_size=hidden_size, num_classes=2).to(device)


# Run the model through a training loop

In [None]:
import torch
import torch.nn as nn

def train_model(model, train_dataloader, num_epochs=5, learning_rate=0.001):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.classifier.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        print(f"Epoch number: {epoch + 1}")

        running_loss = 0.0
        correct = 0
        total = 0

        # Training loop using the DataLoader
        for i, (audio, label) in enumerate(train_dataloader):
            # Move data to GPU if available
            audio, label = audio.to(device), label.to(device)

            # Ensure audio tensor has the correct shape
            audio = audio.unsqueeze(0)  # Add a batch dimension

            # Zero the gradients to prevent accumulation
            optimizer.zero_grad()

            # Forward pass through the model
            output = model(audio)

            # Compute the loss between the model's output and the ground truth labels
            loss = criterion(output, label)

            # Backward pass to compute gradients
            loss.backward()

            # Update model parameters using the optimizer
            optimizer.step()

            # Track running loss for monitoring training progress
            running_loss += loss.item()

            # Compute accuracy metrics
            _, predicted = torch.max(output.data, 1)
            total += label.size(0)  # Accumulate the total number of samples processed
            correct += (predicted == label).sum().item()  # Accumulate the number of correct predictions

        # Calculate average loss and accuracy at the end of each epoch
        average_loss = running_loss / len(train_dataloader)
        accuracy = correct / total * 100 if total != 0 else 0

        print(f"Epoch [{epoch + 1}/{num_epochs}], Average Loss: {average_loss:.4f}, Accuracy: {accuracy:.2f}%")


In [None]:
# train_model(model_with_lstm, train_dataloader, num_epochs=10)

In [None]:
train_model(model, train_dataloader, num_epochs=2)

Epoch number: 1


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


# Function to plot a confusion matrix
def plot_confusion_matrix(y_true, y_pred, labels):
    # Compute the confusion matrix using scikit-learn's confusion_matrix function
    cm = confusion_matrix(y_true, y_pred, labels=labels)

    # Create a figure for the plot with specified size
    plt.figure(figsize=(8, 6))

    # Plot the confusion matrix as a heatmap with annotations
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)

    # Set labels for the x and y axes
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')

    # Set the title of the plot
    plt.title('Confusion Matrix')

    # Display the plot
    plt.show()


def process_audio_batch(model, dataloader, labels):
    model.eval()  # Set model to evaluation mode

    # initializing empty lists to store prediction and ground truth
    predictions = []
    ground_truths = []

    # Pass each batch through the model without gradient calculation
    for audio, label in dataloader:
        # Move data to GPU if available
        audio, label = audio.to(device), label.to(device)
        audio = audio.unsqueeze(0)
        # run the model in inference mode
        with torch.no_grad():
            output = model(audio)
        # add the predictions and ground truth to the list
        predictions.extend(output.argmax(dim=1).tolist())
        ground_truths.extend(label.tolist())

    # Check if all labels are present in the ground truth
    unique_labels = set(ground_truths + predictions)
    if not set(labels).issubset(unique_labels):
        # print("Warning: Not all specified labels are present in the ground truth.")
        # print(f"Present labels in ground truth: {unique_labels}")
        labels = list(unique_labels)

    # Calculate metrics after processing all batches
    accuracy = accuracy_score(ground_truths, predictions)
    precision = precision_score(ground_truths, predictions, average='weighted')
    recall = recall_score(ground_truths, predictions, average='weighted')
    f1 = f1_score(ground_truths, predictions, average='weighted')

    # Create confusion matrix
    plot_confusion_matrix(ground_truths, predictions, labels)

    return predictions, ground_truths, accuracy, precision, recall, f1

In [None]:
labels = ["fake", "real"]
_, _, train_accuracy, train_precision, train_recall, train_f1 = process_audio_batch(model, train_dataloader, labels)
print(f"Accuracy: {train_accuracy}\nPrecision: {train_precision}\nRecall: {train_recall}\nF1 Score: {train_f1}")

In [None]:
prediction, ground_truth, test_accuracy, test_precision, test_recall, test_f1 = process_audio_batch(model, test_dataloader, labels)
print(f"Accuracy: {test_accuracy}\nPrecision: {test_precision}\nRecall: {test_recall}\nF1 Score: {test_f1}")

In [None]:
prediction_list = [id2label[str(id)] for id in prediction]
ground_truth_list = [id2label[str(id)] for id in ground_truth]

In [None]:
prediction_list, ground_truth_list

## Saving the model

In [None]:
torch.save(model, '/content/model.pt')

In [None]:
from google.colab import files
files.download('/content/model.pt')