In [None]:
!pip install datasets evaluate torch pydub
!pip install -U accelerate
!pip install -U transformers

## Connecting to HuggingFace, this is needed to get access to pre-trained models

In [None]:
from huggingface_hub import notebook_login

notebook_login()

## Loading dataset from kaggle

Ensure that you have [kaggle key](https://www.kaggle.com/docs/api) in your directory

In [None]:
!mkdir -p ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/ # path to your kaggle key
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d birdy654/deep-voice-deepfake-voice-recognition
!unzip -q /content/deep-voice-deepfake-voice-recognition.zip

importing required libraries

In [2]:
import os
from pydub import AudioSegment
from pydub.utils import make_chunks
import pandas as pd
import evaluate
import numpy as np
from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer, Wav2Vec2Processor
from datasets import load_dataset, Audio, Dataset, concatenate_datasets, ClassLabel, Features, Value

## Augmenting real data
Since we have less files with human audio, we generate augmented samples of those so we have more of a balanced dataset for fine-tuning

In [None]:
# Function to load audio file using Librosa
def load_audio(file_path, target_sr=16000):
    audio, _ = librosa.load(file_path, sr=target_sr)
    return audio

# Function to add random noise to audio
def add_noise(audio, noise_level=0.005):
    noise = np.random.normal(0, noise_level, len(audio))
    augmented_audio = audio + noise
    return augmented_audio

# Function to perform time stretching on audio
def time_stretch(audio, rate=1.2):
    augmented_audio = librosa.effects.time_stretch(audio, rate=rate)
    return augmented_audio

# Function to perform pitch shifting on audio
def pitch_shift(audio, semitone_steps=2):
    augmented_audio = librosa.effects.pitch_shift(audio, sr=16000, n_steps=semitone_steps)
    return augmented_audio

# Function to save augmented audio
def save_audio(audio, output_path, sr=16000):
  """Saves augmented audio using soundfile."""
  sf.write(output_path, audio, sr, subtype='PCM_16')


# Function to augment audio and save the augmented samples
def augment_and_save(input_folder, output_folder, num_augmentations=5):
    # Ensure output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Iterate through audio files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith(".wav"):
            file_path = os.path.join(input_folder, filename)
            audio = load_audio(file_path)

            # Augment and save multiple times
            for i in range(num_augmentations):
                augmented_audio = audio

                # Apply random augmentation
                augmentation_type = random.choice(['noise', 'time_stretch', 'pitch_shift'])
                if augmentation_type == 'noise':
                    augmented_audio = add_noise(augmented_audio)
                elif augmentation_type == 'time_stretch':
                    augmented_audio = time_stretch(augmented_audio)
                elif augmentation_type == 'pitch_shift':
                    augmented_audio = pitch_shift(augmented_audio)

                # Save augmented audio
                output_filename = f"{os.path.splitext(filename)[0]}_aug_{i+1}.wav"
                output_path = os.path.join(output_folder, output_filename)
                save_audio(augmented_audio, output_path)

# Example usage
input_folder = "/content/KAGGLE/AUDIO/REAL"
output_folder = "/content/KAGGLE/AUDIO/REAL"
augment_and_save(input_folder, output_folder, num_augmentations=3)

## Converting to 15 sec chunks
Here, we convert the audio files to 15-sec chunks. The smaller chunks will be passed to the model for training.

In [None]:
# function to create smaller audio chunks
def create_shorter_chunks(directory, chunk_length_ms=15000):
    # Iterate over all files in the directory
    for filename in os.listdir(directory):
        # Check if the file is an audio file
        if filename.endswith(('.mp3', '.wav', '.ogg', '.flac')):
            # Create an AudioSegment instance
            audio_segment = AudioSegment.from_file(os.path.join(directory, filename))

            # Divide the audio into chunks
            chunks = make_chunks(audio_segment, chunk_length_ms)

            # Save each chunk with a new name
            for i, chunk in enumerate(chunks):
                # Create a new filename for each chunk
                chunk_filename = f"{os.path.splitext(filename)[0]}_chunk{i}.wav"
                # Save the chunk to the same directory
                chunk.export(os.path.join(directory, chunk_filename), format="wav")
    print(f"Audio files in {directory} have been divided into 15-second chunks.")

# Define the directory where the audio files are located
create_shorter_chunks('/content/KAGGLE/AUDIO/REAL')
create_shorter_chunks('/content/KAGGLE/AUDIO/FAKE')

## Creating and formatting dataset
The following code block creates a dataset from the given directory, and adds associated labels to it.

In [None]:
# returns a list of all .wav and .mp3 files in a given directory
def get_file_list(directory):
    """Returns a list of file paths for all .wav and .mp3 files in a directory and its subdirectories."""
    path_list = []
    # reads files from directory, including all subdirectories
    for root, _, filenames in os.walk(directory):
        for filename in filenames:
            if filename.endswith(".wav") or filename.endswith(".mp3"):
                filepath = os.path.join(root, filename)
                path_list.append(filepath) # adds file path to list
    return path_list # returns a list containing filepaths of all .wav and .mp3 files

# creates a dataset of .wav and .mp3 files with a directory
def create_dataset_from_directory(directory, label, max_files=None):
    """Creates a labelled dataset from a directory of .wav and .mp3 files."""
    path_list = get_file_list(directory)

    if max_files is not None:
        path_list = path_list[:max_files]

    # Create a dataset with the list of file paths
    audio_dataset = Dataset.from_dict({"audio": path_list})
    # Cast the 'audio' column to the Audio feature type
    audio_dataset = audio_dataset.cast_column("audio", Audio(sampling_rate=16000, mono=True))

    # Convert the dataset to a pandas DataFrame
    df = audio_dataset.to_pandas()

    # Assign the label to all examples
    df['label'] = label

    # Define the label names
    label_names = ['fake', 'real']

    # Define the label feature with the correct ClassLabel
    label_feature = ClassLabel(num_classes=len(label_names), names=label_names)

    # Define the new features for the dataset
    new_features = Features({
        'audio': audio_dataset.features['audio'],  # Assuming 'audio' is the audio feature
        'label': label_feature
    })

    # Create a new dataset with the updated features
    updated_dataset = Dataset.from_pandas(df, features=new_features)

    return updated_dataset

In [None]:
# create fake and real dataset, pass in the appropriate labels
fake_dataset = create_dataset_from_directory("/content/KAGGLE/AUDIO/FAKE", 0)
real_dataset = create_dataset_from_directory("/content/KAGGLE/AUDIO/REAL", 1)

# Combine the datasets
combined_dataset = concatenate_datasets([real_dataset, fake_dataset])

# Split the dataset into train and test set
combined_dataset = combined_dataset.train_test_split(test_size=0.2)

The provided code retrieves the labels from a dataset and creates two dictionaries to map labels to their corresponding IDs and vice versa.

In [None]:
# Get the list of labels from the dataset
labels = combined_dataset['train'].features["label"].names

# Initialize empty dictionaries to store the mappings
label2id, id2label = dict(), dict()

# Create mappings between labels and their corresponding IDs
for i, label in enumerate(labels):
    label2id[label] = str(i)  # Map label to ID (as a string)
    id2label[str(i)] = label  # Map ID (as a string) to label

num_labels = len(id2label)

The following code defines the pre-processor used by the Wav2Vec model

In [None]:
# loading the processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")

# Modify the preprocess_function to use the processor
def preprocess_function(examples):
    audio_arrays = [x["array"] for x in examples["audio"]]
    inputs = processor(
        audio_arrays,
        sampling_rate=processor.feature_extractor.sampling_rate,
        padding=True,
        max_length=16000,
        truncation=True,
        return_tensors="pt"
    )
    inputs["labels"] = examples["label"]
    return inputs

# applied the pre-process function to the dataset, and remove the audio column.
combined_dataset = combined_dataset.map(preprocess_function, remove_columns="audio", batched=True)

now the computation metrics are defined

In [None]:
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)

## Importing the pre-trained base model

In [None]:
# importing model from HuggingFace Hub
model = AutoModelForAudioClassification.from_pretrained(
    "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
)

## Preparing the training job

In [None]:
# Set up the training arguments
training_args = TrainingArguments(
    output_dir="trained_model",  # Directory to save the trained model
    evaluation_strategy="epoch",  # Evaluate model at the end of each epoch
    save_strategy="epoch",  # Save the model at the end of each epoch
    learning_rate=1e-3,  # Learning rate for the optimizer
    per_device_train_batch_size=32,  # Batch size per device for training
    gradient_accumulation_steps=4,  # Number of steps for gradient accumulation
    per_device_eval_batch_size=32,  # Batch size per device for evaluation
    num_train_epochs=10,  # Number of training epochs
    warmup_ratio=0.1,  # Ratio of warmup steps for the learning rate scheduler
    logging_steps=10,  # Log training metrics every specified number of steps
    load_best_model_at_end=True,  # Load the best model at the end of training
    metric_for_best_model="accuracy",  # Metric to use for tracking the best model
)

# Initialize the trainer
trainer = Trainer(
    model=model,  # The model to be trained
    args=training_args,  # Training arguments
    train_dataset=combined_dataset["train"],  # Training dataset
    eval_dataset=combined_dataset["test"],  # Evaluation dataset
    tokenizer=processor.feature_extractor,  # Tokenizer for pre-processing
    compute_metrics=compute_metrics,  # Function to compute evaluation metrics
)

## Running the training loop

In [None]:
trainer.train()