In [3]:
!pip uninstall torch torchaudio torchvision -y

Found existing installation: torch 2.3.0
Uninstalling torch-2.3.0:
  Successfully uninstalled torch-2.3.0
Found existing installation: torchaudio 2.2.2
Uninstalling torchaudio-2.2.2:
  Successfully uninstalled torchaudio-2.2.2
Found existing installation: torchvision 0.18.0
Uninstalling torchvision-0.18.0:
  Successfully uninstalled torchvision-0.18.0
[0m

In [4]:
!pip install torch==2.2.2 torchaudio==2.2.2

Collecting torch==2.2.2
  Using cached torch-2.2.2-cp310-cp310-manylinux1_x86_64.whl (755.5 MB)
Collecting torchaudio==2.2.2
  Using cached torchaudio-2.2.2-cp310-cp310-manylinux1_x86_64.whl (3.3 MB)
Collecting nvidia-nccl-cu12==2.19.3
  Using cached nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl (166.0 MB)
Collecting triton==2.2.0
  Using cached triton-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (167.9 MB)
Installing collected packages: triton, nvidia-nccl-cu12, torch, torchaudio
  Attempting uninstall: triton
    Found existing installation: triton 2.3.0
    Uninstalling triton-2.3.0:
      Successfully uninstalled triton-2.3.0
  Attempting uninstall: nvidia-nccl-cu12
    Found existing installation: nvidia-nccl-cu12 2.20.5
    Uninstalling nvidia-nccl-cu12-2.20.5:
      Successfully uninstalled nvidia-nccl-cu12-2.20.5
Successfully installed nvidia-nccl-cu12-2.19.3 torch-2.2.2 torchaudio-2.2.2 triton-2.2.0
[0m

In [None]:
# !pip install torch torchaudio torchvision datasets -q

In [1]:
import random
import IPython

import datasets

from datasets.utils import DownloadManager

import numpy as np

import torch
import torchaudio
import torchaudio.transforms as T

from torch.utils.data import DataLoader, Dataset

import os
import sys

from tqdm import tqdm
# import torchvision
import soundfile as sf
import torch.nn as nn
# import torchvision.models as smodels
from librosa import util
import librosa.feature

In [2]:
sys.path.append('src')

In [16]:
device="cuda" if torch.cuda.is_available() else "cpu"

# Load Dataset .zip from link

Logical access (LA):

- ```speaker_id:``` LA_****, a 4-digit speaker ID
- ```audio_file_name:``` name of the audio file
- ```audio:``` '****.flac'  the path to the downloaded audio file in FLAC format (https://xiph.org/flac/).
- ```system_id:``` ID of the speech spoofing system (A01 - A19), or, for bonafide speech SYSTEM-ID is left blank ('-')
- ```key:``` 'bonafide' for genuine speech, or, 'spoof' for spoofing speech

In [3]:
from load_avsspoof19 import ASVspoof2019

dl_manager = DownloadManager()

speech_dataset = ASVspoof2019()
asv_datasets = speech_dataset._split_generators(dl_manager)

Computing checksums: 100%|##########| 1/1 [00:23<00:00, 23.54s/it]

In [4]:
# 0 - train, 1 - eval, 2 - test in asv_datasets
train_metadata_filepath = asv_datasets[0].gen_kwargs["metadata_filepath"]
train_audios_dir = asv_datasets[0].gen_kwargs["audios_dir"]

val_metadata_filepath = asv_datasets[1].gen_kwargs["metadata_filepath"]
val_audios_dir = asv_datasets[1].gen_kwargs["audios_dir"]

test_metadata_filepath = asv_datasets[2].gen_kwargs["metadata_filepath"]
test_audios_dir = asv_datasets[2].gen_kwargs["audios_dir"]

In [5]:
train_samples = speech_dataset._generate_examples(train_metadata_filepath, train_audios_dir)
val_samples = speech_dataset._generate_examples(val_metadata_filepath, val_audios_dir)
test_samples = speech_dataset._generate_examples(test_metadata_filepath, test_audios_dir)

## Listen to random sample

Display some audio from train/validation set.

In [6]:
def get_sample(samples):
    return random.sample(samples, k=1)[0]

sample = get_sample(train_samples)
sample

{'speaker_id': 'LA_0082',
 'audio_file_name': 'LA_T_9952125',
 'system_id': 'A02',
 'key': 'spoof',
 'audio': '/root/.cache/huggingface/datasets/downloads/extracted/911103f86670b6f7e96211444d0f39fc5ffab511156a395f67b098c2f45dce18/LA/ASVspoof2019_LA_train/flac/LA_T_9952125.flac'}

In [7]:
# LA_0095 bonatide
print(sample['key'])
IPython.display.Audio(sample['audio'])

spoof


In [11]:
sample = get_sample(val_samples)
sample

{'speaker_id': 'LA_0108',
 'audio_file_name': 'LA_D_4274975',
 'system_id': '-',
 'key': 'bonafide',
 'audio': '/root/.cache/huggingface/datasets/downloads/extracted/911103f86670b6f7e96211444d0f39fc5ffab511156a395f67b098c2f45dce18/LA/ASVspoof2019_LA_dev/flac/LA_D_4274975.flac'}

In [12]:
# LA_0089 spoof
print(sample['key'])
IPython.display.Audio(sample['audio'])

bonafide


## Audio preprocessing


In [6]:
def audio_preprocess(waveform, sample_rate, resample_rate, desired_duration):
    """
        Resample audio to target frequency (16 kHz or 22.05 kHz) \
        Set equal duration for all audios
    """
    resampler = T.Resample(sample_rate, resample_rate)
    resampled_waveform = resampler(waveform)

    desired_length = int(desired_duration * resample_rate)
    if len(resampled_waveform) < desired_length:
        resampled_waveform = resampled_waveform.tile(((desired_length // resampled_waveform.shape[1]) + 1,))
    resampled_waveform = resampled_waveform[:,0: desired_length]

    return resampled_waveform


def peak_normalize(waveform):
    """
        Normalize audio
    """
    waveform /= torch.max(torch.abs(waveform))
    return waveform

In [7]:
class AudioDataset(Dataset):
    def __init__(self, raw_dataset, desired_duration, resample_rate, transform=None):
        self.raw_data = raw_dataset
        self.transform = transform
        self.sample_rate = resample_rate
        self.duration = desired_duration

    def __len__(self):
        return len(self.raw_data)

    def __getitem__(self, idx):
        waveform, sample_rate = torchaudio.load(self.raw_data[idx]['audio'])
        if self.transform:
            waveform = self.transform[0](waveform, sample_rate, self.sample_rate, self.duration)
            waveform = self.transform[1](waveform)

        label = 1 if self.raw_data[idx]['key'] == 'spoof' else 0
        return waveform, label

In [8]:
_DURATION = 6
_SAMPLE_RATE = 16_000

train_dataset = AudioDataset(train_samples, _DURATION, _SAMPLE_RATE, transform=[audio_preprocess, peak_normalize])
train_loader= DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = AudioDataset(test_samples, _DURATION, _SAMPLE_RATE, transform=[audio_preprocess, peak_normalize])
test_loader= DataLoader(test_dataset, batch_size=32, shuffle=True)

In [16]:
train_dataset[4][0].shape

torch.Size([1, 96000])

# Feature Extraction Definition
Log power spectrum (LPS), Mel-frequency cepstrum coefficient (MFCC), constant Q cepstral coefficient (CQCC).

In [22]:
import feature_extr

# feature_extr.MFCC()

# Model definition

In [None]:
# model = torchvision.models.resnet18(pretrained=True)
# n_ftrs = model.fc.in_features
# model.fc=nn.Linear(n_ftrs,2)
# # model.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
# model.conv1 = nn.Conv2d(1, 64, kernel_size=(9, 3), stride=(3, 1), padding=(1, 1), bias=False)

In [None]:
# device= torch.device("cuda" if torch.cuda.is_available() else "cpu")

# model.to(device)

In [17]:
from models import TE_ResNet

model = TE_ResNet(
    embed_size=32,
    num_layers=6,
    heads=8,
    ff_hidden_size=1024,
    dropout=0.1,
    device=device,
    max_length=126
)

print(model)

TE_ResNet(
  (transformer_encoder): TransformerEncoder(
    (position_embedding): Embedding(126, 32)
    (layers): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadAttention(
          (values): Linear(in_features=4, out_features=32, bias=False)
          (keys): Linear(in_features=4, out_features=32, bias=False)
          (queries): Linear(in_features=4, out_features=32, bias=False)
          (fc_out): Linear(in_features=32, out_features=32, bias=True)
        )
        (norm1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        (feed_forward): FeedForward(
          (fc1): Linear(in_features=32, out_features=1024, bias=True)
          (fc2): Linear(in_features=1024, out_features=32, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (resnet): ResNet(
    (conv1): Conv2d(1, 64, kernel_size=(3, 

# Training

In [18]:
model.to(device)

TE_ResNet(
  (transformer_encoder): TransformerEncoder(
    (position_embedding): Embedding(126, 32)
    (layers): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): MultiHeadAttention(
          (values): Linear(in_features=4, out_features=32, bias=False)
          (keys): Linear(in_features=4, out_features=32, bias=False)
          (queries): Linear(in_features=4, out_features=32, bias=False)
          (fc_out): Linear(in_features=32, out_features=32, bias=True)
        )
        (norm1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        (feed_forward): FeedForward(
          (fc1): Linear(in_features=32, out_features=1024, bias=True)
          (fc2): Linear(in_features=1024, out_features=32, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (resnet): ResNet(
    (conv1): Conv2d(1, 64, kernel_size=(3, 

In [32]:
def train_loop(model, loader, feature_extractor, criterion, optimizer,n_epochs, device):
    for epoch in range(n_epochs):
        # Training
        model.train(True)
        sum_loss = 0.0
        num_correct = 0
        total_samples = 0
        for data, labels in loader:
            data = data
            labels = labels.to(device)

            if feature_extractor == 'MFCC':
                features = feature_extr.get_MFCC(data, loader.dataset.sample_rate)
            elif feature_extractor == 'CQCC':
                features = feature_extr.get_CQCC(data, loader.dataset.sample_rate)
            elif feature_extractor == 'LPS':
                features = feature_extr.get_LPS(data, loader.dataset.sample_rate)
            else:
                raise ValueError(f"Invalid feature extractor: {feature_extractor}")

             # Ensure features have the correct shape (batch_size, seq_length, feature_dim)
            if features.dim() == 2:
                features = features.unsqueeze(0)  # Add batch dimension
            if features.dim() == 3:
                features = features.permute(1, 2, 0)  # Ensure shape is (batch_size, seq_length, feature_dim)
            
            features = features.unsqueeze(1).to(device)
            mask = torch.ones(features.shape[:2], device=device).bool()  # Generate a mask of ones
            output = model(features, mask)
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()


            sum_loss += loss.item()
            max_val, predicted = output.max(1)
            num_correct += (predicted == labels).sum().item()
            total_samples += data.size(0)

        train_accuracy = num_correct / total_samples
        train_avg_loss = sum_loss / len(train_loader)
        print(f'Epoch [{epoch+1}/{num_epochs}], Training Accuracy: {train_accuracy:.4f}, Training Loss: {train_avg_loss:.4f}')

In [20]:
num_epochs=3
optimizer=torch.optim.Adam(model.parameters(), lr=0.001)
criterion=nn.CrossEntropyLoss()
feature_extractor='MFCC'

In [33]:
train_loop(
    model,
    train_loader,
    feature_extractor=feature_extractor,
    criterion=criterion,
    optimizer=optimizer,
    n_epochs=num_epochs,
    device=device
)

ValueError: too many values to unpack (expected 3)

#Evaluation

In [None]:
def evaluate_model(model, test_loader, feature_extractor, criterion, device):
    model.eval()
    test_loss = 0.0
    test_correct = 0
    total_test_samples = 0

    with torch.no_grad():
        for waveform, label in tqdm(test_loader):
            waveform = waveform
            label = label.to(device)

            if feature_extractor == 'MFCC':
                features = feature_extr.get_MFCC(waveform, test_loader.dataset.sample_rate)
            elif feature_extractor == 'CQCC':
                features = feature_extr.get_CQCC(waveform, test_loader.dataset.sample_rate)
            elif feature_extractor == 'LPS':
                features = feature_extr.get_LPS(waveform, test_loader.dataset.sample_rate)
            else:
                raise ValueError(f"Invalid feature extractor: {feature_extractor}")

            features = features.to(device)
            output = model(features)
            loss = criterion(output, label)
            test_loss += loss.item()

            max_val, predicted = output.max(1)
            test_correct += (predicted == label).sum().item()
            total_test_samples += features.size(0)

    test_accuracy = test_correct / total_test_samples
    test_loss = test_loss / len(test_loader)

    print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}')

In [None]:
evaluate_model(
    model,
    test_loader,
    feature_extractor=feature_extractor,
    criterion=criterion,
    device=device
)

In [None]:
# def train_loop(loader):
#     n_epochs = 5
#     for epoch in n_epochs:
#         for data, labels in loader:
#             feature_extr.MFCC() # or any other
#             output = model(data)
#             loss = ...
#     pass