In [1]:
# !pip uninstall torch torchaudio torchvision -y

In [2]:
# !pip install torch==2.2.2 torchaudio==2.2.2

In [3]:
!pip install torch torchaudio torchvision datasets -q

[0m

In [4]:
import random
import IPython

import datasets

from datasets.utils import DownloadManager

import numpy as np

import torch
import torchaudio
import torchaudio.transforms as T

from torch.utils.data import DataLoader, Dataset

import os
import sys

from tqdm import tqdm
import torchvision
import soundfile as sf
import torch.nn as nn
import torchvision.models as models
from librosa import util
import librosa.feature

In [5]:
sys.path.append('src')

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

# Load Dataset .zip from link

Logical access (LA):

- ```speaker_id:``` LA_****, a 4-digit speaker ID
- ```audio_file_name:``` name of the audio file
- ```audio:``` '****.flac'  the path to the downloaded audio file in FLAC format (https://xiph.org/flac/).
- ```system_id:``` ID of the speech spoofing system (A01 - A19), or, for bonafide speech SYSTEM-ID is left blank ('-')
- ```key:``` 'bonafide' for genuine speech, or, 'spoof' for spoofing speech

In [7]:
# # For kaggle
# import sys
# sys.path.append('/kaggle/input/modulesspeechdetection')


In [8]:
%%time
from load_avsspoof19 import ASVspoof2019

dl_manager = DownloadManager()

speech_dataset = ASVspoof2019()
asv_datasets = speech_dataset._split_generators(dl_manager)

Computing checksums: 100%|##########| 1/1 [00:23<00:00, 23.59s/it]

CPU times: user 22.5 s, sys: 1.27 s, total: 23.7 s
Wall time: 57.4 s


In [9]:
# 0 - train, 1 - eval, 2 - test in asv_datasets
train_metadata_filepath = asv_datasets[0].gen_kwargs["metadata_filepath"]
train_audios_dir = asv_datasets[0].gen_kwargs["audios_dir"]

val_metadata_filepath = asv_datasets[1].gen_kwargs["metadata_filepath"]
val_audios_dir = asv_datasets[1].gen_kwargs["audios_dir"]

test_metadata_filepath = asv_datasets[2].gen_kwargs["metadata_filepath"]
test_audios_dir = asv_datasets[2].gen_kwargs["audios_dir"]

In [10]:
train_samples = speech_dataset._generate_examples(train_metadata_filepath, train_audios_dir)
val_samples = speech_dataset._generate_examples(val_metadata_filepath, val_audios_dir)
test_samples = speech_dataset._generate_examples(test_metadata_filepath, test_audios_dir)

## Listen to random sample

Display some audio from train/validation set.

In [11]:
def get_sample(samples):
    return random.sample(samples, k=1)[0]

sample = get_sample(train_samples)
sample

{'speaker_id': 'LA_0093',
 'audio_file_name': 'LA_T_4167933',
 'system_id': 'A06',
 'key': 'spoof',
 'audio': '/root/.cache/huggingface/datasets/downloads/extracted/911103f86670b6f7e96211444d0f39fc5ffab511156a395f67b098c2f45dce18/LA/ASVspoof2019_LA_train/flac/LA_T_4167933.flac'}

In [12]:
print(sample['key'])
IPython.display.Audio(sample['audio'])

spoof


In [13]:
sample = get_sample(val_samples)
sample

{'speaker_id': 'LA_0078',
 'audio_file_name': 'LA_D_7064559',
 'system_id': 'A02',
 'key': 'spoof',
 'audio': '/root/.cache/huggingface/datasets/downloads/extracted/911103f86670b6f7e96211444d0f39fc5ffab511156a395f67b098c2f45dce18/LA/ASVspoof2019_LA_dev/flac/LA_D_7064559.flac'}

In [14]:
print(sample['key'])
IPython.display.Audio(sample['audio'])

spoof


## Audio preprocessing


In [15]:
def audio_preprocess(waveform, sample_rate, resample_rate, desired_duration):
    """
        Resample audio to target frequency (16 kHz or 22.05 kHz) \
        Set equal duration for all audios
    """
    resampler = T.Resample(sample_rate, resample_rate)
    resampled_waveform = resampler(waveform)

    desired_length = int(desired_duration * resample_rate)
    if len(resampled_waveform) < desired_length:
        resampled_waveform = resampled_waveform.tile(((desired_length // resampled_waveform.shape[1]) + 1,))
    resampled_waveform = resampled_waveform[:,0: desired_length]

    return resampled_waveform


def peak_normalize(waveform):
    """
        Normalize audio
    """
    waveform /= torch.max(torch.abs(waveform))
    return waveform

In [16]:
class AudioDataset(Dataset):
    def __init__(self, raw_dataset, desired_duration, resample_rate, transform=None):
        self.raw_data = raw_dataset
        self.transform = transform
        self.sample_rate = resample_rate
        self.duration = desired_duration

    def __len__(self):
        return len(self.raw_data)

    def __getitem__(self, idx):
        waveform, sample_rate = torchaudio.load(self.raw_data[idx]['audio'])
        if self.transform:
            waveform = self.transform[0](waveform, sample_rate, self.sample_rate, self.duration)
            waveform = self.transform[1](waveform)

        label = 1 if self.raw_data[idx]['key'] == 'spoof' else 0
        return waveform, label

In [47]:
_DURATION = 6
_SAMPLE_RATE = 16_000
_BATCH_SIZE = 36

train_dataset = AudioDataset(train_samples, _DURATION, _SAMPLE_RATE, transform=[audio_preprocess, peak_normalize])
train_loader= DataLoader(train_dataset, batch_size=_BATCH_SIZE, shuffle=True)

test_dataset = AudioDataset(test_samples[:-29], _DURATION, _SAMPLE_RATE, transform=[audio_preprocess, peak_normalize])
test_loader= DataLoader(test_dataset, batch_size=_BATCH_SIZE, shuffle=False)

In [19]:
next(iter(train_loader))[0].shape, next(iter(train_loader))[1].shape

(torch.Size([36, 1, 96000]), torch.Size([36]))

# Feature Extraction Definition
Log power spectrum (LPS), Mel-frequency cepstrum coefficient (MFCC), constant Q cepstral coefficient (CQCC).

In [20]:
from feature_ext import get_MFCC, get_CQCC, get_LPS

In [21]:
feature_extractor = 'MFCC' # 'CQCC' 'LPS'
feature_extr = {
        'MFCC': get_MFCC,
        'CQCC': get_CQCC,
        'LPS': get_LPS
    }
sample = next(iter(train_loader))[0]
n_features=feature_extr[feature_extractor](sample, train_loader.dataset.sample_rate).shape[1]

torch.Size([36, 12, 188])

# Model definition

In [22]:
# model = torchvision.models.resnet18(pretrained=True)
# n_ftrs = model.fc.in_features
# model.fc=nn.Linear(n_ftrs,2)
# model.conv1 = nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3, bias=False)
# model.conv1 = nn.Conv2d(1, 64, kernel_size=(9, 3), stride=(3, 1), padding=(1, 1), bias=False)

In [None]:
from models import TE_ResNet

In [24]:
model = TE_ResNet(
    n_features=n_features,
    n_frames=188,
    num_layers=6,
    heads=8,
    hidden_size=256,
    output_channels=128,
    batch_size=_BATCH_SIZE,
    device=device,
).to(device)

In [25]:
model

TE_ResNet(
  (transformer_encoder): TransformerEncoder(
    (linear): Linear(in_features=20, out_features=256, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (multihead_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
    )
    (feed_forward): Sequential(
      (0): Linear(in_features=256, out_features=256, bias=True)
      (1): ReLU()
      (2): Linear(in_features=256, out_features=256, bias=True)
    )
    (encoder_layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=256, bias=True)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (nor

# Metrics Definition

In [26]:
from metrics import compute_det_curve, compute_eer

# Training

In [27]:
def truncate_sequence(features, max_length):
    batch_size, seq_length, feature_dim = features.size()
    if seq_length > max_length:
        features = features[:, :max_length, :]
    else:
        padding = torch.zeros(batch_size, max_length - seq_length, feature_dim, device=features.device)
        features = torch.cat((features, padding), dim=1)
    return features

def train_loop(model, loader, feature_extractor, criterion, optimizer, n_epochs, device, max_length):
    feature_extr = {
        'MFCC': get_MFCC,
        'CQCC': get_CQCC,
        'LPS': get_LPS
    }

    for epoch in range(n_epochs):
        model.train(True)
        sum_loss = 0.0
        num_correct = 0
        total_samples = 0

        for data, labels in tqdm(loader):
            data = data.to(device)
            labels = labels.to(device)

            # Extract features
            if feature_extractor in feature_extr:
                features = feature_extr[feature_extractor](data, loader.dataset.sample_rate)
            else:
                raise ValueError(f"Invalid feature extractor: {feature_extractor}")

            features = features.to(device)

            # Forward pass through the model
            output = model(features)

            # Check for shape mismatch
            if output.shape[0] != labels.shape[0]:
                raise ValueError(f"Mismatch in batch sizes: output batch size {output.shape[0]}, labels batch size {labels.shape[0]}")

            # Compute loss
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            sum_loss += loss.item()
            max_val, predicted = output.max(1)
            num_correct += (predicted == labels).sum().item()
            total_samples += data.size(0)

        train_accuracy = num_correct / total_samples
        train_avg_loss = sum_loss / len(loader)
        print(f'Epoch [{epoch+1}/{n_epochs}], Training Accuracy: {train_accuracy:.4f}, Training Loss: {train_avg_loss:.4f}')



In [28]:
num_epochs = 3
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [29]:
torch.cuda.empty_cache()
train_loop(
    model,
    train_loader,
    feature_extractor=feature_extractor,
    criterion=criterion,
    optimizer=optimizer,
    n_epochs=num_epochs,
    device=device,
    max_length=188
)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 705/705 [07:53<00:00,  1.49it/s]


Epoch [1/3], Training Accuracy: 0.8981, Training Loss: 0.3383


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 705/705 [07:53<00:00,  1.49it/s]


Epoch [2/3], Training Accuracy: 0.8983, Training Loss: 0.3318


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 705/705 [07:52<00:00,  1.49it/s]

Epoch [3/3], Training Accuracy: 0.8983, Training Loss: 0.3304





# Evaluation

In [51]:
import torch
from tqdm import tqdm

def evaluate_model(model, test_loader, feature_extractor, criterion, device, max_length):
    feature_extr = {
        'MFCC': get_MFCC,
        'CQCC': get_CQCC,
        'LPS': get_LPS
    }

    model.eval()
    test_loss = 0.0
    test_correct = 0
    total_test_samples = 0
    target_scores, non_target_scores = [], []

    with torch.no_grad():
        for waveform, label in tqdm(test_loader):
            waveform, label = waveform.to(device), label.to(device)

            # Extract features
            if feature_extractor in feature_extr:
                features = feature_extr[feature_extractor](waveform, test_loader.dataset.sample_rate)
            else:
                raise ValueError(f"Invalid feature extractor: {feature_extractor}")


            features = features.to(device)

            # Forward pass through the model
            output = model(features)

            # Check for shape mismatch
            if output.shape[0] != label.shape[0]:
                raise ValueError(f"Mismatch in batch sizes: output batch size {output.shape[0]}, label batch size {label.shape[0]}")

            # Compute loss
            loss = criterion(output, label)
            test_loss += loss.item()
    
            _, predicted = output.max(1)
            test_correct += (predicted == label).sum().item()
            total_test_samples += label.size(0)
            
            
            
            # Store scores and labels
            target_scores.extend(output[label == 1].tolist())
            non_target_scores.extend(output[label == 0].tolist())
            
            
#             target_scores.append(output[:1])
#             non_target_scores.append(output[:,0])

    test_accuracy = test_correct / total_test_samples
    test_loss /= len(test_loader)

    print(f'Val Loss: {test_loss:.4f}, Val Accuracy: {test_accuracy:.4f}')#, Val EER: {eer}')
    # eer, threshold = compute_eer(np.array(target_scores), np.array(non_target_scores))

    return test_loss, test_accuracy, target_scores, non_target_scores 


In [52]:
torch.cuda.empty_cache()
target_scores, non_target_scores = [], []
loss, acc, target_scores, non_target_scores = evaluate_model(
    model,
    test_loader,
    feature_extractor=feature_extractor,
    criterion=criterion,
    device=device,
    max_length=188
)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1978/1978 [18:29<00:00,  1.78it/s]

Val Loss: 0.3352, Val Accuracy: 0.8968





In [None]:
eer, threshold = compute_eer(target_scores, non_target_scores)

print(f"Equal Error Rate (EER): {eer:.4f}")
print(f"Threshold: {threshold:.4f}")