In [1]:
import scipy.signal as signal
import numpy as np
import scipy.io.wavfile as wav
import sounddevice as sd
import librosa
from pathlib import Path
import pandas
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
from transformers import Wav2Vec2Model, HubertModel
import torch.optim as optim
#c:/Users/egorv/Desktop/BProj
PROJECT_ROOT = Path.cwd()
DATA_DIR = PROJECT_ROOT / "code" / "experiments" / "data" / "ASVSpoof2019"
TARGET_SEC = 4.0
TARGET_LEN = int(16000 * TARGET_SEC)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class ASVSpoofDataset(Dataset):
    
    def __init__(self, flac_dir, labels_path):
        """
        Returns all the directory where the flac_files are located, returns the files itself,
        returns the dataset with filenames, targets, speaker ID, and type attack ID.
        Also returns the list of filenames, and target dictionary
        """
        self.flac_dir = flac_dir
        self.files = sorted(Path(flac_dir).glob("*.flac"))
        self.labels_df = pandas.read_csv(labels_path, sep=r"\s+", header=None)
        self.file_names = self.labels_df[1]
        self.target = dict(zip(self.labels_df[1], self.labels_df[4]))

    def __len__(self):
        return len(self.files)
    
    def __getitem__(self,idx):
        file_path = self.files[idx]

        x, _ = librosa.load(file_path, sr=16000)
        x = self.normalize_duration(x)

        file_name = file_path.stem
        target_str = self.target.get(file_name)

        y = 1 if target_str == 'bonafide' else 0
        return torch.from_numpy(x).float(), torch.tensor(y).long()

    def normalize_duration(self, x):
        TARGET_SEC = 4.0
        TARGET_LEN = int(16000 * TARGET_SEC)
        
        cur_len = len(x)

        if cur_len > TARGET_LEN:
            start = np.random.randint(0, cur_len - TARGET_LEN)
            return x[start:start+TARGET_LEN]
        
        if cur_len < TARGET_LEN:
            pad = TARGET_LEN - cur_len
            return np.pad(x, (0, pad), mode='constant')
        else:
            return x

In [3]:
import transformers
class EnsembleModel(nn.Module):
    def __init__(self, num_classes=2):
        super().__init__()
        self.wav2vec2 = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base")
        self.hubert = HubertModel.from_pretrained("facebook/hubert-base-ls960")
        combined_dim = self.wav2vec2.config.hidden_size + self.hubert.config.hidden_size

        self.classifier = nn.Linear(combined_dim, num_classes)
    
    def forward(self, x):
        w2v_out = self.wav2vec2(x).last_hidden_state.mean(dim=1)
        hubert_out = self.hubert(x).last_hidden_state.mean(dim=1)

        combined = torch.cat((w2v_out, hubert_out), dim=1)

        return self.classifier(combined)
    


In [None]:
def train_one_epoch(epoch_index, tb_writer, train_loader, optimizer, loss_fn, model, device, log_every=50):
    running_loss = 0.0
    total_loss = 0.0
    model.train(True)

    for batch_idx, (audios, labels) in enumerate(train_loader):
        optimizer.zero_grad()

        audios, labels = audios.to(device), labels.to(device)
        outputs = model(audios)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        loss_val = loss.item()
        total_loss += loss_val
        running_loss += loss_val

        if (batch_idx + 1) % log_every == 0:
            avg_50 = running_loss / log_every
            tb_x = epoch_index * len(train_loader) + batch_idx + 1
            tb_writer.add_scalar("Loss/Train", avg_50, tb_x)
            running_loss = 0.0

    epoch_avg = total_loss / max(1, len(train_loader))
    tb_writer.add_scalar("Loss/Train_epoch", epoch_avg, epoch_index + 1)
    tb_writer.flush()
    return epoch_avg


In [5]:
train_flac_dir = (DATA_DIR/ "LA" / "ASVSpoof2019_LA_train" / "flac")
labels_file = DATA_DIR / "LA" / "ASVspoof2019_LA_cm_protocols" / "ASVspoof2019.LA.cm.train.trn.txt"

flac_dataset = ASVSpoofDataset(train_flac_dir, labels_file)

In [6]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset

indices = list(range(len(flac_dataset)))

train_split, temp_split = train_test_split(indices, train_size=0.8, shuffle=True, random_state=10)
val_split, test_split = train_test_split(temp_split, train_size=0.5, random_state=10)

train_subset = Subset(flac_dataset, train_split)
val_subset = Subset(flac_dataset, val_split)
test_subset = Subset(flac_dataset, test_split)

train_loader = DataLoader(train_subset, batch_size=16, num_workers=0, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_subset, batch_size=16, num_workers=0, shuffle=False)
test_loader = DataLoader(test_subset, batch_size=16, num_workers=0, shuffle=False)


In [7]:
print("len(train_loader) =", len(train_loader))
print("len(val_loader)   =", len(val_loader))

len(train_loader) = 1269
len(val_loader)   = 159


In [8]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = EnsembleModel().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-5)

Loading weights: 100%|██████████| 211/211 [00:00<00:00, 1290.48it/s, Materializing param=masked_spec_embed]                                            
Wav2Vec2Model LOAD REPORT from: facebook/wav2vec2-base
Key                          | Status     |  | 
-----------------------------+------------+--+-
quantizer.weight_proj.bias   | UNEXPECTED |  | 
quantizer.codevectors        | UNEXPECTED |  | 
project_hid.weight           | UNEXPECTED |  | 
quantizer.weight_proj.weight | UNEXPECTED |  | 
project_q.weight             | UNEXPECTED |  | 
project_hid.bias             | UNEXPECTED |  | 
project_q.bias               | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
Loading weights: 100%|██████████| 211/211 [00:00<00:00, 833.19it/s, Materializing param=masked_spec_embed]                                            


In [None]:
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

num_epochs = 5

best_vloss = 1_000_000.

project_root = Path(r"C:\Users\egorv\Desktop\BProj")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

log_dir = project_root / "runs" / f"fashion_trainer_{timestamp}"
writer = SummaryWriter(str(log_dir))

print("Writing logs to:", log_dir)

print("len(train_loader) =", len(train_loader))
print("len(val_loader)   =", len(val_loader))

for epoch in range(num_epochs):
    
    model.train(True)
    avg_loss = train_one_epoch(epoch, writer, train_loader, optimizer, criterion, model, device)
    avg_loss_f = float(avg_loss)

    running_vloss = 0.0
    model.eval()

    with torch.no_grad():
        for i, vdata in enumerate(val_loader):
            vinputs, vlabels = vdata
            vinputs, vlabels = vinputs.to(device), vlabels.to(device)
            voutputs = model(vinputs)
            vloss = criterion(voutputs, vlabels)
            running_vloss += vloss.item()

    avg_vloss = running_vloss / (i+1)
    avg_vloss_f = float(avg_vloss)

    print('LOSS train {} valid {}'.format(avg_loss_f, avg_vloss_f))

    writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss_f, 'Validation' : avg_vloss_f },
                    epoch + 1)
    writer.flush()

    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = project_root / f"model_{timestamp}_{epoch}.pt"
        torch.save(model.state_dict(), str(model_path))

writer.close()

Writing logs to: C:\Users\egorv\Desktop\BProj\runs\fashion_trainer_20260128_203238
len(train_loader) = 1269
len(val_loader)   = 159


In [None]:
from torch.utils.tensorboard import SummaryWriter
from pathlib import Path

log_dir = Path(r"C:\Users\egorv\Desktop\BProj\runs\debug_tb")
w = SummaryWriter(str(log_dir))

w.add_scalar("debug/alive", 1.0, 0)
w.flush()
w.close()

print("Wrote to:", log_dir)

Wrote to: C:\Users\egorv\Desktop\BProj\runs\debug_tb
