In [24]:
import numpy as np
import torch
import random

def set_seed(seed):
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

set_seed(520)

In [25]:
import os 
import json
import torch
import random
from pathlib import Path
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence

class myDataset(Dataset):
    def __init__(self, data_dir, segment_len=128):
        self.data_dir = data_dir
        self.segment_len = segment_len
        
        mapping_path = Path(data_dir) / 'mapping.json'
        mapping = json.load(mapping_path.open())
        self.speaker2id = mapping['speaker2id']
        
        metadata_path = Path(data_dir) / 'metadata.json'
        metadata = json.load(open(metadata_path))['speakers']
        
        self.speaker_num = len(metadata.keys())
        self.data = []
        for speaker in metadata.keys():
            for utterances in metadata[speaker]:
                self.data.append([utterances['feature_path'], self.speaker2id[speaker]])
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        feat_path, speaker = self.data[index]
        mel = torch.load(os.path.join(self.data_dir, feat_path))
        
        if len(mel) > self.segment_len:
            # 只取128长度的语音序列来预测speaker
            start = random.randint(0, len(mel) - self.segment_len)
            mel = torch.FloatTensor(mel[start:start+self.segment_len])
        else:
            # 这里面可能存在比128段的序列，但是在组成batch时会填充
            mel = torch.FloatTensor(mel)  
        
        speaker = torch.FloatTensor([speaker]).long()
        return mel, speaker
    
    def get_speaker_number(self):
        return self.speaker_num
    
        

In [26]:
import json
import torch
from pathlib import Path


mapping_path = Path('/kaggle/input/ml2023springhw4/Dataset') / 'mapping.json'
mapping = json.load(mapping_path.open())
mapping  # dict类型


{'speaker2id': {'id00464': 0,
  'id00559': 1,
  'id00578': 2,
  'id00905': 3,
  'id01920': 4,
  'id02368': 5,
  'id04048': 6,
  'id04070': 7,
  'id04239': 8,
  'id05743': 9,
  'id06674': 10,
  'id07109': 11,
  'id00393': 12,
  'id00805': 13,
  'id01447': 14,
  'id02546': 15,
  'id03946': 16,
  'id04282': 17,
  'id04503': 18,
  'id05574': 19,
  'id06732': 20,
  'id06922': 21,
  'id07191': 22,
  'id07198': 23,
  'id07448': 24,
  'id07664': 25,
  'id07717': 26,
  'id08185': 27,
  'id08483': 28,
  'id08806': 29,
  'id09154': 30,
  'id00931': 31,
  'id01371': 32,
  'id01488': 33,
  'id01534': 34,
  'id02011': 35,
  'id02051': 36,
  'id02104': 37,
  'id02562': 38,
  'id03816': 39,
  'id03853': 40,
  'id03858': 41,
  'id04483': 42,
  'id04802': 43,
  'id04946': 44,
  'id05508': 45,
  'id05627': 46,
  'id06422': 47,
  'id09054': 48,
  'id09094': 49,
  'id09188': 50,
  'id00036': 51,
  'id00087': 52,
  'id00388': 53,
  'id00924': 54,
  'id01109': 55,
  'id01120': 56,
  'id01503': 57,
  'id02485


Train:  49% 971/2000 [00:54<00:38, 26.53 step/s, accuracy=0.72, loss=9.73, step=34971][A

In [27]:
metadata_path = Path('/kaggle/input/ml2023springhw4/Dataset') / 'metadata.json'
metadata = json.load(open(metadata_path))['speakers']
metadata.keys()

dict_keys(['id03074', 'id05623', 'id06406', 'id01014', 'id02426', 'id01503', 'id05996', 'id05687', 'id03749', 'id01502', 'id04665', 'id05905', 'id07454', 'id02359', 'id00982', 'id05624', 'id02295', 'id00407', 'id09234', 'id06068', 'id01238', 'id07362', 'id04082', 'id02731', 'id01528', 'id05282', 'id09039', 'id07581', 'id07496', 'id02886', 'id02472', 'id04292', 'id03977', 'id08903', 'id03349', 'id06050', 'id03938', 'id07395', 'id04940', 'id01830', 'id03958', 'id05018', 'id03912', 'id00801', 'id04847', 'id00995', 'id03746', 'id03785', 'id07254', 'id04246', 'id00805', 'id05470', 'id01104', 'id08954', 'id09224', 'id07664', 'id02018', 'id01686', 'id09125', 'id07333', 'id08402', 'id06332', 'id04282', 'id04686', 'id09243', 'id07493', 'id02838', 'id03353', 'id01241', 'id07053', 'id01270', 'id05779', 'id05522', 'id08305', 'id07960', 'id06462', 'id04703', 'id07745', 'id01780', 'id00371', 'id05194', 'id04565', 'id09271', 'id04897', 'id09054', 'id01371', 'id03009', 'id03955', 'id00905', 'id04877',

In [28]:
mel = torch.load('/kaggle/input/ml2023springhw4/Dataset/uttr-0002067f80214182ab863378bdcdd68a.pt')
mel.shape

torch.Size([660, 40])

In [29]:
import torch
from torch.utils.data import DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence

def collate_batch(batch):
    mel, speaker = zip(*batch)  # 解配对
    mel = pad_sequence(mel, batch_first=True, padding_value=-20)
    return mel, torch.FloatTensor(speaker).long()

def get_dataloader(data_dir, batch_size, n_workers):
    dataset = myDataset(data_dir)
    speaker_num = dataset.get_speaker_number()
    trainlen = int(0.9 * len(dataset))
    lengths = [trainlen, len(dataset) - trainlen]
    trainset, validset = random_split(dataset, lengths)  # dataset也是可以操作的
    
    train_loader = DataLoader(trainset,
                             batch_size=batch_size,
                             shuffle=True,
                             drop_last=True,
                             num_workers=n_workers,
                             pin_memory=True,
                             collate_fn=collate_batch)
    
    valid_loader = DataLoader(validset,
                             batch_size=batch_size,
                             num_workers=n_workers,
                             drop_last=True,
                             pin_memory=True,
                             collate_fn=collate_batch)
    
    return train_loader, valid_loader, speaker_num

In [30]:
!pip install conformer
import torch
import torch.nn as nn
import torch.nn.functional as F
# from conformer import ConformerBlock
from conformer import Conformer

[0m

In [32]:
class SelfAttentionPooling(nn.Module):
    def __init__(self, input_dim):
        super(SelfAttentionPooling, self).__init__()
        self.W = nn.Linear(input_dim, 1)
        
    def forward(self, batch_rep):
        softmax = nn.functional.softmax
        
        att_w = softmax(self.W(batch_rep).squeeze(-1)).unsqueeze(-1)
        utter_rep = torch.sum(batch_rep * att_w, dim=1)

        return utter_rep

In [33]:
class CrossEntropyLabelSmooth(nn.Module):

    def __init__(self, num_classes=600, epsilon=0.1, use_gpu=True):
        super(CrossEntropyLabelSmooth, self).__init__()
        self.num_classes = num_classes
        self.epsilon = epsilon
        self.use_gpu = use_gpu
        self.logsoftmax = nn.LogSoftmax(dim=1)

    def forward(self, inputs, targets, use_label_smoothing=True):
        log_probs = self.logsoftmax(inputs)
        targets = torch.zeros(log_probs.size()).scatter_(1, targets.unsqueeze(1).data.cpu(), 1)
        if self.use_gpu: targets = targets.to(torch.device('cuda'))
        if use_label_smoothing:
            targets = (1 - self.epsilon) * targets + self.epsilon / self.num_classes
        loss = (- targets * log_probs).mean(0).sum()
        return loss
    

class AMSoftmaxLoss(nn.Module): #requires classification layer for normalization 
    def __init__(self, m=0.35, s=30, d=256, num_classes=600, use_gpu=True , epsilon=0.1):
        super(AMSoftmaxLoss, self).__init__()
        self.m = m
        self.s = s 
        self.num_classes = num_classes
        self.CrossEntropy = CrossEntropyLabelSmooth(self.num_classes , use_gpu=use_gpu)

    def forward(self, features, labels , classifier):
        # x = torch.rand(32,2048)
        # label = torch.tensor([0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,])
        features = nn.functional.normalize(features, p=2, dim=1) # normalize the features
        with torch.no_grad():
            classifier.weight.div_(torch.norm(classifier.weight, dim=1, keepdim=True))

        cos_angle = classifier(features)
        cos_angle = torch.clamp( cos_angle , min = -1 , max = 1 ) 
        b = features.size(0)
        for i in range(b):
            cos_angle[i][labels[i]] = cos_angle[i][labels[i]]  - self.m 
        weighted_cos_angle = self.s * cos_angle
        log_probs = self.CrossEntropy(weighted_cos_angle , labels, use_label_smoothing=True)
        return log_probs

In [34]:


class Classifier(nn.Module):
    def __init__(self, d_model=80, n_spks=600, dropout=0.2):
        super().__init__()
        # 预处理网络，改变输入维度
        self.prenet = nn.Linear(40, d_model)
        self.encoder = Conformer(
                dim = d_model,
                depth = 2,          # 12 blocks
                dim_head = 64,
                heads = 8,
                ff_mult = 4,
                conv_expansion_factor = 2,
                conv_kernel_size = 31,
                attn_dropout = dropout,
                ff_dropout = dropout,
                conv_dropout = dropout
            )
        self.sap = SelfAttentionPooling(d_model)

        self.pred_layer = nn.Linear(d_model, n_spks)
        
    def forward(self, mels):
        out = self.prenet(mels)
        out = out.permute(1, 0, 2)
        out = self.encoder(out)
        out = out.transpose(0, 1)
        out = self.sap(out)
        return out

In [36]:
import math

import torch
from torch.optim import Optimizer
from torch.optim.lr_scheduler import LambdaLR


def get_cosine_schedule_with_warmup(
    optimizer: Optimizer,
    num_warmup_steps: int,
    num_training_steps: int,
    num_cycles: float = 0.5,
    last_epoch: int = -1,
):
    """
    Create a schedule with a learning rate that decreases following the values of the cosine function between the
    initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
    initial lr set in the optimizer.

    Args:
        optimizer (:class:`~torch.optim.Optimizer`):
        The optimizer for which to schedule the learning rate.
        num_warmup_steps (:obj:`int`):
        The number of steps for the warmup phase.
        num_training_steps (:obj:`int`):
        The total number of training steps.
        num_cycles (:obj:`float`, `optional`, defaults to 0.5):
        The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
        following a half-cosine).
        last_epoch (:obj:`int`, `optional`, defaults to -1):
        The index of the last epoch when resuming training.

    Return:
        :obj:`torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """
    def lr_lambda(current_step):
        # Warmup
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        # decadence
        progress = float(current_step - num_warmup_steps) / float(
            max(1, num_training_steps - num_warmup_steps)
        )
        return max(
            0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
        )

    return LambdaLR(optimizer, lr_lambda, last_epoch)

In [37]:
import torch


def model_fn(batch, model, criterion, device):
    """Forward a batch through the model."""

    mels, labels = batch
    mels = mels.to(device)
    labels = labels.to(device)
    
    outs = model(mels)
    
    loss = criterion(outs, labels, model.pred_layer)

    # Get the speaker id with highest probability.
    outs = model.pred_layer(outs)
    preds = outs.argmax(1)
    # Compute accuracy.
    accuracy = torch.mean((preds == labels).float())

    # Printout model's parameters
#     for name, param in model.named_parameters():
#         if param.requires_grad:
#             print(name)
        
    return loss, accuracy

In [38]:
from tqdm import tqdm
import torch


def valid(dataloader, model, criterion, device): 
    """Validate on validation set."""

    model.eval()
    running_loss = 0.0
    running_accuracy = 0.0
    pbar = tqdm(total=len(dataloader.dataset), ncols=0, desc="Valid", unit=" uttr")

    for i, batch in enumerate(dataloader):
        with torch.no_grad():
            loss, accuracy = model_fn(batch, model, criterion, device)
            running_loss += loss.item()
            running_accuracy += accuracy.item()

        pbar.update(dataloader.batch_size)
        pbar.set_postfix(
            loss=f"{running_loss / (i+1):.2f}",
            accuracy=f"{running_accuracy / (i+1):.2f}",
        )

    pbar.close()
    model.train()

    return running_accuracy / len(dataloader)

In [39]:
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, random_split


def parse_args():
    """arguments"""
    config = {
        "data_dir": "/kaggle/input/ml2023springhw4/Dataset",
        "save_path": "model.ckpt",
        "batch_size": 32,
        "n_workers": 8,
        "valid_steps": 2000,
        "warmup_steps": 1000,
        "save_steps": 10000,
        "total_steps": 70000,
    }

    return config


def main(
    data_dir,
    save_path,
    batch_size,
    n_workers,
    valid_steps,
    warmup_steps,
    total_steps,
    save_steps,
):
    """Main function."""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"[Info]: Use {device} now!")

    train_loader, valid_loader, speaker_num = get_dataloader(data_dir, batch_size, n_workers)
    train_iterator = iter(train_loader)
    print(f"[Info]: Finish loading data!",flush = True)

    model = Classifier(n_spks=speaker_num).to(device)
#     criterion = nn.CrossEntropyLoss()
    criterion = AMSoftmaxLoss(m=0.4, s=30)
    optimizer = AdamW(model.parameters(), lr=1e-3)
    scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
    print(f"[Info]: Finish creating model!",flush = True)

    best_accuracy = -1.0
    best_state_dict = None

    pbar = tqdm(total=valid_steps, ncols=0, desc="Train", unit=" step")

    for step in range(total_steps):
        # Get data
        try:
            batch = next(train_iterator) # (32, 128, 40)
        except StopIteration:
            train_iterator = iter(train_loader)
            batch = next(train_iterator)

        loss, accuracy = model_fn(batch, model, criterion, device)
        batch_loss = loss.item()
        batch_accuracy = accuracy.item()

        # Updata model
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        # Log
        pbar.update()
        pbar.set_postfix(
            loss=f"{batch_loss:.2f}",
            accuracy=f"{batch_accuracy:.2f}",
            step=step + 1,
        )

        # Do validation
        if (step + 1) % valid_steps == 0:
            pbar.close()

            valid_accuracy = valid(valid_loader, model, criterion, device)

            # keep the best model
            if valid_accuracy > best_accuracy:
                best_accuracy = valid_accuracy
                best_state_dict = model.state_dict()

            pbar = tqdm(total=valid_steps, ncols=0, desc="Train", unit=" step")

        # Save the best model so far.
        if (step + 1) % save_steps == 0 and best_state_dict is not None:
            torch.save(best_state_dict, save_path)
            pbar.write(f"Step {step + 1}, best model saved. (accuracy={best_accuracy:.4f})")

    pbar.close()


if __name__ == "__main__":
    main(**parse_args())

[Info]: Use cuda now!


Train:   0% 0/2000 [27:28<?, ? step/s]


[Info]: Finish loading data!
[Info]: Finish creating model!


  if __name__ == "__main__":
Train: 100% 2000/2000 [01:10<00:00, 28.30 step/s, accuracy=0.00, loss=16.72, step=2000]
Valid: 100% 5664/5667 [00:03<00:00, 1829.72 uttr/s, accuracy=0.03, loss=16.25]
Train: 100% 2000/2000 [01:10<00:00, 28.29 step/s, accuracy=0.16, loss=14.60, step=4000]
Valid: 100% 5664/5667 [00:03<00:00, 1777.13 uttr/s, accuracy=0.16, loss=15.08]
Train: 100% 2000/2000 [01:10<00:00, 28.53 step/s, accuracy=0.47, loss=13.31, step=6000]
Valid: 100% 5664/5667 [00:03<00:00, 1804.96 uttr/s, accuracy=0.29, loss=14.26]
Train: 100% 2000/2000 [01:10<00:00, 28.41 step/s, accuracy=0.31, loss=13.36, step=8000]
Valid: 100% 5664/5667 [00:03<00:00, 1805.06 uttr/s, accuracy=0.40, loss=13.56]
Train: 100% 2000/2000 [01:10<00:00, 28.46 step/s, accuracy=0.41, loss=13.36, step=1e+4]
Valid: 100% 5664/5667 [00:03<00:00, 1839.84 uttr/s, accuracy=0.48, loss=12.97]
                                      
Train:   0% 0/2000 [00:00<?, ? step/s]                                                
Train:   0

Step 10000, best model saved. (accuracy=0.4762)


Train: 100% 2000/2000 [01:10<00:00, 28.22 step/s, accuracy=0.72, loss=10.83, step=12000]
Valid: 100% 5664/5667 [00:03<00:00, 1775.19 uttr/s, accuracy=0.54, loss=12.48]
Train: 100% 2000/2000 [01:10<00:00, 28.36 step/s, accuracy=0.75, loss=10.61, step=14000]
Valid: 100% 5664/5667 [00:03<00:00, 1824.63 uttr/s, accuracy=0.58, loss=12.02]
Train: 100% 2000/2000 [01:11<00:00, 28.13 step/s, accuracy=0.78, loss=10.68, step=16000]
Valid: 100% 5664/5667 [00:03<00:00, 1808.68 uttr/s, accuracy=0.60, loss=11.71]
Train: 100% 2000/2000 [01:10<00:00, 28.47 step/s, accuracy=0.72, loss=10.10, step=18000]
Valid: 100% 5664/5667 [00:03<00:00, 1815.65 uttr/s, accuracy=0.61, loss=11.59]
Train: 100% 2000/2000 [01:10<00:00, 28.37 step/s, accuracy=0.62, loss=12.10, step=2e+4] 
Valid: 100% 5664/5667 [00:03<00:00, 1826.32 uttr/s, accuracy=0.64, loss=11.26]
                                      
Train:   0% 0/2000 [00:00<?, ? step/s]                                                
Train:   0% 5/2000 [00:00<01:26, 2

Step 20000, best model saved. (accuracy=0.6393)


Train: 100% 2000/2000 [01:10<00:00, 28.28 step/s, accuracy=0.84, loss=8.74, step=22000] 
Valid: 100% 5664/5667 [00:03<00:00, 1795.94 uttr/s, accuracy=0.65, loss=11.05]
Train: 100% 2000/2000 [01:10<00:00, 28.28 step/s, accuracy=0.75, loss=10.63, step=24000]
Valid: 100% 5664/5667 [00:03<00:00, 1823.66 uttr/s, accuracy=0.67, loss=10.86]
Train: 100% 2000/2000 [01:10<00:00, 28.55 step/s, accuracy=0.69, loss=10.91, step=26000]
Valid: 100% 5664/5667 [00:03<00:00, 1803.21 uttr/s, accuracy=0.67, loss=10.79]
Train: 100% 2000/2000 [01:11<00:00, 28.14 step/s, accuracy=0.69, loss=11.10, step=28000]
Valid: 100% 5664/5667 [00:03<00:00, 1786.71 uttr/s, accuracy=0.68, loss=10.56]
Train: 100% 2000/2000 [01:10<00:00, 28.48 step/s, accuracy=0.72, loss=8.68, step=3e+4]  
Valid: 100% 5664/5667 [00:03<00:00, 1827.36 uttr/s, accuracy=0.69, loss=10.47]
                                      
Train:   0% 0/2000 [00:00<?, ? step/s]                                                
Train:   0% 5/2000 [00:00<01:23, 2

Step 30000, best model saved. (accuracy=0.6879)


Train: 100% 2000/2000 [01:10<00:00, 28.26 step/s, accuracy=0.69, loss=9.09, step=32000] 
Valid: 100% 5664/5667 [00:03<00:00, 1818.34 uttr/s, accuracy=0.70, loss=10.36]
Train: 100% 2000/2000 [01:10<00:00, 28.30 step/s, accuracy=0.81, loss=9.30, step=34000] 
Valid: 100% 5664/5667 [00:03<00:00, 1821.40 uttr/s, accuracy=0.71, loss=10.15]
Train: 100% 2000/2000 [01:09<00:00, 28.66 step/s, accuracy=0.84, loss=9.34, step=36000] 
Valid: 100% 5664/5667 [00:03<00:00, 1806.92 uttr/s, accuracy=0.71, loss=10.14]
Train: 100% 2000/2000 [01:10<00:00, 28.47 step/s, accuracy=0.84, loss=8.50, step=38000] 
Valid: 100% 5664/5667 [00:03<00:00, 1833.16 uttr/s, accuracy=0.71, loss=9.97] 
Train: 100% 2000/2000 [01:10<00:00, 28.19 step/s, accuracy=0.66, loss=10.53, step=4e+4] 
Valid: 100% 5664/5667 [00:03<00:00, 1791.79 uttr/s, accuracy=0.72, loss=9.90]
                                      
Train:   0% 0/2000 [00:00<?, ? step/s]                                                
Train:   0% 5/2000 [00:00<01:20, 24

Step 40000, best model saved. (accuracy=0.7156)


Train: 100% 2000/2000 [01:10<00:00, 28.41 step/s, accuracy=0.88, loss=8.22, step=42000] 
Valid: 100% 5664/5667 [00:03<00:00, 1806.05 uttr/s, accuracy=0.73, loss=9.79]
Train: 100% 2000/2000 [01:10<00:00, 28.50 step/s, accuracy=0.81, loss=8.14, step=44000] 
Valid: 100% 5664/5667 [00:03<00:00, 1812.75 uttr/s, accuracy=0.73, loss=9.75]
Train: 100% 2000/2000 [01:10<00:00, 28.37 step/s, accuracy=0.78, loss=8.57, step=46000] 
Valid: 100% 5664/5667 [00:03<00:00, 1746.94 uttr/s, accuracy=0.73, loss=9.65]
Train: 100% 2000/2000 [01:11<00:00, 28.02 step/s, accuracy=0.81, loss=8.63, step=48000] 
Valid: 100% 5664/5667 [00:03<00:00, 1814.19 uttr/s, accuracy=0.74, loss=9.58]
Train: 100% 2000/2000 [01:10<00:00, 28.29 step/s, accuracy=0.78, loss=9.41, step=5e+4]  
Valid: 100% 5664/5667 [00:03<00:00, 1804.94 uttr/s, accuracy=0.74, loss=9.52]
                                      
Train:   0% 0/2000 [00:00<?, ? step/s]                                                
Train:   0% 6/2000 [00:00<01:16, 26.11 

Step 50000, best model saved. (accuracy=0.7401)


Train: 100% 2000/2000 [01:11<00:00, 27.84 step/s, accuracy=0.84, loss=7.99, step=52000] 
Valid: 100% 5664/5667 [00:03<00:00, 1856.93 uttr/s, accuracy=0.74, loss=9.47]
Train: 100% 2000/2000 [01:09<00:00, 28.78 step/s, accuracy=0.78, loss=8.39, step=54000] 
Valid: 100% 5664/5667 [00:03<00:00, 1835.24 uttr/s, accuracy=0.74, loss=9.40]
Train: 100% 2000/2000 [01:10<00:00, 28.45 step/s, accuracy=0.91, loss=7.11, step=56000] 
Valid: 100% 5664/5667 [00:03<00:00, 1833.95 uttr/s, accuracy=0.75, loss=9.37]
Train: 100% 2000/2000 [01:09<00:00, 28.70 step/s, accuracy=0.75, loss=8.40, step=58000] 
Valid: 100% 5664/5667 [00:03<00:00, 1710.12 uttr/s, accuracy=0.75, loss=9.30]
Train: 100% 2000/2000 [01:08<00:00, 29.06 step/s, accuracy=0.78, loss=9.60, step=6e+4]  
Valid: 100% 5664/5667 [00:03<00:00, 1840.49 uttr/s, accuracy=0.75, loss=9.30]
                                      
Train:   0% 0/2000 [00:00<?, ? step/s]                                                
Train:   0% 5/2000 [00:00<01:17, 25.80 

Step 60000, best model saved. (accuracy=0.7525)


Train: 100% 2000/2000 [01:09<00:00, 28.96 step/s, accuracy=0.84, loss=7.55, step=62000] 
Valid: 100% 5664/5667 [00:03<00:00, 1826.15 uttr/s, accuracy=0.75, loss=9.33]
Train: 100% 2000/2000 [01:09<00:00, 28.58 step/s, accuracy=0.88, loss=7.34, step=64000] 
Valid: 100% 5664/5667 [00:03<00:00, 1802.31 uttr/s, accuracy=0.75, loss=9.28]
Train: 100% 2000/2000 [01:10<00:00, 28.42 step/s, accuracy=0.84, loss=8.34, step=66000] 
Valid: 100% 5664/5667 [00:03<00:00, 1826.79 uttr/s, accuracy=0.75, loss=9.26]
Train: 100% 2000/2000 [01:09<00:00, 28.66 step/s, accuracy=0.88, loss=7.36, step=68000] 
Valid: 100% 5664/5667 [00:03<00:00, 1828.43 uttr/s, accuracy=0.75, loss=9.17]
Train: 100% 2000/2000 [01:10<00:00, 28.54 step/s, accuracy=0.78, loss=8.20, step=7e+4]  
Valid: 100% 5664/5667 [00:03<00:00, 1794.07 uttr/s, accuracy=0.75, loss=9.24]
                                      
Train:   0% 0/2000 [00:00<?, ? step/s]                                                
Train:   0% 0/2000 [00:00<?, ? step/s]5

Step 70000, best model saved. (accuracy=0.7535)


In [49]:
import os
import json
import torch
from pathlib import Path
from torch.utils.data import Dataset


class InferenceDataset(Dataset):
    def __init__(self, data_dir):
        testdata_path = Path(data_dir) / "testdata.json"
        metadata = json.load(testdata_path.open())
        self.data_dir = data_dir
        self.data = metadata["utterances"]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        utterance = self.data[index]
        feat_path = utterance["feature_path"]
        mel = torch.load(os.path.join(self.data_dir, feat_path))

        return feat_path, mel


def inference_collate_batch(batch):
    """Collate a batch of data."""
    feat_paths, mels = zip(*batch)

    return feat_paths, torch.stack(mels)

In [None]:
import json
import csv
from pathlib import Path
from tqdm.notebook import tqdm

import torch
from torch.utils.data import DataLoader

def parse_args():
    """arguments"""
    config = {
        "data_dir": "/kaggle/input/ml2023springhw4/Dataset",
        "model_path": "./model.ckpt",
        "output_path": "./output.csv",
    }

    return config


def main(
    data_dir,
    model_path,
    output_path,
):
    """Main function."""
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"[Info]: Use {device} now!")

    mapping_path = Path(data_dir) / "mapping.json"
    mapping = json.load(mapping_path.open())

    dataset = InferenceDataset(data_dir)
    dataloader = DataLoader(
        dataset,
        batch_size=1,
        shuffle=False,
        drop_last=False,
        num_workers=8,
        collate_fn=inference_collate_batch,
    )
    print(f"[Info]: Finish loading data!",flush = True)

    speaker_num = len(mapping["id2speaker"])
    model = Classifier(n_spks=speaker_num).to(device)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    print(f"[Info]: Finish creating model!",flush = True)

    results = [["Id", "Category"]]
    for feat_paths, mels in tqdm(dataloader):
        with torch.no_grad():
            mels = mels.to(device)
            outs = model(mels)
            outs = model.pred_layer(outs)  # AMSoftmax
            preds = outs.argmax(1).cpu().numpy()
            for feat_path, pred in zip(feat_paths, preds):
                results.append([feat_path, mapping["id2speaker"][str(pred)]])

    with open(output_path, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerows(results)


if __name__ == "__main__":
    main(**parse_args())

[Info]: Use cuda now!
[Info]: Finish loading data!
[Info]: Finish creating model!


  0%|          | 0/8000 [00:00<?, ?it/s]

  if __name__ == "__main__":
