# tech part

In [1]:
!nvidia-smi

Sun Nov 21 19:22:28 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.91.03    Driver Version: 460.91.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  TITAN RTX           Off  | 00000000:18:00.0 Off |                  N/A |
| 41%   29C    P8    17W / 280W |      8MiB / 24220MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  TITAN V             Off  | 00000000:3B:00.0 Off |                  N/A |
| 33%   48C    P2   166W / 250W |  12054MiB / 12066MiB |     79%      Default |
|       

In [6]:
import comet_ml
import warnings
warnings.filterwarnings("ignore")

import os
import sklearn
import librosa
import librosa.display
import random

import numpy as np
import IPython.display as ipd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import f1_score, precision_score, recall_score, roc_curve, confusion_matrix

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.loggers import CometLogger
from pytorch_lightning import Trainer, seed_everything
from transformers import AdamW

from typing import List
from tqdm.auto import tqdm

from src.utils import *
from src.DataModule import AudioPreProcessing
from src.DataModule import AudioDataset
from src.model import AudioClassifierCNN
from src.trainer import ModelTrainer
from src.loss import FocalLoss

seed_everything(294)

Global seed set to 294


294

In [133]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device == 'cuda':
    from torch.cuda import LongTensor
else:
    from torch import LongTensor
print(device)

cuda


# Data

Получим path для train и test. Из-за малого кол-ва данных test буду использовать, как валидационную выборку.

В переменной files хранятся все пути с классами для обучения.

In [9]:
oksana_files = get_files('./vox-test-audio/data/oksana/')
omazh_files = get_files('./vox-test-audio/data/omazh/')

test_files = get_test_files('./vox-test-audio/data/test/')

files = oksana_files + omazh_files

## Pre-processing

Реализовал класс для предобработки аудиофайлов. Чтение, аугментация (если требуется) и генерация MFCC. Все подробности в репозитории в папке src. 

In [108]:
aud_processing = AudioPreProcessing(sr=16000)

train_audio = aud_processing.pipeline(files, True, 10)
test_audio = aud_processing.pipeline(test_files, False)

У нас звуки разный продолжительностью, а значит и размер MFCC тоже будет разный, будем паддить.

In [110]:
def pad_to_size(sig, size, mode='const'):
    if sig.shape[1] < size:
        padding = size - sig.shape[1]
        offset = padding // 2
        pad_width = ((0, 0), (offset, padding - offset))
        if mode == 'const':
            sig = np.pad(sig, pad_width, 'constant', constant_values=sig.min())
        elif mode == 'wrap':
            sig = np.pad(sig, pad_width, 'wrap')
    return sig  

def collate_fn(batch):
    lens = [sample[2] for sample in batch]
    maxlen = np.array(lens).max()
    
    mfcc = [pad_to_size(sample[0], maxlen) for sample in batch]
    label = [sample[1] for sample in batch]
            
    result = {
            'inputs': torch.tensor(mfcc).to(dtype=torch.float),
            'label': torch.tensor(label).to(dtype=torch.float),
        }
    return result

In [111]:
train = AudioDataset(train_audio)
test = AudioDataset(test_audio)

train_iter = DataLoader(train, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_iter = DataLoader(test, batch_size=5, shuffle=True, collate_fn=collate_fn)

# Model

In [123]:
CLIP = 1.0


Model = AudioClassifierCNN().to(device)
#criterion = nn.BCEWithLogitsLoss().to(device)
criterion = FocalLoss().to(device)
model_trainer = ModelTrainer(Model, criterion, 1e-6).to(device)
trainer = pl.Trainer(max_epochs=20,
                     gpus='2',
                     gradient_clip_val=CLIP,
                     progress_bar_refresh_rate=1,
                     checkpoint_callback=False,)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores


In [124]:
trainer.fit(model_trainer, train_iter, test_iter)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]

  | Name      | Type               | Params
-------------------------------------------------
0 | model     | AudioClassifierCNN | 109 K 
1 | criterion | FocalLoss          | 0     
-------------------------------------------------
109 K     Trainable params
0         Non-trainable params
109 K     Total params
0.437     Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

Global seed set to 294


Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

# test

Попробуем подобрать thresholds для улучшения классификации. 

In [125]:
trainer.test(model_trainer, test_iter)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_f1': 0.7777777777777777,
 'test_loss': 0.13521303236484528,
 'test_precision': 0.875,
 'test_recall': 0.7}
--------------------------------------------------------------------------------


[{'test_loss': 0.13521303236484528,
  'test_f1': 0.7777777777777777,
  'test_precision': 0.875,
  'test_recall': 0.7}]

In [126]:
flatten = model_trainer.flatten

test_prob = [sigmoid(float(i.cpu().numpy())) for i in flatten(model_trainer.test_res['prob'])]
test_label = flatten(model_trainer.test_res['label'])
test_pred = flatten(model_trainer.test_res['pred'])

In [127]:
fpr, tpr, thresholds = roc_curve(test_label, test_prob)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]
optimal_threshold

0.50046716803129

In [128]:
pred_label = []
for i in test_prob:
    if i > optimal_threshold:
        pred_label.append(1)
    else:
        pred_label.append(0)

f1_score(pred_label, test_label)

0.7058823529411764

В данном случае результат немного ухудшился. 

Вот результаты предсказания и исходные классы:

In [129]:
pred_label

[1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0]

In [130]:
test_label

[1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0]

# Выводы



    1. Обучение вообще не стабильное. Различные трюки с архитектурой, например добавление GRU, не дало улучшений
    2. Требуется больше данных
    3. Аугментация немного помогает