In [1]:
import pandas as pd
import numpy as np

import os
import sys
import glob
from tqdm.notebook import tqdm
from typing import Dict, Any
from copy import deepcopy

import librosa
import librosa.display
from IPython.display import Audio
from PIL import Image


import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report,roc_auc_score
from sklearn.model_selection import StratifiedKFold, train_test_split


import torch
from torch import LongTensor, FloatTensor, HalfTensor, Tensor
from torch.utils.data import Dataset, DataLoader
from copy import deepcopy
from torch import nn, optim
import torch.nn.functional as F
import inspect

import timm
from catalyst import dl, utils

import warnings
warnings.filterwarnings("ignore")

In [2]:
filenames = []
for filename in glob.glob('prj/*.txt'): 
    filenames.append(filename.split('/')[-1].split('.')[0])
filename = filenames[0]
#filenames

In [3]:
def read_markup_pr_for_lang(filename):
    df_audio = pd.read_csv(f'prj/{filename}.txt', sep='	', header=None)
    df_audio.columns=['start', 'end', 'person']
    df_audio['len'] = df_audio['end'] - df_audio['start']
    df_audio = df_audio[df_audio.person.isin(['su', 'sr', 'cu', 'cr'])].reset_index(drop=True)
    df_audio['lang']  = df_audio['person'].apply(lambda row: row[1])
    df_audio['person']  = df_audio['person'].apply(lambda row: row[0])
    df_audio['filename'] = filename
    return df_audio[['filename', 'start', 'end', 'len', 'lang']]


All_audios = []
for filename in filenames:
    All_audios.append(read_markup_pr_for_lang(filename))
All_audios = pd.concat(All_audios)
All_audios = All_audios[All_audios['len'] > 1].reset_index(drop=True)
All_audios['target'] = All_audios.lang.replace({'u':0, 'r':1})
All_audios['path'] = All_audios.filename.apply(lambda x: f'prj/{filename}.mp3')

In [4]:
train, test = train_test_split(All_audios, test_size=0.2, stratify=All_audios[['lang']].astype(str).apply('_'.join, axis=1), random_state=42)
train, valid = train_test_split(train, test_size=0.1, stratify=train[['lang']].astype(str).apply('_'.join, axis=1), random_state=42)
train.shape[0], valid.shape[0], test.shape[0]

(132, 15, 37)

In [5]:
train=train.reset_index(drop=True)
valid=valid.reset_index(drop=True)
test=test.reset_index(drop=True)

In [6]:
PERIOD=5

class Lang_Dataset(Dataset):
    def __init__(self, dataset, target=None, idx=None, melspectrogram_parameters={}):
        self.path = np.array(dataset.path)
        if target is not None:
            self.target = FloatTensor(np.array(target))
        else:
            self.target = None
        self.index = idx
        self.melspectrogram_parameters = melspectrogram_parameters
        self.start = dataset.start.values
        self.end = dataset.end.values
        self.sample_rate = 8000
        
    def __len__(self):
        return len(self.path)

    def __getitem__(self, idx):
        
        y, sampling_rate = librosa.load(self.path[idx], sr=self.sample_rate, duration=60)
        
        y = y[int(self.sample_rate*self.start[idx]):int(self.sample_rate*self.end[idx])]
        len_y = len(y)
        effective_length = sampling_rate * PERIOD

        if len_y < effective_length:
            new_y = np.zeros(effective_length, dtype=y.dtype)
            start = np.random.randint(effective_length - len_y)
            new_y[start:start + len_y] = y
            y = new_y.astype(np.float32)
        elif len_y > effective_length:
            start = np.random.randint(len_y - effective_length)
            y = y[start:start + effective_length].astype(np.float32)
        else:
            y = y.astype(np.float32)
        S = np.abs(librosa.stft(y));

        melspec = librosa.power_to_db(S**2, ref=np.max)
        
        sample = {'features':  FloatTensor(np.expand_dims(melspec, 0))}
        if self.target is not None:
            sample['targets'] = self.target[idx]#labels.flatten()
            
        return sample
    
def worker_init_fn(worker_id):
    np.random.seed(np.random.get_state()[1][0] + worker_id)

In [7]:
A_temp = Lang_Dataset(train, train.target)
smp = A_temp[10]
img, labl = smp['features'].numpy(), smp['targets']
librosa.display.specshow(img[0], sr=8000, x_axis='time', y_axis='hz')   

<matplotlib.collections.QuadMesh at 0x7ffa1bd878e0>

In [8]:
img.shape

(1, 1025, 79)

In [8]:
class TimmModel(nn.Module):
    def __init__(self, name_of_model='tf_efficientnet_b0_ns', pretrained=True):
        super().__init__()

        self.img_model = timm.create_model(name_of_model, pretrained=pretrained, in_chans=1)
        self.classifier = nn.Sequential(nn.ELU(), nn.Dropout(0.2), nn.Linear(in_features=1000, out_features=1))

    def forward(self, features):
        
        return self.classifier(
            self.img_model(
                   features
            )
        ).flatten()

In [9]:
!nvidia-smi

Mon Oct  4 20:55:58 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.80       Driver Version: 460.80       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-PCIE...  On   | 00000001:00:00.0 Off |                    0 |
| N/A   25C    P0    41W / 250W |      4MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [11]:
EXP_PATH = 'Lang/New_dataset'


#for i, (fold_train, fold_val) in enumerate(cv_data):
#print('Fold', i)
#if i==0:
FOLD_PATH = f'{EXP_PATH}'

batch_size=20


loaders = {
    "train": DataLoader(Lang_Dataset(train, train['target']),
                        worker_init_fn=worker_init_fn,
                        batch_size=batch_size, shuffle=True, drop_last=True, num_workers=6, pin_memory=True),
    "valid": DataLoader(Lang_Dataset(valid, valid['target']), 
                        worker_init_fn=worker_init_fn,
                        batch_size=batch_size, shuffle=False, drop_last=False, num_workers=1, pin_memory=True),
}

model =  TimmModel()
optimizer = optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.BCEWithLogitsLoss()
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=0, factor=0.3, min_lr=1e-7)

callbacks = [
    dl.AUCCallback(input_key="logits", target_key="targets",),
#     dl.AccuracyCallback(
#             input_key="logits", target_key="targets", num_classes=2
#         ),
    dl.OptimizerCallback('loss'),
    dl.SchedulerCallback(loader_key='valid', metric_key='auc', mode='epoch'),
    dl.CheckpointCallback(loader_key='valid', metric_key='auc', minimize=False, save_n_best=3, mode='model', use_runner_logdir=True, use_logdir_postfix=True, ),
    dl.EarlyStoppingCallback(patience=4, loader_key="valid", metric_key="auc", minimize=False),
    dl.TimerCallback()
]
runner = dl.SupervisedRunner(input_key="features", target_key="targets", output_key="logits", loss_key="loss", )
runner.train(
    model=model,
    criterion=criterion,
    optimizer=optimizer,
    scheduler=scheduler,
    loaders=loaders,
    num_epochs=20,#99999,#!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    callbacks=callbacks,
    #engine=dl.DeviceEngine("cpu"),
    logdir=f"{FOLD_PATH}",
    valid_loader="valid",
    valid_metric="auc",
    minimize_valid_metric=False,
    verbose=True,
    load_best_on_end=True,
    amp=True,
)

1/20 * Epoch (train):   0%|          | 0/6 [00:00<?, ?it/s]



train (1/20) auc: 0.5811111330986023 | auc/_macro: 0.5811111330986023 | auc/_micro: 0.5811111111111111 | auc/_weighted: 0.14527778327465057 | auc/class_00: 0.5811111330986023 | loss: 0.6173343658447266 | loss/mean: 0.6173343658447266 | loss/std: 0.02981703219295556 | lr: 1e-05 | momentum: 0.9


1/20 * Epoch (valid):   0%|          | 0/1 [00:00<?, ?it/s]



valid (1/20) auc: 0.7222222089767456 | auc/_macro: 0.7222222089767456 | auc/_micro: 0.7222222222222221 | auc/_weighted: 0.14444445073604584 | auc/class_00: 0.7222222089767456 | loss: 0.5531836748123169 | loss/mean: 0.5531836748123169 | loss/std: 0.0 | lr: 1e-05 | momentum: 0.9
* Epoch (1/20) lr: 1e-05 | momentum: 0.9


2/20 * Epoch (train):   0%|          | 0/6 [00:00<?, ?it/s]



train (2/20) auc: 0.4468167722225189 | auc/_macro: 0.4468167722225189 | auc/_micro: 0.44681677018633537 | auc/_weighted: 0.1042572483420372 | auc/class_00: 0.4468167722225189 | loss: 0.6348111629486084 | loss/mean: 0.6348111629486084 | loss/std: 0.06927250656478569 | lr: 1e-05 | momentum: 0.9


2/20 * Epoch (valid):   0%|          | 0/1 [00:00<?, ?it/s]



valid (2/20) auc: 0.8055555820465088 | auc/_macro: 0.8055555820465088 | auc/_micro: 0.8055555555555556 | auc/_weighted: 0.16111111640930176 | auc/class_00: 0.8055555820465088 | loss: 0.7351803183555603 | loss/mean: 0.7351803183555603 | loss/std: 0.0 | lr: 1e-05 | momentum: 0.9
* Epoch (2/20) lr: 1e-05 | momentum: 0.9


3/20 * Epoch (train):   0%|          | 0/6 [00:00<?, ?it/s]



train (3/20) auc: 0.5125448107719421 | auc/_macro: 0.5125448107719421 | auc/_micro: 0.5125448028673835 | auc/_weighted: 0.11532258242368698 | auc/class_00: 0.5125448107719421 | loss: 0.6051254272460938 | loss/mean: 0.6051254272460938 | loss/std: 0.04979358840972906 | lr: 1e-05 | momentum: 0.9


3/20 * Epoch (valid):   0%|          | 0/1 [00:00<?, ?it/s]



valid (3/20) auc: 0.5555555820465088 | auc/_macro: 0.5555555820465088 | auc/_micro: 0.5555555555555556 | auc/_weighted: 0.111111119389534 | auc/class_00: 0.5555555820465088 | loss: 0.7345924973487854 | loss/mean: 0.7345924973487854 | loss/std: 0.0 | lr: 1e-05 | momentum: 0.9
* Epoch (3/20) lr: 3e-06 | momentum: 0.9


4/20 * Epoch (train):   0%|          | 0/6 [00:00<?, ?it/s]



train (4/20) auc: 0.47200000286102295 | auc/_macro: 0.47200000286102295 | auc/_micro: 0.472 | auc/_weighted: 0.09833332896232605 | auc/class_00: 0.47200000286102295 | loss: 0.6022875308990479 | loss/mean: 0.6022875308990479 | loss/std: 0.03806253150755584 | lr: 3e-06 | momentum: 0.9


4/20 * Epoch (valid):   0%|          | 0/1 [00:00<?, ?it/s]



valid (4/20) auc: 0.5 | auc/_macro: 0.5 | auc/_micro: 0.5 | auc/_weighted: 0.10000000149011612 | auc/class_00: 0.5 | loss: 0.6012418270111084 | loss/mean: 0.6012418270111084 | loss/std: 0.0 | lr: 3e-06 | momentum: 0.9
* Epoch (4/20) lr: 9e-07 | momentum: 0.9


5/20 * Epoch (train):   0%|          | 0/6 [00:00<?, ?it/s]



train (5/20) auc: 0.5135869383811951 | auc/_macro: 0.5135869383811951 | auc/_micro: 0.5135869565217391 | auc/_weighted: 0.1198369562625885 | auc/class_00: 0.5135869383811951 | loss: 0.6042546033859253 | loss/mean: 0.6042546033859253 | loss/std: 0.1060158826164727 | lr: 9e-07 | momentum: 0.9


5/20 * Epoch (valid):   0%|          | 0/1 [00:00<?, ?it/s]



valid (5/20) auc: 0.5833333134651184 | auc/_macro: 0.5833333134651184 | auc/_micro: 0.5833333333333333 | auc/_weighted: 0.11666666716337204 | auc/class_00: 0.5833333134651184 | loss: 0.5689816474914551 | loss/mean: 0.5689816474914551 | loss/std: 0.0 | lr: 9e-07 | momentum: 0.9
* Epoch (5/20) lr: 2.6999999999999996e-07 | momentum: 0.9


6/20 * Epoch (train):   0%|          | 0/6 [00:00<?, ?it/s]



train (6/20) auc: 0.4658385217189789 | auc/_macro: 0.4658385217189789 | auc/_micro: 0.4658385093167702 | auc/_weighted: 0.10869565606117249 | auc/class_00: 0.4658385217189789 | loss: 0.6114551424980164 | loss/mean: 0.6114551424980164 | loss/std: 0.04335433632689891 | lr: 2.6999999999999996e-07 | momentum: 0.9


6/20 * Epoch (valid):   0%|          | 0/1 [00:00<?, ?it/s]



valid (6/20) auc: 0.5833333134651184 | auc/_macro: 0.5833333134651184 | auc/_micro: 0.5833333333333333 | auc/_weighted: 0.11666666716337204 | auc/class_00: 0.5833333134651184 | loss: 0.556983232498169 | loss/mean: 0.556983232498169 | loss/std: 0.0 | lr: 2.6999999999999996e-07 | momentum: 0.9
* Epoch (6/20) lr: 1e-07 | momentum: 0.9
Top best models:
Lang/New_dataset/checkpoints/train.2.pth	0.8056
Lang/New_dataset/checkpoints/train.1.pth	0.7222
Lang/New_dataset/checkpoints/train.5.pth	0.5833


In [12]:
def predict_loader(model, loader):
    pred = runner.predict_loader(model=model, loader=loader)
    #pred = [torch.sigmoid(batch['logits'].detach().cpu().float()).numpy() for batch in tqdm(pred)] 
    pred = [batch['logits'].detach().cpu().float().numpy() for batch in tqdm(pred)] 
    pred = pd.Series(np.concatenate(pred, 0), index = loader.dataset.index)
    return pred
    
def get_valid_logs(exp_path='logs'):
    valid_log = pd.read_csv(f'{exp_path}/logs/valid.csv')#.set_index('step')
    if ('step' == valid_log['step']).any():
        valid_log = valid_log.iloc[valid_log[valid_log['step'] == 'step'].index.max()+1:].reset_index(drop=True)
    return valid_log

In [13]:
test_dataloader = DataLoader(Lang_Dataset(test, test['target']), 
                    #worker_init_fn=worker_init_fn,
                    batch_size=batch_size, shuffle=False, drop_last=False, num_workers=2, pin_memory=False)
model =  TimmModel()
model.load_state_dict(torch.load(f'{EXP_PATH}/checkpoints/best.pth')['model_state_dict'])
model = model.eval().cuda()
score = get_valid_logs(EXP_PATH)['auc'].max()
#pred = predict_loader(model, test_dataloader)
#write_prediction(pred, f'best_{score}', EXP_PATH)

pred = runner.predict_loader(model=model, loader=test_dataloader)
#pred = [torch.sigmoid(batch['logits'].detach().cpu().float()).numpy() for batch in tqdm(pred)] 
pred = [batch['logits'].detach().cpu().float().numpy() for batch in tqdm(pred)] 


pred = pd.DataFrame(np.concatenate(pred, 0), index = test_dataloader.dataset.index)

test['pred'] = pred.apply(lambda row: row.argmax(), axis=1).values
from sklearn.metrics import accuracy_score
print(accuracy_score(test.target, test.pred))

0it [00:00, ?it/s]



0.7837837837837838
