In [1]:
import os
import sys
import matplotlib.pyplot as plt
import IPython.display as ipd
import pandas as pd
import re
import subprocess
import numpy as np
import math

%load_ext autoreload
%autoreload 2
%matplotlib inline

sys.path.append('../audioset_tagging_cnn/')
sys.path.append('../src')

In [2]:
import logging

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data

from pytorch.models import *
from utils.notebooks_utils import *

In [4]:
data_root = '/media/maxim/SStorage/FG_2020/'

reduced_sample_rate = 'data/Reduced_sample_rate'
reduced_sample_rate_path = os.path.join(data_root, reduced_sample_rate)

separated_audio = 'data/Separated_audio'
separated_audio_path = os.path.join(data_root, separated_audio)

labels = 'labels/downgraded'
labels_path = os.path.join(data_root, labels)

features = 'features'

log_root = '/media/maxim/SStorage/FG_2020/logs/'
tb_log_root = '/media/maxim/SStorage/FG_2020/logs/tb/'

features_type = 'mel_64x32'

batch_size = 128

class_names = ['Neutral', 'Anger', 'Disgust', 'Fear', 'Happiness', 'Sadness', 'Surprise']

In [5]:
def apply_window(features, labels, window_width):
    features = np.transpose(features)
    ratio = int(len(features) / len(labels))
    x_center = int((window_width - 1) / 2)
    res_x = []
    res_y = []
    
    for idx, lab in enumerate(labels):
        x_arr = None
        if idx < x_center:
            x_arr = np.concatenate((np.flip(features[(idx + 1) * ratio: (idx + x_center + 1) * ratio]), 
                                    features[idx * ratio: (idx + 1) * ratio],
                                    features[(idx + 1) * ratio: (idx + x_center + 1) * ratio]), axis=0)
            x_arr = x_arr[0:window_width * x_center * ratio]
        elif len(features) < (idx + x_center + 1) * ratio:
            x_arr = np.concatenate((features[(idx - x_center) * ratio: idx * ratio], 
                                    features[idx * ratio: (idx + 1) * ratio],
                                    np.flip(features[(idx - x_center) * ratio: idx * ratio])), axis=0)           
            
            x_arr = x_arr[0:window_width * x_center * ratio]
        else:
            x_arr = features[(idx - x_center) * ratio: (idx + x_center + 1) * ratio]

        res_x.append(np.transpose(x_arr))
        res_y.append(lab)
        
    return (np.asarray(res_x), np.asarray(res_y))

def expand_array(x, max_len):
    return np.pad(x, [(0, 0), (0, max_len - x.shape[1])], mode='constant')

def expand_tensor(x, max_len):
    return [expand_array(i, max_len) if i.shape[1] < max_len else i for i in x]

In [6]:
import librosa

window_width = 5
sr = 16000

n_fft = int(sr * 0.032) # window_width ms
hop_length = int(sr * 0.010) # step ms

all_data = {
    'train': {
        'x': [],
        'y': [],
    },
    'valid': {
        'x': [],
        'y': [],
    }
}

files_data = {
    'train': {
    },
    'valid': {
    },
    'test': {  
    }
}

for ds in ['train', 'valid', 'test']:
    all_samples = []
    
    for i in tqdm(os.listdir(os.path.join(labels_path, ds)), desc='Extract features on {} set'.format(ds)):
        fp = os.path.join(separated_audio_path, i).replace('.txt', '_vocals.wav').replace('_left', '').replace('_right', '')
        if not os.path.exists(fp):
            continue
            
        # Extract features
        wave, sr = librosa.load(fp, sr)
        s = librosa.feature.melspectrogram(y=wave, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=64)
        log_mels = librosa.power_to_db(s, ref=np.max)
        
        if ds == 'test':
            meta = np.full(int(log_mels.shape[1] / 20), -1)
            x, y = apply_window(log_mels, meta, window_width)
        else:
            meta = pd.read_csv(os.path.join(labels_path, ds, i)).values.squeeze()

            samples_ratio = int(log_mels.shape[1] / len(meta))
            diff = meta.shape[0] * samples_ratio - log_mels.shape[1]

            x, y = apply_window(log_mels, meta, window_width)
            
#             x = x[y != -1]
#             y = y[y != -1]

#         all_data[ds]['x'].extend(x)
#         all_data[ds]['y'].extend(y)
        if len(y) > 0:
            file_dict = {
                'x': x,
                'y': y
            }
    
            files_data[ds][os.path.join(separated_audio_path, i)] = file_dict

# max_len = max([i.shape[1] for i in all_data[ds]['x'] for ds in ['train', 'valid', 'test']])

Extract features on train set: 100%|██████████| 253/253 [00:20<00:00, 12.19it/s]
Extract features on valid set: 100%|██████████| 70/70 [00:08<00:00,  8.51it/s]
Extract features on test set: 100%|██████████| 223/223 [00:19<00:00, 11.50it/s]


In [7]:
from torchvision import transforms, models
from torch.utils.data import Dataset, WeightedRandomSampler
import torch.nn.functional as F

class CustomTensorDataset(Dataset):
    """TensorDataset with support of transforms.
    """
    def __init__(self, tensors, transform=None):
        assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors)
        self.tensors = tensors
        self.transform = transform

    def __getitem__(self, index):
        x = self.tensors[0][index]

        if self.transform:
            x = self.transform(x)

        y = self.tensors[1][index]

        return x, y

    def __len__(self):
        return self.tensors[0].size(0)

In [None]:
define_seed(12)

x_train = torch.Tensor(expand_tensor(all_data['train']['x'], max_len))
x_valid = torch.Tensor(expand_tensor(all_data['valid']['x'], max_len))

y_train = torch.LongTensor(all_data['train']['y'])
y_valid = torch.LongTensor(all_data['valid']['y'])

classes_count = np.array([len(np.where(y_train == t)[0]) for t in np.unique(y_train)])
weight = 1. / classes_count
samples_weight = torch.from_numpy(np.array([weight[t] for t in y_train])).double()

sampler = WeightedRandomSampler(samples_weight, len(samples_weight))

train_dataset = CustomTensorDataset(tensors=(x_train, y_train), transform=None)
valid_dataset = CustomTensorDataset(tensors=(x_valid, y_valid), transform=None)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, 
                                               num_workers=6, sampler=sampler)

valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, 
                                               shuffle=False, num_workers=6)

In [8]:
class Transfer_Cnn14(nn.Module):
    def __init__(self, pretrained_path, classes_num, pretrain, freeze_base):
        """Classifier for a new task using pretrained Cnn14 as a sub module.
        """
        super(Transfer_Cnn14, self).__init__()

        self.load_base(pretrain, pretrained_path)
        
        self.base.spectrogram_extractor = nn.Identity()
        self.base.logmel_extractor = nn.Identity()
        self.base.spec_augmenter = nn.Identity()

        # Transfer to another task layer
        self.classifier = nn.Linear(2048, classes_num, bias=True)

        if freeze_base:
            for param in self.base.parameters():
                param.requires_grad = False

        self.init_weights()
        
    def load_base(self, pretrain, pretrained_path):
        self.base = Cnn14(sample_rate=16000, window_size=1024, hop_size=320, mel_bins=64, 
                          fmin=50, fmax=14000, classes_num=527)
        
        if pretrain:
            logging.info('Load pretrained model from {}'.format(pretrained_path))
            checkpoint = torch.load(pretrained_path)
            self.base.load_state_dict(checkpoint['model'])

    def init_weights(self):
        init_layer(self.classifier)

    def forward(self, x, mixup_lambda=None):
        """Input: (batch_size, 1, time_steps, mel_bins)
        """
        x = x.unsqueeze(1).transpose(2, 3)
        output_dict = self.base(x, mixup_lambda)
        embedding = output_dict['embedding']

        clipwise_output =  self.classifier(embedding)
        output_dict['clipwise_output'] = clipwise_output
 
        return output_dict['clipwise_output']

In [None]:
%%capture output
from sklearn.metrics import recall_score, f1_score, accuracy_score

def custom_score(targets, predicts, average='macro'):
    return 0.67 * f1_score(targets, predicts, average='macro') + 0.33 * accuracy_score(targets, predicts) 

define_seed(12)
pretrained_path = '../models/pretrained/Cnn14_mAP=0.431.pth'
model = Transfer_Cnn14(pretrained_path=pretrained_path, classes_num=len(class_names), 
                       pretrain=True, freeze_base=False)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

loss = torch.nn.CrossEntropyLoss()
    
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, betas=(0.9, 0.999), 
                             eps=1e-08, weight_decay=0., amsgrad=True)

model, max_epoch, max_performance = train_model(model, loss, optimizer, None, num_epochs=50, 
                                                device=device, train_dataloader=train_dataloader, 
                                                valid_dataloader=valid_dataloader,
                                                class_names=class_names,
                                                log_root=log_root,
                                                tb_log_root=tb_log_root,
                                                features_name=features_type,
                                                experiment_name='fg2020-NewWeighted-PANN-CNN14-50',
                                                metrics=[custom_score, f1_score, accuracy_score],
                                                log_iter=[])
    
print('Epoch: {0}\n'.format(max_epoch))
print(max_performance)

In [None]:
print(output)

In [9]:
from sklearn.metrics import recall_score, f1_score, accuracy_score

def custom_score(targets, predicts, average='macro'):
    return 0.67 * f1_score(targets, predicts, average='macro') + 0.33 * accuracy_score(targets, predicts) 

def predict_proba(x, y, model, model_name, model_epoch, log_root, batch_size):
#     print('Initialize data')
    all_predictions = []
    all_labels = []
    
    dataset = CustomTensorDataset(tensors=(x, y), transform=None)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=6)

#     print('Initialize model')
    dictionary_path = get_model_by_epoch(os.path.join(log_root, '{0}'.format(model_name)), model_epoch)
#     print(dictionary_path)
    checkpoint = torch.load(dictionary_path)
    
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
#     print('Testing')
    for inputs, labels in dataloader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        predicts = None
        with torch.set_grad_enabled(False):
            preds = model(inputs)
        
        predicts = torch.nn.functional.softmax(preds, dim=1).data.cpu().numpy()
        all_labels.append(labels.data.cpu().numpy())
        all_predictions.append(predicts)
        
    return np.concatenate(all_predictions), np.concatenate(all_labels)

In [None]:
pretrained_path = '../models/pretrained/Cnn14_mAP=0.431.pth'
model = Transfer_Cnn14(pretrained_path=pretrained_path, 
                       classes_num=len(class_names), 
                       pretrain=True, 
                       freeze_base=False)

model_name = 'mel_64x32_fg2020-NewWeighted-PANN-CNN14-50'

x_train = torch.Tensor(expand_tensor(all_data['train']['x'], max_len))
x_valid = torch.Tensor(expand_tensor(all_data['valid']['x'], max_len))

y_train = torch.LongTensor(all_data['train']['y'])
y_valid = torch.LongTensor(all_data['valid']['y'])

probas, labels = predict_proba(x_train, y_train, model, model_name, 41, log_root, batch_size)
preds = probas.argmax(axis=1)
print('Metrics: F1: {0}, Acc: {1}, Custom: {2}'.format(f1_score(labels, preds, average='macro'),
                                                       accuracy_score(labels, preds),
                                                       custom_score(labels, preds, 'macro')))

probas, labels = predict_proba(x_valid, y_valid, model, model_name, 41, log_root, batch_size)
preds = probas.argmax(axis=1)
print('Metrics: F1: {0}, Acc: {1}, Custom: {2}'.format(f1_score(labels, preds, average='macro'),
                                                       accuracy_score(labels, preds),
                                                       custom_score(labels, preds, 'macro')))

In [10]:
pretrained_path = '../models/pretrained/Cnn14_mAP=0.431.pth'
model = Transfer_Cnn14(pretrained_path=pretrained_path, 
                       classes_num=len(class_names), 
                       pretrain=True, 
                       freeze_base=False)

model_name = 'mel_64x32_fg2020-NewWeighted-PANN-CNN14-50'

for ds in ['train', 'valid', 'test']:
    for f in tqdm(files_data[ds]):
        x_train = torch.Tensor(files_data[ds][f]['x'])
        y_train = torch.LongTensor(files_data[ds][f]['y'])
        
        probas, labels = predict_proba(x_train, y_train, model, model_name, 41, log_root, batch_size)
        
        res = np.concatenate((probas, np.expand_dims(labels, axis=1)), axis=1)
        
        os.makedirs(os.path.join(model_name, ds), exist_ok=True)
        
        fn = os.path.splitext(os.path.basename(f))[0]
        np.savetxt(os.path.join(model_name, ds, "{0}.csv".format(fn)), res, delimiter=",")

100%|██████████| 253/253 [05:28<00:00,  1.30s/it]
100%|██████████| 70/70 [01:37<00:00,  1.39s/it]
100%|██████████| 223/223 [04:44<00:00,  1.28s/it]
