In [28]:
import os
import sys
import matplotlib.pyplot as plt
import IPython.display as ipd
import pandas as pd
import re
import subprocess
import numpy as np
import math

%load_ext autoreload
%autoreload 2
%matplotlib inline

sys.path.append('../audioset_tagging_cnn/')
sys.path.append('../src')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
import logging

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data

from pytorch.models import *
from utils.notebooks_utils import *

In [30]:
data_root = '../../FG_2020/'

reduced_sample_rate = 'data/Reduced_sample_rate'
reduced_sample_rate_path = os.path.join(data_root, reduced_sample_rate)

separated_audio = 'data/Separated_audio'
separated_audio_path = os.path.join(data_root, separated_audio)

# downgraded and dropped14_interpolated10 labels paths
# labels = 'labels/dropped14_interpolated10'
labels = 'labels/downgraded'
labels_path = os.path.join(data_root, labels)

features = 'features'

log_root = '../logs/'
tb_log_root = '../logs/tb/'

features_type = 'wave'

batch_size = 128

class_names = ['Neutral', 'Anger', 'Disgust', 'Fear', 'Happiness', 'Sadness', 'Surprise']

# Is train mode?
is_train_mode = True

# Due to memory limit
is_test = lambda ds: (ds == 'test')

In [31]:
# apply window to wave
def apply_window(features, labels, window_width, sr):
    features = np.transpose(features)
    ratio = int(len(features) / len(labels))
    x_center = int((window_width - 1) / 2)
    res_x = []
    res_y = []
    
    if len(features) < window_width * sr:
        features = np.pad(features, [(0, window_width * sr - len(features))], mode='constant')
    
    for idx, lab in enumerate(labels):
        x_arr = None
        if idx < x_center:
            x_arr = np.concatenate((np.flip(features[(idx + 1) * ratio: (idx + x_center + 1) * ratio]), 
                                    features[idx * ratio: (idx + 1) * ratio],
                                    features[(idx + 1) * ratio: (idx + x_center + 1) * ratio]), axis=0)
            x_arr = x_arr[0:window_width * x_center * ratio]
        elif len(features) < (idx + x_center + 1) * ratio:
            x_arr = np.concatenate((features[(idx - x_center) * ratio: idx * ratio], 
                                    features[idx * ratio: (idx + 1) * ratio],
                                    np.flip(features[(idx - x_center) * ratio: idx * ratio])), axis=0)           
            
            x_arr = x_arr[0:window_width * x_center * ratio]
        else:
            x_arr = features[(idx - x_center) * ratio: (idx + x_center + 1) * ratio]

        res_x.append(np.transpose(x_arr))
        res_y.append(lab)
        
    return np.asarray(res_x), np.asarray(res_y)

In [32]:
import librosa

sr = 16000

window_width = 3 * 5 # labels (input size): 3 seconds * 5 labels = 15 labels

# dict for train mode
all_data = {
    'train': {
        'x': [],
        'y': [],
    },
    'valid': {
        'x': [],
        'y': [],
    }
}

# dict for mapping filenames to features and predictions
files_data = {
    'train': {
    },
    'valid': {
    },
    'test': {  
    }
}

for ds in ['train', 'valid', 'test']:
    if is_test(ds):
        continue
    all_samples = []
    
    for i in tqdm(os.listdir(os.path.join(labels_path, ds)), desc='Extract features on {} set'.format(ds)):
        fp = os.path.join(separated_audio_path, i).replace('.txt', '_vocals.wav').replace('_left', '').replace('_right', '')
        if not os.path.exists(fp):
            continue

        # Extract wave
        wave, sr = librosa.load(fp, sr)
        
        if ds == 'test':
            meta = np.full(int(len(wave) / 3242), -1)
            x, y = apply_window(wave, meta, window_width, sr)
        else:
            meta = pd.read_csv(os.path.join(labels_path, ds, i)).values.squeeze()
        
            samples_ratio = int(len(wave) / len(meta))
            diff = len(meta) * samples_ratio - len(wave)
            
            x, y = apply_window(wave, meta, window_width, sr)
            
        if is_train_mode:
            x = x[y != -1]
            y = y[y != -1]
        
            all_data[ds]['x'].extend(x)
            all_data[ds]['y'].extend(y)
        else:
            if len(y) > 0:
                file_dict = {
                    'x': x,
                    'y': y
                }
    
                files_data[ds][os.path.join(separated_audio_path, i)] = file_dict

if is_train_mode:
    max_len = max([len(i) for ds in ['train', 'valid'] for i in all_data[ds]['x']])

Extract features on train set: 100%|██████████| 253/253 [00:14<00:00, 17.99it/s]
Extract features on valid set: 100%|██████████| 70/70 [00:17<00:00,  4.04it/s]


In [33]:
from torchvision import transforms, models
from torch.utils.data import Dataset, WeightedRandomSampler
import torch.nn.functional as F

class CustomTensorDataset(Dataset):
    """TensorDataset with support of transforms.
    """
    def __init__(self, x_np, y_tensor, max_len, transform=None):
        self.x_np = x_np
        self.y = y_tensor
        self.max_len = max_len
        self.transform = transform
        
    def expand_array(self, x):
        return np.pad(x, [(0, self.max_len - len(x))], mode='constant')

    def __getitem__(self, index):
        if self.max_len == 0:
            x = torch.Tensor(self.x_np[index])
        else:
            x = torch.Tensor(self.expand_array(self.x_np[index]))

        if self.transform:
            x = self.transform(x)

        y = self.y[index]

        return x, y

    def __len__(self):
        return self.y.size(0)

In [None]:
# run it in train mode only
if is_train_mode:
    define_seed(12)

    y_train = torch.LongTensor(all_data['train']['y'])
    y_valid = torch.LongTensor(all_data['valid']['y'])

    class_sample_count = np.unique(y_train, return_counts=True)[1]
    class_weights = torch.Tensor(max(class_sample_count) / class_sample_count)

    train_dataset = CustomTensorDataset(all_data['train']['x'], y_train, max_len, transform=None)
    valid_dataset = CustomTensorDataset(all_data['valid']['x'], y_valid, max_len, transform=None)

    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, 
                                                   num_workers=6, shuffle=True)

    valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, 
                                                   shuffle=False, num_workers=6)

In [34]:
class Transfer_Cnn14(nn.Module):
    def __init__(self, pretrained_path, classes_num, pretrain, freeze_base):
        """Classifier for a new task using pretrained Cnn14 as a sub module.
        """
        super(Transfer_Cnn14, self).__init__()

        self.load_base(pretrain, pretrained_path)
        
        # Transfer to another task layer
        self.classifier = nn.Linear(2048, classes_num, bias=True)

        if freeze_base:
            for param in self.base.parameters():
                param.requires_grad = False

        self.init_weights()
        
    def load_base(self, pretrain, pretrained_path):
        self.base = Cnn14(sample_rate=16000, window_size=1024, hop_size=320, mel_bins=64, 
                          fmin=50, fmax=14000, classes_num=527)
        
        if pretrain:
            logging.info('Load pretrained model from {}'.format(pretrained_path))
            checkpoint = torch.load(pretrained_path)
            self.base.load_state_dict(checkpoint['model'])

    def init_weights(self):
        init_layer(self.classifier)

    def forward(self, x, mixup_lambda=None):
        """Input: (batch_size, 1, time_steps, mel_bins)
        """
        output_dict = self.base(x, mixup_lambda)
        embedding = output_dict['embedding']

        clipwise_output =  self.classifier(embedding)
        output_dict['clipwise_output'] = clipwise_output
 
        return output_dict['clipwise_output']

In [None]:
from sklearn.metrics import recall_score, f1_score, accuracy_score

def custom_score(targets, predicts, average='macro'):
    return 0.67 * f1_score(targets, predicts, average='macro') + 0.33 * accuracy_score(targets, predicts) 

# run it in train mode only
if is_train_mode:
    # %%capture output

    define_seed(12)
    pretrained_path = '../models/pretrained/Cnn14_mAP=0.431.pth'
    model = Transfer_Cnn14(pretrained_path=pretrained_path, classes_num=len(class_names), 
                           pretrain=True, freeze_base=False)

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)

    loss = torch.nn.CrossEntropyLoss(weight=class_weights.cuda())
    
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, betas=(0.9, 0.999), 
                                 eps=1e-08, weight_decay=0., amsgrad=True)

    model, max_epoch, max_performance = train_model(model, loss, optimizer, None, num_epochs=50, 
                                                    device=device, train_dataloader=train_dataloader, 
                                                    valid_dataloader=valid_dataloader,
                                                    class_names=class_names,
                                                    log_root=log_root,
                                                    tb_log_root=tb_log_root,
                                                    features_name=features_type,
                                                    experiment_name='fg2020-LossWeighted-PANN-CNN14-50',
                                                    metrics=[custom_score, f1_score, accuracy_score],
                                                    log_iter=[])
    
    print('Epoch: {0}\n'.format(max_epoch))
    print(max_performance)

In [35]:
def predict_proba(x, y, max_len, model, model_name, model_epoch, log_root, batch_size):
    all_predictions = []
    all_labels = []
    
    dataset = CustomTensorDataset(x, y, max_len, transform=None)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=6)

    dictionary_path = get_model_by_epoch(os.path.join(log_root, '{0}'.format(model_name)), model_epoch)
    checkpoint = torch.load(dictionary_path)
    
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    for inputs, labels in dataloader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        predicts = None
        with torch.set_grad_enabled(False):
            preds = model(inputs)
        
        predicts = torch.nn.functional.softmax(preds, dim=1).data.cpu().numpy()
        all_labels.append(labels.data.cpu().numpy())
        all_predictions.append(predicts)
        
    return np.concatenate(all_predictions), np.concatenate(all_labels)

In [None]:
# Test performance of existing model
pretrained_path = '../models/pretrained/Cnn14_mAP=0.431.pth'
model = Transfer_Cnn14(pretrained_path=pretrained_path, 
                       classes_num=len(class_names), 
                       pretrain=True, 
                       freeze_base=False)

model_name = 'wave_fg2020-LossWeighted-PANN-CNN14-50'

y_train = torch.LongTensor(all_data['train']['y'])
y_valid = torch.LongTensor(all_data['valid']['y'])

probas, labels = predict_proba(all_data['train']['x'], y_train, max_len, model, model_name, 45, log_root, batch_size)
preds = probas.argmax(axis=1)
print('Metrics: F1: {0}, Acc: {1}, Custom: {2}'.format(f1_score(labels, preds, average='macro'),
                                                       accuracy_score(labels, preds),
                                                       custom_score(labels, preds, 'macro')))

probas, labels = predict_proba(all_data['valid']['x'], y_valid, max_len, model, model_name, 45, log_root, batch_size)
preds = probas.argmax(axis=1)
print('Metrics: F1: {0}, Acc: {1}, Custom: {2}'.format(f1_score(labels, preds, average='macro'),
                                                       accuracy_score(labels, preds),
                                                       custom_score(labels, preds, 'macro')))

In [36]:
# Get train, valid, and test predictions
pretrained_path = '../models/pretrained/Cnn14_mAP=0.431.pth'
model = Transfer_Cnn14(pretrained_path=pretrained_path, 
                       classes_num=len(class_names), 
                       pretrain=True, 
                       freeze_base=False)

model_name = 'wave_fg2020-LossWeighted-PANN-CNN14-50'

for ds in ['train', 'valid', 'test']:
    if is_test(ds):
        continue
    for f in tqdm(files_data[ds]):
        x_train = torch.Tensor(files_data[ds][f]['x'])
        y_train = torch.LongTensor(files_data[ds][f]['y'])
        
        probas, labels = predict_proba(x_train, y_train, 0, model, model_name, 45, log_root, batch_size)
        
        res = np.concatenate((probas, np.expand_dims(labels, axis=1)), axis=1)
        
        os.makedirs(os.path.join(model_name, ds), exist_ok=True)
        
        fn = os.path.splitext(os.path.basename(f))[0]
        np.savetxt(os.path.join(model_name, ds, "{0}.csv".format(fn)), res, delimiter=",")

100%|██████████| 253/253 [09:11<00:00,  2.18s/it]
100%|██████████| 70/70 [02:44<00:00,  2.35s/it]
