In [1]:
import os
import sys
import matplotlib.pyplot as plt
import IPython.display as ipd
import pandas as pd
import re
import subprocess
import numpy as np
import math

%load_ext autoreload
%autoreload 2
%matplotlib inline

sys.path.append('../src')

In [2]:
import warnings
warnings.filterwarnings('ignore')

import time
import copy
import shutil

import torch
from sklearn.metrics import recall_score

from tqdm import tqdm

from utils.notebooks_utils import *

In [3]:
data_root = '/media/maxim/SStorage/FG_2020/'
features_root = 'features/'
labels_path = ''
log_root = '../logs/'
tb_log_root = '../logs/tb/'

features_name = 'mfcc_30_0-2'

df_labels = pd.read_csv(os.path.join(data_root, labels_path))

class_names = list(map(str, df_labels['label'].unique()))
class_names.sort()

batch_size = 512

In [4]:
import pickle

with open(os.path.join(data_root, features_root, 'agender.{0}.train.pickle'.format(features_name)), 'rb') as f:
    x_train = pickle.load(f)

with open(os.path.join(data_root, features_root, 'agender.{0}.valid.pickle'.format(features_name)), 'rb') as f:
    x_valid = pickle.load(f)

with open(os.path.join(data_root, features_root, 'agender.{0}.test.pickle'.format(features_name)), 'rb') as f:
    x_test = pickle.load(f)

In [13]:
from torchvision import transforms, models
from torch.utils.data import Dataset
import torch.nn.functional as F

class CustomTensorDataset(Dataset):
    """TensorDataset with support of transforms.
    """
    def __init__(self, tensors, transform=None):
        assert all(tensors[0].size(0) == tensor.size(0) for tensor in tensors)
        self.tensors = tensors
        self.transform = transform

    def __getitem__(self, index):
        x = self.tensors[0][index]

        if self.transform:
            x = self.transform(x)

        y = self.tensors[1][index]

        return x, y

    def __len__(self):
        return self.tensors[0].size(0)

class AddPad(object):
    def __call__(self, x):
        x = F.pad(x, pad=(0, abs(196 - x.shape[1])), mode='constant', value=0)
        return x
    
addPad = AddPad()

train_transforms = transforms.Compose([addPad])

In [6]:
from sklearn.preprocessing import LabelEncoder

define_seed(12)

x_train = torch.Tensor(x_train)
x_valid = torch.Tensor(x_valid)
x_test = torch.Tensor(x_test)

le = LabelEncoder()
y_train = torch.LongTensor(le.fit_transform(df_labels['label'][df_labels['file_name'].str.startswith('train')].values))
y_valid = torch.LongTensor(le.transform(df_labels['label'][df_labels['file_name'].str.startswith('valid')].values))
y_test = torch.LongTensor(le.transform(df_labels['label'][df_labels['file_name'].str.startswith('test')].values))

train_dataset = CustomTensorDataset(tensors=(x_train, y_train), transform=train_transforms)
valid_dataset = CustomTensorDataset(tensors=(x_valid, y_valid), transform=train_transforms)
test_dataset = CustomTensorDataset(tensors=(x_test, y_test), transform=train_transforms)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, 
                                               shuffle=True, num_workers=6)

valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, 
                                               shuffle=False, num_workers=6)

# test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, 
#                                               shuffle=False, num_workers=6)

In [18]:
from torch import nn
import torchvision.models as models
from efficientnet_pytorch import EfficientNet

class StatPool(nn.Module):
    def __init__(self, dimension, keepdim):
        super(StatPool, self).__init__()
        self.dimension = dimension
        self.keepdim = keepdim

    def forward(self, x):
        var = x.var(dim=self.dimension, keepdim=self.keepdim)
        mean = x.mean(dim=self.dimension, keepdim=self.keepdim)
        return torch.cat([var, mean], dim=1)
    
class AgenderTDNNv1(nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super(AgenderTDNNv1, self).__init__()
        self.feat_extr = nn.Sequential(
            nn.Conv1d(num_inputs, 512, 5, padding=2),
            nn.BatchNorm1d(512),
            nn.ReLU(inplace=True),
            nn.Conv1d(512, 512, 3, dilation=2, padding=2),
            nn.BatchNorm1d(512),
            nn.ReLU(inplace=True),
            nn.Conv1d(512, 512, 3, dilation=3, padding=3),
            nn.BatchNorm1d(512),
            nn.ReLU(inplace=True),
            nn.Conv1d(512, 512, 1), 
            nn.BatchNorm1d(512),
            nn.ReLU(inplace=True),
            nn.Conv1d(512, 1500, 1), 
            nn.BatchNorm1d(1500),
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.6)
        )
        
        self.z1 = nn.Conv1d(num_inputs, 512, 5, padding=2)
        self.z2 = nn.BatchNorm1d(512)
        self.z3 = nn.ReLU(inplace=True)
        self.z4 = nn.Conv1d(512, 512, 3, dilation=2, padding=2)
        self.z5 = nn.BatchNorm1d(512)
        self.z6 = nn.ReLU(inplace=True)
        self.z7 = nn.Conv1d(512, 512, 3, dilation=3, padding=3)
        self.z8 = nn.BatchNorm1d(512)
        self.z9 = nn.ReLU(inplace=True)
        self.z10 = nn.Conv1d(512, 512, 1)
        self.z11 = nn.BatchNorm1d(512)
        self.z12 = nn.ReLU(inplace=True)
        self.z13 = nn.Conv1d(512, 1500, 1)
        self.z14 = nn.BatchNorm1d(1500)
        self.z15 = nn.ReLU(inplace=True)
        self.z16 = nn.Dropout(p=0.6)
        
        self.stp = StatPool(dimension=-1, keepdim=False)
        
        self.classifier = nn.Sequential(
            nn.Linear(3000, 400),
            nn.BatchNorm1d(400),
            nn.ReLU(inplace=True),
            nn.Linear(400, num_outputs)
        )
        
    def forward(self, x):
        print(x.shape)
#         x = self.feat_extr(x)
        x = self.z1(x)
        print(x.shape)
        x = self.z2(x)
        print(x.shape)
        x = self.z3(x)
        print(x.shape)
        x = self.z4(x)
        print(x.shape)
        x = self.z5(x)
        print(x.shape)
        x = self.z6(x)
        print(x.shape)
        x = self.z7(x)
        print(x.shape)
        x = self.z8(x)
        print(x.shape)
        x = self.z9(x)
        print(x.shape)
        x = self.z10(x)
        print(x.shape)
        x = self.z11(x)
        print(x.shape)
        x = self.z12(x)
        print(x.shape)
        x = self.z13(x)
        print(x.shape)
        x = self.z14(x)
        print(x.shape)
        x = self.z15(x)
        print(x.shape)
        x = self.z16(x)
        print(x.shape)
        x = self.stp(x)
        print(x.shape)
        x = self.classifier(x)
        return x

In [25]:
class BasicConv1d(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(BasicConv1d, self).__init__()
        self.conv = nn.Conv1d(in_channels, out_channels, **kwargs)
        self.bn = nn.BatchNorm1d(out_channels)

    def forward(self, x):
        x = self.conv(x)
        x = self.bn(x)
        x = F.relu(x)
        return x
    
class AgenderCNN(nn.Module):
    def __init__(self, num_inputs, out_size):
        super(AgenderCNN, self).__init__()      
        self.feat_extr = nn.Sequential(
            BasicConv1d(num_inputs, 128, kernel_size=3, padding=1, stride=1),
            BasicConv1d(128, 128, kernel_size=3, padding=1, stride=1),
            BasicConv1d(128, 128, kernel_size=5, padding=1, stride=1),
            nn.Dropout(p=0.15),
            BasicConv1d(128, 128, kernel_size=3, padding=1, stride=1),
            BasicConv1d(128, 256, kernel_size=1, padding=1, stride=1)
        )
        
        self.stp = StatPool(dimension=-1, keepdim=False)

        self.classifier = nn.Linear(in_features=512, out_features=out_size)

    def forward(self, x):
        x = self.feat_extr(x)
        x = self.stp(x)
        x = self.classifier(x)
        return x
    
class TDNN(nn.Module):
    def __init__(self, input_dim=23, output_dim=512, context_size=5, stride=1, dilation=1,
                 batch_norm=True, dropout_p=0.0):
        """
        TDNN as defined by https://www.danielpovey.com/files/2015_interspeech_multisplice.pdf
        Affine transformation not applied globally to all frames but smaller windows with local context
        batch_norm: True to include batch normalisation after the non linearity

        Context size and dilation determine the frames selected
        (although context size is not really defined in the traditional sense)
        For example:
            context size 5 and dilation 1 is equivalent to [-2,-1,0,1,2]
            context size 3 and dilation 2 is equivalent to [-2, 0, 2]
            context size 1 and dilation 1 is equivalent to [0]
        """
        super(TDNN, self).__init__()
        self.context_size = context_size
        self.stride = stride
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.dilation = dilation
        self.dropout_p = dropout_p
        self.batch_norm = batch_norm

        self.kernel = nn.Linear(input_dim * context_size, output_dim)
        self.nonLinearity = nn.ReLU()
        if self.batch_norm:
            self.bn = nn.BatchNorm1d(output_dim)
        if self.dropout_p:
            self.drop = nn.Dropout(p=self.dropout_p)

    def forward(self, x):
        """
        input: size (batch, seq_len, input_features)
        output: size (batch, new_seq_len, output_features)
        """
        _, _, d = x.shape
        assert (d == self.input_dim), 'Input dimension was wrong. Expected ({}), got ({})'.format(self.input_dim, d)
        x = x.unsqueeze(1)

        # Unfold input into smaller temporal contexts
        x = F.unfold(
            x,
            (self.context_size, self.input_dim),
            stride=(1, self.input_dim),
            dilation=(self.dilation, 1)
        )

        # N, output_dim*context_size, new_t = x.shape
        x = x.transpose(1, 2)
        x = self.kernel(x)
        x = self.nonLinearity(x)

        if self.dropout_p:
            x = self.drop(x)

        if self.batch_norm:
            x = x.transpose(1, 2)
            x = self.bn(x)
            x = x.transpose(1, 2)

        return x

class AgenderTDNNv0(nn.Module):
    def __init__(self, num_inputs, out_size):
        # Input to frame1 is of shape (batch_size, T, 24)
        # Output of frame5 will be (batch_size, T-14, 1500)
        # -> CONV/FC -> BatchNorm -> ReLu(or other activation) -> Dropout -> CONV/FC ->
        super(AgenderTDNNv0, self).__init__()

        self.feat_extr = nn.Sequential(
            TDNN(input_dim=num_inputs, output_dim=512, context_size=5, dilation=1),
            TDNN(input_dim=512, output_dim=512, context_size=3, dilation=2),
            TDNN(input_dim=512, output_dim=512, context_size=3, dilation=3),
            TDNN(input_dim=512, output_dim=512, context_size=1, dilation=1),
            TDNN(input_dim=512, output_dim=1500, context_size=1, dilation=1),
            nn.Dropout(p=0.6)
        )
        
        self.z1 = TDNN(input_dim=num_inputs, output_dim=512, context_size=5, dilation=1)
        self.z2 = TDNN(input_dim=512, output_dim=512, context_size=3, dilation=2)
        self.z3 = TDNN(input_dim=512, output_dim=512, context_size=3, dilation=3)
        self.z4 = TDNN(input_dim=512, output_dim=512, context_size=1, dilation=1)
        self.z5 = TDNN(input_dim=512, output_dim=1500, context_size=1, dilation=1)
        self.z6 = nn.Dropout(p=0.6)

        self.stp = StatPool(dimension=1, keepdim=False)
        
        self.classifier = nn.Sequential(
            nn.Linear(in_features=3000, out_features=400),
            nn.BatchNorm1d(400),
            nn.ReLU(),
            
            nn.Linear(in_features=400, out_features=400),
            nn.BatchNorm1d(400),
            nn.ReLU(),
            
            nn.Linear(in_features=400, out_features=out_size)
        )

    def forward(self, x):
        print(x.shape)
        x = x.permute(0, 2, 1)
        print(x.shape)
#         x = self.feat_extr(x)
        x = self.z1(x)
        print(x.shape)
        x = self.z2(x)
        print(x.shape)
        x = self.z3(x)
        print(x.shape)
        x = self.z4(x)
        print(x.shape)
        x = self.z5(x)
        print(x.shape)
        x = self.z6(x)

        print(x.shape)
        x = self.stp(x)
        print(x.shape)
        x = self.classifier(x)
        return x

In [26]:
AgenderTDNNv0(90, 4)(torch.Tensor([x_train[0], x_train[1]]))

torch.Size([2, 90, 196])
torch.Size([2, 196, 90])
torch.Size([2, 192, 512])
torch.Size([2, 188, 512])
torch.Size([2, 182, 512])
torch.Size([2, 182, 512])
torch.Size([2, 182, 1500])
torch.Size([2, 182, 1500])
torch.Size([2, 3000])


tensor([[ 0.1230, -0.5362, -0.1084,  0.2549],
        [-0.5441,  0.1496, -0.7393,  0.3533]], grad_fn=<AddmmBackward>)

In [20]:
AgenderTDNNv1(90, 4)(torch.Tensor([x_train[0], x_train[1]]))

torch.Size([2, 90, 196])
torch.Size([2, 512, 196])
torch.Size([2, 512, 196])
torch.Size([2, 512, 196])
torch.Size([2, 512, 196])
torch.Size([2, 512, 196])
torch.Size([2, 512, 196])
torch.Size([2, 512, 196])
torch.Size([2, 512, 196])
torch.Size([2, 512, 196])
torch.Size([2, 512, 196])
torch.Size([2, 512, 196])
torch.Size([2, 512, 196])
torch.Size([2, 1500, 196])
torch.Size([2, 1500, 196])
torch.Size([2, 1500, 196])
torch.Size([2, 1500, 196])
torch.Size([2, 3000])


tensor([[-0.0794,  0.1774,  0.4354, -0.0005],
        [-0.2210,  0.2436,  0.2032, -0.1710]], grad_fn=<AddmmBackward>)

In [39]:
# AgenderTDNNv0
# AgenderTDNNv1
# AgenderCNN

tensor([[ 0.0468, -0.1723,  0.3643,  0.0596],
        [-0.1401, -0.2256,  0.3341,  0.1411],
        [-0.2596, -0.3465,  0.2531,  0.2095],
        [-0.2305, -0.0442,  0.3455,  0.1286],
        [-0.2236, -0.3733,  0.3789,  0.2213],
        [-0.3489, -0.0838,  0.3531,  0.0782],
        [-0.2147, -0.2110,  0.2759,  0.1107],
        [-0.1037, -0.0840,  0.2893,  0.2697]], grad_fn=<AddmmBackward>)

In [None]:
# %%capture output

define_seed(12)
# model = AgenderResNet50(len(class_names), requires_grad=True)
model = AgenderTDNNv2(90, len(class_names))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
loss = torch.nn.CrossEntropyLoss()
    
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=0.0001) # v1
# optimizer = torch.optim.SGD(model.parameters(), lr=1e-3, momentum=0.9)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.01)
    
model, max_epoch, max_recall = train_model(model, loss, optimizer, scheduler, num_epochs=100, 
                                           device=device, 
                                           train_dataloader=train_dataloader, 
                                           valid_dataloader=valid_dataloader,
                                           class_names=class_names,
                                           log_root=log_root,
                                           tb_log_root=tb_log_root,
                                           features_name=features_name,
                                           experiment_name='AgenderTDNNv2_adam100',
                                           log_iter=[])
    
print('Epoch: {0}, maximum recall: {1}\n'.format(max_epoch, max_recall))

## Validate train

In [None]:
all_test_predictions = []
all_test_labels = []

model_name = 'mel_128_AgenderResNet50_adam8'
epoch = 7

all_labels = []
all_predictions = []
    
dictionary_path = get_model_by_epoch(os.path.join(log_root, model_name), epoch)
print(dictionary_path)
checkpoint = torch.load(dictionary_path)
    
model = AgenderResNet50(len(class_names), requires_grad=True)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()
    
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
    
for inputs, labels in train_dataloader:
    inputs = inputs.to(device)
    labels = labels.to(device)
    predicts = None
    with torch.set_grad_enabled(False):
        preds = model(inputs)
        
    predicts = torch.nn.functional.softmax(preds, dim=1).data.cpu().numpy()
    
    all_labels.append(labels.data.cpu().numpy())
    all_predictions.append(predicts)
        
all_labels = np.concatenate(fold_labels)
all_predictions = np.concatenate(fold_predictions)
    
all_recall = recall_score(all_labels, np.argmax(all_predictions, axis=1), average='macro')
print('Train Recall: {:.4f}'.format(all_recall))

In [None]:
for i in range(0, splits):
    res = np.concatenate((all_test_predictions[i], np.expand_dims(all_test_labels[i], axis=1)), axis=1)
    np.savetxt("SDResNet34_cv_valid_preds_{}.csv".format(i), res, delimiter=",")

## Validation set

In [None]:
all_labels = []
all_predictions = []
    
dictionary_path = get_model_by_epoch(os.path.join(log_root, model_name), epoch)
print(dictionary_path)
checkpoint = torch.load(dictionary_path)
    
model = AgenderResNet50(len(class_names), requires_grad=True)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()
    
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
    
for inputs, labels in valid_dataloader:
    inputs = inputs.to(device)
    labels = labels.to(device)
    predicts = None
    with torch.set_grad_enabled(False):
        preds = model(inputs)
        
    predicts = torch.nn.functional.softmax(preds, dim=1).data.cpu().numpy()
    
    all_labels.append(labels.data.cpu().numpy())
    all_predictions.append(predicts)
        
all_labels = np.concatenate(fold_labels)
all_predictions = np.concatenate(fold_predictions)
    
all_recall = recall_score(all_labels, np.argmax(all_predictions, axis=1), average='macro')
print('Valid Recall: {:.4f}'.format(all_recall))

In [None]:
for i in range(0, splits):
    res = np.concatenate((all_test_predictions[i], np.expand_dims(all_test_labels[i], axis=1)), axis=1)
    np.savetxt("SDResNet34_devel_preds_{}.csv".format(i), res, delimiter=",")

## Test set

In [None]:
all_test_predictions = []
all_test_labels = []

get_model_name = lambda idx: 'mel_128_SDResNet_adam_3_4Fold'
get_model_epoch = lambda idx: [1, 2, 0][idx]

for i in range(0, splits):
    test_dataloader = torch.utils.data.DataLoader(test_dataset, 
                                                  batch_size=batch_size, 
                                                  shuffle=False, 
                                                  num_workers=6)
    
    fold_labels = []
    fold_predictions = []
    
    dictionary_path = get_model_by_epoch(os.path.join(log_root, '{0}_{1}'.format(get_model_name(i), i)), get_model_epoch(i))
    print(dictionary_path)
    checkpoint = torch.load(dictionary_path)
    
#     model = SDNetB()
    model = SDResNet(requires_grad=True)
    model.load_state_dict(checkpoint['model_state_dict'])
    model.eval()
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    for inputs, labels in test_dataloader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        predicts = None
        with torch.set_grad_enabled(False):
            preds = model(inputs)
        
        predicts = torch.nn.functional.softmax(preds, dim=1).data.cpu().numpy()
    
        fold_labels.append(labels.data.cpu().numpy())
        fold_predictions.append(predicts)
        
    fold_labels = np.concatenate(fold_labels)
    fold_predictions = np.concatenate(fold_predictions)
    
    all_test_labels.append(fold_labels)
    all_test_predictions.append(fold_predictions)
    
    test_recall = recall_score(fold_labels, np.argmax(fold_predictions, axis=1), average='macro')
    print('Fold {}, Test Recall: {:.4f}'.format(i, test_recall))
    print('Fold {} OK'.format(i))

In [None]:
for i in range(0, splits):
    res = np.concatenate((all_test_predictions[i], np.expand_dims(all_test_labels[i], axis=1)), axis=1)
    np.savetxt("SDResNet34_test_preds_{}.csv".format(i), res, delimiter=",")

In [None]:
pred_file_name = 'ComParE19_SD.{0}.test.IIAS_new.csv'.format(features_name)

prepare_filenames = lambda x: '{0}.wav'.format(os.path.splitext(os.path.basename(x))[0])

submission_df = pd.DataFrame.from_dict({'file_name': df_labels['file_name'][df_labels['file_name'].str.startswith('test')].values, 
                                        'prediction': le.inverse_transform(np.argmax(test_preds, axis=1))})
submission_df.to_csv(pred_file_name, index=False)

In [None]:
df_pred = pd.read_csv(pred_file_name)
y_test_pred = df_pred['prediction'].values

df_labels = pd.read_csv(os.path.join(data_root, labels_path))
y_test = df_labels['label'][df_labels['file_name'].str.startswith('test')].values

print('Test UAR: {0:.1f}'.format(recall_score(y_test, y_test_pred, labels=class_names, average='macro') * 100))