In [14]:
import os
import sys
import matplotlib.pyplot as plt
import IPython.display as ipd
import pandas as pd
import re
import subprocess
import numpy as np
import math

%load_ext autoreload
%autoreload 2
%matplotlib inline

sys.path.append('../src')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
import logging
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn import svm
from sklearn.metrics import recall_score, confusion_matrix
from utils.configuration_utils import create_logger

create_logger('.', 'BaselineV1.4.log', console_level=logging.INFO, file_level=logging.NOTSET)

# Task
task_name = 'ComParE2020_Mask'  # os.getcwd().split('/')[-2]
classes = ['clear', 'mask']

# Enter your team name HERE
team_name = 'baseline'

# Enter your submission number HERE
submission_index = 1

# Option
show_confusion = True  # Display confusion matrix on devel

# Configuration
feature_set = 'BoAW-2000'  # For all available options, see the dictionary feat_conf
# complexities = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0]  # SVM complexities (linear kernel)

# Mapping each available feature set to tuple
# (number of features, offset/index of first feature, separator, header option)
feat_conf = {'ComParE': (6373, 1, ';', 'infer'),
             'BoAW-125': (250, 1, ';', None),
             'BoAW-250': (500, 1, ';', None),
             'BoAW-500': (1000, 1, ';', None),
             'BoAW-1000': (2000, 1, ';', None),
             'BoAW-2000': (4000, 1, ';', None),
             'auDeep-30': (1024, 2, ',', 'infer'),
             'auDeep-45': (1024, 2, ',', 'infer'),
             'auDeep-60': (1024, 2, ',', 'infer'),
             'auDeep-75': (1024, 2, ',', 'infer'),
             'auDeep-fused': (4096, 2, ',', 'infer'),
             'DeepSpectrum_resnet50': (2048, 1, ',', 'infer')}

num_feat = feat_conf[feature_set][0]
ind_off = feat_conf[feature_set][1]
sep = feat_conf[feature_set][2]
header = feat_conf[feature_set][3]

# Path of the features and labels
features_path = '/media/maxim/SStorage/ComParE2020_Mask/features/'
label_file = '/media/maxim/SStorage/ComParE2020_Mask/lab/labels.csv'

# Start
print('\nRunning ' + task_name + ' ' + feature_set + ' baseline ... (this might take a while) \n')

# Load features and labels
x_train = pd.read_csv(features_path + task_name + '.' + feature_set + '.train.csv', sep=sep, header=header,
                      usecols=range(ind_off, num_feat + ind_off), dtype=np.float32).values
x_devel = pd.read_csv(features_path + task_name + '.' + feature_set + '.devel.csv', sep=sep, header=header,
                      usecols=range(ind_off, num_feat + ind_off), dtype=np.float32).values
x_test = pd.read_csv(features_path + task_name + '.' + feature_set + '.test.csv', sep=sep, header=header,
                     usecols=range(ind_off, num_feat + ind_off), dtype=np.float32).values

df_labels = pd.read_csv(label_file)
y_train = df_labels['label'][df_labels['file_name'].str.startswith('train')].values
y_devel = df_labels['label'][df_labels['file_name'].str.startswith('devel')].values


Running ComParE2020_Mask BoAW-2000 baseline ... (this might take a while) 



In [16]:
from sklearn.model_selection import StratifiedKFold

train_indexes = np.arange(0, len(y_train))
valid_indexes = np.arange(len(y_train), len(y_devel) + len(y_train))

splits = []

# valid
valid_skf = StratifiedKFold(n_splits=2, random_state=12, shuffle=True)
valid_splits = valid_skf.split(np.zeros(len(y_devel)), np.asarray(y_devel))
for i, (t, v) in enumerate(valid_splits):
    v_t = valid_indexes[t]
    v_v = valid_indexes[v]
    splits.append((np.hstack((train_indexes, v_t)), v_v))
    
# train
train_skf = StratifiedKFold(n_splits=2, random_state=12, shuffle=True)
train_splits = train_skf.split(np.zeros(len(y_train)), np.asarray(y_train))
for i, (t, v) in enumerate(train_splits):
    t_t = train_indexes[t]
    t_v = train_indexes[v]
    splits.append((np.hstack((t_t, valid_indexes)), t_v))

all_x = np.vstack((x_train, x_devel))
all_y = np.hstack((y_train, y_devel))

In [None]:
import joblib

logging.info('Start Logging')

for i, (t_indexes, d_indexes) in enumerate(splits):
    train_subset_x, devel_subset_x = all_x[t_indexes], all_x[d_indexes]
    train_subset_y, devel_subset_y = all_y[t_indexes], all_y[d_indexes]
    
    # Feature normalisation
    scaler = MinMaxScaler()
    train_subset_x = scaler.fit_transform(train_subset_x)
    devel_subset_x = scaler.transform(devel_subset_x)
    
    clf = svm.SVC(random_state=0, probability=True)
    clf.fit(train_subset_x, train_subset_y)
    y_pred = clf.predict(devel_subset_x)
    uar_score = recall_score(devel_subset_y, y_pred, labels=classes, average='macro')
    logging.info('UAR on Devel {0}'.format(uar_score * 100))
    if show_confusion:
        logging.info('Confusion matrix (Devel):')
        logging.info(classes)
        logging.info(confusion_matrix(devel_subset_y, y_pred, labels=classes))

    # Train SVM model on the whole training data with optimum complexity and get predictions on test data
    logging.info('\nFOLD {0}. UAR on Devel {1:.1f}\n'.format(i, uar_score * 100))
    
    joblib.dump(scaler, '{0}_scaler_fold_{1}.model'.format(feature_set, i))
    joblib.dump(clf, '{0}_cls_fold_{1}.model'.format(feature_set, i))

2020-04-26 09:33:51,261:INFO:Start Logging


In [17]:
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion Matrix', color_map=plt.cm.Blues, fig_path=None):
        """
        This function prints and plots the confusion matrix,
        Normalization can be applied by setting `normalize=True`
        """
        if not title:
            if normalize:
                title = 'Normalized confusion matrix'
            else:
                title = 'Confusion matrix, without normalization'

            # Compute confusion matrix
        # Only use the labels that appear in the data
        if normalize:
            cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            print("Normalized confusion matrix")
        else:
            print('Confusion matrix, without normalization')

        np.set_printoptions(precision=3)
        print(cm)
        np.set_printoptions(precision=6)

        fig, ax = plt.subplots(figsize=(8, 6))
        im = ax.imshow(cm, interpolation='nearest', cmap=color_map)
        ax.figure.colorbar(im, ax=ax)
        # We want to show all ticks...
        ax.set(xticks=np.arange(cm.shape[1]),
               yticks=np.arange(cm.shape[0]),
               xticklabels=classes, yticklabels=classes,
               title=title,
               ylabel='True label',
               xlabel='Predicted label')

#         ax.set_xticks(np.arange(cm.shape[1] + 1)-.5)
#         ax.set_yticks(np.arange(cm.shape[0] + 1)-.5)

        # Rotate the tick labels and set their alignment.
        plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
                 rotation_mode="anchor")

        # Loop over data dimensions and create text annotations.
        fmt = '.2f' if normalize else 'd'
        thresh = cm.max() / 2.
        for i in range(cm.shape[0]):
            for j in range(cm.shape[1]):
                ax.text(j, i, format(cm[i, j], fmt),
                        ha="center", va="center",
                        color="white" if cm[i, j] > thresh else "black")

        fig.tight_layout()
        if fig_path:
            plt.savefig(fig_path)
        else:
            plt.show(block=False)

In [18]:
import joblib

print('Start Validation')

all_test_predictions = []
all_test_labels = []
for i, (t_indexes, d_indexes) in enumerate(splits):
    print('Validating Fold {0}'.format(i))
    fold_labels = []
    fold_predictions = []
    
    train_subset_x, devel_subset_x = all_x[t_indexes], all_x[d_indexes]
    train_subset_y, devel_subset_y = all_y[t_indexes], all_y[d_indexes]
    
    # Feature normalisation
    scaler = joblib.load('{0}_scaler_fold_{1}.model'.format(feature_set, i))
    train_subset_x = scaler.transform(train_subset_x)
    devel_subset_x = scaler.transform(devel_subset_x)
    clf = joblib.load('{0}_cls_fold_{1}.model'.format(feature_set, i))
    y_pred = clf.predict_proba(devel_subset_x)

    all_test_labels.append(np.asarray(devel_subset_y))
    all_test_predictions.append(np.asarray(y_pred))
    
#     print('UAR on Devel {0}'.format(uar_score * 100))
#     if show_confusion:
#         print('Confusion matrix (Devel):')
#         print(classes)
#         cm = confusion_matrix(devel_subset_y, y_pred, labels=classes)
#         print(cm)
#         res_name = '{0}_fold_{1}'.format(feature_set, i)       
#        plot_confusion_matrix(cm=cm, classes=classes, normalize=True, 
#                               title='{}.png'.format(res_name), fig_path='{}.png'.format(res_name)) 

Start Validation
Validating Fold 0
Validating Fold 1
Validating Fold 2
Validating Fold 3


In [42]:
for i in range(0, 4):
    str_to_int = {
        'clear': 0,
        'mask': 1,
    }

    int_labels = np.asarray([str_to_int[i] for i in all_test_labels[i]])
    res = np.concatenate((all_test_predictions[i], np.expand_dims(int_labels, axis=1)), axis=1)
    np.savetxt("subm3(svm)_devel_preds_{}.csv".format(i), res, delimiter=",")

In [44]:
import joblib

print('Start Testing')

all_test_predictions = []
for i, (t_indexes, d_indexes) in enumerate(splits):
    print('Testing Fold {0}'.format(i))
    # Feature normalisation
    scaler = joblib.load('{0}_scaler_fold_{1}.model'.format(feature_set, i))
    test_subset_x = scaler.transform(x_test)
    clf = joblib.load('{0}_cls_fold_{1}.model'.format(feature_set, i))
    y_pred = clf.predict_proba(test_subset_x)

    all_test_predictions.append(y_pred)

np.savetxt("subm3(svm)_test_preds.csv", np.concatenate(all_test_predictions, axis=1), delimiter=",")

Start Testing
Testing Fold 0
Testing Fold 1
Testing Fold 2
Testing Fold 3
