In [1]:
import pandas as pd
import numpy as np
import librosa
import librosa.display
from matplotlib.pyplot import specgram
import matplotlib.pyplot as plt
import pickle
import gzip
from tqdm import tqdm_notebook, tqdm
import tensorflow as tf
import keras.backend as K
from scipy.fftpack import fft
from scipy import signal
from keras.utils import to_categorical

%matplotlib inline

from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from sklearn.metrics import classification_report

import keras
from keras.models import Sequential, Model
from keras.layers import Conv2D, MaxPool2D, Dense, Flatten, Dropout, AveragePooling2D, concatenate
from keras.layers import GRU, BatchNormalization, Reshape
from keras.utils import to_categorical
from keras.layers import Input, Conv1D, Activation, MaxPool1D
from keras.models import Model

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.3)

sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

In [3]:
K.set_session(sess)

# Reading dataset

In [4]:
with gzip.open('../cache/dataset.pkl.gz', 'rb') as f:
    X_raw, y_raw = pickle.load(f)

In [5]:
test_folds = [4, 8, 11, 12, 15]

In [6]:
['../annotations/cursach2.TextGrid',
 '../annotations/gomes.TextGrid',
 '../annotations/olya1.TextGrid',
 '../annotations/olya2.TextGrid',
 '../annotations/vika1.TextGrid']

['../annotations/cursach2.TextGrid',
 '../annotations/gomes.TextGrid',
 '../annotations/olya1.TextGrid',
 '../annotations/olya2.TextGrid',
 '../annotations/vika1.TextGrid']

## Preprocesing

In [7]:
sample_rate = 22050

def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

def extract_features(x, lim=100):
    _, _, spec = log_specgram(x, sample_rate)
    spec = spec[:, :lim]
    spec = np.expand_dims(spec, -1)
    return spec

input_shape = (28, 100, 1)

## Cross val

In [10]:
def make_crossval(X_raw_, y_raw_, folds_inds, model_create, schedule, epochs):
    X_raw = X_raw_.copy()
    y_raw = y_raw_.copy()
    folds_results = []
    
    for fold in folds_inds:
        print('Fold:', fold)
        X_train = []
        y_train = []

        for i, x in enumerate(X_raw):
            if i != fold:
                X_train.extend(x)

        for i, y in enumerate(y_raw):
            if i != fold:
                y_train.extend(y)
        
        X_test = X_raw[fold]
        y_test = y_raw[fold]
        
        X_train = [x if type(x) == type(np.array([])) else np.array(x) for x in X_train]
        X_test = [x if type(x) == type(np.array([])) else np.array(x) for x in X_test]

        bad_inds = (np.array([len(x) != 6615 for x in X_train])).nonzero()[0]
        if len(bad_inds) != 0:
            X_train = np.delete(X_train, bad_inds)
            y_train = np.delete(y_train, bad_inds)
    
        bad_inds = (np.array([len(x) != 6615 for x in X_test])).nonzero()[0]
        if len(bad_inds) != 0:
            X_test = np.delete(X_test, bad_inds)
            y_test = np.delete(y_test, bad_inds)
        
        X_spec_train = []
        for x in X_train:
            spec = extract_features(x)
            X_spec_train.append(spec)
        X_spec_train = np.array(X_spec_train)

        X_spec_test = []
        for x in X_test:
            spec = extract_features(x)
            X_spec_test.append(spec)
        X_spec_test = np.array(X_spec_test)
        
        mean = X_spec_train.mean()
        std = X_spec_train.std()
        
        X_spec_train -= mean
        X_spec_train /= std
        X_spec_test -= mean
        X_spec_test /= std
        
        y_train, y_test = to_categorical(y_train), to_categorical(y_test)
        
        X_spec_train = np.vstack((X_spec_train, np.repeat(X_spec_train[np.argmax(y_train, 1) == 0], 4, 0)))
        y_train = np.vstack((y_train, np.repeat(y_train[np.argmax(y_train, 1) == 0], 4, 0)))

        X_spec_train = np.vstack((X_spec_train, np.repeat(X_spec_train[np.argmax(y_train, 1) == 1], 20, 0)))
        y_train = np.vstack((y_train, np.repeat(y_train[np.argmax(y_train, 1) == 1], 20, 0)))
        
        print('Train on:', X_train.shape)       
        model = model_create(input_shape)
        model.compile('adam', 'categorical_crossentropy')
        
        model.fit(X_spec_train, y_train, batch_size=64, epochs=epochs,
                 callbacks=[keras.callbacks.LearningRateScheduler(schedule, verbose=0)],
                 verbose=0)
        pr = model.predict(X_spec_test, batch_size=64, verbose=0)
        pr = pr.argmax(1)
        
        acc = accuracy_score(np.argmax(y_test, 1), pr)
        cl_rep = classification_report(np.argmax(y_test, 1), pr)
        f1_micro = f1_score(np.argmax(y_test, 1), pr, average='micro')
        f1_macro = f1_score(np.argmax(y_test, 1), pr, average='macro')
        
        folds_results.append((fold, acc, cl_rep, f1_micro, f1_macro))
    return folds_results

# Rnn model

In [11]:
def create_rnn_model(input_shape):
    x_input = Input(input_shape)
    x = Reshape(input_shape[:-1])(x_input)

    x = Conv1D(128, 3)(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.5)(x)
    
    x = GRU(128, return_sequences=True)(x)
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    
    x = GRU(192, return_sequences=False)(x)
    x = Dropout(0.5)(x)
    x = BatchNormalization()(x)
    
    x = Dense(3, activation='softmax')(x)
    model = Model(inputs=x_input, outputs=x)
    return model

def schedule(i, lr):
    if i == 0:
        lr *= 0.5
    if i == 5:
        lr *= 0.2
    if i == 10:
        lr *= 0.2
    return lr

### Validation

In [12]:
rnn_res = make_crossval(X_raw, y_raw, test_folds, create_rnn_model, schedule, 10)

Fold: 4
Train on: (17409,)
Fold: 8
Train on: (17140,)
Fold: 11
Train on: (18467,)
Fold: 12
Train on: (18306,)
Fold: 15
Train on: (18544,)


In [None]:
for r in res:
    print(r[2])

# Cnn model

In [13]:
def create_cnn_model(input_shape):
    x_input = Input(input_shape)
    x = Conv2D(16, 3, activation='relu', padding='same')(x_input)
    x = Conv2D(16, 3, activation='relu', padding='same')(x)
    x = MaxPool2D((2, 3))(x)
    
    x = Conv2D(32, 3, activation='relu', padding='same')(x)
    x = Conv2D(32, 3, activation='relu', padding='same')(x)
    x = MaxPool2D()(x)

    x = Conv2D(64, 3, activation='relu', padding='same')(x)
    x = Conv2D(64, 3, activation='relu', padding='same')(x)
    x = MaxPool2D()(x)
    
    x = Conv2D(128, 3, activation='relu')(x)
    x = Conv2D(128, (1, 6), activation='relu')(x)
    
    x = Flatten()(x)
    x = Dropout(0.5)(x)
    x = Dense(3, activation='softmax')(x)

    model = Model(inputs=x_input, outputs=x)
    return model

def schedule(i, lr):
    if i == 0:
        lr *= 0.5
    if i == 5:
        lr *= 0.2
    if i == 10:
        lr *= 0.2
    return lr

### Validation

In [14]:
cnn_res = make_crossval(X_raw, y_raw, test_folds, create_cnn_model, schedule, 7)

Fold: 4
Train on: (17409,)
Fold: 8
Train on: (17140,)
Fold: 11
Train on: (18467,)
Fold: 12
Train on: (18306,)
Fold: 15
Train on: (18544,)


# Twin model

In [16]:
def create_twin_model(input_shape):
    x_input = Input(input_shape)
    
    # Cnn part
    c = Conv2D(16, 3, activation='relu', padding='same')(x_input)
    c = Conv2D(16, 3, activation='relu', padding='same')(c)
    c = MaxPool2D((2, 3))(c)
    
    c = Conv2D(16, 3, activation='relu', padding='same')(c)
    c = Conv2D(32, 3, activation='relu', padding='same')(c)
    c = MaxPool2D()(c)

    c = Conv2D(32, 3, activation='relu', padding='same')(c)
    c = Conv2D(64, 3, activation='relu', padding='same')(c)
    c = MaxPool2D()(c)
    
    c = Conv2D(64, 3, activation='relu')(c)
    c = Conv2D(128, (1, 6), activation='relu')(c)
    c = Reshape((int(c.shape[-1:][0]),))(c)
    c = Dropout(0.5)(c)
    
    # Rnn part
    r = Reshape(input_shape[:-1])(x_input)
    r = Conv1D(96, 3)(r)
    r = BatchNormalization()(r)
    r = Activation('relu')(r)
    r = Dropout(0.5)(r)
    
    r = GRU(96, return_sequences=True)(r)
    r = Dropout(0.5)(r)
    r = BatchNormalization()(r)
    
    r = GRU(64, return_sequences=False)(r)
    r = Dropout(0.5)(r)
    r = BatchNormalization()(r)
    
    x = concatenate([c, r])
    x = Dense(64, activation='relu')(x)
    x = Dense(3, activation='softmax')(x)
    
    model = Model(inputs=x_input, outputs=x)
    return model

def schedule(i, lr):
    if i == 0:
        lr *= 0.5
    if i == 5:
        lr *= 0.2
    if i == 10:
        lr *= 0.2
    return lr

### Validation

In [17]:
twin_res = make_crossval(X_raw, y_raw, test_folds, create_twin_model, schedule, 8)

Fold: 4
Train on: (17409,)
Fold: 8
Train on: (17140,)
Fold: 11
Train on: (18467,)
Fold: 12
Train on: (18306,)
Fold: 15
Train on: (18544,)


In [18]:
with open('../cache/res_cv.pkl', 'wb') as f:
    pickle.dump((rnn_res, cnn_res, twin_res), f)

### F1

In [44]:
np.mean([r[4] for r in rnn_res]), np.std([r[4] for r in rnn_res])

(0.7379842711614976, 0.0605726739209234)

In [45]:
np.mean([r[4] for r in cnn_res]), np.std([r[4] for r in cnn_res])

(0.7337921415450412, 0.07011895338751599)

In [46]:
np.mean([r[4] for r in twin_res]), np.std([r[4] for r in twin_res])

(0.7457619903189693, 0.03550755319676306)

## Accuracy

In [47]:
np.mean([r[1] for r in rnn_res]), np.std([r[1] for r in rnn_res])

(0.901249473158801, 0.03088177766784812)

In [48]:
np.mean([r[1] for r in cnn_res]), np.std([r[1] for r in cnn_res])

(0.929114432334256, 0.0308061627976787)

In [49]:
np.mean([r[1] for r in twin_res]), np.std([r[1] for r in twin_res])

(0.9264118970437825, 0.022688938801155446)

In [43]:
with open('../cache/res_cv.pkl', 'rb') as f:
    rnn_res, cnn_res, twin_res = pickle.load(f)