In [54]:
import pandas as pd
import dask
from dask import dataframe as dd
from dask.distributed import Client
import keras
from keras.models import load_model
import librosa
import numpy as np
np.random.seed(1001)

import pickle
import os
import shutil

import IPython
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tqdm import tqdm_notebook
from sklearn.model_selection import StratifiedKFold, train_test_split
import sklearn
import dask.dataframe as dd
from dask import array

import librosa
import IPython.display as ipd  # To play sound in the notebook
import librosa.display

import keras
from keras.layers import Conv1D, Dropout, Dense, MaxPooling1D, Flatten, Conv2D, MaxPooling2D
from keras import Sequential

%matplotlib inline
matplotlib.style.use('ggplot')

import dask
from dask.distributed import Client, wait, progress
from sklearn.preprocessing import StandardScaler

In [55]:
res = dd.read_parquet("res/*.parquet").compute()

In [56]:
res.sort_values("loss").head()

Unnamed: 0,dropout,n_conv,base_dense,n_filters,loss,acc
10,0.3,2.0,64.0,16.0,1.573454,0.348993
12,0.3,2.0,32.0,16.0,1.634949,0.296552
3,0.5,2.0,32.0,32.0,1.700319,0.376712
6,0.5,3.0,32.0,32.0,1.724222,0.236934
2,0.5,4.0,32.0,8.0,1.740797,0.258567


# Utilise dask to read all parquet files across multiple cores

In [57]:
res_1D = dd.read_parquet("res_1D/*").compute()

In [58]:
res_1D.sort_values("loss").head()

Unnamed: 0,fiter_size,pool_size,base_dense,dropout,reg_amt,n_filt,extra_dense,loss,acc
3-5-128-0.5-0.0-128-True,3,5,128,0.5,0.0,128,True,1.514246,0.371681
6-5-256-0.5-0.0-128-True,6,5,256,0.5,0.0,128,True,1.519843,0.371681
6-4-128-0.5-0.0-128-True,6,4,128,0.5,0.0,128,True,1.52245,0.336283
3-4-256-0.3-0.0-64-True,3,4,256,0.3,0.0,64,True,1.524509,0.336283
3-4-256-0.3-0.0-32-True,3,4,256,0.3,0.0,32,True,1.528267,0.309735


In [59]:
train_meta = pd.read_parquet("train_meta")
val_meta = pd.read_parquet("val_meta")
test_meta = pd.read_parquet("test_meta")

# Preprocessing

In [60]:
def get_stft_2d(y, sr = 22050, hop_length=32):
    D = librosa.stft(y, hop_length=hop_length, n_fft=1024)
    spec = librosa.amplitude_to_db(D,ref=np.max)
    df = pd.DataFrame(spec / 80)
    return spec

In [61]:
def get_length_in_seconds(y,sr):
    return len(y)/sr

def create_samples_from_fn(fn, desired_sr = 22050):
    """
    Also trims silence from a file
    """    
    try:
        # Load, resample if needed
        y, sr = librosa.load(fn)
    except:
        return None
    if sr != desired_sr:
        y = librosa.core.resample(y, sr, desired_sr)
        sr = desired_sr

    # Standard scaling
    standardScaler = StandardScaler()    
    y = standardScaler.fit_transform(y.reshape(-1, 1)).reshape(1, -1)[0]

    length = get_length_in_seconds(y, sr)

    # Trim silence
    y_trimmed = librosa.effects.trim(y, top_db=12.5)[0]
    length_trimmed = get_length_in_seconds(y_trimmed, sr)

    # Split into chunks
    chunk_len = int(sr / 2) # .5 seconds
    end = len(y_trimmed) - (len(y_trimmed) % chunk_len)
    n_chunks = int(end / chunk_len)
    if n_chunks == 0:
        return None
    y_trimmed_chunks = np.split(y_trimmed[:end], n_chunks)
    y_trimmed_chunks = [x for x in y_trimmed_chunks]
    
    return y_trimmed_chunks

In [62]:
def get_ts_features(y,sr = 22050, hop_length=32):
    n_cont_bands = 2
    res = pd.DataFrame(columns=["contrast_band_{}".format(x) for x in range(n_cont_bands+1)].extend(["sroll", "sflat"]))
    
    scont = librosa.feature.spectral_contrast(y,n_bands=n_cont_bands, hop_length=hop_length)
    
    for i in range(n_cont_bands + 1):
        res["contrast_band_{}".format(i)] = scont[i]
        
    sroll = librosa.feature.spectral_rolloff(y,sr, hop_length=hop_length)[0].T
    res["sroll"] = sroll
    
    sflat = librosa.feature.spectral_flatness(y, hop_length=hop_length)[0].T
    res["sflat"] = sflat
    
    y_df = pd.DataFrame(y)
    res["downsampled"] = y_df.groupby(y_df.index//hop_length).mean().rolling(2).mean()
    res["downsampled_smooth_abs"] = y_df.abs().groupby(y_df.index//hop_length).mean().rolling(6).mean()

    return res.fillna(method="backfill").fillna(method="ffill")

In [63]:
def preprocess_sample(fn):
    # Load
    chunks = create_samples_from_fn(fn)
    
    # 2D transformation
    chunks_2d = [get_stft_2d(ch) for ch in chunks]
    
    # 1D transformation
    chunks_1d = [get_ts_features(ch).values for ch in chunks] 
    return np.array(chunks_1d).reshape(-1, 345, 7), np.array(chunks_2d).reshape(-1, 513, 345, 1)    

# Load the two sub-models 

In [64]:
md_2d = load_model("md/0.3-1-32-16-False.md")
md_1d = load_model("md_1D/3-5-128-0.5-0.0-128-True")

In [65]:
md_2d.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 504, 336, 16)      1616      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 51, 34, 16)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 51, 34, 16)        0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 27744)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                887840    
_________________________________________________________________
dropout_2 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 198       
Total para

In [66]:
md_1d.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 343, 128)          2816      
_________________________________________________________________
dropout_1 (Dropout)          (None, 343, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 343, 128)          49280     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 69, 128)           0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 69, 128)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 8832)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               1130624   
__________

In [67]:
one, two = preprocess_sample("../data/listenr-ml/crema-wav/AudioWAV/1001_MTI_NEU_XX.wav")



In [68]:
test_meta["filepath"]

1624    ../data/listenr-ml/crema-wav/AudioWAV/1007_WSI...
1345    ../data/listenr-ml/crema-wav/AudioWAV/1004_IWL...
104     ../data/listenr-ml/RAVDESS-Audio_Speech_Actors...
1166    ../data/listenr-ml/crema-wav/AudioWAV/1002_ITH...
957     ../data/listenr-ml/RAVDESS-Audio_Speech_Actors...
1019    ../data/listenr-ml/RAVDESS-Audio_Speech_Actors...
471     ../data/listenr-ml/RAVDESS-Audio_Speech_Actors...
1490    ../data/listenr-ml/crema-wav/AudioWAV/1006_IOM...
796     ../data/listenr-ml/RAVDESS-Audio_Speech_Actors...
366     ../data/listenr-ml/RAVDESS-Audio_Speech_Actors...
1406    ../data/listenr-ml/crema-wav/AudioWAV/1005_IOM...
829     ../data/listenr-ml/RAVDESS-Audio_Speech_Actors...
1182    ../data/listenr-ml/crema-wav/AudioWAV/1002_IWL...
1718    ../data/listenr-ml/crema-wav/AudioWAV/1009_IEO...
1252    ../data/listenr-ml/crema-wav/AudioWAV/1003_ITH...
110     ../data/listenr-ml/RAVDESS-Audio_Speech_Actors...
1178    ../data/listenr-ml/crema-wav/AudioWAV/1002_IWL...
1323    ../dat

In [69]:
fns = test_meta["filepath"].values
preds_2 = []
preds_1 = []

for i in range(20):
    idx = np.random.randint(0, len(fns))
    print(fns[idx])
    fn = fns[idx]
    try:
        one, two = preprocess_sample(fn)
        preds_2.append(md_2d.predict_classes(two))
        preds_1.append(md_1d.predict_classes(one))
    except:
        pass

../data/listenr-ml/RAVDESS-Audio_Speech_Actors_01-24/Actor_04/03-01-07-02-01-02-04.wav




../data/listenr-ml/RAVDESS-Audio_Speech_Actors_01-24/Actor_09/03-01-03-02-01-02-09.wav




../data/listenr-ml/crema-wav/AudioWAV/1001_MTI_NEU_XX.wav




../data/listenr-ml/crema-wav/AudioWAV/1002_IWL_NEU_XX.wav




../data/listenr-ml/RAVDESS-Audio_Speech_Actors_01-24/Actor_13/03-01-07-02-02-01-13.wav




../data/listenr-ml/crema-wav/AudioWAV/1002_IEO_FEA_HI.wav




../data/listenr-ml/RAVDESS-Audio_Speech_Actors_01-24/Actor_03/03-01-05-01-01-02-03.wav




../data/listenr-ml/crema-wav/AudioWAV/1008_IEO_SAD_MD.wav




../data/listenr-ml/crema-wav/AudioWAV/1003_TIE_HAP_XX.wav




../data/listenr-ml/RAVDESS-Audio_Speech_Actors_01-24/Actor_03/03-01-03-02-02-02-03.wav




../data/listenr-ml/RAVDESS-Audio_Speech_Actors_01-24/Actor_13/03-01-05-01-02-02-13.wav




../data/listenr-ml/RAVDESS-Audio_Speech_Actors_01-24/Actor_09/03-01-01-01-01-02-09.wav




../data/listenr-ml/RAVDESS-Audio_Speech_Actors_01-24/Actor_14/03-01-06-01-02-01-14.wav




../data/listenr-ml/RAVDESS-Audio_Speech_Actors_01-24/Actor_16/03-01-05-02-02-02-16.wav




../data/listenr-ml/crema-wav/AudioWAV/1003_ITH_SAD_XX.wav




../data/listenr-ml/crema-wav/AudioWAV/1002_IEO_FEA_HI.wav




../data/listenr-ml/crema-wav/AudioWAV/1004_ITH_NEU_XX.wav




../data/listenr-ml/crema-wav/AudioWAV/1005_IEO_FEA_LO.wav




../data/listenr-ml/RAVDESS-Audio_Speech_Actors_01-24/Actor_03/03-01-05-01-01-02-03.wav




../data/listenr-ml/RAVDESS-Audio_Speech_Actors_01-24/Actor_20/03-01-06-02-01-02-20.wav




# Generate a range of predictions from each model separately

In [70]:
preds_2

[array([3, 3, 3], dtype=int64),
 array([3, 3, 3, 3], dtype=int64),
 array([3, 3], dtype=int64),
 array([3, 3], dtype=int64),
 array([3, 3], dtype=int64),
 array([3], dtype=int64),
 array([3, 3, 3], dtype=int64),
 array([3], dtype=int64),
 array([3, 3, 3], dtype=int64),
 array([3, 3, 3], dtype=int64),
 array([3, 3], dtype=int64),
 array([3, 3], dtype=int64),
 array([3, 3], dtype=int64),
 array([3, 3, 3], dtype=int64),
 array([3, 3], dtype=int64),
 array([3], dtype=int64),
 array([3, 3, 3], dtype=int64),
 array([3], dtype=int64),
 array([3, 3, 3], dtype=int64),
 array([3, 3, 3], dtype=int64)]

In [71]:
preds_1

[array([5, 1, 5], dtype=int64),
 array([1, 1, 4, 5], dtype=int64),
 array([1, 4], dtype=int64),
 array([1, 4], dtype=int64),
 array([1, 5], dtype=int64),
 array([5], dtype=int64),
 array([5, 1, 1], dtype=int64),
 array([5], dtype=int64),
 array([4, 5, 1], dtype=int64),
 array([1, 1, 4], dtype=int64),
 array([5, 4], dtype=int64),
 array([1, 1], dtype=int64),
 array([5, 5], dtype=int64),
 array([5, 5, 5], dtype=int64),
 array([1, 1], dtype=int64),
 array([5], dtype=int64),
 array([1, 1, 1], dtype=int64),
 array([5], dtype=int64),
 array([5, 1, 1], dtype=int64),
 array([5, 1, 1], dtype=int64)]

In [72]:
md_2d.predict_classes(two)

array([3, 3, 3], dtype=int64)

In [None]:
md_1d.predict_proba(one)