In [2]:
import numpy as np
np.random.seed(1001)

import pickle
import os
import shutil

import IPython
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from tqdm import tqdm_notebook
from sklearn.model_selection import StratifiedKFold, train_test_split
import sklearn
import dask.dataframe as dd
from dask import array

import librosa
import IPython.display as ipd  # To play sound in the notebook
import librosa.display

import keras
from keras.layers import Conv1D, Dropout, Dense, MaxPooling1D, Flatten, Conv2D, MaxPooling2D
from keras import Sequential

%matplotlib inline
matplotlib.style.use('ggplot')

import dask
from dask.distributed import Client, wait, progress
from sklearn.preprocessing import StandardScaler

In [3]:
client = Client()

In [4]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:52899  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 12  Cores: 12  Memory: 16.90 GB


In [5]:
train_meta = pd.read_parquet("train_meta")
test_meta = pd.read_parquet("test_meta")
val_meta = pd.read_parquet("val_meta")

In [6]:
def get_stft_2d(y, sr, hop_length=32):
    D = librosa.stft(y, hop_length=hop_length, n_fft=1024)
    spec = librosa.amplitude_to_db(D,ref=np.max)
    df = pd.DataFrame(spec / 80)
    return spec

In [7]:
def get_length_in_seconds(y,sr):
    return len(y)/sr

def create_samples_from_fn(idx, fn, return_data=False, save_data=True):
    """
    Also trims silence from a file
    """
    desired_sr = 22050
    try:
        # Load, resample if needed
        y, sr = librosa.load(fn)
    except:
        return None
    if sr != desired_sr:
        y = librosa.core.resample(y, sr, desired_sr)
        sr = desired_sr

    # Standard scaling
    standardScaler = StandardScaler()    
    y = standardScaler.fit_transform(y.reshape(-1, 1)).reshape(1, -1)[0]

    length = get_length_in_seconds(y, sr)

    #ax.plot(y, alpha=0.3)
    #print(f"{np.round(length, 4)}")
    # Trim silence
    y_trimmed = librosa.effects.trim(y, top_db=12.5)[0]
    length_trimmed = get_length_in_seconds(y_trimmed, sr)
    #print(f"{np.round(length_trimmed, 4)}")
    #plt.plot(y_trimmed, alpha=0.5)

    # Split into chunks
    chunk_len = int(sr / 2) # .5 seconds
    end = len(y_trimmed) - (len(y_trimmed) % chunk_len)
    n_chunks = int(end / chunk_len)
    if n_chunks == 0:
        return None
    y_trimmed_chunks = np.split(y_trimmed[:end], n_chunks)    
    y_trimmed_chunks = [get_stft_2d(ch, desired_sr) for ch in y_trimmed_chunks]
    
    for i in range(len(y_feature_chunks)):
        res = pd.DataFrame(y_feature_chunks[i])
        res.columns = res.columns.astype(str)
        res["sample"]="{}-{}".format(idx, i) # Store the
        if save_data:
            res.to_parquet("../data/listenr-ml/preprocessed_3/{}-{}.parquet".format(idx, i))
    if return_data:
        return res

In [8]:
def establish_data(metadata):
    """
    Save all preprocessed (and eventually feature engineered samples)
    """
    n = len(metadata)
    
    futures = []

    for idx, data in metadata.iterrows():
        futures.append(client.submit(create_samples_from_fn, idx, data["filepath"], return_data=False, save_data=True,
                                    key=idx))
    return futures
    

In [9]:
# futures = establish_data(metadata)

In [10]:
# def load_one(fn):
#     res = pd.read_parquet("../data/listenr-ml/preprocessed_2/{}".format(fn))
#     return res
# res = load_one("0-0.parquet")

In [11]:
import gc

def get_scaler(client, metadata):
    """
    Load all samples and train a standardscaler
    """
    folder = "../data/listenr-ml/preprocessed_3/"
    basenames = os.listdir(folder)
    fns = ["../data/listenr-ml/preprocessed_3/{}".format(x) for x in basenames]
    
    sScaler = StandardScaler()

    # k is chunk size, i is iterator
    k = 500
    i = 0
    
    while (i*k) < len(fns):
        
        start = i*k
        end = min((i+1)*k, len(fns))
        
        print(start,end)
        i+= 1
        futures = [client.submit(pd.read_parquet, x) for x in fns[start:end]]
        x = pd.concat(client.gather(futures))
        sScaler.partial_fit(x[x.columns[:-1]])
    
    with open("2DScaler.p", "wb") as fp:
        pickle.dump(sScaler, fp)
        
    return sScaler
    
    # First, train a standard scaler on all elements
    df = dd.read_parquet("{}*.parquet".format(folder)).compute()
    
    gc.collect()
    client.restart()

    print(df.shape)
    print("scaling")
    # Transform to standard scale, then ignore the sample name column
    sScaler.fit(df[df.columns[:-1]])
    
    with open("2DScaler.p", "wb") as fp:
        pickle.dump(sScaler, fp)
    
    return sScaler

In [12]:
# scaler = get_scaler(client, metadata)

In [13]:
def scale_a_sample(fn, scaler):
    df = pd.read_parquet(fn)
    #Transform to standard scale, then ignore the sample name column
    X = pd.DataFrame(scaler.transform(df[df.columns[:-1]]))
    X.columns = [str(x) for x in X.columns]
    b = os.path.basename(fn)
    X.to_parquet("../data/listenr-ml/preprocessed_3_scaled/{}".format(b))
    
def scale_all_samples(scaler):
    """
    Load all samples into keras-digestible format
    """
    folder = "../data/listenr-ml/preprocessed_3/"
    
    basenames = os.listdir(folder)
        
    fns = ["../data/listenr-ml/preprocessed_3/{}".format(x) for x in basenames]
    # Get the original sample's metada for each of these chunks
    indices = [int(x.split("-")[0]) for x in basenames]
    futures = []
    for fn in fns:
        futures.append(client.submit(scale_a_sample, fn, scaler))
    return futures

with open("2DScaler.p", "rb") as fp:
    scaler = pickle.load(fp)
    
# ff = scale_all_samples(scaler)



In [14]:
def get_input(path):
    return pd.read_parquet(path).values.reshape((513, 345, 1))
    
def get_output(idx, metadata):
    return metadata[metadata.columns[-6:]].loc[idx].values

def batch_gen(metadata, batch_size = 8):
  
    while True:
        # Select files (paths/indices) for the batch
        indices = np.random.choice(a = metadata.index, 
                                     size = batch_size)
        
        data = [(int(x.split("-")[0]), f"../data/listenr-ml/preprocessed_3_scaled/{x}") for x in 
 os.listdir("../data/listenr-ml/preprocessed_3_scaled/") if 
 int(x.split("-")[0]) in indices]
        
        batch_input = []
        batch_output = []

        # Read in each input, perform preprocessing and get labels
        for idx, path in data:
            
            input_ = get_input(path)
            output = get_output(idx,metadata)

            batch_input += [input_]
            batch_output += [output]

        # Return a tuple of (input,output) to feed the network

        batch_x = np.array(batch_input)
        batch_y = np.array(batch_output)

        yield(batch_x, batch_y)

# G-G-G-GRID SEARCH

In [15]:
def build_2D_model_test(input_shape, n_classes, dropout, 
                        base_dense, n_filters, n_conv, filter_size=10, extra_dense=False):
    model = Sequential()
    
    for i in range(n_conv):
        model.add(Conv2D(n_filters, filter_size,
                                padding='valid',
                                input_shape=input_shape,
                                activation="relu"))
        model.add(MaxPooling2D(padding="same", pool_size=(filter_size)))
        model.add(Dropout(dropout))
        
        filter_size -= 3
    
    model.add(Flatten())
    model.add(Dense(base_dense, activation="relu"))
    model.add(Dropout(dropout))    

    if extra_dense:
        model.add(Dense(int(base_dense / 2), activation="relu"))
        model.add(Dropout(dropout))    

    model.add(Dense(n_classes, activation="softmax"))
    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=['acc'])
    
    return model

In [16]:
md = build_2D_model_test((513,345, 1), 6,  
                         dropout=0.1, 
                         n_conv=3,
                         base_dense=64,
                         n_filters=128,
                         extra_dense=False)

In [17]:
del(md)

In [18]:
from keras import backend as K
import tensorflow as tf
import gc

metadata=pd.read_csv("file_metadata.csv")

# Parameter space
dropout_rates = [0.5, 0.3]
n_conv_layers = [2, 1]
base_dense_size = [64, 32]
n_filterses = [64, 32 ,16]
extra_denses = [False]

idx = 0
batch_size=8

for dropout in dropout_rates:
    for n_conv in n_conv_layers:
        for base_dense in base_dense_size:
            for n_filters in n_filterses:
                for extra_dense in extra_denses:
                    print(dropout, n_conv, base_dense, n_filters, extra_dense)
                    try:
                        with tf.Graph().as_default():
                            with tf.Session() as sess:
                                # Single row df for each result, intended to be dask-read into a large df
                                df = pd.DataFrame(columns=["dropout", "n_conv", "base_dense", "n_filters", "loss", "acc"])
                                vals = [dropout, n_conv, base_dense, n_filters]

                                # Build this model
                                md = build_2D_model_test((513,345, 1), 6,  
                                                         dropout=dropout, 
                                                         n_conv=n_conv,
                                                         base_dense=base_dense,
                                                         n_filters=n_filters,
                                                         extra_dense=extra_dense)

                                # Create generators for data streams
                                train_gen = batch_gen(train_meta, batch_size)
                                val_gen = batch_gen(val_meta, batch_size)
                                test_gen = batch_gen(test_meta, batch_size)

                                md.fit_generator(train_gen, 
                                                 steps_per_epoch=len(train_meta)// batch_size,
                                                 validation_data=val_gen, 
                                                 validation_steps=len(train_meta)// batch_size,
                                                 epochs = 50, 
                                                 verbose=2, 
                                                 callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss',
                                                     min_delta=0,
                                                     patience=4,
                                                     verbose=1, mode='auto', restore_best_weights=True)])

                                res = md.evaluate_generator(test_gen, steps = 15, verbose=1)

                                # Memory management
                                gc.collect()

                                vals.extend(res)

                                df.loc[idx] = vals
                                df.to_parquet("res/{}-{}-{}-{}-{}.parquet".format(dropout, n_conv, base_dense, n_filters, extra_dense))
                                print(idx, res)
                                idx += 1
                                md.save("md/{}-{}-{}-{}-{}.md".format(dropout, n_conv, base_dense, n_filters, extra_dense))
                    except Exception as e:
                        print("FAILED")
                        print(str(e))
                        pass

0.5 2 64 64 False
Epoch 1/50
 - 124s - loss: 1.8114 - acc: 0.1882 - val_loss: 1.7805 - val_acc: 0.2180
Epoch 2/50
 - 122s - loss: 1.7812 - acc: 0.2021 - val_loss: 1.7653 - val_acc: 0.2281
Epoch 3/50
 - 121s - loss: 1.7784 - acc: 0.1922 - val_loss: 1.7801 - val_acc: 0.1885
Epoch 4/50
 - 121s - loss: 1.7736 - acc: 0.1947 - val_loss: 1.7696 - val_acc: 0.2299
Epoch 5/50
 - 122s - loss: 1.7695 - acc: 0.1768 - val_loss: 1.7698 - val_acc: 0.2133
Epoch 6/50
 - 121s - loss: 1.7675 - acc: 0.1925 - val_loss: 1.7752 - val_acc: 0.2113
Restoring model weights from the end of the best epoch
Epoch 00006: early stopping
0 [1.771514255252085, 0.20711974396964108]
0.5 2 64 32 False
Epoch 1/50
 - 122s - loss: 1.8104 - acc: 0.1952 - val_loss: 1.7812 - val_acc: 0.2039
Epoch 2/50
 - 118s - loss: 1.7820 - acc: 0.1818 - val_loss: 1.7721 - val_acc: 0.2102
Epoch 3/50
 - 121s - loss: 1.7784 - acc: 0.1932 - val_loss: 1.7645 - val_acc: 0.2179
Epoch 4/50
 - 118s - loss: 1.7784 - acc: 0.1916 - val_loss: 1.7741 - val_


0.5 1 64 32 False
Epoch 1/50
 - 124s - loss: 1.9197 - acc: 0.1882 - val_loss: 1.7858 - val_acc: 0.2266
Epoch 2/50
 - 122s - loss: 1.7855 - acc: 0.1979 - val_loss: 1.7810 - val_acc: 0.2179
Epoch 3/50
 - 122s - loss: 1.7783 - acc: 0.2030 - val_loss: 1.7777 - val_acc: 0.2207
Epoch 4/50
 - 123s - loss: 1.7755 - acc: 0.1948 - val_loss: 1.7753 - val_acc: 0.2137
Epoch 5/50
 - 120s - loss: 1.7757 - acc: 0.1775 - val_loss: 1.7833 - val_acc: 0.1939
Epoch 6/50
 - 122s - loss: 1.7682 - acc: 0.1989 - val_loss: 1.7708 - val_acc: 0.2333
Epoch 7/50
 - 123s - loss: 1.7762 - acc: 0.1920 - val_loss: 1.7700 - val_acc: 0.2090
Epoch 8/50
 - 123s - loss: 1.7730 - acc: 0.1915 - val_loss: 1.7724 - val_acc: 0.2019
Epoch 9/50
 - 122s - loss: 1.7758 - acc: 0.1876 - val_loss: 1.7669 - val_acc: 0.2103
Epoch 10/50
 - 121s - loss: 1.7651 - acc: 0.2031 - val_loss: 1.7736 - val_acc: 0.1930
Epoch 11/50
 - 121s - loss: 1.7707 - acc: 0.2043 - val_loss: 1.7614 - val_acc: 0.2255
Epoch 12/50
 - 124s - loss: 1.7681 - acc: 0.


0.3 2 64 32 False
Epoch 1/50
 - 124s - loss: 1.8001 - acc: 0.1846 - val_loss: 1.7851 - val_acc: 0.1966
Epoch 2/50
 - 125s - loss: 1.7848 - acc: 0.1987 - val_loss: 1.7607 - val_acc: 0.2199
Epoch 3/50
 - 124s - loss: 1.7769 - acc: 0.1975 - val_loss: 1.7803 - val_acc: 0.2150
Epoch 4/50
 - 121s - loss: 1.7701 - acc: 0.2042 - val_loss: 1.7765 - val_acc: 0.2105
Epoch 5/50
 - 123s - loss: 1.7737 - acc: 0.1775 - val_loss: 1.7680 - val_acc: 0.2139
Epoch 6/50
 - 122s - loss: 1.7784 - acc: 0.1828 - val_loss: 1.7743 - val_acc: 0.2150
Restoring model weights from the end of the best epoch
Epoch 00006: early stopping
9 [1.7945411991992783, 0.1279461314080139]
0.3 2 64 16 False
Epoch 1/50
 - 125s - loss: 1.7947 - acc: 0.1948 - val_loss: 1.7737 - val_acc: 0.2041
Epoch 2/50
 - 122s - loss: 1.7606 - acc: 0.2360 - val_loss: 1.7568 - val_acc: 0.1682
Epoch 3/50
 - 124s - loss: 1.7371 - acc: 0.2459 - val_loss: 1.7720 - val_acc: 0.2015
Epoch 4/50
 - 124s - loss: 1.7266 - acc: 0.2517 - val_loss: 1.6888 - val

Epoch 9/50
 - 123s - loss: 1.7651 - acc: 0.2178 - val_loss: 1.7748 - val_acc: 0.1911
Epoch 10/50
 - 123s - loss: 1.7729 - acc: 0.1927 - val_loss: 1.7698 - val_acc: 0.2007
Epoch 11/50
 - 124s - loss: 1.7801 - acc: 0.1776 - val_loss: 1.7711 - val_acc: 0.2186
Epoch 12/50
 - 124s - loss: 1.7747 - acc: 0.1869 - val_loss: 1.7676 - val_acc: 0.2245
Restoring model weights from the end of the best epoch
Epoch 00012: early stopping
13 [1.7652291883609685, 0.13183279951189875]
0.3 1 64 16 False
Epoch 1/50
 - 128s - loss: 2.0961 - acc: 0.1991 - val_loss: 1.7845 - val_acc: 0.2308
Epoch 2/50
 - 123s - loss: 1.7815 - acc: 0.1984 - val_loss: 1.7780 - val_acc: 0.1657
Epoch 3/50
 - 123s - loss: 1.7772 - acc: 0.1980 - val_loss: 1.7803 - val_acc: 0.2012
Epoch 4/50
 - 125s - loss: 1.7764 - acc: 0.1960 - val_loss: 1.7805 - val_acc: 0.1919
Epoch 5/50
 - 125s - loss: 1.7734 - acc: 0.1998 - val_loss: 1.7751 - val_acc: 0.2012
Epoch 6/50
 - 122s - loss: 1.7729 - acc: 0.1816 - val_loss: 1.7791 - val_acc: 0.1969
E

In [None]:
pd.get_dummies(base_class)

In [19]:
from keras import backend as K
import tensorflow as tf
import gc

metadata=pd.read_csv("file_metadata.csv")

# Parameter space
dropout_rates = [0.5]
n_conv_layers = [4,3]
base_dense_size = [32, 16]
n_filterses = [32, 16 ,8]
extra_denses = [False]

idx = 0
batch_size=8

for dropout in dropout_rates:
    for n_conv in n_conv_layers:
        for base_dense in base_dense_size:
            for n_filters in n_filterses:
                for extra_dense in extra_denses:
                    print(dropout, n_conv, base_dense, n_filters, extra_dense)
                    try:
                        with tf.Graph().as_default():
                            with tf.Session() as sess:
                                # Single row df for each result, intended to be dask-read into a large df
                                df = pd.DataFrame(columns=["dropout", "n_conv", "base_dense", "n_filters", "loss", "acc"])
                                vals = [dropout, n_conv, base_dense, n_filters]

                                # Build this model
                                md = build_2D_model_test((513,345, 1), 6,  
                                                         dropout=dropout, 
                                                         n_conv=n_conv,
                                                         base_dense=base_dense,
                                                         n_filters=n_filters,
                                                         extra_dense=extra_dense)

                                # Create generators for data streams
                                train_gen = batch_gen(train_meta, batch_size)
                                val_gen = batch_gen(val_meta, batch_size)
                                test_gen = batch_gen(test_meta, batch_size)

                                md.fit_generator(train_gen, 
                                                 steps_per_epoch=len(train_meta)// batch_size,
                                                 validation_data=val_gen, 
                                                 validation_steps=len(train_meta)// batch_size,
                                                 epochs = 50, 
                                                 verbose=2, 
                                                 callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss',
                                                     min_delta=0,
                                                     patience=4,
                                                     verbose=1, mode='auto', restore_best_weights=True)])

                                res = md.evaluate_generator(test_gen, steps = 15, verbose=1)

                                # Memory management
                                gc.collect()

                                vals.extend(res)

                                df.loc[idx] = vals
                                df.to_parquet("res/{}-{}-{}-{}-{}.parquet".format(dropout, n_conv, base_dense, n_filters, extra_dense))
                                print(idx, res)
                                idx += 1
                                md.save("md/{}-{}-{}-{}-{}.md".format(dropout, n_conv, base_dense, n_filters, extra_dense))
                    except Exception as e:
                        print("FAILED")
                        print(str(e))
                        pass

0.5 4 32 32 False
Epoch 1/50
 - 127s - loss: 1.8084 - acc: 0.1871 - val_loss: 1.7798 - val_acc: 0.1625
Epoch 2/50
 - 122s - loss: 1.7842 - acc: 0.1829 - val_loss: 1.7786 - val_acc: 0.2069
Epoch 3/50
 - 123s - loss: 1.7680 - acc: 0.1987 - val_loss: 1.7781 - val_acc: 0.2042
Epoch 4/50
 - 124s - loss: 1.7727 - acc: 0.1952 - val_loss: 1.7693 - val_acc: 0.2004
Epoch 5/50
 - 129s - loss: 1.7743 - acc: 0.2001 - val_loss: 1.7645 - val_acc: 0.2250
Epoch 6/50
 - 124s - loss: 1.7841 - acc: 0.1978 - val_loss: 1.7758 - val_acc: 0.1952
Epoch 7/50
 - 125s - loss: 1.7757 - acc: 0.1759 - val_loss: 1.7760 - val_acc: 0.2097
Epoch 8/50
 - 125s - loss: 1.7664 - acc: 0.2015 - val_loss: 1.7711 - val_acc: 0.2193
Epoch 9/50
 - 126s - loss: 1.7599 - acc: 0.2103 - val_loss: 1.7694 - val_acc: 0.2269
Restoring model weights from the end of the best epoch
Epoch 00009: early stopping
0 [1.7725615344113774, 0.19097222403312722]
0.5 4 32 16 False
Epoch 1/50
 - 127s - loss: 1.8020 - acc: 0.1838 - val_loss: 1.7804 - val

In [20]:
res = dd.read_parquet("res/*.parquet")

In [21]:
res = res.compute()

In [23]:
res.sort_values("loss")

Unnamed: 0,dropout,n_conv,base_dense,n_filters,loss,acc
10,0.3,2.0,64.0,16.0,1.573454,0.348993
12,0.3,2.0,32.0,16.0,1.634949,0.296552
3,0.5,2.0,32.0,32.0,1.700319,0.376712
6,0.5,3.0,32.0,32.0,1.724222,0.236934
2,0.5,4.0,32.0,8.0,1.740797,0.258567
5,0.5,1.0,64.0,32.0,1.744735,0.258065
8,0.5,1.0,32.0,16.0,1.748221,0.197987
11,0.5,3.0,16.0,8.0,1.755501,0.227414
1,0.5,4.0,32.0,16.0,1.758289,0.168874
11,0.3,2.0,32.0,32.0,1.762288,0.163763
