# **Machine learning project**

## **What did you say? I understood Speech Recognition**

### *Natascia Caria, Claudia Cozzolino, Alfredo Petrella*

In [None]:
##################################################
# Imports
##################################################

import numpy as np
import cv2
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import IPython.display as ipd
import tensorflow as tf 
from tensorflow import keras 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn import svm
from sklearn.pipeline import make_pipeline
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV

!pip install librosa==0.7.2
import librosa

In [None]:
##################################################
# Params
##################################################

DATA_BASE_FOLDER = '/kaggle/input/ml-project-speech-recognition-challenge'
SAMPLE_RATE = 16000
HOP_LEN = 512

# Dataset

The dataset is a reduced version of the [`TensorFlow Speech Commands Dataset`](https://www.tensorflow.org/datasets/catalog/speech_commands) and contains audio waveforms of the words:
- `down`, 
- `go`, 
- `left`, 
- `off`, 
- `on`, 
- `right`, 
- `stop`, 
- `up`.


Train / Validation Split
- 1600 train samples, 200 for each class
- 109 validation samples

In [None]:
##################################################
# Load dataset
##################################################

# Load annotations
df_train = pd.read_csv(os.path.join(DATA_BASE_FOLDER, 'train.csv'))
df_validation = pd.read_csv(os.path.join(DATA_BASE_FOLDER, 'validation.csv'))
labels = sorted(list(set(df_train['word'].values)))
y_train = df_train['word'].map(lambda w: labels.index(w)).values
y_validation = df_validation['word'].map(lambda w: labels.index(w)).values

# Load audio
audio_train = np.load(os.path.join(DATA_BASE_FOLDER, 'train_audio.npy'))

# Feature Extraction

The speech is a time series signal and a well known strategy for extracting a good representation of the raw audio is to mimic the processing of the auditory system of the humans. A well established feature representation for speech is the so called "log mel-spectrum". This feature in fact, takes into account how humans perceive both the frequencies and the amplitude of the sound logarithmically. If you want to dig more into this topic [here](https://medium.com/@jonathan_hui/speech-recognition-feature-extraction-mfcc-plp-5455f5a69dd9) you can find some details. 

![auditory-system](https://www.researchgate.net/profile/Morteza_Khaleghi_Meybodi/publication/322343133/figure/fig1/AS:581011472093184@1515535337239/Figure-31-Schematic-of-the-auditory-system-with-its-primary-components-including.png)

For this project these features are precomputed: for each audio waveform of 1 sec duration, the log mel-spectrum is a bi-dimensional representation (frequency vs time) of shape [128, 32]. Here, we first resize the "image" into a [32, 32] matrix and then we flatten the representation into a 32x32 = 1024 vector.

In [None]:
# Load Features
x_train_raw = np.load(os.path.join(DATA_BASE_FOLDER, 'train_feat.npy'))
x_validation_raw = np.load(os.path.join(DATA_BASE_FOLDER, 'validation_feat.npy'))

x_test_raw = np.load(os.path.join(DATA_BASE_FOLDER, 'test_feat.npy'))

# Plot audio feature
idx = 1205
time = np.arange(1, SAMPLE_RATE + 1, HOP_LEN) / SAMPLE_RATE
plt.figure(figsize=(10, 5))
plt.title(f'Mel-Spectrogram of audio: {df_train["word"][idx]}', fontweight='bold')
plt.imshow(x_train_raw[idx], aspect='auto', origin='low', cmap='inferno')
xticks = plt.xticks()[0].astype(np.int32)
plt.xticks(xticks[1:-1], [f'{1000 * t:.0f}' for t in time[xticks[1:-1]]])
plt.xlabel('Time [ms]', fontweight='bold')
plt.ylabel('Log Mel-Spectogram', fontweight='bold')
plt.grid(lw=0.4, c='w', alpha=0.4)
plt.show()

# Play audio
ipd.Audio(audio_train[idx], rate=SAMPLE_RATE)

x_train_raw[idx].shape

In [None]:
# Resize the features
x_train = []
for x_i in x_train_raw:
    x_train += [cv2.resize(x_i, (32, 32))]
x_train = np.array(x_train)
x_validation = []
for x_i in x_validation_raw:
    x_validation += [cv2.resize(x_i, (32, 32))]
x_validation = np.array(x_validation)

x_test = [] 
for x_i in x_test_raw:
    x_test += [cv2.resize(x_i, (32, 32))]
x_test = np.array(x_test)

# Plot audio feature
idx = 1205
plt.figure(figsize=(5, 3))
plt.title(f'Mel-Spectrogram of audio: {df_train["word"][idx]}', fontweight='bold')
plt.imshow(x_train[idx], aspect='auto', origin='low', cmap='inferno')
plt.grid(lw=0.4, c='w', alpha=0.4)
plt.show()

# Play audio
ipd.Audio(audio_train[idx], rate=SAMPLE_RATE)

In [None]:
# Shuffle data

tShuff = list(range(x_train.shape[0]))
np.random.seed(0)
np.random.shuffle(tShuff)

vShuff = list(range(x_validation.shape[0]))
np.random.seed(0)
np.random.shuffle(vShuff)

df_train = df_train.iloc[tShuff,:]
df_train.index = list(range(x_train.shape[0]))

df_validation = df_validation.iloc[vShuff,:]
df_validation.index = list(range(x_validation.shape[0]))

audio_train = audio_train[tShuff,:]

x_train_raw = x_train_raw[tShuff,:]
x_train = x_train[tShuff,:,:]
y_train = y_train[tShuff]

x_validation_raw = x_validation_raw[vShuff,:]
x_validation = x_validation[vShuff,:,:]
y_validation = y_validation[vShuff]

In [None]:
# Matrix features representation
x_train_mat = x_train
x_validation_mat = x_validation
x_test_mat = x_test

In [None]:
# Flatten the features
x_train = x_train.reshape(x_train.shape[0], -1)
x_validation = x_validation.reshape(x_validation.shape[0], -1)
x_test = x_test.reshape(x_test.shape[0], -1)
print(f'Features dimension size: {x_train.shape[-1]}')

In [None]:
# 
y_test = np.array([7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
                   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                   4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
                   2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                   6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
                   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
                   3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
                   1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

# Import augmented data

In [None]:
'''
from numpy import load

x_train_aug_mat = load(os.path.join(DATA_BASE_FOLDER, 'x_train_aug_mat.npy'))
x_train_aug = load(os.path.join(DATA_BASE_FOLDER, 'x_train_aug.npy'))
y_train_aug = load(os.path.join(DATA_BASE_FOLDER, 'y_train_aug.npy'))
x_train_new = load(os.path.join(DATA_BASE_FOLDER, 'x_train_new.npy'))
'''

# Model

Here you have to implement a model (or more models, for finding the most accurate) for classification.

You can use the [`sklearn`](https://scikit-learn.org/stable/) (or optionally other more advanced frameworks such as [`pytorch`](https://pytorch.org/) or [`tensorflow`](https://www.tensorflow.org/)) package that contains a pool of models already implemented that perform classification. (SVMs, NNs, LR, kNN, ...)

### Some ideas for multiclass learning

* One VS One / One VS All binary classifiers using LR / SVM
* decision tree / random forest 
* Neural Network

### Some ideas for performance improvement

* more data $\rightarrow$ data augmentation

* more complex models or ensembles

* hyperparameters tuning (eg. vector filter on freq,...)

* data transformation / scaling

* feature selection / engeeniring

In [None]:
# func to print performance scores

def print_results(y_true, y_pred):

    confusion = confusion_matrix(y_true, y_pred)
    print('Confusion Matrix\n')
    print(confusion)

    print('\nAccuracy: {:.4f}\n'.format(accuracy_score(y_true, y_pred)))

    print('Micro Precision: {:.4f}'.format(precision_score(y_true, y_pred, average='micro')))
    print('Micro Recall: {:.4f}'.format(recall_score(y_true, y_pred, average='micro')))
    print('Micro F1-score: {:.4f}\n'.format(f1_score(y_true, y_pred, average='micro')))

    print('Macro Precision: {:.4f}'.format(precision_score(y_true, y_pred, average='macro')))
    print('Macro Recall: {:.4f}'.format(recall_score(y_true, y_pred, average='macro')))
    print('Macro F1-score: {:.4f}\n'.format(f1_score(y_true, y_pred, average='macro')))

    print('Weighted Precision: {:.4f}'.format(precision_score(y_true, y_pred, average='weighted')))
    print('Weighted Recall: {:.4f}'.format(recall_score(y_true, y_pred, average='weighted')))
    print('Weighted F1-score: {:.4f}'.format(f1_score(y_true, y_pred, average='weighted')))

    print('\nClassification Report\n')
    print(classification_report(y_true, y_pred, target_names=labels))

### Logistic Regression OVR

In [None]:
def logreg_ovr(X_train, Y_train, X_valid, Y_valid, X_test, Y_test):
    clf = LogisticRegression(random_state=0, max_iter=50000 ,multi_class='auto')
    
    # fit the model
    clf.fit(X_train, Y_train)
    
    # predictions
    Y_train_pred=clf.predict(X_train)
    Y_valid_pred=clf.predict(X_valid)
    Y_test_pred=clf.predict(X_test)
    
    # print accuracy
    print(f'Train Accuracy: {metrics.accuracy_score(Y_train, Y_train_pred):.4f}')
    print(f'Valid Accuracy: {metrics.accuracy_score(Y_valid, Y_valid_pred):.4f}')
    print(f'Test  Accuracy: {metrics.accuracy_score(Y_test,  Y_test_pred):.4f}')
    
    return(Y_train_pred, Y_valid_pred, Y_test_pred)

In [None]:
# Results on original data
y_train_pred, y_valid_pred, y_test_pred = logreg_ovr(x_train, y_train, x_validation, y_validation, x_test, y_test)

In [None]:
# Results on augmented data
#y_train_pred, y_valid_pred, y_test_pred = logreg_ovr(x_train_aug, y_train_aug, x_validation, y_validation, x_test, y_test)

In [None]:
# print_results(y_train, y_train_pred)
# print_results(y_validation, y_valid_pred)
# print_results(y_test, y_test_pred)

### SVM OVR

In [None]:
def svm_ovr(X_train, Y_train, X_valid, Y_valid, X_test, Y_test):
    clf = make_pipeline(preprocessing.Normalizer(),
                        svm.NuSVC(kernel='rbf', decision_function_shape='ovr')
                       )
    # fit the model
    clf.fit(X_train, Y_train)
    
    # predictions
    Y_train_pred=clf.predict(X_train)
    Y_valid_pred=clf.predict(X_valid)
    Y_test_pred=clf.predict(X_test)
    
    # print accuracy
    print(f'Train Accuracy: {metrics.accuracy_score(Y_train, Y_train_pred):.4f}')
    print(f'Valid Accuracy: {metrics.accuracy_score(Y_valid, Y_valid_pred):.4f}')
    print(f'Test  Accuracy: {metrics.accuracy_score(Y_test,  Y_test_pred):.4f}')
    
    return(Y_train_pred, Y_valid_pred, Y_test_pred)

In [None]:
# Results on original data
y_train_pred, y_valid_pred, y_test_pred = svm_ovr(x_train, y_train, x_validation, y_validation, x_test, y_test)

In [None]:
# Results on augmented data
# y_train_pred, y_valid_pred, y_test_pred = svm_ovr(x_train_aug, y_train_aug, x_validation, y_validation, x_test, y_test)

In [None]:
# print_results(y_train, y_train_pred)
# print_results(y_validation, y_valid_pred)
# print_results(y_test, y_test_pred)

### Random Forest


In [None]:
def randfor(X_train, Y_train, X_valid, Y_valid, X_test, Y_test):
    clf = RandomForestClassifier(criterion='entropy', random_state=0, max_depth=10)
    
    # fit the model
    clf.fit(X_train, Y_train)
    
    # predictions
    Y_train_pred=clf.predict(X_train)
    Y_valid_pred=clf.predict(X_valid)
    Y_test_pred=clf.predict(X_test)
    
    # print accuracy
    print(f'Train Accuracy: {metrics.accuracy_score(Y_train, Y_train_pred):.4f}')
    print(f'Valid Accuracy: {metrics.accuracy_score(Y_valid, Y_valid_pred):.4f}')
    print(f'Test  Accuracy: {metrics.accuracy_score(Y_test,  Y_test_pred):.4f}')
    
    return(Y_train_pred, Y_valid_pred, Y_test_pred)

In [None]:
# Results on original data
y_train_pred, y_valid_pred, y_test_pred = randfor(x_train, y_train, x_validation, y_validation, x_test, y_test)

In [None]:
# Results on augmented data
# y_train_pred, y_valid_pred, y_test_pred = randfor(x_train_aug, y_train_aug, x_validation, y_validation, x_test, y_test)

In [None]:
# print_results(y_train, y_train_pred)
# print_results(y_validation, y_valid_pred)
# print_results(y_test, y_test_pred)

## Models comparison

In [None]:
LogisticRegressionOVR = LogisticRegression(random_state=0, max_iter=50000 ,multi_class='auto')

SVMOVR = make_pipeline(preprocessing.Normalizer(), svm.NuSVC(kernel='rbf', decision_function_shape='ovr'))

RandForest = RandomForestClassifier(criterion='entropy', random_state=0, max_depth=10)

In [None]:
# prepare models
models = []
#models.append(('LR', LogisticRegressionOVR))
models.append(('SVM', SVMOVR))
models.append(('RF', RandForest))

# evaluate each model in turn
results = []
names = []

def model_comparison(models, k, X, Y):
    for name, model in models:
        skfold = StratifiedKFold(n_splits=k, random_state=1234, shuffle=True)
        cv_results = cross_val_score(model, X, Y, cv=skfold, scoring = 'accuracy')
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)

    # boxplot algorithm comparison
    fig = plt.figure()
    fig.suptitle('Models Comparison')
    ax = fig.add_subplot(111)
    plt.boxplot(results)
    ax.set_xticklabels(names)
    plt.show()
    plt.savefig('cv_models.pdf', bbox_inches='tight')
    
    return results

In [None]:
# Results on original data
model_comparison(models, 5, x, y_train)

In [None]:
# Results on augmented data
# model_comparison(models, 5, x_train_aug, y_train_aug)

# Neural Networks

In [None]:
def plot_loss(history):
  plt.figure(figsize=(8,5))
  plt.plot(history.epoch,history.history['loss'], label = 'loss')
  plt.plot(history.epoch,history.history['val_loss'], label = 'val_loss')
  plt.legend()
  plt.grid(True)
  plt.title('loss')


def plot_accuracy(history):
  plt.figure(figsize=(8,5))
  plt.plot(history.epoch,history.history['accuracy'], label = 'accuracy')
  plt.plot(history.epoch,history.history['val_accuracy'], label = 'val_accuracy')
  plt.legend(loc = 'lower right')
  plt.grid(True)
  plt.title('accuracy')

**Input reshape for 1D NN**

In [None]:
# Reshape input (original data)
x_train_1D = x_train_mat.reshape(x_train_mat.shape[0], 1024, 1)
x_validation_1D = x_validation_mat.reshape(x_validation_mat.shape[0], 1024,1)
x_test_1D = x_test_mat.reshape(x_test_mat.shape[0], 1024,1)


In [None]:
# Reshape input (augmented data)
# x_train_1D_aug = x_train_aug.reshape(x_train_aug.shape[0], 1024, 1)

**Input reshape for 2D NN**

In [None]:
# Reshape input
x_train_2D = x_train.reshape(x_train.shape[0], 32, 32, 1)
x_validation_2D = x_validation.reshape(x_validation.shape[0], 32, 32, 1)
x_test_2D = x_test.reshape(x_test.shape[0], 32, 32, 1)


In [None]:
# Reshape input (augmented data)
# x_train_2D_aug = x_train_aug.reshape(x_train_aug.shape[0], 32, 32, 1)

### DNN

#### Hyper parameters tuning

In [None]:
# create a function that creates the model (required for KerasClassifier) 
# Input: the hyperparameters we want to tune 
# Output: DNN model

def create_dnn(n_units = 64):
    # define model
    dnn_model = keras.models.Sequential([
        keras.layers.Flatten(input_shape=[1024, 1]),
        keras.layers.Dense(n_units*4, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        keras.layers.Dense(n_units*4, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        keras.layers.Dense(n_units*2, activation="relu",  kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        keras.layers.Dense(n_units*2, activation="relu",  kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        keras.layers.Dense(n_units, activation="relu",  kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        keras.layers.Dense(8, activation="softmax")
    ])
    # compile model
    dnn_model.compile(loss="sparse_categorical_crossentropy",
                  optimizer='adam',
                  metrics=["accuracy"])
    return dnn_model

In [None]:
# create a keras.wrappers.scikit_learn.KerasRegressor and pass the build_model function to the constructor
# this gives a Scikit-Learn compatible predictor
keras_reg_dnn = keras.wrappers.scikit_learn.KerasRegressor(create_dnn)

# define the grid search parameters
params_dnn = {
    "n_units": [32, 64, 128],
    "batch_size": [128, 256],
    "epochs": [50, 100]
}


# define accuracy score in order to avoid error:
# "Classification metrics can't handle a mix of multilabel-indicator and continuous-multioutput targets"
def acc(y_true, y_pred):
    score = accuracy_score(np.argmax(y_pred, axis = 1), y_true)
    print('score is {}'.format(score))
    return score
acc_score = make_scorer(acc)


# define StratifiedKFold CV to be used within GridSearch
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234)

grid_search = GridSearchCV(keras_reg_dnn, params_dnn, cv=inner_cv, scoring=acc_score)

In [None]:
# grid search
# uncomment next lines to run
'''
grid_result = grid_search.fit(x_train_1D, y_train, verbose=False)

# print results
print(f'\nBest Accuracy for {grid_result.best_score_} using {grid_result.best_params_}')
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f' mean={mean:.4}, std={stdev:.4} using {param}')
'''

In [None]:
'''
print(grid_search.best_params_)

# evaluate the best model training on entire training set and validating on validation set
model = grid_search.best_estimator_.model

#model.evaluate(x_train_1D, y_train)
#model.evaluate(x_validation_1D, y_validation)
#model.evaluate(x_test_1D, y_test)
'''

#### Best model and early stopping

In [None]:
def dnn(X_train, Y_train, X_valid, Y_valid, X_test, Y_test,
        n_units, epochs, batch_size):

    model_dnn = create_dnn(n_units)

    # define a log dir in order to save the checkpoint file
    logdir = os.path.join(os.curdir, "my_logs")

    # in Keras the Eraly stopping is manage by using the callbacks argument.
    callbacks_dnn = [
        keras.callbacks.TensorBoard(logdir),
        keras.callbacks.EarlyStopping(patience=5),
        #Saving the checkpoints file allows to load the "best" model when the Early 
        #stopping detect that the generalization error degrade (after 'patience' epochs)
        keras.callbacks.ModelCheckpoint("my_model_dnn.h5", save_best_only=True),
    ]


    history = model_dnn.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,
                        validation_data=(X_valid, Y_valid),
                        callbacks=callbacks_dnn,
                        verbose = 1)

    #model_dnn = keras.models.load_model("my_model_dnn.h5")

    plot_loss(history)
    plot_accuracy(history)

    Y_train_pred = np.argmax(model_dnn.predict(X_train), axis=1)
    Y_valid_pred = np.argmax(model_dnn.predict(X_valid), axis=1)
    Y_test_pred = np.argmax(model_dnn.predict(X_test), axis=1)
    
    # print accuracy
    print(f'Train Accuracy: {accuracy_score(Y_train, Y_train_pred):.4f}')
    print(f'Valid Accuracy: {accuracy_score(Y_valid, Y_valid_pred):.4f}')
    print(f'Test  Accuracy: {accuracy_score(Y_test,  Y_test_pred):.4f}')
    
    return (Y_train_pred, Y_valid_pred, Y_test_pred)

In [None]:
# best model
n_units = 64
epochs = 100
batch_size = 128

y_train_pred, y_valid_pred, y_test_pred = dnn(x_train_1D, y_train,
                                              x_validation_1D, y_validation,
                                              x_test_1D, y_test,
                                              n_units, epochs, batch_size)

# print_results(y_train, y_train_pred)
# print_results(y_validation, y_valid_pred)
# print_results(y_test, y_test_pred)

In [None]:
# best model on augmented data

'''
n_units = 64
epochs = 100
batch_size = 128

y_train_pred, y_valid_pred, y_test_pred = dnn(x_train_1D_aug, y_train_aug,
                                              x_validation_1D, y_validation,
                                              x_test_1D, y_test,
                                              n_units, epochs, batch_size)


# print_results(y_train_aug, y_train_pred)
# print_results(y_validation, y_valid_pred)
# print_results(y_test, y_test_pred)
'''

### CNN 1D

##### Hyper parameters tuning


In [None]:
# create a function that creates the model (required for KerasClassifier) 
# Input: the hyperparameters we want to tune 
# Output: CNN model

def create_cnn1(filters = 32, kernel_size = 3, pool_size=2):
    # define model
    cnn1_model = keras.models.Sequential([
        keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, padding="same", activation="relu", input_shape=[1024, 1]),
        keras.layers.MaxPool1D(pool_size=pool_size),
        keras.layers.Conv1D(filters=2*filters, kernel_size=kernel_size, padding="same", activation="relu"),
        keras.layers.MaxPool1D(pool_size=pool_size),
        keras.layers.Conv1D(filters=4*filters, kernel_size=kernel_size, padding="same", activation="relu"),
        keras.layers.MaxPool1D(pool_size=pool_size),
        keras.layers.Flatten(),
        keras.layers.Dense(8, activation="softmax")
    ])

    # compile model
    cnn1_model.compile(loss="sparse_categorical_crossentropy",
                  optimizer='adam',
                  metrics=["accuracy"])
    return cnn1_model

In [None]:
# create a keras.wrappers.scikit_learn.KerasRegressor and pass the build_model function to the constructor
# this gives a Scikit-Learn compatible predictor
keras_reg_cnn1 = keras.wrappers.scikit_learn.KerasRegressor(create_cnn1)

# define the grid search parameters
params_cnn1 = {
    "filters": [32, 64],
    "kernel_size": [3,5],
    "pool_size": [2,3],
    "batch_size": [128, 256],
    "epochs": [30]
}

# define accuracy score in order to avoid error:
# "Classification metrics can't handle a mix of multilabel-indicator and continuous-multioutput targets"
def acc(y_true, y_pred):
    score = accuracy_score(np.argmax(y_pred, axis = 1), y_true)
    print('score is {}'.format(score))
    return score
acc_score = make_scorer(acc)


# define StratifiedKFold to be used within GridSearch
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234)

grid_search = GridSearchCV(keras_reg_cnn1, params_cnn1, cv=inner_cv, scoring=acc_score)

In [None]:
# grid search
# uncomment next lines to run
'''
grid_result = grid_search.fit(x_train_1D, y_train, epochs=10, verbose=False)

# print results
print(f'Best Accuracy for {grid_result.best_score_} using {grid_result.best_params_}')
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f' mean={mean:.4}, std={stdev:.4} using {param}')
'''

In [None]:
'''
print(grid_search.best_params_)

# evaluate the best model training on entire training set and validating on validation set
model = grid_search.best_estimator_.model

#model.evaluate(x_train_1D, y_train)
#model.evaluate(x_validation_1D, y_validation)
#model.evaluate(x_test_1D, y_test)
'''

##### Best model and early stopping

In [None]:
def cnn1(X_train, Y_train, X_valid, Y_valid, X_test, Y_test,
         filters, kernel_size, pool_size, epochs, batch_size):

    model_cnn1 = create_cnn1(filters, kernel_size, pool_size)

    # define a log dir in order to save the checkpoint file
    logdir = os.path.join(os.curdir, "my_logs")

    # in Keras the Eraly stopping is manage by using the callbacks argument.
    callbacks_cnn1 = [
        keras.callbacks.TensorBoard(logdir),
        keras.callbacks.EarlyStopping(patience=5),
        #Saving the checkpoints file allows to load the "best" model when the Early 
        #stopping detect that the generalization error degrade (after 'patience' epochs)
        keras.callbacks.ModelCheckpoint("my_model_cnn1.h5", save_best_only=True),
    ]

    history = model_cnn1.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,
                        validation_data=(X_valid, Y_valid),
                        callbacks=callbacks_cnn1,
                        verbose = 1)

    #model_cnn1 = keras.models.load_model("my_model_cnn1.h5")

    plot_loss(history)
    plot_accuracy(history)

    Y_train_pred = np.argmax(model_cnn1.predict(X_train), axis=1)
    Y_valid_pred = np.argmax(model_cnn1.predict(X_valid), axis=1)
    Y_test_pred = np.argmax(model_cnn1.predict(X_test), axis=1)
    
    # print accuracy
    print(f'Train Accuracy: {accuracy_score(Y_train, Y_train_pred):.4f}')
    print(f'Valid Accuracy: {accuracy_score(Y_valid, Y_valid_pred):.4f}')
    print(f'Test  Accuracy: {accuracy_score(Y_test,  Y_test_pred):.4f}')
    
    return (Y_train_pred, Y_valid_pred, Y_test_pred)


In [None]:
# best model
filters = 32
kernel_size = 5
pool_size = 3
epochs = 100
batch_size = 128

y_train_pred, y_valid_pred, y_test_pred = cnn1(x_train_1D, y_train,
                                               x_validation_1D, y_validation,
                                               x_test_1D, y_test,
                                               filters, kernel_size, pool_size,
                                               epochs, batch_size)

# print_results(y_train, y_train_pred)
# print_results(y_validation, y_valid_pred)
# print_results(y_test, y_test_pred)

In [None]:
# using same hyperparameter with augmented dataset
'''

filters = 32
kernel_size = 5
pool_size = 3
epochs = 100
batch_size = 128

y_train_pred, y_valid_pred, y_test_pred = cnn1(x_train_1D_aug, y_train_aug,
                                               x_validation_1D, y_validation,
                                               x_test_1D, y_test,
                                               filters, kernel_size, pool_size,
                                               epochs, batch_size)

# print_results(y_train_aug, y_train_pred)
# print_results(y_validation, y_valid_pred)
# print_results(y_test, y_test_pred)
'''

### CNN 2D

##### Hyper parameters tuning

In [None]:
# create a function that creates the model (required for KerasClassifier) 
# Input: the hyperparameters we want to tune 
# Output: CNN model

def create_cnn2(filters = 32, kernel_size = [3,3], pool_size=[2,2]):
    # define model
    model_cnn2 = keras.models.Sequential([
        keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, padding="same", activation="relu",
                            kernel_regularizer = tf.keras.regularizers.l2(0.01), input_shape=[32, 32,1]),
        keras.layers.MaxPool2D(pool_size=pool_size),
        keras.layers.Conv2D(filters=filters*2, kernel_size=kernel_size, padding="same", activation="relu",
                           kernel_regularizer = tf.keras.regularizers.l2(0.01),),
        keras.layers.MaxPool2D(pool_size=pool_size),
        keras.layers.Conv2D(filters=filters*4, kernel_size=kernel_size, padding="same", activation="relu"),
        keras.layers.MaxPool2D(pool_size=pool_size),
        keras.layers.Flatten(),
        keras.layers.Dense(8, activation="softmax")
    ])
    
    model_cnn2.compile(loss="sparse_categorical_crossentropy",
                  optimizer='adam',
                  metrics=["accuracy"])

    return model_cnn2

In [None]:
# create a keras.wrappers.scikit_learn.KerasRegressor and pass the build_model function to the constructor
# this gives a Scikit-Learn compatible predictor
keras_reg_cnn2 = keras.wrappers.scikit_learn.KerasRegressor(create_cnn2)

# define the grid search parameters
params_cnn2 = {
    "filters": [32, 64],
    "kernel_size": [3, 5],
    "pool_size": [2, 3],
    "batch_size": [128, 256],
    "epochs": [30]
}


# define accuracy score in order to avoid error:
# "Classification metrics can't handle a mix of multilabel-indicator and continuous-multioutput targets"
def acc(y_true, y_pred):
    score = accuracy_score(np.argmax(y_pred, axis = 1), y_true)
    print('score is {}'.format(score))
    return score
acc_score = make_scorer(acc)


# define StratifiedKFold CV to be used within GridSearch
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234)

grid_search = GridSearchCV(keras_reg_cnn2, params_cnn2, cv=inner_cv, scoring=acc_score)

In [None]:
# grid search
# uncomment next lines to run
'''
grid_result = grid_search.fit(x_train_2D, y_train, epochs=40, verbose=False)

# print results
print(f'Best Accuracy for {grid_result.best_score_} using {grid_result.best_params_}')
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f' mean={mean:.4}, std={stdev:.4} using {param}')
'''

In [None]:
'''
print(grid_search.best_params_)

# evaluate the best model training on entire training set and validating on validation set
model = grid_search.best_estimator_.model

#model.evaluate(x_train_1D, y_train)
#model.evaluate(x_validation_1D, y_validation)
#model.evaluate(x_test_1D, y_test)
'''

##### Best model and early stopping

In [None]:
def cnn2(X_train, Y_train, X_valid, Y_valid, X_test, Y_test,
         filters, kernel_size, pool_size, epochs, batch_size):

    model_cnn2 = create_cnn2(filters, kernel_size, pool_size)

    # define a log dir in order to save the checkpoint file
    logdir = os.path.join(os.curdir, "my_logs")

    # in Keras the Eraly stopping is manage by using the callbacks argument.
    callbacks_cnn2 = [
        keras.callbacks.TensorBoard(logdir),
        keras.callbacks.EarlyStopping(patience=5),
        #Saving the checkpoints file allows to load the "best" model when the Early 
        #stopping detect that the generalization error degrade (after 'patience' epochs)
        keras.callbacks.ModelCheckpoint("my_model_cnn2.h5", save_best_only=True),
    ]

    history = model_cnn2.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,
                        validation_data=(X_valid, Y_valid),
                        callbacks=callbacks_cnn2,
                        verbose = 1)

    model_cnn2 = keras.models.load_model("my_model_cnn2.h5")

    plot_loss(history)
    plot_accuracy(history)

    Y_train_pred = np.argmax(model_cnn2.predict(X_train), axis=1)
    Y_valid_pred = np.argmax(model_cnn2.predict(X_valid), axis=1)
    Y_test_pred = np.argmax(model_cnn2.predict(X_test), axis=1)
    
    # print accuracy
    print(f'Train Accuracy: {accuracy_score(Y_train, Y_train_pred):.4f}')
    print(f'Valid Accuracy: {accuracy_score(Y_valid, Y_valid_pred):.4f}')
    print(f'Test  Accuracy: {accuracy_score(Y_test,  Y_test_pred):.4f}')
    
    return (Y_train_pred, Y_valid_pred, Y_test_pred)

In [None]:
# best model
filters = 64
kernel_size = 5
pool_size = 2
epochs = 30
batch_size = 128

y_train_pred, y_valid_pred, y_test_pred = cnn2(x_train_2D, y_train,
                                               x_validation_2D, y_validation,
                                               x_test_2D, y_test,
                                               filters, kernel_size, pool_size,
                                               epochs, batch_size)

# print_results(y_train, y_train_pred)
# print_results(y_validation, y_valid_pred)
# print_results(y_test, y_test_pred)

In [None]:
# best model on augmented data

'''
filters = 64
kernel_size = 3
pool_size = 2
epochs = 30
batch_size = 128

y_train_pred, y_valid_pred, y_test_pred = cnn2(x_train_2D_aug, y_train_aug,
                                               x_validation_2D, y_validation,
                                               x_test_2D, y_test,
                                               filters, kernel_size, pool_size,
                                               epochs, batch_size)

# print_results(y_train_aug, y_train_pred)
# print_results(y_validation, y_valid_pred)
# print_results(y_test, y_test_pred)
'''

### CRNN 1D

##### Hyper parameters tuning

In [None]:
# create a function that creates the model (required for KerasClassifier) 
# Input: the hyperparameters we want to tune 
# Output: RNN model

def create_rnn1(filters = 32, kernel_size = 3, pool_size=2, GRUunits = 32):
    # define model
    model_rnn1 = keras.models.Sequential([
        keras.layers.Conv1D(filters=filters, kernel_size= kernel_size, padding="same", activation="relu", input_shape=[1024, 1]),
        keras.layers.MaxPool1D(pool_size= pool_size),
        keras.layers.Conv1D(filters=filters*2, kernel_size= kernel_size, padding="same", activation="relu"),
        keras.layers.MaxPool1D(pool_size= pool_size),
        keras.layers.Conv1D(filters=filters*4, kernel_size= kernel_size, padding="same", activation="relu"),
        keras.layers.MaxPool1D(pool_size= pool_size),
        #keras.layers.Dropout(0.8),
        #keras.layers.Bidirectional(keras.layers.LSTM(16)),
        keras.layers.GRU(GRUunits),
        keras.layers.Flatten(),
        keras.layers.Dense(8, activation="softmax")])
    model_rnn1.compile(loss="sparse_categorical_crossentropy",
              optimizer='adam',
              metrics=["accuracy"])

    return model_rnn1

In [None]:
# create a keras.wrappers.scikit_learn.KerasRegressor and pass the build_model function to the constructor
# this gives a Scikit-Learn compatible predictor
keras_reg_rnn1 = keras.wrappers.scikit_learn.KerasRegressor(create_rnn1)

# define the grid search parameters
params_rnn1 = {
    "filters": [32,64],
    "kernel_size": [3,5],
    "pool_size": [2,3],
    "GRUunits":[64],
    "batch_size":[128,256],
    "epochs":[30]},


# define accuracy score in order to avoid error:
# "Classification metrics can't handle a mix of multilabel-indicator and continuous-multioutput targets"
def acc(y_true, y_pred):
    score = accuracy_score(np.argmax(y_pred, axis = 1), y_true)
    print('score is {}'.format(score))
    return score
acc_score = make_scorer(acc)
 
# define StratifiedKFold to be used within GridSearch
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234)


grid_search = GridSearchCV(keras_reg_rnn1, params_rnn1, cv=inner_cv, scoring=acc_score)

In [None]:
# grid search
# uncomment next lines to run

'''
grid_result = grid_search.fit(x_train_1D, y_train, epochs=40, verbose=False)

# print results
print(f'Best Accuracy for {grid_result.best_score_} using {grid_result.best_params_}')
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f' mean={mean:.4}, std={stdev:.4} using {param}')
'''

In [None]:
'''
print(grid_search.best_params_)

# evaluate the best model training on entire training set and validating on validation set
model = grid_search.best_estimator_.model

#model.evaluate(x_train_1D, y_train)
#model.evaluate(x_validation_1D, y_validation)
#model.evaluate(x_test_1D, y_test)
'''

##### Best model and early stopping

In [None]:
def rnn1(X_train, Y_train, X_valid, Y_valid, X_test, Y_test,
         filters, kernel_size, pool_size, GRUunits, epochs, batch_size):

    model_rnn1 = create_rnn1(filters, kernel_size, pool_size, GRUunits)

    # define a log dir in order to save the checkpoint file
    logdir = os.path.join(os.curdir, "my_logs")

    # in Keras the Eraly stopping is manage by using the callbacks argument.
    callbacks_rnn1 = [
        keras.callbacks.TensorBoard(logdir),
        keras.callbacks.EarlyStopping(patience=5),
        #Saving the checkpoints file allows to load the "best" model when the Early 
        #stopping detect that the generalization error degrade (after 'patience' epochs)
        keras.callbacks.ModelCheckpoint("my_model_rnn1.h5", save_best_only=True),
    ]

    history = model_rnn1.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,
                        validation_data=(X_valid, Y_valid),
                        callbacks=callbacks_rnn1,
                        verbose = 1)

    #model_rnn1 = keras.models.load_model("my_model_rnn1.h5")

    plot_loss(history)
    plot_accuracy(history)

    Y_train_pred = np.argmax(model_rnn1.predict(X_train), axis=1)
    Y_valid_pred = np.argmax(model_rnn1.predict(X_valid), axis=1)
    Y_test_pred = np.argmax(model_rnn1.predict(X_test), axis=1)
    
    # print accuracy
    print(f'Train Accuracy: {accuracy_score(Y_train, Y_train_pred):.4f}')
    print(f'Valid Accuracy: {accuracy_score(Y_valid, Y_valid_pred):.4f}')
    print(f'Test  Accuracy: {accuracy_score(Y_test,  Y_test_pred):.4f}')
    
    return (Y_train_pred, Y_valid_pred, Y_test_pred)

In [None]:
# best model
filters=64
kernel_size=5
pool_size=3
GRUunits=64
epochs = 100
batch_size = 128

y_train_pred, y_valid_pred, y_test_pred = rnn1(x_train_1D, y_train,
                                               x_validation_1D, y_validation,
                                               x_test_1D, y_test,
                                               filters, kernel_size, pool_size,
                                               GRUunits, epochs, batch_size)

# print_results(y_train, y_train_pred)
# print_results(y_validation, y_valid_pred)
# print_results(y_test, y_test_pred)

In [None]:
# using same hyperparameters on augmented data

'''
filters=64
kernel_size=5
pool_size=3
GRUunits=64
epochs = 100
batch_size = 128

y_train_pred, y_valid_pred, y_test_pred = rnn1(x_train_1D_aug, y_train_aug,
                                               x_validation_1D, y_validation,
                                               x_test_1D, y_test,
                                               filters, kernel_size, pool_size,
                                               GRUunits, epochs, batch_size)

# print_results(y_train_aug, y_train_pred)
# print_results(y_validation, y_valid_pred)
# print_results(y_test, y_test_pred)
'''

### CRNN 2D

##### Hyper parameters tuning

In [None]:
# create a function that creates the model (required for KerasClassifier) 
# Input: the hyperparameters we want to tune 
# Output: RNN model

def create_rnn2(filters = 32, kernel_size = 3, pool_size=2, GRUunits=32):
    
    # define model
    model_rnn2 = keras.models.Sequential([
        keras.layers.Conv2D(filters=filters, kernel_size=kernel_size, padding="same", activation="relu",
                        kernel_regularizer = tf.keras.regularizers.l2(0.01), input_shape=[32,32,1]),
        #keras.layers.BatchNormalization(),
        keras.layers.MaxPool2D(pool_size=pool_size),
        keras.layers.Conv2D(filters=filters*2, kernel_size=kernel_size, padding="same", activation="relu",
                        kernel_regularizer = tf.keras.regularizers.l2(0.01)),
        #keras.layers.BatchNormalization(),
        keras.layers.MaxPool2D(pool_size=pool_size),
        keras.layers.Conv2D(filters=filters*4, kernel_size=kernel_size, padding="same", activation="relu",
                        kernel_regularizer = tf.keras.regularizers.l2(0.01)),
        #keras.layers.BatchNormalization(),
        keras.layers.MaxPool2D(pool_size=pool_size),
        keras.layers.Reshape((np.floor(x_train_mat.shape[1]/pool_size**3)**2, filters*4)), # depends on the kernel size!
        keras.layers.GRU(GRUunits),
        #keras.layers.Flatten(),
        keras.layers.Dense(32, activation="relu"), 
        keras.layers.Dense(8, activation="softmax")])
    
    model_rnn2.compile(loss="sparse_categorical_crossentropy",
              optimizer='adam',
              metrics=["accuracy"])

    return model_rnn2

In [None]:
# create a keras.wrappers.scikit_learn.KerasRegressor and pass the build_model function to the constructor
# this gives a Scikit-Learn compatible predictor
keras_reg_rnn2 = keras.wrappers.scikit_learn.KerasRegressor(create_rnn2)
# define the grid search parameters

params_rnn2 = {
    "filters": [16, 32, 64],
    "kernel_size": [3,5],
    "pool_size": [2,3],
    "GRUunits":[16, 32],
    "batch_size":[128, 256],
    "epochs":[30]}


# define accuracy score in order to avoid error:
# "Classification metrics can't handle a mix of multilabel-indicator and continuous-multioutput targets"
def acc(y_true, y_pred):
    score = accuracy_score(np.argmax(y_pred, axis = 1), y_true)
    print('score is {}'.format(score))
    return score
acc_score = make_scorer(acc)


# define StratifiedKFold CV to be used within GridSearch
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234)
 
grid_search = GridSearchCV(keras_reg_rnn2, params_rnn2, cv=inner_cv, scoring=acc_score)

In [None]:
# grid search
# uncomment next lines to run
'''
grid_result = grid_search.fit(x_train_2D, y_train, epochs=40, verbose=False)

# print results
print(f'Best Accuracy for {grid_result.best_score_} using {grid_result.best_params_}')
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print(f' mean={mean:.4}, std={stdev:.4} using {param}')
'''

In [None]:
'''
print(grid_search.best_params_)

# evaluate the best model training on entire training set and validating on validation set
model = grid_search.best_estimator_.model

#model.evaluate(x_train_2D, y_train)
#model.evaluate(x_validation_2D, y_validation)
#model.evaluate(x_test_2D, y_test)
'''

#### Best model and early stopping

In [None]:
def rnn2(X_train, Y_train, X_valid, Y_valid, X_test, Y_test,
         filters, kernel_size, pool_size, GRUunits, epochs, batch_size):

    model_rnn2 = create_rnn2(filters, kernel_size, pool_size, GRUunits)

    # define a log dir in order to save the checkpoint file
    logdir = os.path.join(os.curdir, "my_logs")

    # in Keras the Eraly stopping is manage by using the callbacks argument.
    callbacks_rnn2 = [
        keras.callbacks.TensorBoard(logdir),
        keras.callbacks.EarlyStopping(patience=5),
        #Saving the checkpoints file allows to load the "best" model when the Early 
        #stopping detect that the generalization error degrade (after 'patience' epochs)
        keras.callbacks.ModelCheckpoint("my_model_rnn2.h5", save_best_only=True),
    ]

    history = model_rnn2.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,
                        validation_data=(X_valid, Y_valid),
                        callbacks=callbacks_rnn2,
                        verbose = 1)

    #model_rnn2 = keras.models.load_model("my_model_rnn2.h5")

    plot_loss(history)
    plot_accuracy(history)

    Y_train_pred = np.argmax(model_rnn2.predict(X_train), axis=1)
    Y_valid_pred = np.argmax(model_rnn2.predict(X_valid), axis=1)
    Y_test_pred = np.argmax(model_rnn2.predict(X_test), axis=1)
    
    # print accuracy
    print(f'Train Accuracy: {accuracy_score(Y_train, Y_train_pred):.4f}')
    print(f'Valid Accuracy: {accuracy_score(Y_valid, Y_valid_pred):.4f}')
    print(f'Test  Accuracy: {accuracy_score(Y_test,  Y_test_pred):.4f}')
    
    return (Y_train_pred, Y_valid_pred, Y_test_pred)

In [None]:
# best model
filters=32
kernel_size=5
pool_size= 3
GRUunits=64  #16
epochs = 50
batch_size = 128

y_train_pred, y_valid_pred, y_test_pred = rnn2(x_train_2D, y_train,
                                               x_validation_2D, y_validation,
                                               x_test_2D, y_test,
                                               filters, kernel_size, pool_size,
                                               GRUunits, epochs, batch_size)

# print_results(y_train, y_train_pred)
# print_results(y_validation, y_valid_pred)
# print_results(y_test, y_test_pred)

In [None]:
# best model on augmented data (hyperparameters found with grid search on augmented)
'''
filters = 32
kernel_size = 5
pool_size = 3
GRUunits = 64
epochs = 50
batch_size = 128

y_train_pred, y_valid_pred, y_test_pred = rnn2(x_train_2D_aug, y_train_aug,
                                               x_validation_2D, y_validation,
                                               x_test_2D, y_test,
                                               filters, kernel_size, pool_size,
                                               GRUunits, epochs, batch_size)

# print_results(y_train_aug, y_train_pred)
# print_results(y_validation, y_valid_pred)
# print_results(y_test, y_test_pred)
'''

# Data augmentation

* time shift
* time masking
* white noise addition
* frequency masking
* mix combinations in a function


## Time shift 
Shift to left or right from 1 to 5 slots time, filling with silence (-9.21 Mel frequency). Recall that each slot corresponds to 4 msec.

In [None]:
#time shift
idx=1006

shifted=-9.21*np.ones([32,32])
shifted[:,:29]=x_train_mat[idx][:,3:].copy()


# Plot audio feature
plt.figure(figsize=(5, 3))
plt.title(f'Mel-Spectrogram of audio: {df_train.iloc[idx]["word"]}', fontweight='bold')
plt.imshow(x_train_mat[idx], aspect='auto', origin='low', cmap='inferno')
plt.grid(lw=0.4, c='w', alpha=0.4)
plt.show()

# Plot audio feature
plt.figure(figsize=(5, 3))
plt.title(f'Mel-Spectrogram of audio: {df_train.iloc[idx]["word"]}', fontweight='bold')
plt.imshow(shifted, aspect='auto', origin='low', cmap='inferno')
plt.grid(lw=0.4, c='w', alpha=0.4)
plt.show()



## Time masking

Silence a randomly selected slot of time

In [None]:
#time masking
idx=1006

silenced=x_train_mat[idx].copy()
silenced[:,15]=-9.21*np.ones(32)


# Plot audio feature
plt.figure(figsize=(5, 3))
plt.title(f'Mel-Spectrogram of audio: {df_train.iloc[idx]["word"]}', fontweight='bold')
plt.imshow(x_train_mat[idx], aspect='auto', origin='low', cmap='inferno')
plt.grid(lw=0.4, c='w', alpha=0.4)
plt.show()

# Plot audio feature
plt.figure(figsize=(5, 3))
plt.title(f'Mel-Spectrogram of audio: {df_train.iloc[idx]["word"]}', fontweight='bold')
plt.imshow(silenced, aspect='auto', origin='low', cmap='inferno')
plt.grid(lw=0.4, c='w', alpha=0.4)
plt.show()



## White noise

Add white noise to the sample. White noises are random samples distributed at regular intervals with mean of 0 and standard deviation of 1.

In [None]:
# seaborn histogram
sns.distplot(x_train_mat.reshape(-1), hist=True, kde=False, 
             bins=int(180/5), color = 'blue',
             hist_kws={'edgecolor':'black'})

## Frequency masking

Silence a randomly selected frequency band.

In [None]:
print(np.min(x_train_mat))
print(np.shape(x_train_mat))

In [None]:
# frequency masking
idx=406

fsilenced = x_train_mat[idx].copy()
fsilenced[9:11,:]=-9.21*np.ones((2,32))

# Plot original feature
plt.figure(figsize=(5, 3))
plt.title(f'Mel-Spectrogram of audio: {df_train.iloc[idx]["word"]}', fontweight='bold')
plt.imshow(x_train_mat[idx], aspect='auto', origin='low', cmap='inferno')
plt.grid(lw=0.4, c='w', alpha=0.4)
plt.show()

# Plot edited feature
plt.figure(figsize=(5, 3))
plt.title(f'Mel-Spectrogram of audio: {df_train.iloc[idx]["word"]}', fontweight='bold')
plt.imshow(fsilenced, aspect='auto', origin='low', cmap='inferno')
plt.grid(lw=0.4, c='w', alpha=0.4)
plt.show()

## Combinations

In [None]:
def augment(data_mat, y, shift=True, time=True, freq=True, noise=True, noise_coeff=0.1, bern_prob = 0.8):
  minEn = np.min(data_mat)
  suppl = data_mat.copy()

  for i in range(data_mat.shape[0]):

    if time and np.random.binomial(1,bern_prob):
      # sample time slot and its width
      t = np.random.randint(1,3)
      t0 = np.random.randint(0,32+1-t)
      suppl[i][t0:t0+t,:] = minEn*np.ones((t,32))

    if shift and np.random.binomial(1,bern_prob):
      # sample direction
      l = np.random.binomial(1,0.5)
      # sample shift
      s = np.random.randint(2,6)
      if l == 1:
        suppl[i][:,:32-s] = suppl[i][:,s:].copy()
        suppl[i][:,32-s:] = minEn*np.ones((32,s))
      else:
        suppl[i][:,s:] = suppl[i][:,:32-s].copy()
        suppl[i][:,:s] = minEn*np.ones((32,s))

    if freq and np.random.binomial(1,bern_prob):
      # sample time slot and its width
      f = np.random.randint(1,3)
      f0 = np.random.randint(0,32+1-f)
      suppl[i][:,f0:f0+f] = minEn*np.ones((32,f))
  
  if noise and np.random.binomial(1,bern_prob):
      noisemat = noise_coeff*np.abs(np.random.normal(0,1,1600*32*32))
      noisemat = noisemat.reshape(1600,32,32)
      suppl += noisemat

  suppl = np.array(suppl)

  augmented = np.append(data_mat,suppl,0)
  #print(augmented.shape)

  y_aug = np.append(y, y, 0)
  #print(y_aug.shape)

  augmented_flat = augmented.reshape(augmented.shape[0], -1)
  #print(augmented_flat.shape)

  '''
  idx = np.random.randint(0,data_mat.shape[0])

  # Plot audio feature
  
  # Plot audio feature
  plt.figure(figsize=(5, 3))
  plt.title(f'Mel-Spectrogram of audio: {df_train.iloc[idx]["word"]}', fontweight='bold')
  plt.imshow(data_mat[idx], aspect='auto', origin='low', cmap='inferno')
  plt.grid(lw=0.4, c='w', alpha=0.4)
  plt.show()

  # Plot audio feature
  plt.figure(figsize=(5, 3))
  plt.title(f'Mel-Spectrogram of edited audio: {df_train.iloc[idx]["word"]}', fontweight='bold')
  plt.imshow(suppl[idx], aspect='auto', origin='low', cmap='inferno')
  plt.grid(lw=0.4, c='w', alpha=0.4)
  plt.show()
  '''

  return (augmented, augmented_flat, y_aug, suppl)

In [None]:
# x_train_aug_mat, x_train_aug, y_train_aug, x_train_new = augment(x_train_mat, y_train, freq=False, noise_coeff=0.05)

In [None]:
'''
from numpy import save
save(os.path.join(DATA_BASE_FOLDER, 'x_train_aug_mat.npy'), x_train_aug_mat)
save(os.path.join(DATA_BASE_FOLDER, 'x_train_aug.npy'), x_train_aug)
save(os.path.join(DATA_BASE_FOLDER, 'y_train_aug.npy'), y_train_aug)
save(os.path.join(DATA_BASE_FOLDER, 'x_train_new.npy'), x_train_new)
'''

# Audio reconstruction

In [None]:
idx=1005

#original audio 

ipd.Audio(audio_train[idx], rate=SAMPLE_RATE)

In [None]:
#original data (128,32) -  reconstr audio
np.exp(x_train_raw[idx]).shape

S = librosa.feature.inverse.mel_to_audio(np.exp(x_train_raw[idx]), sr=SAMPLE_RATE, hop_length=HOP_LEN)

ipd.Audio(S, rate=SAMPLE_RATE)

In [None]:
#original data (32,32) -  reconstr audio

np.exp(x_train_mat[idx]).shape

S = librosa.feature.inverse.mel_to_audio(np.exp(x_train_mat[idx]), sr=SAMPLE_RATE, hop_length=HOP_LEN)

ipd.Audio(S, rate=SAMPLE_RATE)

In [None]:
#shifted data (32,32) -  reconstr audio

shifted=-9.21*np.ones([32,32])
shifted[:,:29]=x_train_mat[idx][:,3:].copy()

np.exp(shifted).shape

S = librosa.feature.inverse.mel_to_audio(np.exp(shifted), sr=SAMPLE_RATE, hop_length=HOP_LEN)

ipd.Audio(S, rate=SAMPLE_RATE)

In [None]:
#time mask (32,32) -  reconstr audio

silenced=x_train_mat[idx].copy()
silenced[:,15]=-9.21*np.ones(32)

np.exp(silenced).shape

S = librosa.feature.inverse.mel_to_audio(np.exp(silenced), sr=SAMPLE_RATE, hop_length=HOP_LEN)

ipd.Audio(S, rate=SAMPLE_RATE)

In [None]:
#white noise (32,32) -  reconstr audio

noisemat=0.5*np.random.normal(0,1,32*32)
noisemat=np.abs(noisemat.reshape(32,32))
noised=x_train_mat[idx]+noisemat

np.exp(noised).shape

S = librosa.feature.inverse.mel_to_audio(np.exp(noised), sr=SAMPLE_RATE, hop_length=HOP_LEN)

ipd.Audio(S, rate=SAMPLE_RATE)

In [None]:
#freq mask (32,32) -  reconstr audio

fsilenced = x_train_mat[idx].copy()
fsilenced[9:11,:]=-9.21*np.ones((2,32))

np.exp(fsilenced).shape

S = librosa.feature.inverse.mel_to_audio(np.exp(fsilenced), sr=SAMPLE_RATE, hop_length=HOP_LEN)

ipd.Audio(S, rate=SAMPLE_RATE)

In [None]:
#spy test

S = librosa.feature.inverse.mel_to_audio(np.exp(x_test_mat[26]), sr=SAMPLE_RATE, hop_length=HOP_LEN)
ipd.Audio(S, rate=SAMPLE_RATE)

# Send the submission for the challenge

In [None]:
##################################################
# Save your test prediction in y_test_pred
##################################################

#y_test_pred = None

# Create submission
submission = pd.read_csv(os.path.join(DATA_BASE_FOLDER, 'sample_submission.csv'))
if y_test_pred is not None:
    submission['word'] = [labels[int(y_i)] for y_i in y_test_pred]
submission.to_csv('my_submission.csv', index=False)
