# Predictor Model
****
In the notebook 11_class_CNN we trained a CNN to recognize spoken words and it performed well enough. In this notebook we will be re-creating that model in order to train it on all of the data in the training set without seperating a validation set.

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import matplotlib.image as img
import numpy as np
import librosa.display
import time
import math
import re
import os

from glob import glob
from sklearn.metrics import confusion_matrix
from sklearn.utils import resample
from datetime import timedelta
from PIL import Image
from scipy import signal
from scipy.io import wavfile

In [2]:
# Convolutional Layer 1.
filter_size1 = 6         # number of pixels in one side of the filter
num_filters1 = 72         # number of filters

# Convolutional Layer 2.
filter_size2 = 6
num_filters2 = 112

# Fully-connected layer.
fc_size = 256             # Number of neurons in fully-connected layer.
fc_size_2 = 256

# The number of pixels in each dimension of an image.
img_height = 128 # 161 for spec, 128 for mfcc
img_width = 32 #99 for spec 32 for mfcc

# The images are stored in one-dimensional arrays of this length.
img_size_flat = img_height * img_width

# Tuple with height and width of images used to reshape arrays.
img_shape = (128, 32)# (161,99)

# Number of classes, one class for each of 10 digits.
num_classes = 12

# Number of colour channels for the images: 1 channel for gray-scale.
num_channels = 1

#The file path for the original audio files
audio_path = 'train/audio/'
test_path = 'test/'

POSSIBLE_LABELS = 'yes no up down left right on off stop go silence unknown'.split()
id2name = {i: name for i, name in enumerate(POSSIBLE_LABELS)}
name2id = {name: i for i, name in id2name.items()}

In [3]:
def load_data(data_dir):
    pattern = re.compile("(.+\/)?(\w+)\/([^_]+)_") # for file types add '.+(type)'
    all_files = glob(os.path.join(data_dir, 'mfcc/train/*/*'))

    possible = set(POSSIBLE_LABELS)
    train = []
    for entry in all_files:
        r = re.match(pattern, entry)
        if r:
            label = r.group(2)
            if label == '_background_noise_':
                label = 'silence'
            if label not in possible:
                label = 'unknown'

            label_id = name2id[label]
            label_vec = np.eye(len(id2name))[label_id]
            sound_path = audio_path + "/".join(entry.strip("/").split('/')[2:]) + '.wav'#str(entry)

            sample = (label, label_id, label_vec, entry, sound_path)
            train.append(sample)

    columns_list = ['label', 'label_id', 'label_vec', 'file_name', 'audio_file']
    
    return pd.DataFrame(train, columns = columns_list)

In [4]:
train_df = load_data('')
label_df = train_df[['label','label_id']].drop_duplicates(
    subset=None, keep='first', inplace=False).set_index('label_id')
label_names = label_df['label'].sort_index()
train_df.head()

Unnamed: 0,label,label_id,label_vec,file_name,audio_file
0,right,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",mfcc/train/right/3bfd30e6_nohash_2,train/audio/right/3bfd30e6_nohash_2.wav
1,right,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",mfcc/train/right/9190045a_nohash_0,train/audio/right/9190045a_nohash_0.wav
2,right,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",mfcc/train/right/dabf67d9_nohash_0,train/audio/right/dabf67d9_nohash_0.wav
3,right,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",mfcc/train/right/b7a0754f_nohash_0,train/audio/right/b7a0754f_nohash_0.wav
4,right,5,"[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",mfcc/train/right/a1cff772_nohash_2,train/audio/right/a1cff772_nohash_2.wav


### Resampling

In [5]:
avg_size = np.int(train_df[train_df['label_id'] < 10]['label_id'].value_counts().mean())
df_maj = train_df[train_df['label_id'] == 11]
df_min = train_df[train_df['label_id'] == 10]
df_rest = train_df[train_df['label_id'] < 10]

df_majority_downsampled = resample(df_maj, replace=False, n_samples=avg_size, random_state=5)

df_minority_upsampled = resample(df_min, n_samples = avg_size, random_state=5)

train_DS = pd.concat([df_majority_downsampled, df_minority_upsampled, df_rest])
train_DS['label_id'].value_counts()

8     2380
0     2377
2     2375
1     2375
9     2372
11    2368
10    2368
6     2367
5     2367
3     2359
7     2357
4     2353
Name: label_id, dtype: int64

## Creating the CNN
****
We will leave out the info stuff like sound files, etc and get straight to rebuilding the CNN

In [6]:
def new_weights(shape):
    return tf.Variable(tf.truncated_normal(shape, stddev=0.05))
def new_biases(length):
    return tf.Variable(tf.constant(0.05, shape=[length]))
def new_conv_layer(input,              # The previous layer.
                   num_input_channels, # Num. channels in prev. layer.
                   filter_size,        # Width and height of each filter.
                   num_filters,        # Number of filters.
                   use_pooling=True):  # Use 2x2 max-pooling.

    shape = [filter_size, filter_size, num_input_channels, num_filters]
    weights = new_weights(shape=shape)
    biases = new_biases(length=num_filters)
    layer = tf.nn.conv2d(input=input,
                         filter=weights,
                         strides=[1, 1, 1, 1],
                         padding='SAME')
    layer += biases
    if use_pooling:
        layer = tf.nn.max_pool(value=layer,
                               ksize=[1, 2, 2, 1],
                               strides=[1, 2, 2, 1],
                               padding='SAME')

    layer = tf.nn.relu(layer)
    return layer, weights
def flatten_layer(layer):
    layer_shape = layer.get_shape()
    num_features = layer_shape[1:4].num_elements()
    layer_flat = tf.reshape(layer, [-1, num_features])
    return layer_flat, num_features
def new_fc_layer(input,          # The previous layer.
                 num_inputs,     # Num. inputs from prev. layer.
                 num_outputs,    # Num. outputs.
                 use_relu=True): # Use Rectified Linear Unit (ReLU)?
    weights = new_weights(shape=[num_inputs, num_outputs])
    biases = new_biases(length=num_outputs)
    layer = tf.matmul(input, weights) + biases
    if use_relu:
        layer = tf.nn.relu(layer)
    return layer

### Placeholders

In [7]:
x = tf.placeholder(tf.float32, shape=[None, img_size_flat], name='x')
x_image = tf.reshape(x, [-1, img_height, img_width, num_channels])
y_true = tf.placeholder(tf.float32, shape=[None, num_classes], name='y_true')
y_true_cls = tf.argmax(y_true, axis=1)
keep_prob_1 = tf.placeholder(tf.float32)
keep_prob_2 = tf.placeholder(tf.float32)
keep_prob_3 = tf.placeholder(tf.float32)

### Convolutional Layers

In [8]:
layer_conv1, weights_conv1 = new_conv_layer(input=x_image,
                                            num_input_channels=num_channels,
                                            filter_size=filter_size1,
                                            num_filters=num_filters1,
                                            use_pooling=True)
layer_conv2, weights_conv2 = new_conv_layer(input=layer_conv1,
                                            num_input_channels=num_filters1,
                                            filter_size=filter_size2,
                                            num_filters=num_filters2,
                                            use_pooling=True)
layer_flat, num_features = flatten_layer(layer_conv2)

dropout_1 = tf.nn.dropout(layer_flat, keep_prob_1)

layer_fc1 = new_fc_layer(input=dropout_1,
                         num_inputs=num_features,
                         num_outputs=fc_size,
                         use_relu=True)

dropout_2 = tf.nn.dropout(layer_fc1, keep_prob_2)

layer_fc2 = new_fc_layer(input=layer_fc1,
                         num_inputs=fc_size,
                         num_outputs=fc_size_2,
                         use_relu=True)

dropout_3 = tf.nn.dropout(layer_fc2, keep_prob_3)

layer_fc3 = new_fc_layer(input=dropout_2,
                         num_inputs=fc_size_2,
                         num_outputs=num_classes,
                         use_relu=False)

y_pred = tf.nn.softmax(layer_fc3)
y_pred_cls = tf.argmax(y_pred, axis=1)

### Optimization

In [9]:
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=layer_fc3, labels=y_true)
cost = tf.reduce_mean(cross_entropy)

optimizer = tf.train.AdamOptimizer(learning_rate=1e-4).minimize(cost)

correct_prediction = tf.equal(y_pred_cls, y_true_cls)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.



## TensorFlow Run

In [10]:
session = tf.Session()
session.run(tf.global_variables_initializer())

In [11]:
RUN_NAME = 'Run 1 with 2 conv layers'

training_writer = tf.summary.FileWriter("./logs/{}/training".format(RUN_NAME), session.graph)

In [12]:
train_batch_size = 50
total_iterations = 0

def optimize(num_iterations):
    global total_iterations
    start_time = time.time()

    for i in range(total_iterations,
                   total_iterations + num_iterations):
        
        batch_df = train_DS.sample(train_batch_size)
        x_batch = np.array(
            [pd.read_csv(x, sep=',',header=None).T.values.tolist()[0] for x in batch_df['file_name']])
        y_true_batch = np.array([y for y in batch_df['label_vec']])
        
        feed_dict_train = {x: x_batch,
                           y_true: y_true_batch,
                           keep_prob_1: 0.5, keep_prob_2: 0.5, keep_prob_3: 0.5}

        session.run(optimizer, feed_dict=feed_dict_train)

        if i % 1000 == 0:
            acc = session.run(accuracy, feed_dict=feed_dict_train)
            msg = "Optimization Iteration: {0:>6}, Training Accuracy: {1:>6.1%}"
            print(msg.format(i + 1, acc))

    total_iterations += num_iterations
    end_time = time.time()
    time_dif = end_time - start_time
    print("Time usage: " + str(timedelta(seconds=int(round(time_dif)))))

### Training

In [13]:
optimize(num_iterations=2000)

Optimization Iteration:      1, Training Accuracy:   6.0%
Optimization Iteration:   1001, Training Accuracy:  66.0%
Time usage: 0:31:19


In [14]:
optimize(num_iterations=3000)

Optimization Iteration:   2001, Training Accuracy:  76.0%
Optimization Iteration:   3001, Training Accuracy:  76.0%
Optimization Iteration:   4001, Training Accuracy:  78.0%
Time usage: 0:46:47


In [15]:
optimize(num_iterations=5000)

Optimization Iteration:   5001, Training Accuracy:  82.0%
Optimization Iteration:   6001, Training Accuracy:  82.0%
Optimization Iteration:   7001, Training Accuracy:  94.0%
Optimization Iteration:   8001, Training Accuracy:  92.0%
Optimization Iteration:   9001, Training Accuracy:  94.0%
Time usage: 1:16:52


In [None]:
#optimize(num_iterations=5000)

In [16]:
def load_test(data_dir):
    all_files = glob(os.path.join(data_dir, 'mfcc/test/*'))
    test = []
    for entry in all_files:
        sound_path = test_path + "/".join(entry.split('/')[1:]) + '.wav'
        sample = (entry, sound_path)
        test.append(sample)
            
    columns_list = ['file_name', 'audio_file']
    test_df = pd.DataFrame(test, columns = columns_list)
    
    return test_df

In [17]:
Kaggle_test = load_test('')
Kaggle_test.head()

Unnamed: 0,file_name,audio_file
0,mfcc/test/clip_e5079a5ec,test/test/clip_e5079a5ec.wav
1,mfcc/test/clip_9b6cb90e7,test/test/clip_9b6cb90e7.wav
2,mfcc/test/clip_2e6d2f181,test/test/clip_2e6d2f181.wav
3,mfcc/test/clip_e75d514f3,test/test/clip_e75d514f3.wav
4,mfcc/test/clip_c5db7ac41,test/test/clip_c5db7ac41.wav


In [21]:
def predict_files(test_batch_size = 256):
    num_test = len(Kaggle_test)
    cls_pred = np.zeros(shape=num_test, dtype=np.int)
    names_pred = np.empty(shape=num_test, dtype=str)

    i = 0
    x_test_array = np.array([pd.read_csv(x, sep=',', header=None).T.values.tolist()[0] 
                             for x in Kaggle_test['file_name']])
    x_test_files = np.array([x.split('/')[-1] for x in Kaggle_test['audio_file']])

    while i < num_test:
        j = min(i + test_batch_size, num_test)
        files = x_test_array[i:j, :]
        feed_dict = {x: files, keep_prob_1: 1, keep_prob_2: 1, keep_prob_3: 1}
        cls_pred[i:j] = session.run(y_pred_cls, feed_dict=feed_dict)
        names_pred[i:j] = x_test_files[i:j]
        i = j

    return x_test_files, cls_pred

In [22]:
file_names, file_predictions = predict_files()

In [23]:
columns_kaggle = ['fname']
kaggle_df = pd.DataFrame(file_names, columns = columns_kaggle)
kaggle_df['label'] = pd.Series(file_predictions)
kaggle_df = kaggle_df.replace({"label": id2name})
kaggle_df = kaggle_df.set_index('fname')
kaggle_df.head()

Unnamed: 0_level_0,label
fname,Unnamed: 1_level_1
clip_e5079a5ec.wav,unknown
clip_9b6cb90e7.wav,down
clip_2e6d2f181.wav,unknown
clip_e75d514f3.wav,silence
clip_c5db7ac41.wav,unknown


In [24]:
kaggle_df.to_csv('Kaggle_Predictions_large.csv')

In [25]:
session.close()