In [None]:
### Notebook for Pytorch implementation of NN-based PCVD classifier using Keras
# Potentially use as script for live coding tutorial after students have done the Keras version

# Main Goals:
# 1. Intro to pytorch via a direct comparison to keras
# 2. A lower-level look at some of the abstractions/decisions one doesn't see/make in Keras
# 3. Review of the vowel classification problem by applying the same principles via different code/implementation
# 4. more practice with FC-MLP architecture design (beyond single layer logreg/softmax)

# We will work with the PCVD dataset
# go to this website and click on Download
# https://www.kaggle.com/sabermalek/pcvcspeech

# Then put the main folder in the same directory as this .ipynb file and rename it PCVD

# mount your Google drive
from google.colab import drive
drive.mount('/content/drive')

#modify your pathfile here
%cd /content/drive/MyDrive/path/to/pcvd/PCVD

In [None]:
# This cell loads and processes the data
# you do not have to do anything here

# The libraries needed
import os
import scipy.io
from scipy.signal.windows import hann
import numpy as np
import librosa

# list all the files that are part of the dataset
all_mats = [i for i in os.listdir('.') if 'mat' in i]

# load the time-series data in each of the data files
# and store them in a numpy array
data = []
for mat in all_mats:
  d = scipy.io.loadmat(mat)['x']
  data.append(d)
data = np.concatenate(data,axis=1)

# reshape the data so that we have a matrix where each
# row is a datapoint (i.e. a vowel-consonant utterance)
_,nreps,nvow,nsamps=data.shape
data = np.reshape(data,(nreps*nvow,nsamps),order='F')

# window the data to reduce the number of samples
# and center the window around the vowel
data = data[:,5000:15000]*hann(10000)

# finally, resample the data have a sampling
# rate of 16000
sr = 16000
X = []
for d in data:
  X.append(librosa.resample(d,48000,sr))
data = np.array(X)

print("The shape of the data is", data.shape)

In [None]:
# "data" has the same number of datapoints for each vowel
# In farsi, there are 6 vowels. Considering the number of
# datapoints in "data". How many points do you have per vowel?

ndatapoints_per_vowel = 299

# now, the first ndatapoints_per_vowel rows in "data" contain
# datapoints that correspond to the vowel "a". The next
# ndatapoints_per_vowel rows correspond to the vowel "i", etc.

# we need to create a "labels" vector with the same number of rows
# as "data" containing class indices 0-5.  We will not use one-hot encoding this time, as the loss function
# we will use is optimized to received class indices.

labels = np.repeat(range(6),ndatapoints_per_vowel)
print("the shape of labels is",labels.shape)

# now randomly select ~5% of rows in "data" to be the test set
# Hint: you can use the np.random.choice function (with replace=False) 
# and use the first ~5% of its output to index out the test set
# the remaining datapoints will be the "development" set

N = data.shape[0]

all_idx = np.random.choice(N,N,replace=False)

data_ts = data[all_idx[:int(0.05*N)]]
labels_ts = labels[all_idx[:int(0.05*N)]]
data_dv = data[all_idx[int(0.05*N):]]
labels_dv = labels[all_idx[int(0.05*N):]]

print("\nThe shape of the development data is ", data_dv.shape)
print("The shape of the development labels is ", labels_dv.shape)
print("The shape of the testing data is ", data_ts.shape)
print("The shape of the testing labels is ", labels_ts.shape)

# now we randomly select ~15% of the development
# data to be your validation set, and the rest to be your training
# set. In this homework we will NOT do k-fold cross-validation.

N_dv = data_dv.shape[0]

all_idx = np.random.choice(N_dv,N_dv,replace=False)

Xvl = data_dv[all_idx[:int(0.15*N_dv)]]
Yvl = labels_dv[all_idx[:int(0.15*N_dv)]]
Xtr = data_dv[all_idx[int(0.15*N_dv):]]
Ytr = labels_dv[all_idx[int(0.15*N_dv):]]

print("\nThe shape of the training data is ", Xtr.shape)
print("The shape of the training labels is ", Ytr.shape)
print("The shape of the validation data is ", Xvl.shape)
print("The shape of the validation labels is ", Yvl.shape)

# now we have to standardize the data.

# Here each datapoint is a time-series. Additionally, we have
# a very limited number of datapoints. As a result, we must
# standardize each datapoint separately. Fortunately, audio time-series
# can be normalized to have zero mean and values that in the
# range between -1 and 1.

# standardize the training and validation data so that each datapoint
# has a mean centered around zero, and the largest magnitude in a datapoint is 1

mu_tr = np.mean(Xtr, axis=1, keepdims=True)
max_tr = np.max(np.abs(Xtr-mu_tr),axis=1, keepdims=True)
mu_vl = np.mean(Xvl, axis=1, keepdims=True)
max_vl = np.max(np.abs(Xvl-mu_vl),axis=1, keepdims=True)

Xtr = (Xtr-mu_tr)/max_tr
Xvl = (Xvl-mu_vl)/max_vl

In [None]:
# we have a very limited number of training data. 
# as a result, we must "augment" the number of training datapoints
# here we suggest that you augment the data by adding noise to it
# and randomly shift its pitch. However, you should consider augmenting
# your data with even more techniques. 

# create a copy of your training data to add gaussian noise 
# with a small variance
Xnoise = Xtr + 0.01*np.random.randn(*Xtr.shape)

# create a copy of your training data to randomly shift 
# the pitch of each datapoint by a few semitones
pitch_factors = np.random.uniform(-4,4,Xtr.shape[0])
Xpitch = []
for i, x in enumerate(Xtr):
  Xpitch.append(librosa.effects.pitch_shift(x,sr,pitch_factors[i]))

# now concatenate your original data with the augmented datapoints
Xtr = np.concatenate((Xtr,Xnoise,np.array(Xpitch)),axis=0)
Ytr = np.concatenate((Ytr,Ytr,Ytr),axis=0)

print("The shape of the training data is ", Xtr.shape)
print("The shape of the training labels is ", Ytr.shape)
print("The shape of the validation data is ", Xvl.shape)
print("The shape of the validation labels is ", Yvl.shape)

In [None]:
# Now we will import pytorch and we will use this library
# to build our model

# import necessary libraries

# set the hardware device you will be training on

#convert training data to Torch tensors on the current device:


In [None]:
# let's first define some training parameters:


In [None]:
# Pytorch differs from Keras in that we define our computational graph (i.e. neural network) through a class that contains two main methods:
# 1) Initialization of the graph structure and parameters
# 2) Specification of the forward data flow through the graph

# see https://pytorch.org/docs/stable/generated/torch.nn.Linear.html
# and https://pytorch.org/docs/stable/generated/torch.nn.Module.html

# Next, we will set up the training optimization using the parameters we chose above:


# Then we define the loss function outside of the model class and before the training loop.

# Next, we will create a function to compare the predicted classes and the actual classes to calculate the accuracy


############## Model Information #####################################
# Lastly we will print out the summary of our model:
# 1) The model structure
# 2) The name and size of each layer's weights + biases
# 3) The total number of trainable parameters (How do we know if a parameter is trainable?)
# Uncomment the following comment block when you have finished coding the above steps
'''
print(f"Model structure: {model}\n")
for name, param in model.named_parameters():
    print(f"Layer: {name} | Size: {param.size()}")

# total parameters and trainable parameters
total_params = sum(p.numel() for p in model.parameters())
print(f"\n{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.\n")
'''

In [None]:
# now that the model has been built, let's see how it performs on the training
# data before being trained.

In [None]:
# now we can move on to train the model

# When using standard pytorch, we must define training loops similar to those we used for our softmax classification training algorithm.
# We define an outer loop that will iterate through epochs
# Within each epoch, we will:
#   1. pass the (training/validation) data through the model to get predicted outputs
#   2. calculate the loss
#   3. backpropogate the gradient
#   4. test the model on the validation set
#   5. calculate the epoch statistics we care about

# define 2 dictionaries which will store the accuracy/epoch and loss/epoch for both train and validation sets

############## Main Training Loop ###################
# implement main training loop here

In [None]:
# now that trianing is done, let's visualize the training and validation loss
# all of that information is readily available in the "training logs"

import matplotlib.pyplot as plt

# summarize history for loss

In [None]:
# PERFORMANCE: validation set

# let's see our model's confusion matrix with the validation set

# calculate theta using the best parameters found

# compute the confusion matrix and plot it


In [None]:
# now let's "hear" the model weights

# to extract your model's weights, you will have to use your models layers' names,
# to access the the weights. You can use the .weight and .bias variables to get the actual data

from IPython.display import Audio
Audio(data=W[5,:], rate=sr)

In [None]:
# Now let's see how the model does with the test data

# Standardize your data. 
mu_ts = np.mean(data_ts, axis=1, keepdims=True)
max_ts = np.max(np.abs(data_ts-mu_ts),axis=1, keepdims=True)

Xts = (data_ts-mu_ts)/max_ts

# Convert the test set to Torch tensors and put them on the appropriate device.

# pass the test data through the model in evaluation mode and calculate the accuracy

# compute the confusion matrix and plot it
