In [16]:
# this code implements a deep neural network for feature learning
# this code comes from Vishwa's class on constructing a data loader and creating a model

# Some common system imports
import os
import sys
import importlib
import time
import csv

# Numeric computing
import numpy as np

# Sklearn functions are useful for generating train/test splits, and metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

from scipy.io import wavfile

# pytorch
import torch
import torch.utils.data as tdata
from torchaudio import transforms
from torchmetrics import Accuracy

# Plotting (if we want it)
import matplotlib.pyplot as plt

# importing our own modules
import audio_datasets as ads

# for one-hot vectore
import torch.nn.functional as func

from sklearn.datasets import load_digits
from sklearn.decomposition import KernelPCA

In [17]:
# loading the raw data for feature learning
training_path = os.path.join(os.getcwd(), "..", "training_data") #need to change this back before pushing ***
files = os.listdir(training_path)
wav_files = []
file_type = "wav"
LABELS = {"neutral": 0, "calm": 1, "happy": 2, "sad": 3, "angry": 4, "fearful": 5, "disgust": 6, "surprised": 7}

for file in files:
    curr_path = os.path.join(training_path, file)
    if os.path.isfile(curr_path) and file_type in file:
        for label in LABELS.keys():
            if label in file:
                wav_files.append((file, LABELS[label]))

data_array = []
label_array = []
for data in wav_files: #in the my_datasets code
    data_array.append(wavfile.read(os.path.join(training_path, data[0]))[1])
    label_array.append(data[1])

max_len = max([len(data) for data in data_array])
data_array = [np.resize(data, 48000*5) for data in data_array]
print(len(data_array[0]))

  data_array.append(wavfile.read(os.path.join(training_path, data[0]))[1])


240000


In [18]:
#setting up the CNN
class CNN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        
        #convolutional layers
        KERNEL = 3
        self.conv1 = torch.nn.Conv2d(1, 1, KERNEL)
        #self.conv2 = torch.nn.Conv2d(1, 1, KERNEL)
        #self.conv3 = torch.nn.Conv2d(1, 1, KERNEL)

        #max pooling layer
        self.pooling_layer = torch.nn.MaxPool2d(3)
        
        #fully connected layers
        self.fc1 = torch.nn.Linear(in_features=2394, out_features=500)
        self.fc2 = torch.nn.Linear(in_features=500, out_features=50)
        self.fc3 = torch.nn.Linear(in_features=50, out_features=8)

    def forward(self, x):
        
        conv_x1 = torch.relu(self.conv1(x))
        #conv_x2 = torch.relu(self.conv2(conv_x1))
        #conv_x3 = torch.relu(self.conv1(conv_x2))

        pooled_x = self.pooling_layer(conv_x1)

        fc_layer_in = pooled_x.reshape((pooled_x.shape[0], -1))
        
        fc_x1 = torch.relu(self.fc1(fc_layer_in))
        fc_x2 = torch.relu(self.fc2(fc_x1))
        fc_x3 = self.fc3(fc_x2)
        
        return torch.sigmoid(fc_x3)
    

In [19]:
# split into training and testing
#transformer = KernelPCA(n_components=40, kernel='cosine')
N_MELS = 20
mfcc = transforms.MFCC(sample_rate=48000, n_mfcc=N_MELS, melkwargs={"n_fft": 400})
mfcc_data = []
for data in data_array:
    mfcc_image = mfcc(torch.tensor(data).float())
    mfcc_data.append(np.array(mfcc_image))
#mfcc_data_array = [np.array(mfcs(torch.tensor(data).float())) for data in data_array] #transform into mfcc before model
train_data, test_data, train_labels, test_labels = train_test_split(mfcc_data, label_array, train_size=0.75, test_size=0.25)
#train_data = mfcc_data_array
#train_labels = label_array
#print(data.shape, train_data.shape, test_data.shape)
for i in range(len(train_data)):
    # print(train_data[i].shape)
    train_data[i] = train_data[i].reshape(1, N_MELS, 1201)
for i in range(len(test_data)):
    test_data[i] = test_data[i].reshape(1, N_MELS, 1201)
train_ten, test_ten = torch.tensor(train_data), torch.tensor(test_data)
train_y_ten, test_y_ten = torch.tensor(train_labels), torch.tensor(test_labels)
# print(train_ten.shape)
train_dataset = tdata.TensorDataset(train_ten, train_y_ten)

train_loader = tdata.DataLoader(train_dataset, batch_size=50) #this is the data loader (just doing 1 point at a time)

In [20]:
print(train_ten.shape)
print(test_ten.shape)
print(train_y_ten.shape)
print(test_y_ten.shape)

torch.Size([843, 1, 20, 1201])
torch.Size([282, 1, 20, 1201])
torch.Size([843])
torch.Size([282])


In [26]:
def test_new_model():
  n_epochs = 100 #essentially number of iterations

  #instantiating a model
  model = CNN()

  #loss function
  criterion = torch.nn.CrossEntropyLoss() #binary cross-entropy loss, clamps log values to protect against extremes

  #optimizer mechanism
  optimizer = torch.optim.Adam(lr=1e-4, params=model.parameters())

  loss_array = np.zeros(n_epochs)

  for epoch_idx in range(n_epochs):
    for batch in train_loader:
      data_batch, labels_batch = batch

      #predict
      labels_pred = model(data_batch.to(torch.float32))
    
      #compute loss
      loss = criterion(labels_pred, labels_batch) #note: cross entropy is not symmetric

      loss_array[epoch_idx] += loss.item()/data_batch.shape[0] #mean of loss
  
      #backprop
      optimizer.zero_grad()
      loss.backward() #backprop from pytorch
      optimizer.step() #all params optimized
    #if epoch_idx%50 == 0:
    #  fig.data[0].y = loss_array[:epoch_idx]
    #  fig.update_yaxes(type='log')

  test_pred = []
  test_dataset = tdata.TensorDataset(test_ten, test_y_ten)
  test_loader = tdata.DataLoader(test_dataset, batch_size=50) 
  with torch.no_grad():
    for test in test_loader:
      label_histogram = model(test[0].to(torch.float32))
      for histogram in label_histogram:
        test_pred.append(np.argmax(histogram).item())


  test_pred = torch.tensor(test_pred)
  accuracy = Accuracy(task="multiclass", num_classes=8)
  computed_accuracy = accuracy(test_pred, test_y_ten)
  print("Accuracy =", computed_accuracy)
  return (model, computed_accuracy)

In [27]:
best_model = None
best_accuracy = 0.0
num_tests = 10

for i in range(num_tests):
    result = test_new_model()
    if result[1] > best_accuracy:
        best_accuracy = result[1]
        best_model = result[0]
        print("Accuracy improved to", best_accuracy)
    print("Model", i, "trained.")

print(best_accuracy)

Accuracy = tensor(0.3936)
Accuracy improved to tensor(0.3936)
Model 0 trained.
Accuracy = tensor(0.4468)
Accuracy improved to tensor(0.4468)
Model 1 trained.
Accuracy = tensor(0.3511)
Model 2 trained.
Accuracy = tensor(0.1099)
Model 3 trained.
Accuracy = tensor(0.3830)
Model 4 trained.
Accuracy = tensor(0.3759)
Model 5 trained.
Accuracy = tensor(0.3830)
Model 6 trained.
Accuracy = tensor(0.3511)
Model 7 trained.
Accuracy = tensor(0.3369)
Model 8 trained.
Accuracy = tensor(0.0567)
Model 9 trained.
tensor(0.4468)


In [22]:
# test_pred = []
# test_dataset = tdata.TensorDataset(test_ten, test_y_ten)
# test_loader = tdata.DataLoader(test_dataset, batch_size=50) 
# with torch.no_grad():
#   for test in test_loader:
#     # convolutional layer
    
#     label_histogram = model(test[0].to(torch.float32))
#     for histogram in label_histogram:
#       test_pred.append(np.argmax(histogram).item())


# test_pred = torch.tensor(test_pred)
# print(classification_report(test_y_ten, test_pred))

              precision    recall  f1-score   support

           0       0.18      0.25      0.21        20
           1       0.55      0.15      0.23        41
           2       0.20      0.54      0.29        35
           3       0.18      0.33      0.24        39
           4       0.89      0.24      0.37        34
           5       0.30      0.08      0.13        36
           6       0.58      0.38      0.45        40
           7       0.44      0.38      0.41        37

    accuracy                           0.29       282
   macro avg       0.41      0.29      0.29       282
weighted avg       0.43      0.29      0.30       282

