In [12]:
# this code implements a deep neural network for feature learning
# this code comes from Vishwa's class on constructing a data loader and creating a model

# Some common system imports
import os
import sys
import importlib
import time
import csv
import random
from subprocess import call

# Numeric computing
import numpy as np

# Sklearn functions are useful for generating train/test splits, and metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz

from scipy.io import wavfile

# pytorch
import torch
import torch.utils.data as tdata
from torchaudio import transforms

# Plotting (if we want it)
import matplotlib.pyplot as plt

# importing our own modules
import audio_datasets as ads

from IPython.display import Image

In [3]:
# loading the raw data for feature learning
training_path = os.path.join(os.getcwd(), "..", "training_data") #need to change this back before pushing ***
files = os.listdir(training_path)
wav_files = []
file_type = "wav"
LABELS = {"neutral": 0, "calm": 1, "happy": 2, "sad": 3, "angry": 4, "fearful": 5, "disgust": 6, "surprised": 7}

for file in files:
    curr_path = os.path.join(training_path, file)
    if os.path.isfile(curr_path) and file_type in file:
        for label in LABELS.keys():
            if label in file:
                wav_files.append((file, LABELS[label]))

data_array = []
label_array = []
max_size = 0

for data in wav_files: #in the my_datasets code
    wav_file = wavfile.read(os.path.join(training_path, data[0]))
    audio_data = wav_file[1]

    try:
        print("Two track audio file detected", audio_data[0][0])
        audio_data = audio_data[0]
    except:
        pass

    sample_array = [sample for sample in audio_data if abs(sample) > 2]

    if len(sample_array) > max_size:
        max_size = len(sample_array)

    data_array.append(np.array(sample_array))
    label_array.append(data[1])

# original_len = len(data_array)
# print(original_len)
# for i in range(original_len // 2):
#     selection = random.randint(0, original_len - 1)
#     data_array.append(data_array[selection])
#     label_array.append(label_array[selection])

max_len = max([len(data) for data in data_array])
train_data_array = np.array([np.resize(data, max_size) for data in data_array])
print(train_data_array)

  wav_file = wavfile.read(os.path.join(training_path, data[0]))


Two track audio file detected 0
Two track audio file detected 0
Two track audio file detected 0
Two track audio file detected 0
Two track audio file detected 1
[[  3.  -5. -16. ... -16. -35. -31.]
 [  8.   8.   7. ...  24.  24.  23.]
 [ -5.   6.  30. ... -15.  -5.   4.]
 ...
 [ -4.  -6.  -5. ...  -4.  -4.  -3.]
 [ -3.   3.   3. ... 302. 306. 276.]
 [ -3.  -4.  -3. ... 491. 415. 334.]]


In [5]:
# loading the raw data for feature learning
test_path = os.path.join(os.getcwd(), "..", "test_data")
test_files = os.listdir(test_path)
test_wav_files = []

for file in test_files:
    curr_path = os.path.join(test_path, file)
    if os.path.isfile(curr_path) and file_type in file:
        test_wav_files.append(file)

test_data_array = []
for data in test_wav_files: #in the my_datasets code
    waveform = wavfile.read(os.path.join(test_path, data))[1]
    audio_data = [sample for sample in waveform if abs(sample) > 2]
    test_data_array.append(np.array(audio_data))

test_data_array = np.array([np.resize(data, max_size) for data in test_data_array])

  waveform = wavfile.read(os.path.join(test_path, data))[1]


In [6]:
train_ten, test_ten = torch.tensor(train_data_array), torch.tensor(test_data_array)
train_y_ten = torch.tensor(label_array)
print(len(label_array))

1125


In [9]:
#implementing the Mel-frequency cepstrum coefficients as feature vectors
mfcc = transforms.MFCC(sample_rate=48000, n_mfcc=40)
print(train_ten.shape)
#mel_spectrogram = transforms.MelSpectrogram(sample_rate=48000, n_fft=400, n_mels=1)
#print("2", mel_spectrogram(train_ten[0].float()).shape)
#print("2.5", mfcc(mel_spectrogram(train_ten[0].float())).shape)
print(mfcc(train_ten[0].float()).shape)
mfcc_features = np.zeros((len(label_array), 40640))
for row in range(len(label_array)):
    mel_coef = np.array(mfcc(train_ten[row].float()))
    mel_coef = mel_coef.reshape((1, 40640))
    mfcc_features[row] = mel_coef

print("3", train_ten.size())
print("4", mfcc_features.shape)
print("5", train_y_ten.shape)


torch.Size([1125, 203174])
torch.Size([40, 1016])
3 torch.Size([1125, 203174])
4 (1125, 40640)
5 torch.Size([1125])


In [10]:
#implementing the Mel-frequency cepstrum coefficients as feature vectors
mfcc2 = transforms.MFCC(sample_rate=48000, n_mfcc=40)
#mel_spectrogram = transforms.MelSpectrogram(sample_rate=48000, n_fft=400, n_mels=1)
#print("2", mel_spectrogram(train_ten[0].float()).shape)
#print("2.5", mfcc(mel_spectrogram(train_ten[0].float())).shape)
mfcc_test_features = np.zeros((len(test_data_array), 40640))
for row in range(len(test_data_array)):
    mel_coef = np.array(mfcc2(test_ten[row].float()))
    mel_coef = mel_coef.reshape((1, 40640))
    mfcc_test_features[row] = mel_coef


In [20]:
forest2 = RandomForestClassifier(n_estimators=10) #gnb and y_pred lines directly from scikit-learn website documentation for Naive-Bayes
print(mfcc_features.shape, train_y_ten.shape)
model = forest2.fit(mfcc_features, label_array)
y_pred2 = model.predict(mfcc_test_features)
y_pred2 = list(y_pred2)

# print(classification_report(test_y_ten, test_pred))
# correct = 0
# for label in range(len(y_pred2)):
#     if y_pred2[label] == test_labels[label]:
#         correct += 1
# print("Number of correct labels: ", correct)

# VISUALIZATION
print(forest2.decision_path(mfcc_test_features))

estimator = model.estimators_[0]
export_graphviz(estimator, out_file='tree.dot', 
                rounded = True, proportion = False, 
                precision = 2, filled = True,
                max_depth = 3)

call(['dot', '-Tpng', 'tree.dot', '-o', 'tree.png', '-Gdpi=600'])
Image(filename = 'tree.png')

# Saving Results
LABEL_CONVERTER = dict([(value, key) for key, value in LABELS.items()])

#counting the mislabeled points
with open("random_forest_results2.csv", "w+") as result_file:
   writer = csv.writer(result_file)
   writer.writerow(["filename", "label"])

   for idx in range(len(y_pred2)):
       writer.writerow([os.path.splitext(test_wav_files[idx])[0], LABEL_CONVERTER[y_pred2[idx]]])
#print("Number of mislabeled points out of a total %d points : %d", (len(test_labels), (test_labels != y_pred2).sum()))

(1125, 40640) torch.Size([1125])
(<315x4708 sparse matrix of type '<class 'numpy.int64'>'
	with 33157 stored elements in Compressed Sparse Row format>, array([   0,  433,  928, 1391, 1862, 2333, 2818, 3285, 3746, 4203, 4708]))
