# Lab 8: Audio Classification 

## Part 1: Making a speech detector

In this section we will design a simple classifier that will let us know if its input is speech or non-speech. Download the data archive from: [ https://drive.google.com/file/d/1oAnvk-hzzgzZ4di4W0pKw6v3IWLm9u2X/view?usp=sharing ] In this part we will use the dataset in data/SpeechMusic. In it you will find two directories, speech/ and music/ containing data from each class.

Randomly select 50 soundfiles from each directory to use as training data, and use the remaining sounds as testing data. For all of the sounds we will compute a representation that makes the classification easier and we will use a simple Gaussian model to classify them. Do the following:

- Perform an STFT for each sound, take it’s magnitude and raise it to 0.3 to improve contrast
    - We will consider each spectral slice of that to be a data point
- Using the training data of each sound:
    - Calculate the mean column and the diagonal covariance of the columns
    - You will thus get two sets of Gaussian parameters that model each sound class
- For each testing data point:
    - Calculate the likelihood of each column based on the above models
	- To calculate the entire file likelihood add all the frame likelihoods
	- Assign each soundfile to the class that gets the highest likelihood

For extra credit implement the parameter estimation and model likelihood yourself. If you are too lazy for that you can instead use ```sklearn.mixture.GaussianMixture``` to learn a diagonal single-Gaussian model per class.

How do the results look like? If you rerun this with a different training/testing set, is there an appreciable difference? On average over multiple training/testing sets what accuracy do you get?

In [83]:
# YOUR CODE HERE
import numpy as np
import math
import IPython
import matplotlib.pyplot as plt
import scipy.io.wavfile as wavfile
import scipy.signal as sg
from tqdm import tqdm
from os import listdir
from sklearn.model_selection import train_test_split 
from sklearn.mixture import GaussianMixture

def sound( x, rate=8000, label=''):
    from IPython.display import display, Audio, HTML
    display( HTML( 
    '<style> table, th, td {border: 0px; }</style> <table><tr><td>' + label + 
    '</td><td>' + Audio( x, rate=rate)._repr_html_()[3:] + '</td></tr></table>'
    ))

def stft( input_sound, dft_size, hop_size, zero_pad, window):
    # YOUR CODE HERE
    padZeroes = np.zeros(dft_size)
    pad = np.append(padZeroes, input_sound)
    pad = np.append(pad, padZeroes)
    tempMatrix = []
    for i in range(0, len(pad)-dft_size, hop_size):
        tempMatrix.append(pad[i:i+dft_size] * window)
    out = []
    for i in range(len(tempMatrix)):
        rfft = np.fft.rfft(tempMatrix[i], dft_size+zero_pad)
        out.append(np.reshape(rfft, (-1, 1)))     
    return np.hstack(out)

def istft( stft_output, dft_size, hop_size, zero_pad, window):
    # YOUR CODE HERE
    irfftSounds = []
    for segment in stft_output.T:
        irfftSounds.append(np.fft.irfft(segment, dft_size+zero_pad))
    irfftSounds = np.array(irfftSounds)
    input_sound = np.zeros(dft_size+hop_size*(len(irfftSounds)-1))
    for i in range(len(irfftSounds)):
        input_sound[i*hop_size:i*hop_size+dft_size] += (irfftSounds[i][:dft_size] * window)
    input_sound = input_sound[dft_size:]
    return input_sound

def loadDir(directory):
    audioData = []
    rate = 0
    for filename in tqdm(listdir(directory)):
        if filename.endswith(".wav"):
            curFilePath = os.path.join(directory, filename)
            #rate should be same for all files, only 1 variable needed
            wavRate, wavData = wavfile.read(curFilePath)
            audioData.append(wavData)
            if rate != wavRate:
                rate = wavRate
    return audioData, rate


def getSTFT(dataList):
    stftList = []
    for data in tqdm(dataList):
        curSTFT = stft(data,  dft_size, hop_size, zero_pad, window)
        magnitude = np.abs(curSTFT)
        curSTFT = np.power(magnitude, 0.3)
        stftList.append(curSTFT)
    return stftList

def getPerformance(gaussianSpeech, gaussianMusic, speechTest, musicTest):
    correct = 0
    for speech in speechTest:
        scoreSpeech = gaussianSpeech.score(speech)
        scoreMusic = gaussianMusic.score(speech)
        if scoreSpeech > scoreMusic:
            correct += 1
    for music in musicTest:
        scoreSpeech = gaussianSpeech.score(music)
        scoreMusic = gaussianMusic.score(music)
        if scoreSpeech < scoreMusic:
            correct += 1
    testSize = len(speechTest) + len(musicTest)
    accuracy = correct/testSize
    print("accuracy: ", accuracy)
    print("correct/testSize: ", correct, "/", testSize)
    return accuracy

############### you can ignore all the stuff between here #################



#computer performance method should work, as it is from a past project of mine, 
#but the numbers it gives me are weird for accuracy, so I probably implemented it wrong

# def computePerformance(predictedLabels, actualLabels):
#     yhats = predictedLabels
#     accuracy = np.mean(yhats == actualLabels)
#     tp = np.sum([yhats[i] == actualLabels[i] and yhats[i] == 1 for i in range(len(yhats))])
#     precision = tp / np.sum([yhats[i] == 1 for i in range(len(yhats))])
#     recall = tp / (np.sum([yhats[i] != actualLabels[i] and yhats[i] == 0 for i in range(len(yhats))]) + tp)
#     f1 = 2 * (precision * recall) / (precision + recall)
#     return accuracy, f1, precision, recall

#Defunct, the fancy np.mean accuracy didn't work and I didn't have time to fix
# def getPerformance(gaussianSpeech, gaussianMusic, speechTest, musicTest):
#     predictedLabels = []
#     actualLabels = []
#     correct = 0
#     #assume 1 is speech, 0 is music
#     for speech in speechTest:
#         scoreSpeech = gaussianSpeech.score(speech)
#         scoreMusic = gaussianMusic.score(speech)
#         if scoreSpeech > scoreMusic:
#             correct += 1
#             predictedLabels.append(1) #speech
#         else:
#             predictedLabels.append(0) #music
#         actualLabels.append(1)
#     for music in musicTest:
#         scoreSpeech = gaussianSpeech.score(music)
#         scoreMusic = gaussianMusic.score(music)
#         if scoreSpeech < scoreMusic:
#             correct += 1
#             predictedLabels.append(0) #music
#         else:
#             predictedLabels.append(1) #speech
#         actualLabels.append(0)
#     print(predictedLabels)
#     print(actualLabels)
#     accuracy = np.mean(predictedLabels == actualLabels)
#     testSize = len(speechTest) + len(musicTest)
#     accuracy2 = correct/testSize
#     print("accuracy ", accuracy)
#     print("accuracy2 ", accuracy2)
# #     accuracy, f1, precision, recall = computePerformance(predictedLabels, actualLabels)
# #     return accuracy, f1, precision, recall


############### and here #################

In [86]:
dft_size = 1024
hop_size = 256
zero_pad = 0
window = sg.hann(dft_size)

dataDir = "data/SpeechMusic/"
musicDir = dataDir+"music/"
speechDir = dataDir+"speech/"

train_size = 50
test_size = 10

#note, this will still print accuracy, but that's what you would want anyway
def runTrialNoPrints(musicDir, speechDir, dft_size, hop_size, zero_pad, window, random = True, random_state = 64):
    audioDataMusic, rateMusic = loadDir(musicDir)
    audioDataSpeech, rateSpeech = loadDir(speechDir)

    stftMusic = getSTFT(audioDataMusic)
    stftSpeech = getSTFT(audioDataSpeech)

    if random:
        speechTrain, speechTest, musicTrain, musicTest = train_test_split( stftSpeech, stftMusic, train_size=train_size, test_size=test_size)
    else:
        speechTrain, speechTest, musicTrain, musicTest = train_test_split( stftSpeech, stftMusic, train_size=train_size, test_size=test_size, random_state=random_state)
    speechTrain = np.concatenate(np.array(speechTrain), axis=0)
    musicTrain = np.concatenate(np.array(musicTrain), axis=0)

    #create and fit Gaussian models
    gaussianSpeech = GaussianMixture(5, "diag",verbose=1)
    gaussianSpeech = gaussianSpeech.fit(speechTrain)

    gaussianMusic = GaussianMixture(5, "diag",verbose=1)
    gaussianMusic = gaussianMusic.fit(musicTrain)

    
    #get performance
    accuracy = getPerformance(gaussianSpeech, gaussianMusic, speechTest, musicTest)
    return accuracy


def runTrial(musicDir, speechDir, dft_size, hop_size, zero_pad, window, random = True, random_state = 64):
    print("loading files")
    audioDataMusic, rateMusic = loadDir(musicDir)
    audioDataSpeech, rateSpeech = loadDir(speechDir)
    print("--- loading files complete ---")

    print("performing STFT")
    stftMusic = getSTFT(audioDataMusic)
    stftSpeech = getSTFT(audioDataSpeech)
    print("--- STFT complete ---")


    print("split data")
    if random:
        speechTrain, speechTest, musicTrain, musicTest = train_test_split( stftSpeech, stftMusic, train_size=train_size, test_size=test_size)
    else:
        speechTrain, speechTest, musicTrain, musicTest = train_test_split( stftSpeech, stftMusic, train_size=train_size, test_size=test_size, random_state=random_state)
    speechTrain = np.concatenate(np.array(speechTrain), axis=0)
    musicTrain = np.concatenate(np.array(musicTrain), axis=0)
    print(len(speechTrain))
    print(len(speechTest))
    print(len(musicTrain))
    print(len(musicTest))
    print("---- data split complete ---- ")


    #create and fit Gaussian models
    print("fit Speech model")
    gaussianSpeech = GaussianMixture(5, "diag",verbose=1)
    gaussianSpeech = gaussianSpeech.fit(speechTrain)
    print("---- Speech model fitted ---- ")

    print("fit Music model")
    gaussianMusic = GaussianMixture(5, "diag",verbose=1)
    gaussianMusic = gaussianMusic.fit(musicTrain)
    print("---- Music model fitted ---- ")

    
    #get performance
    print("calculating performance")
    accuracy = getPerformance(gaussianSpeech, gaussianMusic, speechTest, musicTest)
    print("---- performance calculated -----")
    return accuracy
    

In [87]:
#this is an example run, because running the 10 trials takes forever

accuracy = runTrial(musicDir, speechDir, dft_size, hop_size, zero_pad, window,random=False,random_state=64)

loading files


100%|████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 1363.67it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 1500.06it/s]


--- loading files complete ---
performing STFT


100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:03<00:00, 15.97it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:03<00:00, 15.55it/s]


--- STFT complete ---
split data
25650
10
25650
10
---- data split complete ---- 
fit Speech model




Initialization 0
  Iteration 10
  Iteration 20
  Iteration 30
Initialization converged: True
---- Speech model fitted ---- 
fit Music model




Initialization 0
  Iteration 10
  Iteration 20
  Iteration 30
  Iteration 40
Initialization converged: True
---- Music model fitted ---- 
calculating performance
accuracy:  0.9
correct/testSize:  18 / 20
---- performance calculated -----


In [91]:
accuracies = []
for i in range(10):
    print("\n\n\n--- Trial Number :", i+1,"--- ")
    accuracy = runTrialNoPrints(musicDir, speechDir, dft_size, hop_size, zero_pad, window, random=True)
    accuracies.append(accuracy)

print("accuracies are:", accuracies)




--- Trial Number : 1 --- 


100%|████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 2000.05it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 2307.52it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:03<00:00, 15.93it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:03<00:00, 16.53it/s]


Initialization 0
  Iteration 10
  Iteration 20
  Iteration 30
Initialization converged: True




Initialization 0
  Iteration 10
  Iteration 20
  Iteration 30
  Iteration 40
  Iteration 50
  Iteration 60
Initialization converged: True
accuracy:  0.85
correct/testSize:  17 / 20



--- Trial Number : 2 --- 


100%|████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 1538.51it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 1935.46it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:04<00:00, 14.85it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:03<00:00, 15.49it/s]


Initialization 0
  Iteration 10
  Iteration 20
  Iteration 30
Initialization converged: True




Initialization 0
  Iteration 10
  Iteration 20
  Iteration 30
  Iteration 40
  Iteration 50
Initialization converged: True
accuracy:  0.8
correct/testSize:  16 / 20



--- Trial Number : 3 --- 


100%|████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 1153.89it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 1395.44it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:03<00:00, 15.68it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:03<00:00, 16.38it/s]


Initialization 0
  Iteration 10
  Iteration 20
  Iteration 30
  Iteration 40
Initialization converged: True




Initialization 0
  Iteration 10
  Iteration 20
  Iteration 30
  Iteration 40
  Iteration 50
  Iteration 60
Initialization converged: True
accuracy:  0.85
correct/testSize:  17 / 20



--- Trial Number : 4 --- 


100%|████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 1579.02it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 1818.27it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:04<00:00, 15.92it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:03<00:00, 15.61it/s]


Initialization 0
  Iteration 10
  Iteration 20
  Iteration 30
Initialization converged: True




Initialization 0
  Iteration 10
  Iteration 20
  Iteration 30
  Iteration 40
Initialization converged: True
accuracy:  0.8
correct/testSize:  16 / 20



--- Trial Number : 5 --- 


100%|████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 1579.04it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 1714.23it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:04<00:00, 13.63it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:04<00:00, 16.44it/s]


Initialization 0
  Iteration 10
  Iteration 20
  Iteration 30
Initialization converged: True




Initialization 0
  Iteration 10
  Iteration 20
  Iteration 30
  Iteration 40
Initialization converged: True
accuracy:  0.85
correct/testSize:  17 / 20



--- Trial Number : 6 --- 


100%|████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 1250.03it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 1333.32it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:03<00:00, 16.31it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:03<00:00, 15.35it/s]


Initialization 0
  Iteration 10
  Iteration 20
  Iteration 30
  Iteration 40
Initialization converged: True




Initialization 0
  Iteration 10
  Iteration 20
  Iteration 30
  Iteration 40
Initialization converged: True
accuracy:  0.85
correct/testSize:  17 / 20



--- Trial Number : 7 --- 


100%|████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 1666.56it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 1764.76it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:03<00:00, 16.38it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:03<00:00, 16.11it/s]


Initialization 0
  Iteration 10
  Iteration 20
  Iteration 30
  Iteration 40
  Iteration 50
  Iteration 60
Initialization converged: True




Initialization 0
  Iteration 10
  Iteration 20
  Iteration 30
  Iteration 40
  Iteration 50
  Iteration 60
  Iteration 70
  Iteration 80
  Iteration 90
  Iteration 100
Initialization converged: False




accuracy:  0.8
correct/testSize:  16 / 20



--- Trial Number : 8 --- 


100%|████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 1578.97it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 1714.36it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:03<00:00, 15.93it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:03<00:00, 14.83it/s]


Initialization 0
  Iteration 10
  Iteration 20
  Iteration 30
  Iteration 40
Initialization converged: True




Initialization 0
  Iteration 10
  Iteration 20
  Iteration 30
  Iteration 40
  Iteration 50
  Iteration 60
Initialization converged: True
accuracy:  0.85
correct/testSize:  17 / 20



--- Trial Number : 9 --- 


100%|████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 1714.23it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 1875.08it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:03<00:00, 16.23it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:03<00:00, 16.23it/s]


Initialization 0
  Iteration 10
  Iteration 20
  Iteration 30
  Iteration 40
Initialization converged: True




Initialization 0
  Iteration 10
  Iteration 20
  Iteration 30
  Iteration 40
  Iteration 50
  Iteration 60
Initialization converged: True
accuracy:  0.85
correct/testSize:  17 / 20



--- Trial Number : 10 --- 


100%|█████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 967.72it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 60/60 [00:00<00:00, 1153.99it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:03<00:00, 16.16it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 60/60 [00:03<00:00, 15.67it/s]


Initialization 0
  Iteration 10
  Iteration 20
  Iteration 30
Initialization converged: True




Initialization 0
  Iteration 10
  Iteration 20
  Iteration 30
  Iteration 40
Initialization converged: True
accuracy:  0.9
correct/testSize:  18 / 20


TypeError: unsupported operand type(s) for +: 'NoneType' and 'NoneType'

## After running multiple trials, I found that accuracy varies from run to run by about 0.8 to 0.9 from randomized train and test sets. As you can see from the results of my trial run of 10, the average is roughly 0.84 

## Part 2: Making a music genre classifier

We will repeat the above, but this time we will perform music genre classification. To do so we will use a slightly more elaborate feature representation, and a stronger classification model. If you downloaded the data archive pointed to above, you will find a subset of the CTZAN dataset in the data/genre folder, this is a benchmark data set for music genre classification.

Just as before, you will find a set of directories with examples of each sound class that we want to recognize. For each class, split the soundfiles into a training set (50% of data) and testing set (remaining 50% of data).

For a representation we will use MFCC features. For extra credit, code these yourself otherwise you can use the implementation from the ```librosa``` library. Once all the files are transformed we will have a series of MFCC frames for each recording (as opposed to spectral frames as is in the case of the STFT). We will use these as the data to classify.

For each class learn a Gaussian model (with a diagonal covariance again). This will be the same process as above.
In order to evaluate how good this works we will use the following procedure. For each sound in the training data, get the likelihood of each MFCC frame based on the learned Gaussian models and sum these over the entire file just as we did before. Use the resulting values to get a classification result for each . Report how accurate your results are. Now report the accuracy using your testing data instead.

Now will use a better classifier to hopefully get better accuracy. We will use a Gaussian Mixture Model (```sklearn.mixture.GaussianMixture```). Just as before you should learn one such model for each class using the corresponding training data.

How many Gaussians do you need in your GMM to get the best results? Do the MFCC parameters make a difference? Play around with the numbers to get the best possible results.

In [141]:
import librosa

n_mfcc = 50

# YOUR CODE HERE
def loadMFCCfromDir(directory):
    mfccList = []
    for filename in listdir(directory):
        if filename.endswith(".mp3"):
            curFilePath = os.path.join(directory, filename)
#             print(curFilePath)
            file, rate = librosa.core.load(curFilePath)
            rate = 1293 # no idea what is wrong with this, but this is necessary I think
            mfccSequence = librosa.feature.mfcc(file, rate, n_mfcc=n_mfcc)
            mfccList.append(mfccSequence)
    return mfccList

def getGenrePerformance(genreClassifierList,genreTestList):
    correct = 0
    scores = []
    #indexes are parallel
    totalLength = 0
    for i, genre in enumerate(genreTestList):
        for musicFile in genre:
            scores = [genreClassifier.score(musicFile.T) for genreClassifier in genreClassifierList]
            if np.argmax(scores) == idx:
                correct += 1
            totalTests += 1
    accuracy = correct/totalTests
    print("accuracy: ", accuracy)
    print("correct/testSize: ", correct, "/", testSize)
    return accuracy

# raise NotImplementedError()

In [142]:
# DO NOT RERUN UNLESS NECESSARY, TAKES FOREVER!!
classicalMFCC = loadMFCCfromDir('data/genres/classical')
discoMFCC = loadMFCCfromDir('data/genres/disco')
metalMFCC = loadMFCCfromDir('data/genres/metal')
popMFCC = loadMFCCfromDir('data/genres/pop')
reggaeMFCC = loadMFCCfromDir('data/genres/reggae')


In [146]:
test_size = 0.5
random_state = 35

classicalTrain, classicalTest = train_test_split(classicalMFCC, test_size=test_size, random_state=random_state)
discoTrain, discoTest = train_test_split(discoMFCC, test_size=test_size, random_state=random_state)
metalTrain, metalTest = train_test_split(metalMFCC, test_size=test_size, random_state=random_state)
popTrain, popTest = train_test_split(popMFCC, test_size=test_size, random_state=random_state)
reggaeTrain, reggaeTest = train_test_split(reggaeMFCC, test_size=test_size, random_state=random_state)

print(len(classicalTrain))
print(len(classicalTrain[0]))
print(len(classicalTrain[0][0]))
print(classicalTrain)
test =  np.array(classicalTrain)
print(np.shape(test))

classicalTrain = np.concatenate(np.array(classicalTrain), axis=1)
discoTrain = np.concatenate(np.array(discoTrain), axis=1)
metalTrain = np.concatenate(np.array(metalTrain), axis=1)
popTrain = np.concatenate(np.array(popTrain), axis=1)
reggaeTrain = np.concatenate(np.array(classicalTrain), axis=1)

50
50
1293
[array([[-3.6021286e+02, -3.7209698e+02, -3.8469833e+02, ...,
        -2.9834387e+02, -2.2398697e+02, -1.7270926e+02],
       [ 1.4447214e+02,  1.6152377e+02,  1.8226184e+02, ...,
         1.8255060e+02,  1.7496545e+02,  1.6949576e+02],
       [ 9.1208054e+01,  8.5405022e+01,  7.1491074e+01, ...,
         4.6799530e+01,  3.5596073e+01,  2.6750713e+01],
       ...,
       [ 1.6207131e+00,  1.5676415e+00, -1.1479497e-01, ...,
        -4.8692570e+00, -5.3756499e+00, -5.7174134e+00],
       [-5.6478891e+00, -6.6711264e+00, -3.4443884e+00, ...,
        -1.7203186e+00,  7.3397976e-01, -4.4399238e+00],
       [-3.1149473e+00, -4.3396144e+00, -3.9715466e+00, ...,
        -6.1852798e+00, -3.8738906e+00,  2.9281535e+00]], dtype=float32), array([[-4.1607626e+02, -4.2589386e+02, -4.4093350e+02, ...,
        -4.6985095e+02, -4.5782556e+02, -4.2312402e+02],
       [ 1.5853983e+02,  1.6450873e+02,  1.7083626e+02, ...,
         1.5419751e+02,  1.5693684e+02,  1.4626538e+02],
       [ 6.8061

         2.4534011e-01,  8.5393453e-01, -2.3011351e-01]], dtype=float32)]


  test =  np.array(classicalTrain)


ValueError: could not broadcast input array from shape (50,1293) into shape (50)

In [144]:
n_components = 5

classicalGauss = GaussianMixture(n_components=n_components, covariance_type='diag')
discoGauss = GaussianMixture(n_components=n_components, covariance_type='diag')
metalGauss = GaussianMixture(n_components=n_components, covariance_type='diag')
popGauss = GaussianMixture(n_components=n_components, covariance_type='diag')
reggaeGauss = GaussianMixture(n_components=n_components, covariance_type='diag')

classicalGauss = classicalGauss.fit(classicalTrain.T)
discoGauss = discoGauss.fit(discoTrain.T)
metalGauss = metalGauss.fit(metalTrain.T)
popGauss = popGauss.fit(popTrain.T)
reggaeGauss = reggaeGauss.fit(reggaeTrain.T)

AttributeError: 'list' object has no attribute 'T'

In [145]:
genreTestList = [classicalTest, discoTest, metalTest,popTest,reggaeTest]
genreClassifierList = [classicalGauss, discoGauss, metalGauss,popGauss,reggaeGauss]
accuracy = getGenrePerformance(genreClassifierList,genreTestList)

NotFittedError: This GaussianMixture instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

## This code worked at some point, but as I started implementing more things, I somehow broke how my train data is stored.  Didn't have time to finish debugging it, but I have a feeling it is a simple mistake that I can't find.

## Part 3: Make it better (extra credit, required for 4-hour registrants)

There is no shortage of techniques (and free code) to use for classification. Revisit the two problems above and use any other type of classifier you want (Neural Nets, Boosting, Decision Trees, whatever). Also feel free to use any feature you want. Can you improve on the results you got before? How much higher can you get your accuracy for either case?

In [None]:
# YOUR CODE HERE
# I really wish I had more time to play around with this, but I had a late start :/


raise NotImplementedError()

## problem 1

## no time to implement/run this, but if you wanted to test out other classifiers, you could essentially just reuse the same methods and plug in new Sklearn models, as most models have the same fit, score, and other methods.

In [None]:
# def runTrialAnyClassifier(classifier, musicDir, speechDir, dft_size, hop_size, zero_pad, window, random = True, random_state = 64):
#     print("loading files")
#     audioDataMusic, rateMusic = loadDir(musicDir)
#     audioDataSpeech, rateSpeech = loadDir(speechDir)
#     print("--- loading files complete ---")

#     print("performing STFT")
#     stftMusic = getSTFT(audioDataMusic)
#     stftSpeech = getSTFT(audioDataSpeech)
#     print("--- STFT complete ---")


#     print("split data")
#     if random:
#         speechTrain, speechTest, musicTrain, musicTest = train_test_split( stftSpeech, stftMusic, train_size=train_size, test_size=test_size)
#     else:
#         speechTrain, speechTest, musicTrain, musicTest = train_test_split( stftSpeech, stftMusic, train_size=train_size, test_size=test_size, random_state=random_state)
#     speechTrain = np.concatenate(np.array(speechTrain), axis=0)
#     musicTrain = np.concatenate(np.array(musicTrain), axis=0)
#     print(len(speechTrain))
#     print(len(speechTest))
#     print(len(musicTrain))
#     print(len(musicTest))
#     print("---- data split complete ---- ")


#     #create and fit Gaussian models
#     print("fit Speech model")
#     gaussianSpeech = classifier
#     gaussianSpeech = gaussianSpeech.fit(speechTrain)
#     print("---- Speech model fitted ---- ")

#     print("fit Music model")
#     gaussianMusic = GaussianMixture(5, "diag",verbose=1)
#     gaussianMusic = gaussianMusic.fit(musicTrain)
#     print("---- Music model fitted ---- ")

    
#     #get performance
#     print("calculating performance")
#     accuracy = getPerformance(gaussianSpeech, gaussianMusic, speechTest, musicTest)
#     print("---- performance calculated -----")
#     return accuracy
    