# Imports

In [28]:
from functions import getClasses
from functions import createFolder
import csv
import os
import glob
import matplotlib.pyplot as plt
import sys
import numpy as np

#Comment out if not generating feature folders
sys.path.append('/home/arclab/Documents/FlorianHwk/ECE271B/pyAudioAnalysis/')
from pyAudioAnalysis import audioFeatureExtraction as aT
import audioBasicIO

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm

ourClasses, _, _, _ = getClasses('classes.csv', "data/ontology.json", 'data/class_labels_indices.csv')

# Get all mt features given a file input

In [52]:
def getMidtermFeatures(filepath):
    mtWin  = 1
    mtStep = 1
    stWin  = 0.05
    stStep = 0.05

    #This assumes that there is no overlap between mt features and no overlap between st features.
    #So window size = step size, and whole number of st features per mt feature
    #Also assumes the length of the recording is 10 seconds

    [Fs, x] = audioBasicIO.readAudioFile(filepath)
    if not type(x) is np.ndarray:
        return -1;
    
    if isinstance(x, int):
            return -1       
    
    x = audioBasicIO.stereo2mono(x)
    if x.shape[0]<float(Fs)/10:
            print "  (AUDIO FILE TOO SMALL - SKIPPING)"
            return -1
    
    
    stFeatures = aT.stFeatureExtraction(x, Fs, round(stWin * Fs), round(stStep * Fs))

    mtFeaturesMean = [];
    mtFeaturesVar  = [];
    numStPerMt = int(mtWin/stStep)


    for i in range(0, 10/mtWin):
        curStFeatures = stFeatures[:,i*numStPerMt:(i+1)*numStPerMt]

        mtFeaturesMean.append(np.mean(curStFeatures,1))
        mtFeaturesVar.append(np.std(curStFeatures, 1))

    mtFeaturesMean = np.array(mtFeaturesMean)
    mtFeaturesVar = np.array(mtFeaturesVar)

    features = np.hstack((mtFeaturesMean.flatten(), mtFeaturesVar.flatten()))
    
    return features


# Generate folders of all features!!

In [53]:
#No need to run if you have the train_audioFeaturesFull and train_audioFeaturesFull Folders. Put them inside the /data/folder
createFolder("data/train_audioFeaturesFull")
for c in ourClasses:
    directory = "data/train_rawAudio/" + c + "/"
    
    types = ('*.wav', '*.aif',  '*.aiff', '*.mp3','*.au')
    wavFilesList = []
    for files in types:
        wavFilesList.extend(glob.glob(os.path.join(directory, files)))
    
    with open('data/train_audioFeaturesFull/' + c + '.csv', 'wb') as csvfile:
        csvWriter = csv.writer(csvfile, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
        
        for i in range(0, len(wavFilesList)):   
            
            print("Extracting features from {}".format(wavFilesList[i]))
            features = getMidtermFeatures(wavFilesList[i])
            if not type(features) is np.ndarray:
                continue
                
            row = ["%.10f" % feat for feat in features]
            curName = wavFilesList[i]
            e = curName.rfind(".");
            b = curName.rfind("/");
            
            row.insert(0,curName[b+1:e])
            csvWriter.writerow(row)
            
createFolder("data/test_audioFeaturesFull")
for c in ourClasses:
    directory = "data/test_rawAudio/" + c + "/"
    
    types = ('*.wav', '*.aif',  '*.aiff', '*.mp3','*.au')
    wavFilesList = []
    for files in types:
        wavFilesList.extend(glob.glob(os.path.join(directory, files)))
    
    with open('data/test_audioFeaturesFull/' + c + '.csv', 'wb') as csvfile:
        csvWriter = csv.writer(csvfile, delimiter=',',quotechar='|', quoting=csv.QUOTE_MINIMAL)
        
        for i in range(0, len(wavFilesList)):   
            
            print("Extracting features from {}".format(wavFilesList[i]))
            features = getMidtermFeatures(wavFilesList[i])
            
            if not type(features) is np.ndarray:
                continue
                
            row = ["%.10f" % feat for feat in features]
            curName = wavFilesList[i]
            e = curName.rfind(".");
            b = curName.rfind("/");
            
            row.insert(0,curName[b+1:e])
            csvWriter.writerow(row)
            


# Get Generated Features

In [70]:
#Assumes featureFolder is generated. Look at generateFeatures.ipynb
def getFeatureMatrices(pathToFeatureCSV):
    data = [];
    names = [];
    #numComma = pathToFeatureCSV.count(',')
    with open(pathToFeatureCSV, 'r') as f:
        csvReader = csv.reader(f, delimiter=',')
        rowCount = 0
        for row in csvReader:
            rowCount = rowCount + 1
            
            names.append(row[0])
            x = np.array(row[1:]).astype(np.float)
            x = np.nan_to_num(x)
            
            
            data.append(x)
                            
    return np.array(data), names

# Get an equal positive set and negative set for a class

In [71]:
def getFeatureSetPosNeg(pathToDirWithCSV, cName, ourClasses):
    data_pos, names = getFeatureMatrices(pathToDirWithCSV + cName + ".csv")
    amountOfNegData = data_pos.shape[0]
    
    data_neg = [];
    
    for c in ourClasses:
        if c == cName:
            continue
        data_new, names_new = getFeatureMatrices(pathToDirWithCSV + c + ".csv")
        negDataCounter = 0
        for i in range(0, len(names_new)):
            
            if names_new[i] in names:
                continue
                
            negDataCounter = negDataCounter + 1
            data_neg.append(data_new[i,:])
            
            if negDataCounter >= amountOfNegData/6:
                break

    return data_pos, np.array(data_neg)

# Calculates test error given a list of classifiers

In [72]:
def getTestErrorValue(clf_list, ourClasses, pathToDirWithTestCSV):
    numCorrect_test  = 0
    num_test  = 0
    
    dataSets = [];
    nameSets = [];
    for c in ourClasses:
        data, names = getFeatureMatrices(pathToDirWithTestCSV + c + ".csv")
        dataSets.append(data)
        nameSets.append(names)
    
    for k in range(0, len(ourClasses)):
        data = dataSets[k]
        names = nameSets[k]
        
        for i in range(0, len(clf_list)):
            clf = clf_list[i]
            result = clf.predict(data)
            
            for j in range(0, len(names)):
                num_test = num_test + 1
                if result[j]  and names[j] in nameSets[i]:
                    numCorrect_test = numCorrect_test + 1
                    
                if not(result[j]) and not(names[j] in nameSets[i]):
                    numCorrect_test = numCorrect_test + 1

    return 1 - float(numCorrect_test)/float(num_test)

# Performing LDA

In [73]:
print("Using LDA")

clf_list = []

for c in ourClasses:
    data_pos, data_neg = getFeatureSetPosNeg("data/train_audioFeaturesFull/", c, ourClasses)

    
    dataset = np.vstack([data_pos, data_neg])
    labels  = np.ravel(np.vstack([np.ones([data_pos.shape[0],1]), np.zeros([data_neg.shape[0],1])]))
    
    clf = LDA()
    clf.fit(dataset, labels)
    LDA(n_components=None, priors=None, shrinkage=None, solver='svd',store_covariance=False, tol=0.1)
    print("Training classifier " + c + " training error: {}".format(1 - clf.score(dataset,labels)))
    clf_list.append(clf)

testE = getTestErrorValue(clf_list, ourClasses, "data/test_audioFeaturesFull//")
print("Test Error  : {}".format(testE))


#  Performing Neural Network

In [74]:
#l1,l2 values taken from last one:


clf_list = []

for i in range(0,len(ourClasses)):
    c = ourClasses[i]
    #l1 = l1_list[i]
    #l2 = l2_list[i]
    l1 = 200
    l2 = 50
    
    data_pos, data_neg = getFeatureSetPosNeg("data/train_audioFeaturesMean/", c, ourClasses)
    dataset = np.vstack([data_pos, data_neg])
    labels  = np.ravel(np.vstack([np.ones([data_pos.shape[0],1]), np.zeros([data_neg.shape[0],1])]))
    
    clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(l1, l2), random_state=1)
    clf.fit(dataset, labels);
    print("Training classifier " + c + " training error: {}".format(1 - clf.score(dataset,labels)))
    
    clf_list.append(clf)
    
testE = getTestErrorValue(clf_list, ourClasses, "data/test_audioFeaturesMean//")
print("Test Error  : {}".format(testE))

# Performing Boosting

In [75]:
print("Using Adaboost")

clf_list = []

for c in ourClasses:
    data_pos, data_neg = getFeatureSetPosNeg("data/train_audioFeaturesMean/", c, ourClasses)
    dataset = np.vstack([data_pos, data_neg])
    labels  = np.ravel(np.vstack([np.ones([data_pos.shape[0],1]), np.zeros([data_neg.shape[0],1])]))
    
    clf = AdaBoostClassifier(n_estimators = 75)
    
    clf.fit(dataset, labels);
    print("Training classifier " + c + " training error: {}".format(1 - clf.score(dataset,labels)))
    
    clf_list.append(clf)
    
testE = getTestErrorValue(clf_list, ourClasses, "data/test_audioFeaturesMean/")
print("Test Error  : {}".format(testE))

# Performing SVM

In [76]:
print("Using SVM")

clf_list = []

for c in ourClasses:
    data_pos, data_neg = getFeatureSetPosNeg("data/train_audioFeaturesMean/", c, ourClasses)
    dataset = np.vstack([data_pos, data_neg])
    labels  = np.ravel(np.vstack([np.ones([data_pos.shape[0],1]), np.zeros([data_neg.shape[0],1])]))
    
    clf = svm.SVC(kernel='linear')
    clf.fit(dataset, labels)
    print("Training classifier " + c + " training error: {}".format(1 - clf.score(dataset,labels)))
    
    clf_list.append(clf)
    
testE = getTestErrorValue(clf_list, ourClasses, "data/test_audioFeaturesMean/")
print("Test Error  : {}".format(testE))