In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score
from scipy.io import wavfile
from pylab import*
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [11]:
# 1 == Abnormal, 0 == Normal
def loadData(trainingFolder):
    wav = pd.read_csv('{}/REFERENCE.csv'.format(trainingFolder), header=None, names =['filename', 'outcome'])
    wav_list = []
    wav_name = []
    
    wav.outcome.replace(to_replace=-1, value=0, inplace=True)
    
    for fname in wav.filename:
        path = "./{}/{}.wav".format(trainingFolder, fname)
        sampFreq, snd = wavfile.read(path)
        snd = snd/(2.**15)
        wav_list.append(snd)
        wav_name.append(fname)
    return wav_list, wav

In [12]:
def shortenTo9Sec(sndList):
    snd9SecList = []
    for snd in sndList:
        sndNP = np.asarray(snd)
        if sndNP.size <= 16384:
            #print "Case0"
            snd9sec = sndNP
        #Check left and rigth side of max index
        # max is at least 8000 from start and 8000 from end
        elif sndNP.argmax() > 8000 and (sndNP.size - sndNP.argmax()) >= 8000:
            #print "Case1"
            snd9sec = snd[sndNP.argmax() - 8000:sndNP.argmax()+8000]
        # max is within 8000 from start
        elif sndNP.argmax() < 8000 and sndNP.size - sndNP.argmax() > 8000:
            #print "Case2"
            snd9sec = sndNP[0:16000]
        # max is at least 8000 from start but within 8000 from end
        elif sndNP.argmax() > 8000 and sndNP.size - sndNP.argmax() <= 8000:
            #print "Case3"
            snd9sec = sndNP[sndNP.size - 16000:sndNP.size]
    
        snd9SecList.append(snd9sec)
    return snd9SecList

In [13]:
def average(snd, offset):
    aveList = []
    for x in range(0,len(snd), offset):
        aveList.append(np.mean(snd[x:x+offset]))
    return aveList

In [14]:
def fftProcess4(sndList, cutOffidx, offset):
    sndFFTList = []
    fList = []
    NFFT = 16384
    Fs = 2000
    for idx, snd in enumerate(sndList):
        L = len(snd)
        Ypre = fft(snd, NFFT)/L
        f = Fs/2*np.linspace(0.0, 1, NFFT/2+1)
        fList.append(f)
        Y = 2*np.abs(Ypre[0:NFFT/2+1])
        #sndFFTList.append(Y[0:maxOffset][0::4])  #Get the average of every 4th data point
        sndFFTList.append(average(Y[0:cutOffidx], 4))
        #sndFFTList.append(maverage(Y, 4))
    return sndFFTList, fList

### Training D

In [15]:
sndDMList, wavDMDF = loadData('training-d_cleaned')
sndDM9SecList = shortenTo9Sec(sndDMList)

fftDMList, fDMList = fftProcess4(sndDM9SecList, 2458, 4) #set at 300Hz?

In [16]:
modelDM = RandomForestClassifier(n_estimators = 20)
scoresDM = cross_val_score(modelDM, fftDMList, wavDMDF.outcome, scoring='roc_auc', cv=5)
print 'CV AUC {}, Average AUC {}'.format(scoresDM, scoresDM.mean())

CV AUC [ 1.          0.84722222  0.7         0.66        0.9       ], Average AUC 0.821444444444


### Training A

In [17]:
sndAMList, wavAMDF = loadData('training-a_cleaned')
sndAM9SecList = shortenTo9Sec(sndAMList)

fftAMList, fAMList = fftProcess4(sndAM9SecList, 2458, 4) #set at 300Hz?

In [18]:
modelAM = RandomForestClassifier(n_estimators = 20)
scoresAM = cross_val_score(modelAM, fftAMList, wavAMDF.outcome, scoring='roc_auc', cv=5)
print 'CV AUC {}, Average AUC {}'.format(scoresAM, scoresAM.mean())

CV AUC [ 0.57168079  0.6299435   0.57046477  0.59407796  0.52098951], Average AUC 0.577431305534


In [19]:
# Fit on Trinaing A and Score on Training E
modelAM.fit(fftAMList, wavAMDF.outcome)
print modelAM.score(fftDMList, wavDMDF.outcome)

0.509090909091


### Training B

In [20]:
sndBMList, wavBMDF = loadData('training-b_cleaned')
sndBM9SecList = shortenTo9Sec(sndBMList)

fftBMList, fBMList = fftProcess4(sndBM9SecList, 2458, 4) #set at 300Hz?

In [21]:
modelBM = RandomForestClassifier(n_estimators = 20)
scoresBM = cross_val_score(modelBM, fftBMList, wavBMDF.outcome, scoring='roc_auc', cv=5)
print 'CV AUC {}, Average AUC {}'.format(scoresBM, scoresBM.mean())

CV AUC [ 0.63339438  0.63234385  0.50401979  0.49134199  0.56461039], Average AUC 0.565142080142


In [22]:
# Fit on Trinaing B and Score on Training D
modelBM.fit(fftBMList, wavBMDF.outcome)
modelBM.score(fftDMList, wavDMDF.outcome)

0.43636363636363634

### Training C

In [23]:
sndCMList, wavCMDF = loadData('training-c_cleaned')
sndCM9SecList = shortenTo9Sec(sndCMList)

fftCMList, fCMList = fftProcess4(sndCM9SecList, 2458, 4) #set at 300Hz?

In [24]:
modelCM = RandomForestClassifier(n_estimators = 20)
scoresCM = cross_val_score(modelCM, fftCMList, wavCMDF.outcome, scoring='roc_auc', cv=5)
print 'CV AUC {}, Average AUC {}'.format(scoresCM, scoresCM.mean())

CV AUC [ 1.    0.65  0.6   0.5   0.75], Average AUC 0.7


In [25]:
modelCM.fit(fftCMList, wavCMDF.outcome)
print modelCM.score(fftDMList, wavDMDF.outcome)

0.381818181818


### Training E

In [26]:
sndEMList, wavEMDF = loadData('training-e_cleaned')
sndEM9SecList = shortenTo9Sec(sndEMList)

fftEMList, fEMList = fftProcess4(sndEM9SecList, 2458, 4) #set at 300Hz?

In [27]:
modelEM = RandomForestClassifier(n_estimators = 20)
scoresEM = cross_val_score(modelEM, fftEMList, wavEMDF.outcome, scoring='roc_auc', cv=5)
print 'CV AUC {}, Average AUC {}'.format(scoresEM, scoresEM.mean())

CV AUC [ 0.97876448  0.9767995   0.99327772  0.9741759   0.98362461], Average AUC 0.98132844207


In [28]:
# Fit on Trinaing E and Score on Training D
modelEM.fit(fftEMList, wavEMDF.outcome)
modelEM.score(fftDMList, wavDMDF.outcome)

0.49090909090909091