## Final Project

In [23]:
# Import libraries
import sys
import numpy as np
import pandas as pd
import sklearn as sk
from os import listdir
from os.path import isfile, join
from timeit import default_timer as timer

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from six.moves import cPickle as pickle
from six.moves import range

import librosa
import soundfile as sf
from python_speech_features import mfcc
from python_speech_features import logfbank

In [3]:
# # if running file for the first time.
# import tarfile
# t = tarfile.open('UrbanSound8K.tar.gz', mode="r:gz")
# t.extractall()

In [4]:
# see see dataset orginization
raw_sound = pd.read_csv('UrbanSound8K/metadata/UrbanSound8K.csv')
raw_sound.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [7]:
# get sense of data
data, samplerate = sf.read('UrbanSound8K/audio/fold1/102106-3-0-0.wav')
print(data)
print(samplerate)

[[-0.01174927  0.03039551]
 [-0.01153564  0.02471924]
 [-0.01644897  0.01794434]
 ..., 
 [-0.00588989  0.00012207]
 [ 0.00314331  0.00585938]
 [ 0.00540161  0.00689697]]
44100


In [12]:
fold_list = ['fold1', 'fold2', 'fold3', 'fold4', 'fold5', 'fold6', 'fold7', 'fold8', 'fold9', 'fold10']

'3.5.2 |Anaconda 4.1.1 (64-bit)| (default, Jul  5 2016, 11:41:13) [MSC v.1900 64 bit (AMD64)]'

In [13]:
def extract_feature(file_name: str) -> tuple:
    """
    Extracts 193 chromatographic features from sound file. 
    including: MFCC's, Chroma_StFt, Melspectrogram, Spectral Contrast, and Tonnetz
    NOTE: this extraction technique changes the time series nature of the data
    """
    X, sample_rate = librosa.load(file_name)
    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
    return mfccs,chroma,mel,contrast,tonnetz

In [20]:
start_time = timer()
a,b,c,d,e = extract_feature('UrbanSound8K/audio/fold1/102106-3-0-0.wav')
end_time = timer()
print('time to extract features from one file: {:.3f}sec'.format((end_time-start_time)/60))

time to extract features from one file: 0.034sec


  return array(a, dtype, copy=False, order=order)


In [21]:
print(a.shape,b.shape,c.shape,d.shape,e.shape)
print(a[0])
print(40+12+128+13)

(40,) (12,) (128,) (7,) (6,)
-253.201139054
193


In [27]:
mfcc_data = []
exception_count = 0

start_time = timer()
for i in range(10):
    # get file names
    mypath = 'UrbanSound8K/audio/'+ fold_list[i] + '/'
    files = [mypath + f for f in listdir(mypath) if isfile(join(mypath, f))]
    
    for fn in files:
        try: # extract features
            mfccs,chroma,mel,contrast,tonnetz = extract_feature(fn)
            features = np.empty((0,193))
            ext_features = np.hstack([mfccs,chroma,mel,contrast,tonnetz])
            features = np.vstack([features,ext_features])
            
        except: # else exception (.ds_store files are part of mac file systems)
            print(fn)
            exception_count += 1
            continue
            
        l_row = raw_sound.loc[raw_sound['slice_file_name']==fn.split('/')[-1]].values.tolist()
        label = l_row[0][-1]
        fold = i+1
    
        mfcc_data.append([features, features.shape, label, fold])
        
            #print(f,old_samplerate,ss)
        
print("Exceptions: ", exception_count)
end_time = timer()
print(print("time taken: {0} minutes {1:.1f} seconds".format((end_time - start_time)//60, (end_time - start_time)%60)))

UrbanSound8K/audio/fold1/.DS_Store


  return array(a, dtype, copy=False, order=order)


UrbanSound8K/audio/fold2/.DS_Store
UrbanSound8K/audio/fold3/.DS_Store
UrbanSound8K/audio/fold4/.DS_Store
UrbanSound8K/audio/fold5/.DS_Store
UrbanSound8K/audio/fold6/.DS_Store
UrbanSound8K/audio/fold7/.DS_Store
UrbanSound8K/audio/fold8/.DS_Store
UrbanSound8K/audio/fold9/.DS_Store
UrbanSound8K/audio/fold10/.DS_Store
Exceptions:  10
time taken: 79.0 minutes 43.5 seconds
None


In [50]:
cols=["features", "shape","label", "fold"]
mfcc_pd = pd.DataFrame(data = mfcc_data, columns=cols)
mfcc_pd.head(1)

Unnamed: 0,features,shape,label,fold
0,"[[-402.458131222, 92.2208198393, 19.3559458953...","(1, 193)",dog_bark,1


In [38]:
# Convert label to class number
le = LabelEncoder()
label_num = le.fit_transform(mfcc_pd["label"])

# one hot encode
ohe = OneHotEncoder()
onehot = ohe.fit_transform(label_num.reshape(-1, 1))

(8732,)

In [55]:
for i in range(10):
    mfcc_pd[le.classes_[i]] = onehot[:,i].toarray()

In [56]:
mfcc_pd.head(1)

Unnamed: 0,features,shape,label,fold,air_conditioner,car_horn,children_playing,dog_bark,drilling,engine_idling,gun_shot,jackhammer,siren,street_music
0,"[[-402.458131222, 92.2208198393, 19.3559458953...","(1, 193)",dog_bark,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [57]:
ll = [mfcc_pd['features'][i].ravel() for i in range(mfcc_pd.shape[0])]
mfcc_pd['sample'] = pd.Series(ll, index=mfcc_pd.index)
del mfcc_pd['features']

In [58]:
mfcc_pd.head(1)

Unnamed: 0,shape,label,fold,air_conditioner,car_horn,children_playing,dog_bark,drilling,engine_idling,gun_shot,jackhammer,siren,street_music,sample
0,"(1, 193)",dog_bark,1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,"[-402.458131222, 92.2208198393, 19.3559458953,..."


In [60]:
# for use in Networks with 193 features.ipynb
pickle.dump(mfcc_pd, open('193_features.p','wb'))

In [None]:
# for use in SVM_RF_NB_final.ipynb
mfcc_data["label_id"] = label_num
pickle.dump(mfcc_data, open('feature_data1.p','wb'))