<a href="https://colab.research.google.com/github/AyishaR/Spokendigit/blob/master/Spokendigit_feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import tarfile
from torchvision.datasets.utils import download_url
import os
import librosa
import pandas as pd
import numpy as np
import torchvision.transforms as transforms
import csv
from PIL import Image

#Data

In [None]:
data = download_url("http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz", "/content/")

with tarfile.open('/content/speech_commands_v0.01.tar.gz', 'r:gz') as tar:
    tar.extractall(path='./data')

Downloading http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz to /content/speech_commands_v0.01.tar.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

In [None]:
data, sampling_rate = librosa.load('/content/data/one/00176480_nohash_0.wav')
data, data.shape, sampling_rate

(array([-2.9995823e-05, -1.6769719e-04, -3.9024639e-04, ...,
        -6.1316596e-04, -3.3654648e-04, -1.7320579e-04], dtype=float32),
 (22050,),
 22050)

In [None]:
digit = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
for x in digit:
    print(x, ": ", len(os.listdir('/content/data/'+x)))

#Balanced

zero :  2376
one :  2370
two :  2373
three :  2356
four :  2372
five :  2357
six :  2369
seven :  2377
eight :  2352
nine :  2364


In [None]:
for x in os.listdir('/content/data'):
    if os.path.isdir('/content/data/'+x):
        print(x, ": ", len(os.listdir('/content/data/'+x)))

four :  2372
go :  2372
off :  2357
bird :  1731
right :  2367
left :  2353
yes :  2377
happy :  1742
eight :  2352
dog :  1746
wow :  1745
stop :  2380
six :  2369
cat :  1733
_background_noise_ :  7
one :  2370
up :  2375
zero :  2376
two :  2373
house :  1750
sheila :  1734
on :  2367
bed :  1713
down :  2359
tree :  1733
three :  2356
seven :  2377
no :  2375
five :  2357
marvin :  1746
nine :  2364


# Reference csv files

Two csv files are created.

* A csv file with the path of all the recordings and the label (Spoken_digit.csv).
* A csv file that contains the path of recording categorized by label (Spoken_digit_X.csv).

In [None]:
import csv
d = {}
with open("Spoken_digit.csv", 'w') as csvfile:
    csvwriter = csv.writer(csvfile)
    csvwriter.writerow(["File", "Label"])
    for x in digit:
        if os.path.isdir('/content/data/'+x):
            d[x] = os.listdir('/content/data/'+x)
            for name in os.listdir('/content/data/'+x):
                if os.path.isfile('/content/data/'+x+"/"+name):
                    csvwriter.writerow([x+'/'+name, x])

#shuffle 
df = pd.read_csv('Spoken_digit.csv')
df = df.sample(frac=1)
df.to_csv('Spoken_digit.csv', index = False)

dfx = pd.DataFrame.from_dict(d, orient='index', dtype = 'float32').transpose()
dfx.to_csv('Spoken_digit_X.csv', index = False)
'''

#Feature extraction from dataset 

Extract features each time datapoint is called, i.e., extract_features() in dataset class, as below.

This method is time consuming.

So, we extract the features and store them beforehand (code in later section).

In [None]:
def extract_features(path):
    data, sr = librosa.load('/content/data/'+path)
    mfccs = np.mean(librosa.feature.mfcc(y = data, sr=sr).T, axis = 0)
    spectral_centroids = librosa.feature.spectral_centroid(data+0.01, sr=sr)[0]
    stft = np.abs(librosa.stft(data))
    chroma = np.mean(librosa.feature.chroma_stft(S = stft, sr = sr).T, axis = 0)
    mel = np.mean(librosa.feature.melspectrogram(data, sr).T, axis = 0)
    contrast = np.mean(librosa.feature.spectral_contrast(S = stft, sr = sr).T, axis = 0)
    tonnetz = np.mean(librosa.feature.tonnetz(y = librosa.effects.harmonic(data), sr = sr).T, axis = 0)
    
    #print(mfccs.shape, spectral_centroids.shape, stft.shape, chroma.shape, mel.shape, contrast.shape, tonnetz.shape)
    
    #spectral_centroids have varying shapes for each datapoint and stft is 2d array. Thus they are not included in the final features.
    return np.concatenate((mfccs, chroma, mel, contrast, tonnetz), axis = 0).astype('float32')

In [None]:
class SpokenDigist(Dataset):
    def __init__(self, file = None, rootdir = None):
        self.df = pd.read_csv(file)
        self.rootdir = rootdir

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i):
        row = self.df.loc[i]
        fname, label = row['File'], row['Label']
        fts = extract_features(fname)
        #print(len(fts))
        return torch.tensor(fts), torch.tensor(digit.index(label))
    
    def getsr(self, i):
        fname, label = row['File'], row['Label']
        _, sr = librosa.load(self.rootdir+'/'+fname)
        return sr

In [None]:
spoken_dset = SpokenDigit(file = "Spoken_digit.csv", rootdir = "/content/data/")

#Mel cnn

Extracting melspectogram image for each image. Stored in drive. Takes about 4.5+hours.

Ran separately for each digit using Spoken_digit_X.csv generated above.

In [None]:
from google.colab import files
from google.colab import drive
drive.mount('/content/drive')

In [None]:
sp = pd.read_csv('Spoken_digit_X.csv')

In [None]:
def extract_mel(f, label):
    
    data, sr = librosa.load('/content/data/'+label+'/'+f)
    
    fig = plt.figure(figsize=[1,1])
    ax = fig.add_subplot(111)
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)
    ax.set_frame_on(False)
    
    S = librosa.feature.melspectrogram(y=data, sr=sr)
    librosa.display.specshow(librosa.power_to_db(S, ref=np.max), x_axis='time', y_axis='mel', fmin=50, fmax=280)
    file  = '/content/drive/My Drive/Dataset/spokendigit/'+label+'/' + str(f[:-4]) + '.jpg'
    plt.savefig(file, dpi=500, bbox_inches='tight',pad_inches=0)
    
    plt.close()

In [None]:
for lbl in digit:
    j=0
    print(lbl)
    for t in sp[lbl]:
        if type(t) == str:
            print(j)
            j = j+1
            extract_mel(t, lbl)

#Individual features

Extracting and simultaneously writing to a csv file, each element of each feature separately. Takes about 4.5+hours. 

### CSV details - 

label, mfccs, chroma, mel, contrast, tonnetz

(1,) (20,) (12,) (128,) (7,) (6,)

Stft is a 2D array. So its not included.


In [None]:
sp = pd.read_csv("Spoken_digit.csv")

In [None]:
csvfile = open("Spokendigit_final_fts.csv", "w")
csvwriter = csv.writer(csvfile)
csvwriter.writerow(np.concatenate((['Label'], [i for i in range(1, 174)])))

In [None]:
def extract_features(files):
    data, sr = librosa.load('/content/data/'+files.File)
    mfccs = np.mean(librosa.feature.mfcc(y = data, sr=sr).T, axis = 0)
    stft = np.abs(librosa.stft(data))
    chroma = np.mean(librosa.feature.chroma_stft(S = stft, sr = sr).T, axis = 0)
    mel = np.mean(librosa.feature.melspectrogram(data, sr).T, axis = 0)
    contrast = np.mean(librosa.feature.spectral_contrast(S = stft, sr = sr).T, axis = 0)
    tonnetz = np.mean(librosa.feature.tonnetz(y = librosa.effects.harmonic(data), sr = sr).T, axis = 0)
    
    #print(mfccs.shape, stft.shape, chroma.shape, mel.shape, contrast.shape, tonnetz.shape)
    
    row =  np.concatenate((mfccs, chroma, mel, contrast, tonnetz), axis = 0).astype('float32')
    csvwriter.writerow(np.concatenate(([digit.index(files.Label)], row)))

In [None]:
sp.apply(extract_features, axis = 1)