In [1]:
import numpy as np
import pandas as pd
import librosa
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from utils import audioFilesWordsIds, loadAudioFile

## Load Audio files and store amplitudes

In [3]:
dataFolder = 'dataSmol'
subFolderExceptions = np.array(['_background_noise_'])

audioFiles,words,wordIds = audioFilesWordsIds(dataFolder,subFolderExceptions)
allAmplitudes = []
allDurations = []
allSampleRates = []
allWords = []
allWordIds = []

sampleRate = 16000

for file, word, wordId in zip(audioFiles, words, wordIds):
    amplitudes, sampleRate = loadAudioFile(file, sampleRate)
    if(len(amplitudes)==sampleRate):  # only taking files with 1 second duration
        allAmplitudes.append(amplitudes)
        allSampleRates.append(sampleRate)
        allDurations.append(len(amplitudes))
        allWords.append(word)
        allWordIds.append(wordId)


allAmplitudes = np.array(allAmplitudes, dtype=object) # using object because of unequal length of array : unequal no. of amplitudes of audios
allSampleRates = np.array(allSampleRates)
allDurations = np.array(allDurations)
allWords = np.array(allWords)
allWordIds = np.array(allWordIds)

allAmplitudes = list(allAmplitudes)  
df = pd.DataFrame({
    'amplitudesOfAudios': allAmplitudes,
    'sampleRates' : allSampleRates,
    'durations' : allDurations,
    'words': allWords,
    'wordIds': allWordIds
})

In [4]:
pd.set_option('display.expand_frame_repr', False)
print(df)


                                     amplitudesOfAudios  sampleRates  durations     words  wordIds
0     [-0.065765380859375, -0.0709228515625, -0.0753...        16000      16000  backward        1
1     [-0.000274658203125, -0.000213623046875, -0.00...        16000      16000  backward        1
2     [-6.103515625e-05, -0.00018310546875, -0.00024...        16000      16000  backward        1
3     [9.1552734375e-05, 0.000244140625, 0.000244140...        16000      16000  backward        1
4     [0.0, -9.1552734375e-05, -0.000244140625, -0.0...        16000      16000  backward        1
...                                                 ...          ...        ...       ...      ...
3085  [-3.0517578125e-05, -3.0517578125e-05, -3.0517...        16000      16000      zero       35
3086  [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...        16000      16000      zero       35
3087  [0.0, 0.0, 3.0517578125e-05, 0.0, 3.0517578125...        16000      16000      zero       35
3088  [-0.

## Storing label encoding of words in a map

In [5]:
wordToIdMap = dict(df[['wordIds', 'words']].drop_duplicates().values)

print(wordToIdMap)

{1: 'backward', 2: 'bed', 3: 'bird', 4: 'cat', 5: 'dog', 6: 'down', 7: 'eight', 8: 'five', 9: 'follow', 10: 'forward', 11: 'four', 12: 'go', 13: 'happy', 14: 'house', 15: 'learn', 16: 'left', 17: 'marvin', 18: 'nine', 19: 'no', 20: 'off', 21: 'on', 22: 'one', 23: 'right', 24: 'seven', 25: 'sheila', 26: 'six', 27: 'stop', 28: 'three', 29: 'tree', 30: 'two', 31: 'up', 32: 'visual', 33: 'wow', 34: 'yes', 35: 'zero'}


## Spliting dataset in Train:Test :: 80:20

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
trainingDf, testingDf = train_test_split(df, test_size=0.2, shuffle=True)
trainingDf = trainingDf.reset_index(drop=True)
testingDf = testingDf.reset_index(drop=True)

print(trainingDf)
print(testingDf)

                                     amplitudesOfAudios  sampleRates  durations   words  wordIds
0     [0.0, 9.1552734375e-05, 9.1552734375e-05, 9.15...        16000      16000   seven       24
1     [0.0, -3.0517578125e-05, -3.0517578125e-05, -6...        16000      16000    zero       35
2     [-3.0517578125e-05, -0.000213623046875, -0.000...        16000      16000   three       28
3     [0.0, 0.0, 3.0517578125e-05, 3.0517578125e-05,...        16000      16000   right       23
4     [-3.0517578125e-05, -6.103515625e-05, -3.05175...        16000      16000   eight        7
...                                                 ...          ...        ...     ...      ...
2467  [0.0, 3.0517578125e-05, -3.0517578125e-05, -6....        16000      16000  visual       32
2468  [0.0, 3.0517578125e-05, 3.0517578125e-05, 3.05...        16000      16000    four       11
2469  [-0.000152587890625, -0.00018310546875, -0.000...        16000      16000     two       30
2470  [-0.00531005859375, 0.01

## Adjusting for class imbalance

### Class imbalance by samples for each class

In [8]:
countSamples = trainingDf['words'].value_counts().reset_index()
countSamples.columns = ['words', 'counts']

print(countSamples)

       words  counts
0         on      78
1        one      76
2      eight      76
3   backward      76
4     marvin      75
5       four      75
6      learn      75
7       down      75
8       left      74
9        dog      73
10      bird      73
11      five      73
12    follow      73
13        go      73
14     house      72
15       six      72
16    visual      72
17      stop      72
18       two      71
19      zero      70
20     three      70
21     right      69
22     seven      69
23     happy      68
24       off      68
25      nine      67
26       yes      67
27        no      67
28       bed      66
29       wow      66
30        up      66
31    sheila      65
32   forward      64
33       cat      63
34      tree      63


#### treatment : creating bootstrap samples

In [9]:
from utils import balanceClassSamples

In [10]:
meanCount = int(countSamples['counts'].mean())
print(f"\nMean of counts column: {meanCount}\n\n")

trainingDf = balanceClassSamples(trainingDf)

countSamples = trainingDf['words'].value_counts().reset_index()
countSamples.columns = ['words', 'counts']
print(countSamples)
total = countSamples['counts'].sum()
print(f'Total : {total}')


Mean of counts column: 70


       words  counts
0       five      70
1     follow      70
2   backward      70
3       tree      70
4      house      70
5      three      70
6     visual      70
7        off      70
8      learn      70
9        bed      70
10        on      70
11   forward      70
12       wow      70
13      left      70
14        no      70
15      bird      70
16       one      70
17        go      70
18       two      70
19       cat      70
20     happy      70
21        up      70
22       yes      70
23      zero      70
24    sheila      70
25    marvin      70
26      nine      70
27     right      70
28      down      70
29     seven      70
30       dog      70
31     eight      70
32       six      70
33      four      70
34      stop      70
Total : 2450


### Class imbalance by durations among all classes

In [11]:
countDurations = trainingDf['durations'].value_counts().reset_index()
countDurations.columns = ['durations', 'counts']

print(countDurations)

   durations  counts
0      16000    2450


everything has same duration, no treatment needed

## Spliting into Inputs and Outputs

In [12]:
# xTrainingDf = trainingDf.drop(columns=['words','wordIds'])
xTrainingDf = trainingDf['amplitudesOfAudios']
yTrainingDf = trainingDf['wordIds']
# xTestingDf = testingDf.drop(columns=['words','wordIds'])
xTestingDf = testingDf['amplitudesOfAudios']
yTestingDf = testingDf['wordIds']

print(xTrainingDf)
print(yTrainingDf)

0       [0.00054931640625, 0.0008544921875, 0.00045776...
1       [0.001434326171875, -0.001861572265625, -0.003...
2       [-9.1552734375e-05, -0.00018310546875, -0.0002...
3       [3.0517578125e-05, -6.103515625e-05, -9.155273...
4       [3.0517578125e-05, -0.000213623046875, -0.0002...
                              ...                        
2445    [0.000946044921875, 0.000946044921875, 0.00100...
2446    [0.0008544921875, 0.00103759765625, 0.00103759...
2447    [-9.1552734375e-05, -0.00018310546875, -0.0002...
2448    [-0.00543212890625, -0.005401611328125, -0.005...
2449    [0.000518798828125, 0.0006103515625, 0.0004577...
Name: amplitudesOfAudios, Length: 2450, dtype: object
0        8
1       30
2       13
3       31
4       34
        ..
2445    24
2446     1
2447    34
2448    35
2449    10
Name: wordIds, Length: 2450, dtype: int32


## Framing : breaking array of amplitudes into smaller chunks 

Each audio is of 1 second duration. Spliting them into overlapping chunks, say of 25milisecond, will help us analyse them locally

In [13]:
from utils import frameAudio

In [14]:
frameLength = sampleRate #25ms for 16000 samples
overlap = 0.5
hopLength = int(frameLength * (1 - overlap))

framedAudio = xTrainingDf.apply(frameAudio, args=(frameLength, hopLength))
framedAudioArray = np.array([np.array(frame) for frame in framedAudio])

In [15]:
print(framedAudioArray)
print(framedAudioArray.shape)

[[[0.00054931640625 0.0008544921875 0.000457763671875 ... 0.001953125
   0.001373291015625 0.0020751953125]
  [0.001129150390625 0.001220703125 0.000640869140625 ...
   0.000762939453125 0.00054931640625 0.002716064453125]
  [0.001312255859375 0.0015869140625 0.00140380859375 ...
   0.0003662109375 0.00128173828125 0.001861572265625]
  ...
  [-0.006866455078125 -0.006561279296875 -0.0078125 ...
   -0.00531005859375 -0.005401611328125 -0.005401611328125]
  [-0.007781982421875 -0.007720947265625 -0.00701904296875 ...
   -0.003143310546875 -0.002410888671875 -0.00341796875]
  [-0.0047607421875 -0.00555419921875 -0.005859375 ... 0.0023193359375
   -0.002166748046875 -0.006683349609375]]

 [[0.001434326171875 -0.001861572265625 -0.003662109375 ...
   0.00213623046875 0.001800537109375 -0.000213623046875]
  [0.000244140625 -0.000579833984375 0.00103759765625 ...
   -0.000396728515625 -0.0010986328125 -0.001220703125]
  [0.0009765625 0.000823974609375 0.00018310546875 ... -0.00054931640625
  

## Feature Extractionm

### Gabor Transform

In [16]:
from utils import optimizedGaborTransform

In [17]:
sigma = 0.05
frequency = 2000
gaborResults = optimizedGaborTransform(framedAudioArray, sampleRate, sigma, frequency)

print(gaborResults)

[[[0.038384829817341086 0.05724703846257051 -0.03695540027207514 ...
   -0.11012436025004135 -0.21504995141188937 -0.1705799565657309]
  [0.08717332333583484 0.23023058947408748 0.11696364866419878 ...
   -0.18838411963107954 -0.22259015685750874 -0.11420777465324042]
  [0.1406045318553923 0.3111989285982031 0.2312718080549998 ...
   -0.16515473779332884 -0.09797345922180258 0.006732952631302367]
  ...
  [-0.03420090303036789 0.00032814851954048735 0.02693660793226258 ...
   -0.12460433670747277 -0.09891384788882829 -0.06487072397343957]
  [-0.09869411240854105 -0.0930179800625254 -0.05190586140765356 ...
   -0.05388610015493138 -0.050779284434662154 -0.03337502858114775]
  [-0.09449809131586827 -0.10321231310189277 -0.0613518330459616 ...
   0.06461557905140526 0.0415413792223792 0.03252391913332285]]

 [[0.05203565288059776 -0.057621432685510746 0.022218759411817265 ...
   0.00627939574045841 -0.10069909247594556 -0.11953258797082353]
  [0.025676174645317344 -0.06808881545318847 -0.0

In [18]:
print(gaborResults.shape)

(2450, 79, 400)


In [19]:
import numpy as np
import pandas as pd
import librosa

def extractMFCCs(frames: np.array, sampleRate: int, nMFCC: int = 13) -> np.array:
    mfccs = []
    for frame_set in frames:
        frame_mfccs = []
        for frame in frame_set:
            flattened_frame = frame.flatten().astype(np.float32)
            mfcc = librosa.feature.mfcc(y=flattened_frame, sr=sampleRate, n_mfcc=nMFCC)
            frame_mfccs.append(np.mean(mfcc, axis=1))
        mfccs.append(frame_mfccs)
    return np.array(mfccs)


mfccResults = extractMFCCs(framedAudioArray, sampleRate)

print(mfccResults)





[[[-5.31367615e+02  3.27622452e+01  1.67675018e+00 ... -4.24910116e+00
    8.25844765e+00 -3.38834858e+00]
  [-5.39339966e+02  2.71179314e+01  9.82077789e+00 ...  1.04446328e+00
    1.43125963e+01 -3.64102268e+00]
  [-5.29686584e+02  3.48548508e+01  7.69137335e+00 ... -2.21143246e+00
    8.77179527e+00 -2.13400984e+00]
  ...
  [-4.77821442e+02  7.22203140e+01  2.35592270e+01 ...  1.69313693e+00
    9.24404240e+00  2.42158461e+00]
  [-4.85281372e+02  6.38906822e+01  1.72518635e+01 ...  2.45479059e+00
    6.94578075e+00  7.07460165e-01]
  [-4.86913483e+02  4.45979919e+01  2.71645908e+01 ...  9.09841824e+00
    8.06748867e+00  7.37948045e-02]]

 [[-4.39548981e+02  4.57504883e+01 -2.58793449e+01 ... -8.08251381e+00
    2.42736831e-01 -1.04422092e+01]
  [-4.42875122e+02  4.26021957e+01 -2.09490242e+01 ... -1.16816893e+01
    5.69621658e+00 -2.95763993e+00]
  [-4.36072052e+02  4.99048462e+01 -1.98052921e+01 ... -6.59005070e+00
    5.89494658e+00 -5.13127279e+00]
  ...
  [-4.37170319e+02  3.9

In [20]:
print(mfccResults.shape)

(2450, 79, 13)


## Combining Gabor and 13 MFCC into one DF

In [21]:
# Create DataFrame for Gabor features (assuming gaborResults is (2450, 79, Gabor_dim))
gaborDf = pd.DataFrame(gaborResults.reshape(gaborResults.shape[0], -1))  # Flatten if necessary

# Create DataFrame for MFCC features
mfccDf = pd.DataFrame({
    f'MFCC_{i+1}': [frame_mfcc for frame_mfcc in mfccResults[:, :, i]]  # Each column will have an array of MFCCs
    for i in range(13)
})

# Combine both DataFrames
combinedDf = pd.concat([gaborDf, mfccDf], axis=1)

# Print the final DataFrame structure
print(combinedDf)

             0         1         2         3         4         5         6         7         8         9  ...                                             MFCC_4                                             MFCC_5                                             MFCC_6                                             MFCC_7                                             MFCC_8                                             MFCC_9                                            MFCC_10                                            MFCC_11                                            MFCC_12                                            MFCC_13
0     0.038385  0.057247 -0.036955 -0.188098 -0.277405 -0.236645 -0.119185 -0.058672 -0.035391  0.023796  ...  [38.07473, 36.32576, 32.275173, 28.390907, 36....  [3.1461601, 8.868343, -0.41999578, -6.104909, ...  [19.651024, 25.30933, 27.693523, 21.107101, 21...  [-3.1288872, -3.520518, -3.7910862, 1.2677432,...  [23.192993, 18.8126, 14.973905, 11.7588215, 15...  [0.6565336, 11

In [22]:
print(combinedDf.shape)

(2450, 31613)
