In [1]:
import warnings
warnings.filterwarnings('ignore')

import librosa
import os

In [2]:
TRAIN_PATH = 'data/train/' # File path for training data
TEST_PATH = 'data/test/' # File path for Test data
VAL_PATH = 'data/val/' # File path where you upload your own recording to check

speech_cats =['down', 'up'] # We process only two categories - Up and Down

In [3]:
# Connect to Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Unzip the files
!unzip drive/MyDrive/'NLP Data'/train.zip -d ./data/
!unzip drive/MyDrive/'NLP Data'/test.zip -d ./data/

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: ./data/train/down/2151b09a_nohash_1.wav  
  inflating: ./data/train/down/215699ff_nohash_0.wav  
  inflating: ./data/train/down/215699ff_nohash_1.wav  
  inflating: ./data/train/down/2167c2ed_nohash_0.wav  
  inflating: ./data/train/down/21832144_nohash_0.wav  
  inflating: ./data/train/down/21832144_nohash_1.wav  
  inflating: ./data/train/down/21832144_nohash_2.wav  
  inflating: ./data/train/down/21832144_nohash_3.wav  
  inflating: ./data/train/down/21832144_nohash_4.wav  
  inflating: ./data/train/down/2197f41c_nohash_0.wav  
  inflating: ./data/train/down/2275edbb_nohash_0.wav  
  inflating: ./data/train/down/2296b1af_nohash_0.wav  
  inflating: ./data/train/down/2296b1af_nohash_1.wav  
  inflating: ./data/train/down/229978fd_nohash_0.wav  
  inflating: ./data/train/down/229978fd_nohash_1.wav  
  inflating: ./data/train/down/229978fd_nohash_2.wav  
  inflating: ./data/train/down/229978fd_nohash_3.wav  


In [5]:
import os
os.mkdir('./data/val')
os.mkdir('./data/val/up')
os.mkdir('./data/val/down')

In [9]:
import IPython.display as ipd

ipd.Audio('./data/test/up/f0ae7203_nohash_1.wav')

In [10]:
librosa.__version__

'0.10.1'

In [11]:
# Function to process data.
# I/P : Path to folders
# Output is a Dictionary with two lists - one for labels and one for MFCC
def prep_data(path):
    data = {'label': [],'mfcc': []}
    for x in speech_cats:
        print(path+x)
        for fn in os.listdir(path+"/"+x+"/"):
            signal, sr = librosa.load(path+"/"+x+"/"+fn)
            if len(signal) >= 22050:
                mfcc = librosa.feature.mfcc(y=signal[:22050], sr=sr, n_mfcc=13, hop_length = 512, n_fft=2048)

                data['label'].append(x)
                data['mfcc'].append(mfcc.T.tolist())
    return(data)

In [12]:
# Prepare the Test and Train data
test = prep_data(TEST_PATH)
train = prep_data(TRAIN_PATH)

data/test/down
data/test/up
data/train/down
data/train/up


In [13]:
# Import Numpy and prepare the X and y variables
import numpy as np
X_train = np.array(train['mfcc'])
X_test = np.array(test['mfcc'])
y_train = np.array(train['label'])
y_test = np.array(test['label'])

In [14]:
X_train.shape

(4214, 44, 13)

In [15]:
X_test.shape

(525, 44, 13)

In [16]:
np.unique(y_train)

array(['down', 'up'], dtype='<U4')

In [17]:
# Label Encoding for the y variables
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train_c = le.fit_transform(y_train)
y_test_c = le.transform(y_test)

In [18]:
le.classes_

array(['down', 'up'], dtype='<U4')

In [20]:
le.transform(['down'])

array([0])

In [21]:
# Import relevant components from Keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Flatten, Dense, Dropout

In [22]:
# Instantiate a model
model = Sequential()

In [23]:
# First Conv layer with Maxpool
model.add(Conv2D(128, 3, activation = 'relu',padding='same', input_shape=(X_train.shape[1], X_train.shape[2],1)))
model.add(MaxPool2D(2))

# Second Conv layer with Maxpool
model.add(Conv2D(128, 3, activation = 'relu', padding='same'))
model.add(MaxPool2D(2))

# Third Conv layer with Maxpool
model.add(Conv2D(128, 3, activation = 'relu', padding='same'))
model.add(MaxPool2D(2))

# Flatten layer
model.add(Flatten())

# Dense Layer
model.add(Dense(64, activation = 'relu'))

# Output layer
model.add(Dense(1, activation = 'sigmoid'))

In [24]:
# Compile Model
model.compile(optimizer = 'adam', loss=tf.keras.losses.BinaryCrossentropy(), metrics = ['accuracy'])

In [25]:
# Fit the model
model.fit(X_train, y_train_c, epochs = 10, validation_data = (X_test, y_test_c))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7924281271f0>

In [34]:
!rmdir ./data/val/up/.ipynb_checkpoints/

In [28]:
ipd.Audio('./data/val/up/Up2.wav')

In [35]:
# Since the self recorded files have been appropriately uploaded
# Prepare the validation data
val = prep_data(VAL_PATH)

data/val/down
data/val/up


In [36]:
X_val = np.array(val['mfcc'])

In [37]:
# Predict the validation data
model.predict(X_val)>0.5



array([[ True],
       [False]])

In [None]:
val['label']

['down', 'up']

In [None]:
le.classes_

array(['down', 'up'], dtype='<U4')