In [1]:
import numpy as np
import librosa # pip install librosa
import librosa.display
from os import listdir

In [2]:
audio_dir = 'audio_clips/' # Directory that should store the audio files

In [3]:
# Function to extract features from the audio files
def feature_extraction(dir):
    features = []
    targets = []
    
    for audio_file in listdir(dir):
        X, sample_rate = librosa.load(dir + audio_file, res_type='kaiser_fast') # Convert audio file into a time series numpy array
        mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0) # Extract Mel-frequency ceptral coefficients
        stft = np.abs(librosa.stft(X)) # Extract Short-time Fourier transform (STFT)
        chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0) # Calculate chromagram
        mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0) # Calculate mel-scaled spectogram
        contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0) # Calculate spectral contrast
        tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0) # Calculate tonal centroid features
        
        feat = np.concatenate((mfccs, chroma, mel, contrast, tonnetz), axis=0)
        features.append(feat)
        
        name = audio_file.split('_')[0]
        targets.append(name)
        
    np_features = np.array(features)
    
    return np_features, targets

In [5]:
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical

In [6]:
X_data, y_data = feature_extraction(audio_dir) # X data for the model

In [7]:
X_data.shape

(15, 193)

In [8]:
y_data

['david',
 'david',
 'david',
 'david',
 'kene',
 'kene',
 'kene',
 'moni',
 'moni',
 'moni',
 'moni',
 'steph',
 'steph',
 'steph',
 'steph']

In [9]:
encoder = LabelEncoder()
y_data = to_categorical(encoder.fit_transform(y_data)) # One-hot encodes the y data

In [10]:
y_data

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]], dtype=float32)

In [17]:
# Just a test, manually had to split the data so all observations will be in the train data because of how small it is
# Training the model on two audio clips each
X_train = np.array([X_data[0], X_data[2], X_data[5], X_data[6], X_data[8], X_data[10], X_data[11], X_data[14]])
y_train = np.array([y_data[0], y_data[2], y_data[5], y_data[6], y_data[8], y_data[10], y_data[11], y_data[14]])
X_test = np.array([X_data[1], X_data[3], X_data[4], X_data[7], X_data[9], X_data[12], X_data[13]])
y_test = np.array([y_data[1], y_data[3], y_data[4], y_data[7], y_data[9], y_data[12], y_data[13]])

In [18]:
y_train

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]], dtype=float32)

In [19]:
y_test

array([[1., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 1.]], dtype=float32)

In [20]:
X_test.shape

(7, 193)

In [21]:
from sklearn.preprocessing import StandardScaler

In [22]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [23]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, Convolution2D, MaxPooling2D
from keras.callbacks import EarlyStopping

In [28]:
model = Sequential()

model.add(Dense(193, input_shape=(193,), activation = 'relu'))
model.add(Dropout(0.1))

model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.25))  

model.add(Dense(50, activation = 'relu'))
model.add(Dropout(0.5))    

model.add(Dense(4, activation = 'softmax'))

model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

In [29]:
# Training the model
model.fit(X_train, y_train, batch_size=256, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.callbacks.History at 0x268870de6c8>

In [30]:
# Prediction
preds = model.predict_classes(X_test)

In [31]:
preds

array([0, 2, 1, 0, 2, 3, 3], dtype=int64)

In [32]:
preds_names = encoder.inverse_transform(preds)

In [33]:
preds_names

array(['david', 'moni', 'kene', 'david', 'moni', 'steph', 'steph'],
      dtype='<U5')

In [34]:
y_tq = []
for i in y_test:
    count = 0
    for j in i:
        if j == 1: 
            y_tq.append(count)
            break
        else:
            count += 1

In [35]:
y_tq

[0, 0, 1, 2, 2, 3, 3]

In [36]:
res = 0
for i in range(len(preds)):
    if preds[i] == y_tq[i]:
        res += 1

In [37]:
res/len(preds) # Model accuracy

0.7142857142857143