In [None]:
import pandas as pd
import numpy as np
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import recall_score, confusion_matrix, f1_score
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [None]:
data = pd.read_csv("../input/dysarthria-detection/torgo_data/data.csv")
print(data.shape)

In [None]:
dir_ = "../input/dysarthria-detection/"

data['filename'] = dir_ + data['filename']

In [None]:
data.head()

In [None]:
sns.countplot(x=data['is_dysarthria'])

## **Defining functions for various plots**

In [None]:
#Waveplot
def get_waveplot(path, target, gender):
    x, sr = librosa.load(path)
    plt.figure(figsize=(20,5))
    librosa.display.waveshow(x, sr=sr)
    plt.title("Target: " + target + " " + "Gender: " + gender)
    plt.show()

In [None]:
#Mel-Frequency Cepstral Coefficient -> 128 coeffs
def get_mfcc(path, target, gender):
    x, sr = librosa.load(path)
    mfccs = librosa.feature.mfcc(y=x, sr=sr, n_mfcc=128)
    print(mfccs.shape)
    plt.figure(figsize=(20, 5))
    librosa.display.specshow(mfccs, sr=sr, x_axis='time')
    plt.title('MFCC of '+ target + " " + gender)
    plt.show()

In [None]:
#Mel-Spectrogram -> 128 bands
def get_melspec(path, target, gender):
    x, sr = librosa.load(path)
    melspec = librosa.feature.melspectrogram(y=x, sr=sr, n_mels=128)
    print(melspec.shape)
    plt.figure(figsize=(20, 5))
    librosa.display.specshow(melspec, sr=sr, x_axis='time')
    plt.title('Mel Spectrogram of '+ target + ' ' + gender)
    plt.show()

## **Visualizing Waveplots**

**Speech of a male with no dysarthria**

In [None]:
sample = data[(data['is_dysarthria']=='non_dysarthria') & (data['gender']=='male')] 
get_waveplot(sample['filename'].values[20], sample['is_dysarthria'].values[20], sample['gender'].values[20])
ipd.Audio(sample['filename'].values[20])

**Male with dysarthria**

In [None]:
sample = data[(data['is_dysarthria']=='dysarthria') & (data['gender']=='male')] 
get_waveplot(sample['filename'].values[20], sample['is_dysarthria'].values[20], sample['gender'].values[20])
ipd.Audio(sample['filename'].values[20])

**Female with no dysarthria**

In [None]:
sample = data[(data['is_dysarthria']=='non_dysarthria') & (data['gender']=='female')] 
get_waveplot(sample['filename'].values[20], sample['is_dysarthria'].values[20], sample['gender'].values[20])
ipd.Audio(sample['filename'].values[20])

**Female with dysarthria**

In [None]:
sample = data[(data['is_dysarthria']=='dysarthria') & (data['gender']=='female')] 
get_waveplot(sample['filename'].values[20], sample['is_dysarthria'].values[20], sample['gender'].values[20])
ipd.Audio(sample['filename'].values[20])

## **Visualizing MFCC features**

In [None]:
for i in np.unique(data['is_dysarthria']):
    for j in np.unique(data['gender']):
        sample = data[(data['is_dysarthria']==i) & (data['gender']==j)] 
        get_mfcc(sample['filename'].values[20], sample['is_dysarthria'].values[20], sample['gender'].values[20])

## **Visualizing Mel-Spectograms**

In [None]:
for i in np.unique(data['is_dysarthria']):
    for j in np.unique(data['gender']):
        sample = data[(data['is_dysarthria']==i) & (data['gender']==j)] 
        get_melspec(sample['filename'].values[20], sample['is_dysarthria'].values[20], sample['gender'].values[20])

**Observation:**

Dysarthric samples - the visualization plots have much more darker and has repetitive patterns indicating the slurs present in the speech. 

Non dysarthric samples - have a more spread out visualization. 

## **Extracting MFCC features**

**Maximum 128 MFCCs can be extracted**

In [None]:
def create_mfcc_features(path):
    mfccs = []
    try:
        x, sr = librosa.load(path, res_type='kaiser_fast')
        mfccs = librosa.feature.mfcc(y=x, sr=sr, n_mfcc=128)
        mfccs = np.mean(mfccs.T, axis=0)
    except:
        print("Error in audio")
    return mfccs

In [None]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [None]:
X_df = pd.DataFrame(data['filename'].progress_apply(lambda x: create_mfcc_features(x)).tolist())

In [None]:
X= X_df.copy()

## **Data Preparation**

In [None]:
X_df.loc[pd.isnull(X_df).any(1), :].index.values

In [None]:
X_df = X_df.dropna()
print(X_df.shape)
X_df.head()

In [None]:
encoder = LabelEncoder()
y = encoder.fit_transform(data['is_dysarthria'])
y = np.delete (y, 600)
y.shape

In [None]:
x_train, x_val, y_train, y_val = train_test_split(np.array(X_df), y, test_size=0.1)
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.1)
x_train.shape, x_val.shape, x_test.shape

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train= sc.fit_transform(x_train)
X_val = sc.transform(x_val)
X_test = sc.transform(x_test)

In [None]:
X_train = X_train.reshape(X_train.shape[0], 16, 8, 1)
X_val = X_val.reshape(X_val.shape[0], 16, 8, 1)
X_test = X_test.reshape(X_test.shape[0], 16, 8, 1)

## **CNN Model**

In [None]:
tf.keras.backend.clear_session()
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(16, 8, 1)),
    tf.keras.layers.Conv2D(filters=32, kernel_size=(3, 3), activation='relu', padding='same'),
    tf.keras.layers.MaxPooling2D((2, 2), padding='valid'),
    tf.keras.layers.Conv2D(filters=16, kernel_size=(3, 3), activation='relu', padding='same'),
    tf.keras.layers.MaxPooling2D((2, 2), padding='valid'),
    tf.keras.layers.Conv2D(filters=8, kernel_size=(3, 3), activation='relu', padding='same'),
    tf.keras.layers.MaxPooling2D((2, 2), padding='valid'),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dropout(0.5),

    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.summary()

**Model 1 (Adam Optimizer - Learning Rate = 0.001, Beta 1 = 0.9, Beta 2 = 0.999)**

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
callback_ = EarlyStopping(monitor='val_loss', patience=3)
metrics = [tf.keras.metrics.Recall()]
adam1 = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999)

In [None]:
model.compile(optimizer= adam1, loss='binary_crossentropy', metrics=metrics)

In [None]:
history = model.fit(X_train, y_train, epochs=100, callbacks=[callback_], validation_data=(X_val, y_val))

In [None]:
plt.plot(history.history['loss'], label="LOSS")
plt.plot(history.history['recall'], label="RECALL")
plt.xlabel('Epochs')
plt.ylabel('Loss/ Recall')

plt.legend(loc='best')
plt.show()

In [None]:
print("Validation Loss: ", model.evaluate(X_test, y_test)[0])
print("Validation Recall: ", model.evaluate(X_test, y_test)[1])

In [None]:
y_pred = model.predict(X_test)

y_pred[y_pred>=0.5] = 1
y_pred[y_pred<0.5] = 0
y_pred = y_pred.astype("int")
y_pred = encoder.inverse_transform(y_pred)

In [None]:
# Confusion matrix
cf_matrix = confusion_matrix(encoder.inverse_transform(y_test), y_pred)
categories = list(data['is_dysarthria'].unique())
sns.heatmap(cf_matrix, annot = True, fmt = '', xticklabels = categories, yticklabels = categories)
plt.xlabel("Predicted values")
plt.ylabel("Actual values")
plt.title ("Confusion Matrix")
plt.show()

In [None]:
print("F1-score of model 1: ", f1_score((y_test), encoder.transform(y_pred)))

**Model 2 (Adam Optimizer - Learning Rate = 0.001, Beta 1 = 0.6, Beta 2 = 0.77)**

In [None]:
adam2 = tf.keras.optimizers.Adam(learning_rate=0.001, beta_1=0.8, beta_2=0.9)

In [None]:
model.compile(optimizer=adam2, loss='binary_crossentropy', metrics=metrics)

In [None]:
history = model.fit(X_train, y_train, epochs=500, callbacks=[callback_], validation_data=(X_val, y_val))

In [None]:
plt.plot(history.history['loss'], label="LOSS")
plt.plot(history.history['recall'], label="RECALL")
plt.xlabel('Epochs')
plt.ylabel('Loss/ Recall')

plt.legend(loc='best')
plt.show()

In [None]:
print("Validation Loss: ", model.evaluate(X_test, y_test)[0])
print("Validation Recall: ", model.evaluate(X_test, y_test)[1])
model.evaluate(X_test, y_test)

In [None]:
y_pred2 = model.predict(X_test)

y_pred2[y_pred2>=0.5] = 1
y_pred2[y_pred2<0.5] = 0
y_pred2 = y_pred2.astype("int")
y_pred2 = encoder.inverse_transform(y_pred2)

In [None]:
# Confusion matrix
cf_matrix = confusion_matrix(encoder.inverse_transform(y_test), y_pred2)
categories = list(data['is_dysarthria'].unique())
sns.heatmap(cf_matrix, annot = True, fmt = '', xticklabels = categories, yticklabels = categories)
plt.xlabel("Predicted values")
plt.ylabel("Actual values")
plt.title ("Confusion Matrix")
plt.show()

In [None]:
print("F1-score of model 2: ", f1_score((y_test), encoder.transform(y_pred2)))

**Model 3 (Adam Optimizer - Learning Rate = 0.01, Beta 1 = 0.9, Beta 2 = 0.999)**

In [None]:
adam3 = tf.keras.optimizers.Adam(learning_rate=0.01, beta_1=0.9, beta_2=0.999)

In [None]:
model.compile(optimizer=adam3, loss='binary_crossentropy', metrics=metrics)

In [None]:
history = model.fit(X_train, y_train, epochs=500, callbacks=[callback_], validation_data=(X_val, y_val))

In [None]:
plt.plot(history.history['loss'], label="LOSS")
plt.plot(history.history['recall'], label="RECALL")
plt.xlabel('Epochs')
plt.ylabel('Loss/ Recall')

plt.legend(loc='best')
plt.show()

In [None]:
print("Validation Loss: ", model.evaluate(X_test, y_test)[0])
print("Validation Recall: ", model.evaluate(X_test, y_test)[1])
model.evaluate(X_test, y_test)

In [None]:
y_pred3 = model.predict(X_test)

y_pred3[y_pred3>=0.5] = 1
y_pred3[y_pred3<0.5] = 0
y_pred3 = y_pred3.astype("int")
y_pred3 = encoder.inverse_transform(y_pred3)

In [None]:
# Confusion matrix
cf_matrix = confusion_matrix(encoder.inverse_transform(y_test), y_pred3)
categories = list(data['is_dysarthria'].unique())
sns.heatmap(cf_matrix, annot = True, fmt = '', xticklabels = categories, yticklabels = categories)
plt.xlabel("Predicted values")
plt.ylabel("Actual values")
plt.title ("Confusion Matrix")
plt.show()

In [None]:
print("F1-score of model 3: ", f1_score((y_test), encoder.transform(y_pred3)))

**Comparison of the 3 models:**

We can see that when the values of beta_1 and beta_2 are reduced in model 2 and the value of learning rate is kept constant, the number of training steps reduce drastically. Model 2 gives increased number of false positives. 

When the beta values are kept same as model 1 and the value of learning rate is increased, we again see a decrease in epochs relative to the first model (faster learning). False positives increase in model 3. 

Conclusion: Best model is model 1 (Small learning rate and beta values close to 1) 