In [26]:
import os
import pandas as pd
import numpy as np
import librosa
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical

# Define the path to the audio files
normal_audio_dir = "All Non Dys Wav"
disorder_audio_dir = "All Dys Wav"

# Create a DataFrame to store audio file paths and labels
data = []

# Load normal audio files
for filename in os.listdir(normal_audio_dir):
    if filename.endswith(".wav"):
        audio_path = os.path.join(normal_audio_dir, filename)
        data.append((audio_path, 0))

# Load disorder audio files
for filename in os.listdir(disorder_audio_dir):
    if filename.endswith(".wav"):
        audio_path = os.path.join(disorder_audio_dir, filename)
        data.append((audio_path, 1))

# Create a DataFrame
df = pd.DataFrame(data, columns=["audio_path", "label"])

# Function to extract MFCC features
def extract_mfcc_features(audio_path):
    y, sr = librosa.load(audio_path, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    return np.mean(mfccs, axis=1).tolist()

# Function to extract Chroma features
def extract_chroma_features(audio_path):
    y, sr = librosa.load(audio_path, sr=None)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    return np.mean(chroma, axis=1).tolist()

# Function to extract Spectral Contrast features
def extract_spectral_contrast_features(audio_path):
    y, sr = librosa.load(audio_path, sr=None)
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    return np.mean(spectral_contrast, axis=1).tolist()

# Function to extract Tonnetz features
def extract_tonnetz_features(audio_path):
    y, sr = librosa.load(audio_path, sr=None)
    tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr)
    return np.mean(tonnetz, axis=1).tolist()

# Function to extract Zero-Crossing Rate features
def extract_zero_crossing_rate_features(audio_path):
    y, sr = librosa.load(audio_path, sr=None)
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y)
    return np.mean(zero_crossing_rate).tolist()

# Apply MFCC feature extraction to each audio file
df["mfcc_features"] = df["audio_path"].apply(extract_mfcc_features)

# Apply Chroma feature extraction to each audio file
df["chroma_features"] = df["audio_path"].apply(extract_chroma_features)

# Apply Spectral Contrast feature extraction to each audio file
df["spectral_contrast_features"] = df["audio_path"].apply(extract_spectral_contrast_features)

# Apply Tonnetz feature extraction to each audio file
df["tonnetz_features"] = df["audio_path"].apply(extract_tonnetz_features)

# Apply Zero-Crossing Rate feature extraction to each audio file
df["zero_crossing_rate_features"] = df["audio_path"].apply(extract_zero_crossing_rate_features)

# Verify the balance of the dataset
print(df['label'].value_counts())

# Check the first few rows to ensure correct labeling
print(df.head())




label
0    32
1    32
Name: count, dtype: int64
                audio_path  label  \
0    All Non Dys Wav\1.wav      0   
1   All Non Dys Wav\10.wav      0   
2   All Non Dys Wav\11.wav      0   
3   All Non Dys Wav\12.wav      0   
4  All Non Dys Wav\13,.wav      0   

                                       mfcc_features  \
0  [-536.5430908203125, 97.0123519897461, 7.79789...   
1  [-435.2642822265625, 80.30461883544922, 3.3133...   
2  [-461.0809631347656, 114.24716186523438, 0.501...   
3  [-500.19842529296875, 100.60742950439453, -4.4...   
4  [-551.3157958984375, 95.65728759765625, -2.064...   

                                     chroma_features  \
0  [0.4485311508178711, 0.464842289686203, 0.4395...   
1  [0.3792971968650818, 0.39954376220703125, 0.36...   
2  [0.37338632345199585, 0.3548054099082947, 0.36...   
3  [0.2978111803531647, 0.32145676016807556, 0.37...   
4  [0.4390692412853241, 0.4155716598033905, 0.395...   

                          spectral_contrast_features  \

In [27]:

# Ensure the features are combined correctly
features = []
for index, row in df.iterrows():
    combined_features = np.concatenate([
        row['mfcc_features'],
        row['chroma_features'],
        row['spectral_contrast_features'],
        row['tonnetz_features'],
        [row['zero_crossing_rate_features']]
    ])
    features.append(combined_features)

X = np.array(features)

# Debug: Print shape of the combined features
print("Combined Features Shape:", X.shape)

if X.size == 0:
    raise ValueError("No valid feature arrays found. Check data preprocessing steps.")

# Ensure all combined features have the same length
feature_length = X.shape[1]
consistent_length = all(len(feature) == feature_length for feature in X)
if not consistent_length:
    raise ValueError("Inconsistent feature lengths found in the dataset.")

y = df['label'].values

# Normalize features using training data mean and std
mean = np.mean(X, axis=0)
std = np.std(X, axis=0)
X = (X - mean) / std

# Debug: Print shape of X before reshaping
print("Shape of X before reshaping:", X.shape)

# Reshape features to be compatible with Conv1D
X = X.reshape(X.shape[0], X.shape[1], 1)

# Debug: Print shape of X after reshaping
print("Shape of X after reshaping:", X.shape)

# Convert labels to categorical
y = to_categorical(y, num_classes=2)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Save mean and std for use in prediction
np.save('mean.npy', mean)
np.save('std.npy', std)


Combined Features Shape: (64, 39)
Shape of X before reshaping: (64, 39)
Shape of X after reshaping: (64, 39, 1)


In [28]:

# Step 2: Model Implementation

# Define the CNN architecture
model = Sequential([
    Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),
    Conv1D(128, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.3),
    Dense(2, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=50, batch_size=32)

# Save the trained model
model.save('dysarthria_model.h5')


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 207ms/step - accuracy: 0.5535 - loss: 0.7077 - val_accuracy: 0.5385 - val_loss: 0.6615
Epoch 2/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.4700 - loss: 0.6790 - val_accuracy: 0.5385 - val_loss: 0.6504
Epoch 3/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.6058 - loss: 0.6070 - val_accuracy: 0.6923 - val_loss: 0.5926
Epoch 4/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.6763 - loss: 0.5624 - val_accuracy: 0.7692 - val_loss: 0.5602
Epoch 5/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.8199 - loss: 0.5491 - val_accuracy: 0.8462 - val_loss: 0.5323
Epoch 6/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.8329 - loss: 0.5088 - val_accuracy: 0.7692 - val_loss: 0.4981
Epoch 7/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37



In [29]:

# Function to predict audio class
def predict_audio_class(audio_path, model):
    # Extract features from the audio file
    mfcc_features = extract_mfcc_features(audio_path)
    chroma_features = extract_chroma_features(audio_path)
    spectral_contrast_features = extract_spectral_contrast_features(audio_path)
    tonnetz_features = extract_tonnetz_features(audio_path)
    zero_crossing_rate_features = extract_zero_crossing_rate_features(audio_path)
    
    # Combine the extracted features
    combined_features = np.concatenate([mfcc_features, chroma_features, 
                                        spectral_contrast_features, tonnetz_features,
                                        [zero_crossing_rate_features]])
    
    # Load mean and std from training phase
    mean = np.load('mean.npy')
    std = np.load('std.npy')
    
    # Normalize the features using training mean and std
    combined_features_normalized = (combined_features - mean) / std
    
    # Reshape the features to match the input shape of the model
    test_input = combined_features_normalized.reshape(1, combined_features_normalized.shape[0], 1)
    
    # Predict using the trained model
    predictions = model.predict(test_input)
    
    # Print prediction probabilities for debugging
    print("Prediction probabilities:", predictions)
    
    # Get the predicted class
    predicted_class = np.argmax(predictions)
    
    return "Dys" if predicted_class == 1 else "Non Dys"

# Load the model for prediction
from tensorflow.keras.models import load_model
model = load_model('dysarthria_model.h5')






In [43]:
# Define the path to the testing dataset
testing_dataset_dir = "Testing Dataset"

# Iterate through each subfolder and predict the class of each audio file
for subfolder in os.listdir(testing_dataset_dir):
    subfolder_path = os.path.join(testing_dataset_dir, subfolder)
    if os.path.isdir(subfolder_path):
        print(f"Processing folder: {subfolder}")
        for filename in os.listdir(subfolder_path):
            if filename.endswith(".wav"):
                audio_path = os.path.join(subfolder_path, filename)
                predicted_label = predict_audio_class(audio_path, model)
                print(f"File: {filename} - Predicted class: {predicted_label}")

Processing folder: Female Dys
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
Prediction probabilities: [[7.953187e-04 9.992047e-01]]
File: MS1.wav - Predicted class: Dys
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Prediction probabilities: [[0.01335469 0.9866453 ]]
File: MS2.wav - Predicted class: Dys
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Prediction probabilities: [[0.36381826 0.6361817 ]]
File: MS3.wav - Predicted class: Dys
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Prediction probabilities: [[0.00200319 0.99799675]]
File: MS4.wav - Predicted class: Dys
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Prediction probabilities: [[0.00932248 0.9906775 ]]
File: MS5.wav - Predicted class: Dys
Processing folder: Female Non Dys
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Prediction probabilities: [[0.9922616  0.00773842]]
Fil

In [44]:
predicted_label = predict_audio_class("32.wav", model)
print("Predicted class:", predicted_label)  

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
Prediction probabilities: [[2.5436416e-04 9.9974567e-01]]
Predicted class: Dys
