In [None]:
import glob
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
import librosa
import numpy as np
import pandas as pd
import tensorflow as tf
from scipy.signal import spectrogram
from matplotlib import pyplot as plt
from scipy.io import wavfile

In [None]:
!unzip Data2.zip

In [None]:
# Dataset files should be located at the same directory with this notebook.

dataset = [{'file': file, 'class': file.split('/')[1]}
           for file in glob.glob("Data/**/*.wav")]

dataframe = pd.DataFrame.from_dict(dataset)
dataframe['data'] = dataframe['file'].apply(lambda amplitude: wavfile.read(amplitude)[1])

print("Dataset size is", dataframe['data'].size, "elements.")

In [None]:
# Feature extraction using librosa (MFCCs)
def extract_features(file):
    y, sr = librosa.load(file, sr=None)  # Load the audio file
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)  # Extract MFCC features
    return np.mean(mfcc.T, axis=0)  # Return the mean of the MFCC features

In [None]:
# Apply feature extraction to the dataset
dataframe['features'] = dataframe['file'].apply(extract_features)

In [None]:
# Visualizing example features (optional)
normal = dataframe[dataframe['class'] == 'Normal_2'].sample(1)
murmur = dataframe[dataframe['class'] == 'Murmur_2'].sample(1)
extrastole = dataframe[dataframe['class'] == 'Extrastole_2'].sample(1)

# Visualization function remains the same (adapted to MFCC features)
def visualize(data, typename, color, sampling_frequency=48000):
    plt.figure(figsize=(12, 6))
    plt.title(typename + ' MFCC features plot')
    plt.plot(data.values[0], c=color)
    plt.ylabel('MFCC Coefficients')
    plt.xlabel('Time')

visualize(normal['features'], "Normal_2", 'g')
visualize(murmur['features'], "Murmur_2", 'b')
visualize(extrastole['features'], "Extrastole_2", 'r')

In [None]:
# Prepare the dataset for machine learning
X = np.stack(dataframe['features'].values, axis=0)  # Features (MFCCs)
y = dataframe['class'].values  # Labels

In [None]:
!pip install scikit-learn
from sklearn.model_selection import train_test_split

# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler

# Instantiate the scaler
scaler = StandardScaler()

# Fit the scaler to the training data and transform it
X_train_scaled = scaler.fit_transform(X_train)

# Apply the scaler to the test data
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_neighbors': np.arange(1, 20),  # Searching for best k between 1 and 10
    'metric': ['euclidean', 'manhattan', 'minkowski'],  # Trying different distance metrics
    'weights': ['uniform', 'distance']
}

# Instantiate the KNeighborsClassifier
knn = KNeighborsClassifier()

# Instantiate GridSearchCV with the KNeighborsClassifier and parameter grid
grid_search = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')  # fold cross-validation

# Fit GridSearchCV on the scaled training data
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters found by GridSearchCV
print("Best parameters found:", grid_search.best_params_)

# Use the best estimator to make predictions
best_knn = grid_search.best_estimator_
knn_predictions = best_knn.predict(X_test_scaled)

# Print the accuracy of the best model
print("Best KNN accuracy:", accuracy_score(y_test, knn_predictions))

# Print the best validation accuracy during GridSearchCV
print("Best Validation Accuracy from GridSearchCV:", grid_search.best_score_)

Best parameters found: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
Best KNN accuracy: 0.9420289855072463
Best Validation Accuracy from GridSearchCV: 0.9290322580645162


  _data = np.array(data, dtype=dtype, copy=copy,


# Methods for Validation Accuracy

**1. Using 'GridSearchCV' Cross-Validation accuracy**

mean_test_score: This represents the average validation accuracy across all cross-validation folds.

std_test_score: This is the standard deviation of the validation accuracy across the folds, giving you a sense of how much variability there was in the validation performance.

In [None]:
# Perform the grid search
grid_search.fit(X_train_scaled, y_train)

# Best parameters found
print("Best parameters found:", grid_search.best_params_)

# Show the validation accuracy for each set of parameters
cv_results = pd.DataFrame(grid_search.cv_results_)
print("Cross-validation results:")
print(cv_results[['params', 'mean_test_score', 'std_test_score']])

# The mean_test_score column gives you the mean validation accuracy across all folds

Best parameters found: {'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
Cross-validation results:
                                                params  mean_test_score  \
0    {'metric': 'euclidean', 'n_neighbors': 1, 'wei...         0.911290   
1    {'metric': 'euclidean', 'n_neighbors': 1, 'wei...         0.911290   
2    {'metric': 'euclidean', 'n_neighbors': 2, 'wei...         0.770968   
3    {'metric': 'euclidean', 'n_neighbors': 2, 'wei...         0.911290   
4    {'metric': 'euclidean', 'n_neighbors': 3, 'wei...         0.683871   
..                                                 ...              ...   
109  {'metric': 'minkowski', 'n_neighbors': 17, 'we...         0.924194   
110  {'metric': 'minkowski', 'n_neighbors': 18, 'we...         0.712903   
111  {'metric': 'minkowski', 'n_neighbors': 18, 'we...         0.925806   
112  {'metric': 'minkowski', 'n_neighbors': 19, 'we...         0.716129   
113  {'metric': 'minkowski', 'n_neighbors': 19, 'we...       

**2. Manual Validation Split**

In [None]:
# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

# Train the model on the training subset
knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train, y_train)

# Calculate validation accuracy
val_predictions = knn.predict(X_test)
val_accuracy = accuracy_score(y_test, val_predictions)
print("Validation accuracy:", val_accuracy)

Validation accuracy: 0.7391304347826086


**3. GridSearchCV with Validation Accuracy Display**

In [None]:
# Best model from GridSearchCV
best_knn = grid_search.best_estimator_

# Predict on the validation/test set
knn_val_predictions = best_knn.predict(X_test_scaled)

# Calculate and display validation accuracy
val_accuracy = accuracy_score(y_test, knn_val_predictions)
print("Validation/Test accuracy of the best model:", val_accuracy)

Validation/Test accuracy of the best model: 0.9420289855072463


In [None]:
from sklearn.metrics import precision_score

# Predict on the test set using the best KNN model
knn_predictions = best_knn.predict(X_test_scaled)

# Calculate precision
precision = precision_score(y_test, knn_predictions, average='weighted')

# Display precision
print("Precision of the best KNN model:", precision)

Precision of the best KNN model: 0.945814212750512


In [None]:
from sklearn.metrics import precision_score

# Assuming you already have the best model from GridSearchCV
# If you haven't trained it, you can refer to the code earlier for training it
best_knn = grid_search.best_estimator_

# Predict on the validation/test set
val_predictions = best_knn.predict(X_test_scaled)

# Calculate precision
precision = precision_score(y_test, val_predictions, average='weighted')

# Display validation precision
print("Validation Precision of the best KNN model:", precision)

Validation Precision of the best KNN model: 0.945814212750512


'macro': For unweighted mean of the precision.

'micro': For global precision considering all instances.

In [None]:
# Calculate weighted and macro precision
precision_weighted = precision_score(y_test, val_predictions, average='weighted')
precision_macro = precision_score(y_test, val_predictions, average='macro')
precision_micro = precision_score(y_test, val_predictions, average='micro')

# Display both precision values
print("Validation Precision (weighted):", precision_weighted)
print("Validation Precision (macro):", precision_macro)
print("Validation Precision (micro):", precision_micro)

Validation Precision (weighted): 0.945814212750512
Validation Precision (macro): 0.9153249850924269
Validation Precision (micro): 0.9420289855072463


In [None]:
import joblib
from google.colab import files  # Only needed if you're using Google Colab

# Save the trained KNN model to a file
joblib.dump(best_knn, 'best_knn_model.joblib')

# Optional: Download the saved model if you are using Google Colab
files.download('best_knn_model.joblib')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>