Naman Patidar          230679          EE Y23

Importing Libraries

In [None]:
import os
import librosa
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense


Loading the data and fixing errors in metadata, as 'yawm' to 'yawn', and standardising the column names for both the metadata



In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
train_metadata = pd.read_csv('/content/drive/MyDrive/DCASE/metadata_train.csv')
test_metadata = pd.read_csv('/content/drive/MyDrive/DCASE/metadata_test.csv')

train_metadata.columns = train_metadata.columns.str.strip().str.replace(' ', '_')
test_metadata.columns = test_metadata.columns.str.strip().str.replace(' ', '_')

train_metadata.rename(columns={'Class_ID': 'Class_id'}, inplace=True)

# Fix "yawm" to "yawn" in test metadata
test_metadata['Classname'] = test_metadata['Classname'].replace('yawm', 'yawn')

# Check the columns of metadata to ensure they are standardized
train_metadata.head(), test_metadata.head()


(           Filename File_ID  Duration_in_ms  Class_id Classname  \
 0  108160-1_0_0.wav  108160            3730         0    breath   
 1  108160-2_0_0.wav  108160            4000         0    breath   
 2  108160-3_0_0.wav  108160            4000         0    breath   
 3  108160-4_0_0.wav  108160            2226         0    breath   
 4  146769-1_0_0.wav  146769            3540         0    breath   
 
    augmentation__id Augmentation__type                  source  
 0                 0            Orignal  https://freesound.org/  
 1                 0            Orignal  https://freesound.org/  
 2                 0            Orignal  https://freesound.org/  
 3                 0            Orignal  https://freesound.org/  
 4                 0            Orignal  https://freesound.org/  ,
            Filename  File_ID  Durationin_ms  Class_id Classname  Augment_Id  \
 0  112557-2_0_0.wav   112557           4000         0    breath           0   
 1  112557-3_0_0.wav   112557    

Extraction of MFCC features from audio file

In [None]:
def extract_mfcc(file_path, sample_rate=32000, duration=4, n_mfcc=13, n_fft=2048, hop_length=512):
    if not os.path.exists(file_path):
        raise ValueError(f"File does not exist: {file_path}")

    audio, sr = librosa.load(file_path, sr=sample_rate, duration=duration)

    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length)

    mfcc_resized = np.resize(mfcc, (n_mfcc, 87))
    return mfcc_resized


Loading MFCC features from audio files

In [None]:
def load_mfcc_data(metadata, folder, sample_rate=32000, duration=4, n_mfcc=13):
    X = []
    y = []

    for index, row in metadata.iterrows():
        try:

            file_path = os.path.join('/content/drive/MyDrive/DCASE', folder, row['Filename'])


            mfcc = extract_mfcc(file_path, sample_rate, duration, n_mfcc)


            mfcc_flat = mfcc.flatten()


            y.append(row['Class_id'])
            X.append(mfcc_flat)
        except ValueError as e:
            print(f"Error processing file {row['Filename']}: {e}")

    X = np.array(X)
    y = np.array(y)

    return X, y


In [None]:
X_train, y_train = load_mfcc_data(train_metadata, 'Train')
X_test, y_test = load_mfcc_data(test_metadata, 'Test')


Loading Training and Test Data

Splitting the training data into training and validation sets

In [None]:
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

CNN Model- Padding has been added for those audio files whose duration is less to make the length of audio files uniform. For audio files of larger duration, thay have been clipped.

In [None]:
def create_cnn_feature_extractor(input_shape=(13, 87, 1)):
    model = Sequential()
    model.add(Conv2D(64, (3, 3), activation='relu', input_shape=input_shape, padding='same'))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(128, (3, 3), activation='relu', padding='same'))
    model.add(MaxPooling2D((2, 2)))

    model.add(Conv2D(256, (2, 2), activation='relu', padding='same'))
    model.add(MaxPooling2D((1,1)))
    model.add(Flatten())
    return model

 Reshape data for CNN (13x87 MFCC image shape)

In [None]:
X_train_cnn = X_train_split.reshape(-1, 13, 87, 1)
X_val_cnn = X_val_split.reshape(-1, 13, 87, 1)
X_test_cnn = X_test.reshape(-1, 13, 87, 1)


In [None]:
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)


X_train_cnn = X_train_split.reshape(-1, 13, 87, 1)
X_val_cnn = X_val_split.reshape(-1, 13, 87, 1)
X_test_cnn = X_test.reshape(-1, 13, 87, 1)

cnn_model = create_cnn_feature_extractor(input_shape=(13, 87, 1))

cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

cnn_model.fit(
    X_train_cnn, y_train_split, epochs=10, batch_size=32, validation_data=(X_val_cnn, y_val_split)
)

Epoch 1/10
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 239ms/step - accuracy: 0.0862 - loss: 16.2323 - val_accuracy: 0.2862 - val_loss: 9.6883
Epoch 2/10
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 206ms/step - accuracy: 0.2574 - loss: 9.6965 - val_accuracy: 0.2854 - val_loss: 9.7180
Epoch 3/10
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 210ms/step - accuracy: 0.2713 - loss: 9.7105 - val_accuracy: 0.2854 - val_loss: 9.7253
Epoch 4/10
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 203ms/step - accuracy: 0.2693 - loss: 9.7065 - val_accuracy: 0.2854 - val_loss: 9.7042
Epoch 5/10
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 212ms/step - accuracy: 0.2666 - loss: 9.6992 - val_accuracy: 0.2854 - val_loss: 9.7042
Epoch 6/10
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 219ms/step - accuracy: 0.2699 - loss: 9.6970 - val_accuracy: 0.2854 - val_loss: 9.7042
Epoch 7/1

<keras.src.callbacks.history.History at 0x795636c40ad0>

In [None]:
cnn_features_train = cnn_model.predict(X_train_cnn)
cnn_features_val = cnn_model.predict(X_val_cnn)
cnn_features_test = cnn_model.predict(X_test_cnn)

# Combine CNN features with raw MFCC features
X_train_combined = np.hstack((X_train_split, cnn_features_train))
X_val_combined = np.hstack((X_val_split, cnn_features_val))
X_test_combined = np.hstack((X_test, cnn_features_test))

[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 54ms/step
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 65ms/step
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 71ms/step


Flatten the CNN features

In [None]:
cnn_features_train_flat = cnn_features_train.reshape(cnn_features_train.shape[0], -1)
cnn_features_val_flat = cnn_features_val.reshape(cnn_features_val.shape[0], -1)
cnn_features_test_flat = cnn_features_test.reshape(cnn_features_test.shape[0], -1)

print("Shape of raw MFCC features (train):", X_train_split.shape)
print("Shape of CNN features (train):", cnn_features_train_flat.shape)

X_train_combined = np.hstack((X_train_split, cnn_features_train_flat))
X_val_combined = np.hstack((X_val_split, cnn_features_val_flat))
X_test_combined = np.hstack((X_test, cnn_features_test_flat))



Shape of raw MFCC features (train): (5031, 1131)
Shape of CNN features (train): (5031, 1280)


Train Random Forest Classifier using combined features

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_combined, y_train_split)


Evaluation metrices and other scores

In [None]:
from sklearn.metrics import classification_report
import numpy as np

# Evaluate on the validation set
rf_val_predictions = rf_classifier.predict(X_val_combined)
print("Random Forest Classification Report (Validation with CNN Features):")

# Get the classification report for validation set
report_val = classification_report(y_val_split, rf_val_predictions, output_dict=True)

for class_label, metrics in report_val.items():
    if class_label != 'accuracy':  # Accuracy will be handled separately
        for metric in metrics:
            metrics[metric] = np.round(metrics[metric] * 100, 2)

print("Validation Report (Percentages):")
for class_label, metrics in report_val.items():
    if class_label != 'accuracy':
        print(f"Class {class_label}:")
        for metric, value in metrics.items():
            print(f"  {metric.capitalize()}: {value}%")
    else:
        print(f"Accuracy: {np.round(metrics * 100, 2)}%")

# Evaluate on the test data
rf_test_predictions = rf_classifier.predict(X_test_combined)
print("\nRandom Forest Classification Report (Test with CNN Features):")

# Get the classification report for test set
report_test = classification_report(y_test, rf_test_predictions, output_dict=True)


for class_label, metrics in report_test.items():
    if class_label != 'accuracy':
        for metric in metrics:
            metrics[metric] = np.round(metrics[metric] * 100, 2)

print("Test Report (Percentages):")
for class_label, metrics in report_test.items():
    if class_label != 'accuracy':
        print(f"Class {class_label}:")
        for metric, value in metrics.items():
            print(f"  {metric.capitalize()}: {value}%")
    else:
        print(f"Accuracy: {np.round(metrics * 100, 2)}%")


Random Forest Classification Report (Validation with CNN Features):
Validation Report (Percentages):
Class 0:
  Precision: 81.32%
  Recall: 78.61%
  F1-score: 79.94%
  Support: 36000.0%
Class 1:
  Precision: 38.89%
  Recall: 35.59%
  F1-score: 37.17%
  Support: 11800.0%
Class 2:
  Precision: 55.36%
  Recall: 81.1%
  F1-score: 65.8%
  Support: 34400.0%
Class 3:
  Precision: 42.46%
  Recall: 34.08%
  F1-score: 37.81%
  Support: 22300.0%
Class 4:
  Precision: 79.52%
  Recall: 57.39%
  F1-score: 66.67%
  Support: 11500.0%
Class 5:
  Precision: 60.0%
  Recall: 33.33%
  F1-score: 42.86%
  Support: 5400.0%
Class 6:
  Precision: 50.0%
  Recall: 6.82%
  F1-score: 12.0%
  Support: 4400.0%
Accuracy: 60.97%
Class macro avg:
  Precision: 58.22%
  Recall: 46.7%
  F1-score: 48.89%
  Support: 125800.0%
Class weighted avg:
  Precision: 61.18%
  Recall: 60.97%
  F1-score: 59.41%
  Support: 125800.0%

Random Forest Classification Report (Test with CNN Features):
Test Report (Percentages):
Class 0:
  Prec

For Validation Set-

* Highest Scores-
  For class 0
  Precision: 81.32%
  Recall: 78.61%
  F1-score: 79.94%
  
* Overall-
  Precision: 61.18%
  Recall: 60.97%
  F1-score: 59.41%

For Test Set-


*   Precision: 68.83%
  Recall: 33.12%
  F1-score: 44.73%
*   Overall-
  Precision: 48.48%
  Recall: 45.24%
  F1-score: 43.84%


The low accuracy is may be due to padding and clipping  of the audio files that we did in order to uniformise the lengths of all the audio files.