In [36]:
import pandas as pd
import numpy as np

import os
import zipfile
import random
from IPython.display import Audio

import librosa

import time

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_io as tfio
import keras_tuner as kt
from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input

from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from xgboost import XGBClassifier

from plotly import graph_objects as go

In [2]:
script_dir = os.getcwd()

In [3]:
yamnet_model = hub.load('https://www.kaggle.com/models/google/yamnet/TensorFlow2/yamnet/1')













In [4]:
#dataset load
url = 'https://github.com/karoldvl/ESC-50/archive/master.zip'
file = tf.keras.utils.get_file(fname=f'{script_dir}\src\dataset\master.zip', origin=url)

with zipfile.ZipFile(file, 'r') as zipfile:
    zipfile.extractall('src/dataset/')                     

## Prepare the ESC-50 Dataset

In [5]:
def load_audio(file_path):
    # Load audio file at 16 kHz
    audio_data, sample_rate = librosa.load(file_path, sr=16000)
    return audio_data


In [111]:
#Load ESC-50 metadata
metadata = pd.read_csv('src/dataset/ESC-50-master/meta/esc50.csv')
metadata['absolute_path'] ='src/dataset/ESC-50-master/audio/' + metadata['filename']

In [106]:
#exporting map for target mapping prediction
target_map = metadata.groupby('category')['target'].unique().reset_index()
target_map['target'] = target_map['target'].astype(int)
target_map.to_csv('src/target_map.csv', index=False)

In [112]:
metadata = metadata.drop(['fold','esc10','src_file','take','category','filename'], axis=1)
metadata.head()

Unnamed: 0,target,absolute_path
0,0,src/dataset/ESC-50-master/audio/1-100032-A-0.wav
1,14,src/dataset/ESC-50-master/audio/1-100038-A-14.wav
2,36,src/dataset/ESC-50-master/audio/1-100210-A-36.wav
3,36,src/dataset/ESC-50-master/audio/1-100210-B-36.wav
4,19,src/dataset/ESC-50-master/audio/1-101296-A-19.wav


## Defining functions

In [8]:
# Extract Embeddings Using YAMNet
def extract_embedding(audio_data):
    
    # Run YAMNet to get embeddings
    # waveform = audio_data / tf.int16.max
    scores, embeddings, spectrogram = yamnet_model(audio_data)
    # Average embeddings over time frames
    embedding = tf.reduce_mean(embeddings, axis=0)
    return embedding.numpy()
    # return embeddings


In [9]:
def pitch_shift(audio):
    return librosa.effects.pitch_shift(audio, sr=16000,n_steps=1)

def freq_mask(audio):
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=16000)
    num_freq_bands = spectrogram.shape[0]
    mask_band = random.randint(1, num_freq_bands // 2)  # Randomly mask up to half of the frequency bands
    f_start = random.randint(0, num_freq_bands - mask_band)
    spectrogram[f_start:f_start + mask_band, :] = 0  # Masking the selected frequency bands
    
    return librosa.feature.inverse.mel_to_audio(spectrogram)

def time_mask(audio):
    mask_percentage = 0.1
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=16000)
    num_time_steps = spectrogram.shape[1]
    mask_steps = int(mask_percentage * num_time_steps)
    
    t_start = random.randint(0, num_time_steps - mask_steps)
    spectrogram[:, t_start:t_start + mask_steps] = 0  # Masking the selected time steps
    return librosa.feature.inverse.mel_to_audio(spectrogram)

def add_noise(audio, noise_factor=0.005):
    noise = np.random.randn(len(audio))
    augmented_audio = audio + noise_factor * noise
    return augmented_audio

In [10]:
def data_augmentation(audio_data, augmentations, augment_proba=0.5):
  
    augmented_audio = []
    
    for augmentation in augmentations:
        if random.random() < augment_proba:
          augmented_audio.append(augmentation(audio_data))
          
    augmented_audio.append(audio_data)
    
    return augmented_audio
          
          

##  Create Augmented Dataset of Embeddings and Labels

In [116]:
# Create Dataset of Embeddings and Labels
file_paths = metadata['absolute_path'].tolist()
labels = metadata['target'].tolist()

augmentations_list = [pitch_shift,
                        # time_mask,
                        # freq_mask,
                        # add_noise
                    ]
embeddings_list = []
labels_list = []


i=0
for file_path, label in zip(file_paths, labels):
    print(f"{len(metadata)-i} audio files left")
    audio_data = load_audio(file_path)
    augmented_data_list = data_augmentation(audio_data, augmentations_list)
    for data in augmented_data_list:
        embedding = extract_embedding(data)
        embeddings_list.append(embedding)
        labels_list.append(label)
        
    i += 1




2000 audio files left
1999 audio files left
1998 audio files left
1997 audio files left
1996 audio files left
1995 audio files left
1994 audio files left
1993 audio files left
1992 audio files left
1991 audio files left
1990 audio files left
1989 audio files left
1988 audio files left
1987 audio files left
1986 audio files left
1985 audio files left
1984 audio files left
1983 audio files left
1982 audio files left
1981 audio files left
1980 audio files left
1979 audio files left
1978 audio files left
1977 audio files left
1976 audio files left
1975 audio files left
1974 audio files left
1973 audio files left
1972 audio files left
1971 audio files left
1970 audio files left
1969 audio files left
1968 audio files left
1967 audio files left
1966 audio files left
1965 audio files left
1964 audio files left
1963 audio files left
1962 audio files left
1961 audio files left
1960 audio files left
1959 audio files left
1958 audio files left
1957 audio files left
1956 audio files left
1955 audio

In [117]:
# Combine lists to shuffle them in the same order
combined = list(zip(embeddings_list, labels_list))
random.shuffle(combined)

# Unzip back into two lists
X, y = zip(*combined)

X = np.array(X)
y = np.array(y)

## Create Dataset of Embeddings and Labels

## Encode Labels

In [14]:
# label_encoder = LabelEncoder()
# y_encoded = label_encoder.fit_transform(y)

## Split the Dataset

In [118]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training models

## Neural Network

### Training

In [119]:
deep_model = Sequential([
    Input(shape=(1024,)),
    Dense(256, activation='relu'),
    Dropout(0.5),
    Dense(128, activation='relu'),
    Dropout(0.5),
    # Dense(len(label_encoder.classes_), activation='softmax')
    Dense(50, activation='softmax')
])

deep_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['Accuracy'])
history = deep_model.fit(X_train, y_train, epochs=200, batch_size=32, validation_split=0.1)


Epoch 1/200
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - Accuracy: 0.0590 - loss: 4.0179 - val_Accuracy: 0.4463 - val_loss: 2.9512
Epoch 2/200
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - Accuracy: 0.2724 - loss: 3.0448 - val_Accuracy: 0.5826 - val_loss: 1.9689
Epoch 3/200
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - Accuracy: 0.3964 - loss: 2.4190 - val_Accuracy: 0.6322 - val_loss: 1.6693
Epoch 4/200
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - Accuracy: 0.4484 - loss: 2.0833 - val_Accuracy: 0.6612 - val_loss: 1.4032
Epoch 5/200
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - Accuracy: 0.4995 - loss: 1.9041 - val_Accuracy: 0.6488 - val_loss: 1.3239
Epoch 6/200
[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - Accuracy: 0.5451 - loss: 1.7751 - val_Accuracy: 0.6860 - val_loss: 1.2447
Epoch 7/200
[1m69/69[0m [32m━━━

### Evaluation

In [120]:
history = deep_model.history.history

In [121]:
loss, accuracy = deep_model.evaluate(X_test, y_test)
print(f'Accuracy: {accuracy:.2f}')


[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - Accuracy: 0.7766 - loss: 1.3736 
Accuracy: 0.77


In [122]:
fig = go.Figure(data=[
                      go.Scatter(
                          y=history["loss"],
                          name="Training loss",
                          mode="lines",
                          ),
                      go.Scatter(
                          y=history["val_loss"],
                          name="Validation loss",
                          mode="lines",
                          )
])
fig.update_layout(
    title='Training and val loss across epochs',
    xaxis_title='epochs',
    yaxis_title='Cross Entropy' ,
    height=600,
    width=900
     
)
fig.show()

In [123]:
deep_model.save('model/audio_classifier.keras')

## Loop

In [19]:
models = {
    'Logistic Regression':LogisticRegression(max_iter=2000),
    'SVM':SVC(kernel = 'rbf', probability = False),
    'Random Forrest':RandomForestClassifier(),
    'XGBoost':XGBClassifier()
}

fitted_models = {}
model_scores = []

for key, model in models.items():
    
    start_time=time.time()
    
    print(f'Training {key} model...')
    model.fit(X_train, y_train)
    print("...Done.")
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Elapsed time: {elapsed_time:1f} seconds. \n")
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    train_accuracy= accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    
    train_precision = precision_score(y_train, y_train_pred, average='weighted')
    test_precision = precision_score(y_test, y_test_pred, average='weighted')
    
    train_recall = recall_score(y_train, y_train_pred, average='weighted')
    test_recall = recall_score(y_test, y_test_pred, average='weighted')
    
    train_f1score = f1_score(y_train, y_train_pred, average='weighted')
    test_f1score = f1_score(y_test, y_test_pred, average='weighted')
    
    
    model_scores.append({
        'Model': key,
        'train_Accuracy': train_accuracy,
        'test_Accuracy': test_accuracy,
        'train_Precision': train_precision,
        'test_Precision': train_precision,
        'train_Recall': train_recall,
        'test_Recall': test_recall,
        'train_F1_score': train_f1score,
        'test_F1_score': test_f1score      
        
    })
    
    fitted_models.update({key : model})
    
metrics_df = pd.DataFrame(model_scores)
display(metrics_df)
    

    
  

Training Logistic Regression model...
...Done.
Elapsed time: 18.513875 seconds. 

Training SVM model...
...Done.
Elapsed time: 1.486653 seconds. 

Training Random Forrest model...
...Done.
Elapsed time: 7.809550 seconds. 

Training XGBoost model...
...Done.
Elapsed time: 57.273813 seconds. 



Unnamed: 0,Model,train_Accuracy,test_Accuracy,train_Precision,test_Precision,train_Recall,test_Recall,train_F1_score,test_F1_score
0,Logistic Regression,0.971163,0.844534,0.971486,0.971486,0.971163,0.844534,0.971099,0.845502
1,SVM,0.746489,0.709127,0.799749,0.799749,0.746489,0.709127,0.756406,0.7163
2,Random Forrest,1.0,0.820461,1.0,1.0,1.0,0.820461,1.0,0.819987
3,XGBoost,1.0,0.789368,1.0,1.0,1.0,0.789368,1.0,0.789826


## Test prediction

In [20]:
audio_test = load_audio('src/dataset/ESC-50-master/audio/1-977-A-39.wav')
audio_test_emb = extract_embedding(audio_test)
Audio(audio_test,rate=16000)

In [21]:
test_sample = np.reshape(audio_test_emb,(1,1024))

In [22]:
fitted_models.get('Logistic Regression').predict(test_sample)

array([26], dtype=int64)