## **Audio data analysis and classification**


Sound is represented in the form of an audio signal having parameters such as *frequency, bandwidth, decibel*, etc.

A typical audio signal can be expressed as a function of amplitude and time.

In [None]:
!pip install librosa
!pip install --upgrade resampy
!pip install --upgrade protobuf

Collecting resampy
  Downloading resampy-0.4.3-py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m33.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: resampy
Successfully installed resampy-0.4.3
Collecting protobuf
  Downloading protobuf-5.27.1-cp38-abi3-manylinux2014_x86_64.whl (309 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.2/309.2 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.20.3
    Uninstalling protobuf-3.20.3:
      Successfully uninstalled protobuf-3.20.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf-cu12 24.4.1 requires protobuf<5,>=3.20, but you have protobuf 5.27.1 which is incompatible.
google-ai-generativelanguage 0

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import os
from tqdm import tqdm

In [None]:
import IPython.display as ipd
import librosa
import librosa.display
from glob import glob

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
# Audio file upload
audio_dataset_path = '/content/drive/MyDrive/Audio_data'T

# **Terms to know audio in digital form**

# Frequency (Hz)
*   Frequency describes the differences of wave lengths
*   We interprate the frequency as high and low pitches

# Intensity (db/Power)
*   Intensity describes the amplitude (i.e., height) of the wave
*   Change in intensity but not frequency (called loudness)
*   Change in frequency but not intensity (called pitch)

# Sample Rate

* The sample rate is the **number of samples carried out** by the selected audio per second, measured either in Hz or kHz
*   Sample rate is specific to how the computer reads in the audio file (i.e., a discrete representation of a audio file)
*   Any music file recorded with a sample rate and bit depth higher than 44.1kHz/16-bit is considered high definition (HD) audio.
* In a typical digital audio CD recording, the sampling rate is 44,100 or 44.1kHz.

In [None]:
audio_files = glob('/content/drive/MyDrive/Audio_data/*/*.wav')

In [None]:
print(audio_files[10])
len(audio_files)

##Audio data processing using librosa library

In [None]:
file_name = audio_files[56]
print(file_name)

In [None]:
#Play an audio file
ipd.Audio(file_name)

In [None]:
#Extracting audio file information using librosa
#Load an audio file as a floating point time series.

data, sample_rate_default = librosa.load(file_name) #load() returns time-series data (i.e., frequency) of audio signal and sampling rate
#load(path_to_file, *[, sr, mono, offset, duration, ...]) the other parameters like sr, mono, offset, duration, ... are optional.

In [None]:
data.shape

In [None]:
#This is the default sample rate
sample_rate_default

In [None]:
data[0:10]

In [None]:
#Printing the basic information of the audio file
print(f"Type of audio data:{type(data)}")
#print(f"Shape of audio data:{data.shape}")
print(f"Type of sample rate:{type(sample_rate_default)}")
print(f"Sample rate(by default): {sample_rate_default}") #Print the sample rate
print(f"Audio data shape: {data.shape}") #print the size of audio data
print(f"First 10 values of the Audio data : {data[:10]}") #print first 10 values of audio data

In [None]:
#Resampling the sample rate with 44.1kHz of a audio file
audio_data, sample_rate = librosa.load(file_name, sr=44100)

In [None]:
sample_rate

In [None]:
#Printing the basic information of the audio file with different sample rate
print(f"Type of audio data:{type(audio_data)}")
print(f"Type of sample rate:{type(sample_rate)}")
print(f"Sample rate: {sample_rate}") #Print the sample rate
print(f"Audio data shape: {audio_data.shape}") #Change in the sample rate changes the size of audio data
print(f"First 10 values of the Audio data : {audio_data[:10]}") #also change the values of audio data

# Handling audio data with different Channels


*   Mono Channel
*   Stereo Channel



In [None]:
audio_data_stereo_default, sample_rate = librosa.load(file_name, mono=False)

In [None]:
print(f"Audio data shape: {audio_data_stereo_default.shape}") #audio_data_stereo_default.shape == (N_channels,
#the first channel as audio_data_stereo_default[0] and the second channel as audio_data_stereo_default[1],

In [None]:
#Printing the basic information of the audio file with stereo channel
print(f"Type of audio data:{type(audio_data_stereo_default)}")
#print(f"Shape of audio data:{audio_data_stereo.shape}")
print(f"Type of sample rate:{type(sample_rate)}")
print(f"Sample rate(by default): {sample_rate}") #Print the sample rate
print(f"Audio data shape: {audio_data_stereo_default.shape}") #Change in the sample rate changes the size of audio data
print(f"First 10 values of the Audio data : {audio_data_stereo_default[:10]}") #also change the values of audio data

In [None]:
print(f"First channel of the Audio data : {audio_data_stereo_default[0]}")
print(f"Second channel of the Audio data : {audio_data_stereo_default[1]}")

In [None]:
audio_data_stereo, sample_rate = librosa.load(file_name, sr=44100, mono=False)

In [None]:
#Printing the basic information of the audio file with stereo channel
print(f"Type of audio data:{type(audio_data_stereo)}")
#print(f"Shape of audio data:{audio_data_stereo.shape}")
print(f"Type of sample rate:{type(sample_rate)}")
print(f"Sample rate: {sample_rate}") #Print the sample rate
print(f"Audio data shape: {audio_data_stereo.shape}") #Change in the sample rate changes the size of audio data
print(f"First 10 values of the Audio data : {audio_data_stereo[:10]}") #also change the values of audio data

In [None]:
print(f"First channel of the Audio data : {audio_data_stereo[0]}")
print(f"Second channel of the Audio data : {audio_data_stereo[1]}")

*Converting audio from stereo to mono channel*

In [None]:
#to_mono() convert an audio signal to mono by averaging samples across channels.
audio_data_mono = librosa.to_mono(audio_data_stereo)
print(audio_data_mono.shape)

# **Ploting audio data (Waveform Visulization)**

In [None]:
# One way to Plot audio data
librosa.display.waveshow(data, sr=sample_rate_default)

In [None]:
# audio data with different sample rate
librosa.display.waveshow(audio_data, sr=sample_rate)

In [None]:
#Alternate way to plot audio data
pd.Series(data).plot(figsize=(8, 3),
                     lw=1,
                     title="Raw audio data example")

In [None]:
#Trim the audio data
data_trimmed, _ = librosa.effects.trim(data)
print(data_trimmed)

In [None]:
pd.Series(data_trimmed).plot(figsize=(8, 3),
                     lw=1,
                     title="Raw audio data (trimmed) example")

In [None]:
#Apply slicing
pd.Series(data[7000:9000]).plot(figsize=(8, 3),
                     lw=1,
                     title="Raw audio data example")

**Spectogram Visualization**


*   A spectrogram is a visual way of representing the  **frequency of sound** or  **signal strength**, or **loudness**, of a signal over time at various frequencies present in a particular waveform.
* Not only can one see whether there is more or less energy at, for example, 2 Hz vs 10 Hz, but one can also see how energy levels vary over time.
*   Spectrogram are sometimes called **sonographs, voiceprint, or voicegrams**

In [None]:
#Applying fourier transformation (Short-time fourier transform)
#stft() converts data into short term Fourier transform.
#STFT converts signals such that we can know the amplitude of the given frequency at a given time.
data_transformed = librosa.stft(data)
data_db = librosa.amplitude_to_db(np.abs(data_transformed), ref=np.max)
data_db.shape

In [None]:
#Ploting the transformed audio (Spectogram)
fig, ax = plt.subplots(figsize=(10, 4))
img = librosa.display.specshow(data = data_db, #specshow is used to display a spectrogram.
                              # sr = 44100,
                               x_axis='time',
                               y_axis='log',
                               ax=ax)
ax.set_title('Spectogram Example')
fig.colorbar(img, ax=ax, format=f'%0.2f')

**Mel Spectogram**

The mel-spectrogram, based on the auditory-based mel-frequency scale, provides better resolution for lower frequencies than the spectrogram.

In [None]:
data_mel = librosa.feature.melspectrogram(y=data,
                                          sr=sample_rate,
                                          n_mels=20)
data_mel_db = librosa.amplitude_to_db(data_mel, ref=np.max)

In [None]:
#Ploting the transformed audio (Spectogram)
fig, ax = plt.subplots(figsize=(10, 4))
img = librosa.display.specshow(data = data_mel_db,
                               x_axis='time',
                               y_axis='log',
                               ax=ax)
ax.set_title('Mel Spectogram Example')
fig.colorbar(img, ax=ax, format=f'%0.2f')

In [None]:
'''
dft_input = data[:4096]

# calculate the DFT
window = np.hanning(len(dft_input))
windowed_input = dft_input * window
dft = np.fft.rfft(windowed_input)
# get the frequency bins
frequency = librosa.fft_frequencies(sr=sample_rate, n_fft=len(dft_input))
# get the amplitude spectrum in decibels
amplitude = np.abs(dft)
amplitude_db = librosa.amplitude_to_db(amplitude, ref=np.max)

plt.figure().set_figwidth(12)
plt.plot(frequency, amplitude_db)
plt.xlabel("Frequency (Hz)")
plt.ylabel("Amplitude (dB)")
plt.xscale("log")
'''

In [None]:
file_name = audio_files[39]
plt.figure(figsize=(14, 9))
ipd.Audio(file_name)

In [None]:
data, sample_rate = librosa.load(file_name)
librosa.display.waveshow(data, sr=sample_rate)


#Reading audio data using scipy.io

In [None]:
from scipy.io import wavfile as wav
wave_sample_rate, wave_audio = wav.read(file_name)

In [None]:
wave_sample_rate #Default sample rate in this case 48.0 KHz

In [None]:
wave_audio

In [None]:
#Printing the basic information of the audio file with different sample rate
print(f"Type of audio data:{type(wave_audio)}")
print(f"Type of sample rate:{type(wave_sample_rate)}")
print(f"Sample rate(by default): {wave_sample_rate}") #Print the sample rate
print(f"Audio data shape: {wave_audio.shape}") #Change in the sample rate changes the size of audio data
print(f"First 10 values of the Audio data : {wave_audio[:10]}") #also change the values of audio data

# **Feature extraction from Audio data**
Every audio signal consists of many features.

However, we must extract the characteristics that are relevant to the problem we are trying to solve.

A audio data consist of mainly following types of features:


*   Spectral features (or frequency-based features)
*   Time-domain features: Zero crossing rate, amplitude envelope, and RMS energy
*   Time-frequency domain features

*Spectral features (frequency-based features)*, are obtained by converting the time-based signal into the frequency domain using the Fourier Transform, like fundamental frequency, frequency components, spectral centroid, spectral flux, spectral density, spectral roll-off, etc.

*Time domain:* These are extracted from waveforms of the raw audio. Zero crossing rate, amplitude envelope, and RMS energy are examples.

*Time-frequency representation:* These features combine both the time and frequency components of the audio signal.

The time-frequency representation is obtained by applying the Short-Time Fourier Transform (STFT) on the time domain waveform. Spectrogram, mel-spectrogram, and constant-Q transform are examples.

**Mel-Frequency Cepstral Coefficients(MFCCs)**

* The Mel frequency cepstral coefficients (MFCCs) of a signal are a small set of features (usually about 10–20) that concisely describe the overall shape of a spectral envelope.
* It models the characteristics of the human voice.

In [None]:
audio, sample_rate = librosa.load(file_name)
mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate)#Number of mfcc to be returned, default value is 20

In [None]:
mfccs_features.shape

In [None]:
mfccs_scaled_features=np.mean(mfccs_features.T,axis=0)
mfccs_scaled_features.shape

In [None]:
mfccs_scaled_features

**Environmental sound classification** (on UrbanSound8K dataset)

In [None]:
#Loading the dataset
sound_data = pd.read_csv('/content/drive/MyDrive/Audio_data/UrbanSound8K.csv')


#Know your dataset
*Data preprocessing and visualization*

In [None]:
sound_data.head(10)

In [None]:
sound_data.sample(10)

In [None]:
sound_data.columns

In [None]:
sound_data.shape

In [None]:
#checking for the null values
sound_data.isnull().sum()

In [None]:
#checking for the na values
sound_data.isna().sum()

In [None]:
sound_data.info()

In [None]:
sound_data.describe()

In [None]:
#Checking whether dataset is balenced or not
sound_data['class'].value_counts()

 **Exploratory data analysis**

In [None]:
# Distribution of classes
class_distribution = sound_data['class'].value_counts()
plt.figure(figsize=(12, 6))
sns.barplot(x=class_distribution.index, y=class_distribution.values)
plt.title("Class Distribution")
plt.xlabel("Class")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Duration of audio files
sound_data['duration'] = sound_data['end'] - sound_data['start']
plt.figure(figsize=(12, 6))
sns.histplot(sound_data['duration'], bins=50, kde=True)
plt.title("Distribution of Audio Duration")
plt.xlabel("Duration (seconds)")
plt.ylabel("Count")
plt.show()

In [None]:
# Define a function to plot audio waveforms
def plot_waveform(file_path):
    #Plot the audio in wave form
    audio, sr = librosa.load(file_path)
    plt.figure(figsize=(10, 2))
    librosa.display.waveshow(audio, sr=sr)
    plt.title("Audio Waveform")
    plt.show()
    #Plot the audio in spectrogram
    data, sr = librosa.load(file_path)
    D = librosa.amplitude_to_db(np.abs(librosa.stft(data)), ref=np.max)
    plt.figure(figsize=(10, 6))
    librosa.display.specshow(D, y_axis='linear')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Linear-frequency spectrogram')
    ipd.Audio(file_path)


# Choose a few audio files to visualize (you can change the file paths)
sample_audio_paths = ['/content/drive/MyDrive/Audio_data/fold5/100263-2-0-3.wav',
                      '/content/drive/MyDrive/Audio_data/fold2/102871-8-0-10.wav',
                      '/content/drive/MyDrive/Audio_data/fold3/103199-4-1-0.wav']

#for audio_path in sample_audio_paths:
#    plot_waveform(audio_path)

sample_audio_paths = audio_files[:10]
for audio_path in sample_audio_paths:
    plot_waveform(audio_path)

#Extracting MFCC feature for each audio file
Here we will be using Mel-Frequency Cepstral Coefficients(MFCC) from the audio samples.

The MFCC summarises the frequency distribution across the window size, so it is possible to analyse both the frequency and time characteristics of the sound.

These audio representations will allow us to identify features for classification.

In [None]:
#Defining a function to extract MFCC audio features of a audio file

def feature_extractor(file_name):
  audio, sample_rate = librosa.load(file_name, res_type = 'kaiser_fast')
  mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
  mfccs_scaled_features=np.mean(mfccs_features.T,axis=0)

  return mfccs_scaled_features


In [None]:
# Features extraction from all audio files (MFCC)
extracted_features=[]

for index_num, row in tqdm(sound_data.iterrows()):
     file_name = os.path.join(os.path.abspath(audio_dataset_path),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))
     print(file_name)
     final_class_labels=row["class"]
     data=feature_extractor(file_name)
     extracted_features.append([data,final_class_labels])


In [None]:
import pickle
#pickle.dump(extracted_features,open('/content/drive/MyDrive/Audio_data/extracted.pkl','wb'))
loaded_model = pickle.load(open('/content/drive/MyDrive/Audio_data/extracted.pkl','rb'))

In [None]:
# Converting extracted_features to Pandas dataframe
extracted_features_df=pd.DataFrame(loaded_model,columns=['feature','class'])
extracted_features_df.head()
# Data Frame Saving
extracted_features_df.to_csv("UrbanSound8K_DF.csv")

In [None]:
# Data Splitting
X=np.array(extracted_features_df['feature'].tolist())
y=np.array(extracted_features_df['class'].tolist())

In [None]:
X

In [None]:
y

In [None]:
# Import Libraries for Label Encoding
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
y_label = labelencoder.fit_transform(y)
y=to_categorical(y_label)

In [None]:
X.shape

In [None]:
print(y.shape)
print(y_label)

In [None]:
#Train-test spliting
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)
X_scale = scaler.fit_transform(X)

In [None]:
X_test_std

In [None]:
X_train_std.shape

In [None]:
X_test_std.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

# Model training

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation
from sklearn import metrics
from tensorflow.keras.callbacks import ModelCheckpoint
from datetime import datetime
from sklearn.metrics import ConfusionMatrixDisplay, classification_report, accuracy_score, precision_score, f1_score, recall_score, roc_curve, auc

In [None]:
y.shape

In [None]:
num_labels = y.shape[1]
print(num_labels)

In [None]:
#Model Creation
model = Sequential()
#First layer
model.add(Dense(1600, input_shape =(X.shape[1], )))
model.add(Activation('relu'))
model.add(Dropout(0.5))
#Second layer
model.add(Dense(800))
model.add(Activation('relu'))
model.add(Dropout(0.5))
#third layer
model.add(Dense(400))
model.add(Activation('relu'))
model.add(Dropout(0.5))
#Output layer
model.add(Dense(num_labels))
model.add(Activation('softmax'))

In [None]:
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer='adam')

In [None]:
num_epochs = 100
num_batch_size = 128

checkpointer = ModelCheckpoint(filepath='saved_models/audio_classification.h5', verbose=1, save_best_only=True)
start = datetime.now()

y_pred=model.fit(X_train, y_train, batch_size=num_batch_size, epochs=num_epochs, validation_data=(X_test, y_test), callbacks=[checkpointer], verbose=1)

duration = datetime.now() - start
print("Training completed in time: ", duration)

In [None]:
test_accuracy=model.evaluate(X_test,y_test,verbose=0)
print(test_accuracy[1])

In [None]:
pickle.dump(model,open('/content/model.pkl','wb'))

# Test on New audio data

In [None]:
#Extracting feature of new audio file
#filename="/content/drive/MyDrive/Audio_test/104327-2-0-4.wav"
file_name = audio_files[354]
#print(file_name)
audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)

In [None]:
mfccs_scaled_features.shape

In [None]:
#Predicting the class for new data
mfccs_scaled_features=mfccs_scaled_features.reshape(1,-1)
print(mfccs_scaled_features.shape)
predicted_label = np.argmax(model.predict(mfccs_scaled_features), axis=-1)
prediction_class = labelencoder.inverse_transform(predicted_label)
print('Predicted Label:',predicted_label)
print('Predicted Class:',prediction_class[0])
ipd.Audio(file_name)

