# Audio MNIST

In [1]:
# Import python packages
from setup import Setup
from utilities import Utilities
import glob
import os
import pandas as pd
import warnings
from data_processing import DataProcessing
from data_visualization import DataVisualization
from feature_engineering import FeatureEngineering
from data_split import DataSplit
from xgboost_model import XGBoostModel

# Ignore warnings
warnings.filterwarnings("ignore")

# Initialize classes
SU = Setup(cfg_filepath = 'config.yaml')

# Get the paths
source_path = SU.source_path
destination_path = SU.destination_path
plot_path = SU.plot_path
result_path = SU.result_path
meta_data_path = SU.source_meta_path
meta_data = SU.read_file(meta_data_path)
model_param_path = SU.source_model_path
model_param = SU.read_file(model_param_path)

# Define the target sample rate
target_sr = 8000

# Initialize classes
UT = Utilities(destination_path)
DP = DataProcessing(target_sr)
DV = DataVisualization(plot_path)
FE = FeatureEngineering()
DS = DataSplit()
DS = DataSplit(test_size=0.1, val_size=0.1)

# Create empty dataframe with column names
columns=['mean', 'std', 'avg', 'median', 'min', 'max', 'skewness', 'kurtosis', 'gender', 'digit']
df = UT.create_dataframe(columns)

# Specify total number of folders in source path
all_folders = 2 #len(next(os.walk(source_path))[1])+1

# Loop over audio recordings in the source path
for i in range(1, all_folders):
    # Show progress
    UT.loop_progress(i, all_folders-1)

    # Assign source temp
    src_temp = os.path.join(source_path, f"{i:02d}")
    filepath_filename = sorted(glob.glob(os.path.join(src_temp, "*.wav")))

    # Loop over files in directory
    for file in filepath_filename:
        # Split file string
        dig, vp, rep = file.rstrip(".wav").split("/")[-1].split("_")

        # Read audio data
        fs, audio_data = UT.read_audio(file)

        # Plot audio signal
        audio_name = f"audio_{dig[-1]}_{vp}_{rep}.png"
        DV.plot_audio(fs, audio_data, audio_name)

        # Plot STFT of audio signal
        stft_name = f"stft_{dig[-1]}_{vp}_{rep}.png"
        DV.plot_stft(fs, audio_data, stft_name)

        # Play audio signal
        #DV.play_audio(file)

        # Resample audio data
        audio_data = DP.resample_data(fs, audio_data)

        # Zero padding audio data
        audio_data = DP.zero_pad(audio_data)

        # FFT audio data
        fft_data = DP.fft_data(audio_data)

        # Feature creation
        features = DP.feature_creation(fft_data)

        # Normalize features
        n_features = DP.normalize_features(features)

        # Add gender and digit label
        features = DP.add_gender(n_features, meta_data[vp]["gender"])
        features = DP.add_digit(n_features, dig[-1])

        # Append new dict values to the DataFrame
        df = df.append(features, ignore_index=True)
        break

# Show size of dataset
df_size = UT.df_shape(df)
print(f"Size of data set, columns: {df_size[1]} and rows: {df_size[0]}")

# Save data to CSV
csv_name = "audio_data.csv"
#UT.save_df_to_csv(df, csv_name)

Progress: 100.00%
Size of data set, columns: 10 and rows: 1


<Figure size 432x288 with 0 Axes>

In [2]:
# Load CSV file into dataframe
df = UT.csv_to_df(csv_name)

# Leave target columns out
columns_to_leave_out = ["gender", "digit"]

# Remove constant columns
df = FE.remove_constant_columns(df, columns_to_leave_out)

# Calculate correlation matrix
corr_matrix = FE.pearson_correlation(df, columns_to_leave_out)

# Assign correlation threshold
threshold = 0.95

# Removed correlated columns
df = FE.remove_correlated_columns(df, threshold, columns_to_leave_out)

# Create label column where 'female' is 0 and 'male' is 1
df = FE.create_label_column(df)

# Removed digit column
df = UT.remove_column(df, "digit")

# Save data to CSV
csv_name = "final_data.csv"
UT.save_df_to_csv(df, csv_name)

# Plot column distribution
plot_name = "column_distribution.png"
DV.column_distribution(df, plot_name)

# Load CSV file into dataframe
df = UT.csv_to_df(csv_name)

# Split data into training (80%), validation (10%), and test set (10%)
train_df, val_df, test_df = DS.split(df, "label")

# Show size of datasets
train_size = UT.df_shape(train_df)
val_size = UT.df_shape(val_df)
test_size = UT.df_shape(test_df)
print(f"Size of training set, columns: {train_size[1]} and rows: {train_size[0]}")
print(f"Size of validation set, columns: {val_size[1]} and rows: {val_size[0]}")
print(f"Size of validation set, columns: {test_size[1]} and rows: {test_size[0]}")

Size of training set, columns: 6 and rows: 24000
Size of validation set, columns: 6 and rows: 3000
Size of validation set, columns: 6 and rows: 3000


<Figure size 720x720 with 0 Axes>

In [4]:
# Initialize classes
XM = XGBoostModel(train_df, val_df, test_df)
X_train, y_train, X_val, y_val, X_test, y_test = XM.prepare_data()
XM.set_params(model_param)
XM.fit(X_train, y_train, X_val, y_val)
y_pred = XM.predict(X_test)
accuracy = XM.evaluate_predictions(y_test, y_pred)
# print the accuracy
print("Accuracy: %.2f%%" % (accuracy * 100))


[0]	validation_0-logloss:0.64223	validation_0-accuracy:0.82696	validation_1-logloss:0.64741	validation_1-accuracy:0.78667
[1]	validation_0-logloss:0.60155	validation_0-accuracy:0.83079	validation_1-logloss:0.61094	validation_1-accuracy:0.80067
[2]	validation_0-logloss:0.56867	validation_0-accuracy:0.83021	validation_1-logloss:0.58170	validation_1-accuracy:0.80467
[3]	validation_0-logloss:0.54077	validation_0-accuracy:0.83137	validation_1-logloss:0.55923	validation_1-accuracy:0.80400
[4]	validation_0-logloss:0.51764	validation_0-accuracy:0.83283	validation_1-logloss:0.54048	validation_1-accuracy:0.80267
[5]	validation_0-logloss:0.49812	validation_0-accuracy:0.83371	validation_1-logloss:0.52501	validation_1-accuracy:0.80433
[6]	validation_0-logloss:0.48168	validation_0-accuracy:0.83521	validation_1-logloss:0.51315	validation_1-accuracy:0.80400
[7]	validation_0-logloss:0.46799	validation_0-accuracy:0.83617	validation_1-logloss:0.50258	validation_1-accuracy:0.80433
[8]	validation_0-logloss