In [None]:
import pandas as pd
import numpy as np
import os
import re
import math

from concurrent.futures import ThreadPoolExecutor

from utilities.visualizers import view_label_freq
from utilities.loaders import load_labels, save_model, concur_load_data

%load_ext autoreload
%autoreload 2

# Load audio signals and respective labels for each subject

In [None]:
# there are 16000 samples per second originally but
# if we let librosa interpolate our signals it would be 256hz
# which is frequency typical to that of a human voice
hertz = 8000

# how many seconds we want our window to be
# e.g. if we want our signal segment to be 1 second
# then this would mean 16000 (or 22050) samples that we need to aggregate
# quarter of a second
window_time = 0.25

# how many seconds we want our signal segments to overlap
# one eighth of a second (1/8)
hop_time = 0.125

# note that the shorter the window time and hop time the more there will be data points in our final dataset
# which can be computationally intensive for our machines

In [None]:
DIR = "./data/"
folders = list(filter(lambda file: not file.endswith(".tgz") and (not "_EXTRACTED_FEATURES" in file), os.listdir(DIR)))[:]

In [None]:
folders

In [None]:
labels = load_labels(DIR, folders)
labels

In [None]:
len(labels)

In [None]:
labels_df = pd.DataFrame(labels, columns=["subject_name", "string", "label"])
labels_df

In [None]:
labels_df["label"].value_counts()

In [None]:
labels_df["string"].value_counts()

In [None]:
labels_df[labels_df["label"].isna()]

# Once all .tar file contents are extracted we proceed to trimming any insignificant parts of the audio signal and have it be of the same length as the labels

In [None]:
import librosa
import librosa.display
import matplotlib.pyplot as plt

from scipy.stats import kurtosis as kurt, skew, mode, entropy
from sklearn.model_selection import train_test_split

from utilities.loaders import load_audio
from utilities.preprocessors import encode_features

In [None]:
signals = load_audio(DIR, folders, hertz=hertz)

### Shape of newly combined dataset for each subject will have a longer vector

In [None]:
signals[0]

In [None]:
signals[0][1].shape

In [None]:
len(signals)

In [None]:
signals

In [None]:
signals_df = pd.DataFrame(signals, columns=["subject_name", "raw_signals"])
signals_df

In [None]:
dataset_df = signals_df.merge(labels_df, how="left", on=["subject_name"])
dataset_df

In [None]:
labels = dataset_df["label"].value_counts().index

In [None]:
counts = dataset_df["label"].value_counts().values

In [None]:
view_label_freq(dataset_df["label"].value_counts(), img_title="male to female ratio", save_img=True, labels=labels)

In [None]:
dataset_df["label"], dataset_df_le = encode_features(dataset_df["label"])
dataset_df

In [None]:
dataset_df["label"].value_counts()

### As we can see 1 is male and 0 is female and 2 is unknown

In [None]:
dataset_df_le.inverse_transform([0, 0, 0, 1, 2])

### We save this encoder for later when we run the training script

In [None]:
save_model(dataset_df_le, './saved/misc/audio_dataset_le.pkl')

In [None]:
train_dataset_df, test_dataset_df = train_test_split(dataset_df, test_size=0.2, random_state=0)

In [None]:
train_dataset_df

In [None]:
fig = plt.figure(figsize=(17, 5))
librosa.display.waveshow(dataset_df.loc[0, "raw_signals"], alpha=0.5, color="#8442f5")
plt.show()

In [None]:
train_dataset = list(train_dataset_df.itertuples(index=False, name=None))
train_dataset

In [None]:
test_dataset = list(test_dataset_df.itertuples(index=False, name=None))
test_dataset

### All we havee to do now is to extract the features of each combined vector for each subject
### 521216 is the length of the 16000hz test audio signal 

In [None]:
from utilities.feature_extractors import extract_features

In [None]:
train_dataset_final = extract_features(train_dataset, hertz=hertz, window_time=window_time, hop_time=hop_time)
train_dataset_final

In [None]:
# train_dataset_final[0][0]

In [None]:
# train_dataset_final[0][0].shape

In [None]:
# train_dataset_final[0][1]

### This merges the list of features returned by `extract_features()`

In [None]:
# train_features_merged = pd.concat(train_dataset_final[0], axis=0, ignore_index=True)
# train_features_merged

### This merges the list of labels returned by `extract_features()`

In [None]:
# train_labels_merged = pd.concat(train_dataset_final[1], axis=0, ignore_index=True)
# train_labels_merged

In [None]:
# train_labels_merged.value_counts()

# Saving extracted features to .csv

In [None]:
# os.makedirs('./data/_EXTRACTED_FEATURES/', exist_ok=True)
# train_features_merged.to_csv('./data/_EXTRACTED_FEATURES/train_features_merged.csv')
# train_labels_merged.to_csv('./data/_EXTRACTED_FEATURES/train_labels_merged.csv')

### Do feature extraction on test set also and save

In [None]:
test_dataset_final = extract_features(test_dataset, hertz=hertz, window_time=window_time, hop_time=hop_time)
test_dataset_final

In [None]:
# test_features_merged = pd.concat(test_dataset_final[0], axis=0, ignore_index=True)
# test_features_merged

In [None]:
# test_labels_merged = pd.concat(test_dataset_final[1], axis=0, ignore_index=True)
# test_labels_merged

In [None]:
# test_features_merged.to_csv('./data/_EXTRACTED_FEATURES/test_features_merged.csv')
# test_labels_merged.to_csv('./data/_EXTRACTED_FEATURES/test_labels_merged.csv')

### We visualize the calculated root mean squared energy of the audio signal

In [None]:
fig = plt.figure(figsize=(17, 5))
librosa.display.waveshow(dataset_df.loc[0, "raw_signals"], alpha=0.5, color="#2ddae3")

time = train_dataset_final[2][0]
plt.scatter(time, train_dataset_final[0][0]["rms"], color="#6cf542", marker='.', alpha=1)
plt.plot(time, train_dataset_final[0][0]["rms"], color="#e02f8e", alpha=0.5)
plt.tight_layout()

plt.savefig('./figures & images/root mean squared energy.png')
plt.show()

### Zero crossing rate feature of audio signal

In [None]:
fig = plt.figure(figsize=(17, 5))
time = train_dataset_final[2][0]
plt.scatter(time, train_dataset_final[0][0]["zcr"], color="#6cf542", marker='.', alpha=1)
plt.plot(time, train_dataset_final[0][0]["zcr"], color="#e02f8e", alpha=1)
plt.tight_layout()

plt.savefig('./figures & images/zero crossing rate feature.png')
plt.show()

### Mel Spectogram

In [None]:
fig = plt.figure(figsize=(17, 5))
time = train_dataset_final[2][0]
plt.scatter(time, train_dataset_final[0][0]["mean_mel"], color="#6cf542", marker='.', alpha=1)
plt.plot(time, train_dataset_final[0][0]["mean_mel"], color="#e02f8e", alpha=1)
plt.tight_layout()

plt.savefig('./figures & images/mel frequency mean feature.png')
plt.show()

### variance of mel frequency

In [None]:
fig = plt.figure(figsize=(17, 5))
time = train_dataset_final[2][0]
plt.scatter(time, train_dataset_final[0][0]["variance_mel"], color="#6cf542", marker='.', alpha=1)
plt.plot(time, train_dataset_final[0][0]["variance_mel"], color="#e02f8e", alpha=1)
plt.tight_layout()

plt.savefig('./figures & images/mel frequency variance feature.png')
plt.show()

### spectral centroid

In [None]:
fig = plt.figure(figsize=(17, 5))
time = train_dataset_final[2][0]
plt.scatter(time, train_dataset_final[0][0]["spect_cent"], color="#6cf542", marker='.', alpha=1)
plt.plot(time, train_dataset_final[0][0]["spect_cent"], color="#e02f8e", alpha=1)
plt.tight_layout()

plt.savefig('./figures & images/spectral centroid feature.png')
plt.show()

### For deep learning however and the use of LSTMs and CNNs and of its ilk we can use raw audio signals themselves to extract deep features from. As we know these models are able to extract higher order features automatically

In [None]:
# dl_train_dataset_final = concur_load_data(train_dataset, hertz=hertz, window_time=window_time, hop_time=hop_time, config="deep")
# dl_train_dataset_final

In [None]:
# dl_train_signals_merged = np.concatenate(dl_train_dataset_final[0], axis=0)
# dl_train_signals_merged

In [None]:
# dl_train_labels_merged = np.concatenate(dl_train_dataset_final[1], axis=0)
# dl_train_labels_merged

In [None]:
# # again in lstms we have the concept of timesteps
# # but in the case of signal processing especially biosignal
# # and audio processing the number of features can sometimes be
# # just 1 dimension and the timesteps could be the window size itself
# m, tx, nf  = dl_train_signals_merged.shape

In [None]:
# dl_train_labels_merged.shape

In [None]:
# m, tx, nf