### <i>Pre-processing & splitting normal and augmented data</i>

In [99]:
import os
import librosa
import torchaudio
import torch
import math
import pandas as pd
import numpy as np
from IPython.display import Audio
train_dirs = [f"../data/Train/{path}" for path in os.listdir("../data/Train")]
train_dirs+= [f"augmented/{path}" for path in os.listdir("augmented")]
train_dirs.sort()

labels_csv_out = ""
for dir in train_dirs:
    audio, sr = torchaudio.load(dir)
    file_name = dir.split("/")[-1].replace(".wav",".npy")
    label = file_name[0]
    labels_csv_out += ",".join([file_name,label])+"\n"
    length = len(audio[0])/sr
    target_length = 13 * sr
    audio = (librosa.effects.trim(audio, top_db = 40)[0]/torch.max(torch.abs(audio))).repeat(1,13//math.floor(length)+30)[0][:target_length]
    S = librosa.feature.melspectrogram(y=audio.numpy(), sr=sr)
    # print(audio.shape, S.shape)
    np.save(f'preprocessed/all/specs/{file_name}', S)
    # break
with open("preprocessed/all/labels.csv", "w+") as f:
    f.write(labels_csv_out)

In [100]:
from sklearn.model_selection import train_test_split
import shutil

# Split data
train_files = [dir.replace(".wav","") for dir in os.listdir("../data/Train")]
train_files.sort()

stratify = [name.split("_")[0] for name in train_files]

train_ids, val_ids = train_test_split(train_files, test_size=0.2, random_state=42, stratify=stratify)

for file_name in os.listdir("preprocessed/all/specs"):
    id = file_name[:7]
    src = f"preprocessed/all/specs/{file_name}"
    dst = "preprocessed/train/specs" if (id in train_ids) else "preprocessed/val/specs"
    shutil.copy(src,dst)

# Generate labels
for set_type in ["train","val"]:
    labels = ""
    for file_name in os.listdir(f"preprocessed/{set_type}/specs"):
        labels+= f"{file_name},{file_name[0]}\n"
    with open(f"preprocessed/{set_type}/labels.csv", "w+") as f:
        f.write(labels)

## Load train data

In [47]:
import convenience
df_normal_train_val, sample_rates = convenience.load_train()
file_names = df_normal_train_val.file_name
print(df_normal_train_val.head())
file_names.head()

In [None]:
# df_augment_train_val, sample_rates = convenience.load_train("augmented")
# df_augment_train_val.head()
df_train_val = df_normal_train_val

### Combine augmented and non-augmented data togehter

In [None]:
import pandas as pd
# df_train_val = pd.concat([df_augment_train_val, df_normal_train_val], axis = 0, ignore_index=True)

In [None]:
# for x in range(0,len(df_train_val),1000):
#     subset = df_train_val.loc[x:x+999]
#     subset.audio = [audio.tolist() for audio in subset.audio.tolist()]
#     display(subset)
#     subset.to_csv(f"subsets/subset_{x}:{x+999}.csv")
#     break


In [None]:
import ast
from IPython.display import Audio
my_favourite_df = pd.read_csv("subsets/subset_0:999.csv")
Audio(my_favourite_df.audio[0],rate=16000)


In [None]:
import gc
gc.collect()

## Trim silence

In [None]:
#silence cutting
import librosa 
trimmed_audio = []

for file in df_train_val.audio:
    trimmed, index = librosa.effects.trim(file, top_db = 40)    
    trimmed_audio.append(trimmed)
    
df_train_val.audio = trimmed_audio
#df_train_val["trimmed"] = trimmed_audio

In [None]:
# df_train_val["trimmed"] = trimmed_audio
from IPython.display import Audio
Audio(df_train_val.audio[0],rate = 16000)
#df_train_val.audio[3164]

In [None]:
df_train_val["length"] = [len(df_train_val.audio[i])/16000 for i  in range(0, len(df_train_val.audio))]

In [None]:
import IPython.display as ipd
display(df_train_val)
display(ipd.Audio(df_train_val.audio[0], rate=16000))
display(ipd.Audio(trimmed_audio[0], rate=16000))

## Loop data

In [None]:
import math
target_duration = math.ceil(df_train_val.length.max()) # 13s
sr = list(sample_rates)[0] # 16kHz
for i in range(len(df_train_val.audio)):
    df_train_val.audio[i] = convenience.loop_audio(df_train_val.audio[i], df_train_val.length[i], target_duration, sr)

df_train_val

In [None]:
# import math

# target_duration = math.ceil(df_train_val.length.max())
# sr = list(sample_rates)[0]

# df_train_val, _ = convenience.loop_audio_df(df_train_val, target_duration, sr)
# df_train_val.head()

## Normalize data

In [None]:
import torch
df_train_val['audio_norm'] = df_train_val.looped_audio/[torch.max(torch.abs(df_train_val.looped_audio[i])) for i in range(len(df_train_val.looped_audio))]
df_train_val.head()

In [None]:
maxs = [torch.max(torch.abs(df_train_val.audio_norm[i])) for i in range(len(df_train_val.audio_norm))]
mins = [torch.min(torch.abs(df_train_val.audio_norm[i])) for i in range(len(df_train_val.audio_norm))]
means = [torch.sum(torch.abs(df_train_val.audio_norm[i]))/len(df_train_val.audio_norm[i]) for i in range(len(df_train_val.audio_norm))]

print("Maximums")
print("min:\t",min(maxs))
print("max:\t",max(maxs))
print("mean:\t",sum(maxs)/len(maxs))

print("\nMinimums")
print("min:\t",min(mins))
print("max:\t",max(mins))
print("mean:\t",sum(mins)/len(mins))

print("\nMeans")
print("min:\t",min(means))
print("max:\t",max(means))
print("mean:\t",sum(means)/len(means))


## Split data

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(df_train_val, test_size=0.2, random_state=42, stratify=df_train_val.stratify)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

In [None]:
import matplotlib.pyplot as plt

plt.title("Distribution of data")
plt.hist(df_train.sort_values('stratify').stratify)
plt.hist(df_val.sort_values('stratify').stratify)
plt.legend(("Train set", "Validation set"))
plt.xlabel("Combined labels")
plt.ylabel("Number of samples")

## Turn audio into spectrograms

In [None]:
display(df_train.head())

In [None]:
import librosa
import numpy as np
import os

if (not os.path.isdir("preprocessed")): os.mkdir("preprocessed")
if (not os.path.isdir("preprocessed/train")): os.mkdir("preprocessed/train")
if (not os.path.isdir("preprocessed/train/specs")): os.mkdir("preprocessed/train/specs")
if (not os.path.isdir("preprocessed/val")): os.mkdir("preprocessed/val")
if (not os.path.isdir("preprocessed/val/specs")): os.mkdir("preprocessed/val/specs")

labels_csv_out = ""

for i in range(len(df_train.looped_audio)):
    S = librosa.feature.melspectrogram(y=df_train.looped_audio[i].numpy(), sr=16000)
    labels_csv_out += ",".join([f"{df_train.file_name[i][:-4]}.npy", str(df_train.accent[i]),"\n"])
    np.save(f'preprocessed/train/specs/{df_train.file_name[i][:-4]}.npy', S)

with open("preprocessed/train/labels.csv", "w+") as f:
    f.write(labels_csv_out)

labels_csv_out = ""

for i in range(len(df_val.looped_audio)):
    S = librosa.feature.melspectrogram(y=df_val.looped_audio[i].numpy(), sr=16000)
    labels_csv_out += ",".join([f"{df_val.file_name[i][:-4]}.npy", str(df_val.accent[i]), "\n"])
    np.save(f'preprocessed/val/specs/{df_val.file_name[i][:-4]}.npy', S)


with open("preprocessed/val/labels.csv", "w+") as f:
    f.write(labels_csv_out)

In [None]:
librosa.feature.melspectrogram(y=df_train_val.looped_audio[0].numpy(), sr=16000).shape

In [None]:
# # Sanity check
import matplotlib.pyplot as plt
import os
S = np.load(f"preprocessed/train/specs/{os.listdir('preprocessed/train/specs')[1]}")
fig, ax = plt.subplots()
S_dB = librosa.power_to_db(S, ref=np.max)
img = librosa.display.specshow(S_dB, x_axis='time', y_axis='mel', sr=16000, ax=ax)

fig.colorbar(img, ax=ax, format='%+2.0f dB')

ax.set(title='Mel-frequency spectrogram')

## Process test data

In [None]:
# Pre-process test set
import convenience
import torch
df_test, sample_rates = convenience.load_test()
display(df_test.head())

target_duration = 13
sr = 16000

df_test, _ = convenience.loop_audio_df(df_test, target_duration=target_duration, sr=sr)
display(df_test.head())

df_test['audio_norm'] = df_test.looped_audio/[torch.max(torch.abs(df_test.looped_audio[i])) for i in range(len(df_test.looped_audio))]
display(df_test.head())


In [None]:
# Export test set
import librosa
import numpy as np
import os

if (not os.path.isdir("preprocessed")): os.mkdir("preprocessed")
if (not os.path.isdir("preprocessed/test")): os.mkdir("preprocessed/test")
if (not os.path.isdir("preprocessed/test/specs")): os.mkdir("preprocessed/test/specs")


labels_csv_out = "Id,label\n"

for i in range(len(df_test.looped_audio)):
    S = librosa.feature.melspectrogram(y=df_test.looped_audio[i].numpy(), sr=16000)
    labels_csv_out += ",".join([f"{df_test.id[i]}","\n"])
    np.save(f'preprocessed/test/specs/{df_test.file_name[i][:-4]}.npy', S)

with open("preprocessed/test/labels.csv", "w+") as f:
    f.write(labels_csv_out)