## Load train data

In [None]:
import convenience
df_train_val, sample_rates = convenience.load_train()
df_train_val.head()

## Loop data

In [None]:
import math

target_duration = math.ceil(df_train_val.length.max())
sr = list(sample_rates)[0]

df_train_val, _ = convenience.loop_audio_df(df_train_val, target_duration, sr)
df_train_val.head()

13 16000


KeyboardInterrupt: 

## Normalize data

In [None]:
import torch
df_train_val['audio_norm'] = df_train_val.looped_audio/[torch.max(torch.abs(df_train_val.looped_audio[i])) for i in range(len(df_train_val.looped_audio))]
df_train_val.head()

In [None]:
maxs = [torch.max(torch.abs(df_train_val.audio_norm[i])) for i in range(len(df_train_val.audio_norm))]
mins = [torch.min(torch.abs(df_train_val.audio_norm[i])) for i in range(len(df_train_val.audio_norm))]
means = [torch.sum(torch.abs(df_train_val.audio_norm[i]))/len(df_train_val.audio_norm[i]) for i in range(len(df_train_val.audio_norm))]

print("Maximums")
print("min:\t",min(maxs))
print("max:\t",max(maxs))
print("mean:\t",sum(maxs)/len(maxs))

print("\nMinimums")
print("min:\t",min(mins))
print("max:\t",max(mins))
print("mean:\t",sum(mins)/len(mins))

print("\nMeans")
print("min:\t",min(means))
print("max:\t",max(means))
print("mean:\t",sum(means)/len(means))


## Split data

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_val = train_test_split(df_train_val, test_size=0.2, random_state=42, stratify=df_train_val.stratify)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

In [None]:
import matplotlib.pyplot as plt

plt.title("Distribution of data")
plt.hist(df_train.sort_values('stratify').stratify)
plt.hist(df_val.sort_values('stratify').stratify)
plt.legend(("Train set", "Validation set"))
plt.xlabel("Combined labels")
plt.ylabel("Number of samples")

## Turn audio into spectrograms

In [None]:
display(df_train.head())

In [None]:
import librosa
import numpy as np
import os

if (not os.path.isdir("preprocessed")): os.mkdir("preprocessed")
if (not os.path.isdir("preprocessed/train")): os.mkdir("preprocessed/train")
if (not os.path.isdir("preprocessed/train/specs")): os.mkdir("preprocessed/train/specs")
if (not os.path.isdir("preprocessed/val")): os.mkdir("preprocessed/val")
if (not os.path.isdir("preprocessed/val/specs")): os.mkdir("preprocessed/val/specs")

labels_csv_out = ""

for i in range(len(df_train.looped_audio)):
    S = librosa.feature.melspectrogram(y=df_train.looped_audio[i].numpy(), sr=16000)
    labels_csv_out += ",".join([f"{df_train.file_name[i][:-4]}.npy", str(df_train.accent[i]),"\n"])
    np.save(f'preprocessed/train/specs/{df_train.file_name[i][:-4]}.npy', S)

with open("preprocessed/train/labels.csv", "w+") as f:
    f.write(labels_csv_out)

labels_csv_out = ""

for i in range(len(df_val.looped_audio)):
    S = librosa.feature.melspectrogram(y=df_val.looped_audio[i].numpy(), sr=16000)
    labels_csv_out += ",".join([f"{df_val.file_name[i][:-4]}.npy", str(df_val.accent[i]), "\n"])
    np.save(f'preprocessed/val/specs/{df_val.file_name[i][:-4]}.npy', S)


with open("preprocessed/val/labels.csv", "w+") as f:
    f.write(labels_csv_out)

In [None]:
librosa.feature.melspectrogram(y=df_train_val.looped_audio[0].numpy(), sr=16000).shape

In [None]:
# # Sanity check
import matplotlib.pyplot as plt
import os
S = np.load(f"preprocessed/train/specs/{os.listdir('preprocessed/train/specs')[1]}")
fig, ax = plt.subplots()
S_dB = librosa.power_to_db(S, ref=np.max)
img = librosa.display.specshow(S_dB, x_axis='time', y_axis='mel', sr=16000, ax=ax)

fig.colorbar(img, ax=ax, format='%+2.0f dB')

ax.set(title='Mel-frequency spectrogram')

## Process test data

In [1]:
# Pre-process test set
import convenience
import torch
df_test, sample_rates = convenience.load_test()
display(df_test.head())

target_duration = 13
sr = 16000

df_test, _ = convenience.loop_audio_df(df_test, target_duration=target_duration, sr=sr)
display(df_test.head())

df_test['audio_norm'] = df_test.looped_audio/[torch.max(torch.abs(df_test.looped_audio[i])) for i in range(len(df_test.looped_audio))]
display(df_test.head())


Unnamed: 0,id,file_name,audio,length
0,5775,5775.wav,"[tensor(0.0003), tensor(0.0002), tensor(0.0002...",3.498687
1,5652,5652.wav,"[tensor(-9.1553e-05), tensor(-9.1553e-05), ten...",5.125625
2,5740,5740.wav,"[tensor(0.), tensor(3.0518e-05), tensor(-3.051...",2.6375
3,6428,6428.wav,"[tensor(0.), tensor(6.1035e-05), tensor(0.0002...",7.253312
4,8021,8021.wav,"[tensor(0.0003), tensor(0.0002), tensor(0.0001...",4.010625


Unnamed: 0,id,file_name,audio,length,looped_audio,looped_length
0,5775,5775.wav,"[tensor(0.0003), tensor(0.0002), tensor(0.0002...",3.498687,"[tensor(0.0003), tensor(0.0002), tensor(0.0002...",13.0
1,5652,5652.wav,"[tensor(-9.1553e-05), tensor(-9.1553e-05), ten...",5.125625,"[tensor(-9.1553e-05), tensor(-9.1553e-05), ten...",13.0
2,5740,5740.wav,"[tensor(0.), tensor(3.0518e-05), tensor(-3.051...",2.6375,"[tensor(0.), tensor(3.0518e-05), tensor(-3.051...",13.0
3,6428,6428.wav,"[tensor(0.), tensor(6.1035e-05), tensor(0.0002...",7.253312,"[tensor(0.), tensor(6.1035e-05), tensor(0.0002...",13.0
4,8021,8021.wav,"[tensor(0.0003), tensor(0.0002), tensor(0.0001...",4.010625,"[tensor(0.0003), tensor(0.0002), tensor(0.0001...",13.0


Unnamed: 0,id,file_name,audio,length,looped_audio,looped_length,audio_norm
0,5775,5775.wav,"[tensor(0.0003), tensor(0.0002), tensor(0.0002...",3.498687,"[tensor(0.0003), tensor(0.0002), tensor(0.0002...",13.0,"[tensor(0.0007), tensor(0.0004), tensor(0.0004..."
1,5652,5652.wav,"[tensor(-9.1553e-05), tensor(-9.1553e-05), ten...",5.125625,"[tensor(-9.1553e-05), tensor(-9.1553e-05), ten...",13.0,"[tensor(-0.0001), tensor(-0.0001), tensor(-0.0..."
2,5740,5740.wav,"[tensor(0.), tensor(3.0518e-05), tensor(-3.051...",2.6375,"[tensor(0.), tensor(3.0518e-05), tensor(-3.051...",13.0,"[tensor(0.), tensor(5.0774e-05), tensor(-5.077..."
3,6428,6428.wav,"[tensor(0.), tensor(6.1035e-05), tensor(0.0002...",7.253312,"[tensor(0.), tensor(6.1035e-05), tensor(0.0002...",13.0,"[tensor(0.), tensor(0.0002), tensor(0.0005), t..."
4,8021,8021.wav,"[tensor(0.0003), tensor(0.0002), tensor(0.0001...",4.010625,"[tensor(0.0003), tensor(0.0002), tensor(0.0001...",13.0,"[tensor(0.0004), tensor(0.0003), tensor(0.0002..."


In [3]:
# Export test set
import librosa
import numpy as np
import os

if (not os.path.isdir("preprocessed")): os.mkdir("preprocessed")
if (not os.path.isdir("preprocessed/test")): os.mkdir("preprocessed/test")
if (not os.path.isdir("preprocessed/test/specs")): os.mkdir("preprocessed/test/specs")


labels_csv_out = "Id,label\n"

for i in range(len(df_test.looped_audio)):
    S = librosa.feature.melspectrogram(y=df_test.looped_audio[i].numpy(), sr=16000)
    labels_csv_out += ",".join([f"{df_test.id[i]}","\n"])
    np.save(f'preprocessed/test/specs/{df_test.file_name[i][:-4]}.npy', S)

with open("preprocessed/test/labels.csv", "w+") as f:
    f.write(labels_csv_out)