Imports

In [33]:
import os
import pandas as pd
import shutil
from sklearn.model_selection import train_test_split

In [34]:
def create_labels(path):
    files = []
    for file in os.listdir(path):
        if file.endswith(".npz"):
            files.append(file[:-4])
    return files

## Binary classification:

Create labels based on chord quality

0: Major

1: minor

In [35]:
chords = sorted(create_labels("./data/raw/"))
print(chords)
chord_num = {'M': 0, 'm': 1}
labels = {chord: chord_num[chord[1]] for chord in chords}
print(labels, "\n", len(labels))

['AM5', 'AM50', 'AM51', 'AM52', 'AM53', 'AM54', 'AM55', 'AM56', 'AM7', 'AM70', 'AM71', 'AM72', 'AM73', 'AM74', 'AM75', 'AM76', 'AM77', 'AM78', 'Am5', 'Am50', 'Am51', 'Am52', 'Am53', 'Am54', 'Am55', 'Am56', 'Am7', 'Am70', 'Am71', 'Am72', 'Am73', 'Am74', 'Am75', 'Am76', 'Am77', 'Am78', 'BM5', 'BM50', 'BM51', 'BM52', 'BM53', 'BM54', 'BM55', 'BM56', 'BM7', 'BM70', 'BM71', 'BM72', 'BM73', 'BM74', 'BM75', 'BM76', 'BM77', 'BM78', 'Bm5', 'Bm50', 'Bm51', 'Bm52', 'Bm53', 'Bm54', 'Bm55', 'Bm56', 'Bm7', 'Bm70', 'Bm71', 'Bm72', 'Bm73', 'Bm74', 'Bm75', 'Bm76', 'Bm77', 'Bm78', 'CM5', 'CM50', 'CM51', 'CM52', 'CM53', 'CM54', 'CM55', 'CM56', 'CM7', 'CM70', 'CM71', 'CM72', 'CM73', 'CM74', 'CM75', 'CM76', 'CM77', 'CM78', 'Cm5', 'Cm50', 'Cm51', 'Cm52', 'Cm53', 'Cm54', 'Cm55', 'Cm56', 'Cm7', 'Cm70', 'Cm71', 'Cm72', 'Cm73', 'Cm74', 'Cm75', 'Cm76', 'Cm77', 'Cm78', 'DM5', 'DM50', 'DM51', 'DM52', 'DM53', 'DM54', 'DM55', 'DM56', 'DM7', 'DM70', 'DM71', 'DM72', 'DM73', 'DM74', 'DM75', 'DM76', 'DM77', 'DM78', 'Dm5'

### Check that the data is relatively well-balanced

In [36]:
M = 0
m = 0
for x in labels:
    if x[1] == "M":
        M += 1
    elif x[1] == "m":
        m += 1

print(M, m)

141 141


In [37]:
df = pd.DataFrame.from_dict(labels, orient='index')
df.to_csv('./data/labels.csv')
df['Filename'] = df.index
# df

Make train/test/val split

In [38]:
# train_val, test = train_test_split(df, test_size=0.2, random_state=0)
# train, val = train_test_split(train_val, test_size=0.25, random_state=0)
train, val = train_test_split(df, test_size=0.2, random_state=1)


train.to_csv('./data/train_labels.csv', index=False)
val.to_csv('./data/val_labels.csv', index=False)
# test.to_csv('./data/test_labels.csv', index=False)

Move data files from main directory to their respective splits

*Totally not necessary*

In [39]:
def move_files(df, source, dest):
    os.makedirs(dest, exist_ok=True)
    for _, row in df.iterrows():
        filename = row['Filename']
        source_path = os.path.join(source, filename)
        dest_path = os.path.join(dest, filename)
        shutil.move(source_path, dest_path)

In [40]:
# source = "./data/"
# train_labels = pd.read_csv(source + "train_labels.csv")
# val_labels = pd.read_csv(source + "val_labels.csv")
# test_labels = pd.read_csv(source + "test_labels.csv")

# move_files(train_labels, source, os.path.join(source, "train"))
# move_files(test_labels, source, os.path.join(source, "test"))
# move_files(val_labels, source, os.path.join(source, "val"))

## Multi-class classification:

Create labels based on chord quality and type

0: Major

1: minor

2: Major 7

3: minor 7

In [41]:
chords = sorted(create_labels("./data/raw/"))
print(chords)
chord_num = {'M5': 0, 'm5': 1, 'M7': 2, 'm7': 3}
labels = {chord: chord_num[chord[1:3]] for chord in chords}
print(labels)

['AM5', 'AM50', 'AM51', 'AM52', 'AM53', 'AM54', 'AM55', 'AM56', 'AM7', 'AM70', 'AM71', 'AM72', 'AM73', 'AM74', 'AM75', 'AM76', 'AM77', 'AM78', 'Am5', 'Am50', 'Am51', 'Am52', 'Am53', 'Am54', 'Am55', 'Am56', 'Am7', 'Am70', 'Am71', 'Am72', 'Am73', 'Am74', 'Am75', 'Am76', 'Am77', 'Am78', 'BM5', 'BM50', 'BM51', 'BM52', 'BM53', 'BM54', 'BM55', 'BM56', 'BM7', 'BM70', 'BM71', 'BM72', 'BM73', 'BM74', 'BM75', 'BM76', 'BM77', 'BM78', 'Bm5', 'Bm50', 'Bm51', 'Bm52', 'Bm53', 'Bm54', 'Bm55', 'Bm56', 'Bm7', 'Bm70', 'Bm71', 'Bm72', 'Bm73', 'Bm74', 'Bm75', 'Bm76', 'Bm77', 'Bm78', 'CM5', 'CM50', 'CM51', 'CM52', 'CM53', 'CM54', 'CM55', 'CM56', 'CM7', 'CM70', 'CM71', 'CM72', 'CM73', 'CM74', 'CM75', 'CM76', 'CM77', 'CM78', 'Cm5', 'Cm50', 'Cm51', 'Cm52', 'Cm53', 'Cm54', 'Cm55', 'Cm56', 'Cm7', 'Cm70', 'Cm71', 'Cm72', 'Cm73', 'Cm74', 'Cm75', 'Cm76', 'Cm77', 'Cm78', 'DM5', 'DM50', 'DM51', 'DM52', 'DM53', 'DM54', 'DM55', 'DM56', 'DM7', 'DM70', 'DM71', 'DM72', 'DM73', 'DM74', 'DM75', 'DM76', 'DM77', 'DM78', 'Dm5'

In [42]:
M5 = m5 = M7 = m7 = 0

for x in labels:
    chord_type = x[1:3]
    if chord_type == "M5":
        M5 += 1
    elif chord_type == "m5":
        m5 += 1
    elif chord_type == "M7":
        M7 += 1
    elif chord_type == "m7":
        m7 += 1


print(f"M5 {M5}, m5 {m5}, M7 {M7}, m7 {m7}")

M5 71, m5 71, M7 70, m7 70


In [43]:
df = pd.DataFrame.from_dict(labels, orient='index')
df.to_csv('./data/multi-labels.csv')
df

Unnamed: 0,0
AM5,0
AM50,0
AM51,0
AM52,0
AM53,0
...,...
gM51,0
gM52,0
gm50,1
gm51,1


Make train/test/val split

In [46]:
df['Filename'] = df.index

# train_val, test = train_test_split(df, test_size=0.2, random_state=0)
# train, val = train_test_split(train_val, test_size=0.25, random_state=0)
train, val = train_test_split(df, test_size=0.2, random_state=1)

train.to_csv('./data/train_multi.csv', index=False)
val.to_csv('./data/val_multi.csv', index=False)
# test.to_csv('./data/test_multi.csv', index=False)

*Totally not necessary to do this*

In [45]:
# source = "./data/"
# train_labels = pd.read_csv(source + "train_multi.csv")
# val_labels = pd.read_csv(source + "val_multi.csv")
# test_labels = pd.read_csv(source + "test_multi.csv")

# move_files(train_labels, source, os.path.join(source, "multitrain"))
# move_files(test_labels, source, os.path.join(source, "multitest"))
# move_files(val_labels, source, os.path.join(source, "multival"))