Imports

In [1]:
import os
import pandas as pd
import shutil
from sklearn.model_selection import train_test_split

In [2]:
def create_labels(path):
    files = []
    for file in os.listdir(path):
        if file.endswith(".npz"):
            files.append(file[:-4])
    return files

## Binary classification:

Create labels based on chord quality

0: Major

1: minor

In [13]:
chords = sorted(create_labels("./data/raw/"))
print(chords)
chord_num = {'M': 0, 'm': 1}
labels = {chord: chord_num[chord[1]] for chord in chords}
print(labels, "\n", len(labels))

['AM5', 'AM50', 'AM51', 'AM52', 'AM7', 'AM70', 'AM71', 'AM72', 'AM73', 'AM74', 'Am5', 'Am50', 'Am51', 'Am52', 'Am7', 'Am70', 'Am71', 'Am72', 'Am73', 'Am74', 'BM5', 'BM50', 'BM51', 'BM52', 'BM7', 'BM70', 'BM71', 'BM72', 'BM73', 'BM74', 'Bm5', 'Bm50', 'Bm51', 'Bm52', 'Bm7', 'Bm70', 'Bm71', 'Bm72', 'Bm73', 'Bm74', 'CM5', 'CM50', 'CM51', 'CM52', 'CM7', 'CM70', 'CM71', 'CM72', 'CM73', 'CM74', 'Cm5', 'Cm50', 'Cm51', 'Cm52', 'Cm7', 'Cm70', 'Cm71', 'Cm72', 'Cm73', 'Cm74', 'DM5', 'DM50', 'DM51', 'DM52', 'DM7', 'DM70', 'DM71', 'DM72', 'DM73', 'DM74', 'Dm5', 'Dm50', 'Dm51', 'Dm52', 'Dm7', 'Dm70', 'Dm71', 'Dm72', 'Dm73', 'Dm74', 'EM5', 'EM50', 'EM51', 'EM52', 'EM7', 'EM70', 'EM71', 'EM72', 'EM73', 'EM74', 'Em5', 'Em50', 'Em51', 'Em52', 'Em7', 'Em70', 'Em71', 'Em72', 'Em73', 'Em74', 'FM5', 'FM50', 'FM51', 'FM52', 'FM7', 'FM70', 'FM71', 'FM72', 'FM73', 'FM74', 'Fm5', 'Fm50', 'Fm51', 'Fm52', 'Fm7', 'Fm70', 'Fm71', 'Fm72', 'Fm73', 'Fm74', 'GM5', 'GM50', 'GM51', 'GM52', 'GM7', 'GM70', 'GM71', 'GM72', '

### Check that the data is relatively well-balanced

In [19]:
M = 0
m = 0
for x in labels:
    if x[1] == "M":
        M += 1
    elif x[1] == "m":
        m += 1

print(M, m)

85 85


In [14]:
df = pd.DataFrame.from_dict(labels, orient='index')
df.to_csv('./data/labels.csv')
# df

Make train/test/val split

In [15]:
df['Filename'] = df.index + ".npz"

train_val, test = train_test_split(df, test_size=0.2, random_state=0)
train, val = train_test_split(train_val, test_size=0.25, random_state=0)

train.to_csv('./data/train_labels.csv', index=False)
val.to_csv('./data/val_labels.csv', index=False)
test.to_csv('./data/test_labels.csv', index=False)

Move data files from main directory to their respective splits

*Totally not necessary*

In [3]:
def move_files(df, source, dest):
    os.makedirs(dest, exist_ok=True)
    for _, row in df.iterrows():
        filename = row['Filename']
        source_path = os.path.join(source, filename)
        dest_path = os.path.join(dest, filename)
        shutil.move(source_path, dest_path)

In [16]:
source = "./data/"
train_labels = pd.read_csv(source + "train_labels.csv")
val_labels = pd.read_csv(source + "val_labels.csv")
test_labels = pd.read_csv(source + "test_labels.csv")

move_files(train_labels, source, os.path.join(source, "train"))
move_files(test_labels, source, os.path.join(source, "test"))
move_files(val_labels, source, os.path.join(source, "val"))

## Multi-class classification:

Create labels based on chord quality and type

0: Major

1: minor

2: Major 7

3: minor 7

In [20]:
chords = sorted(create_labels("./data/raw/"))
print(chords)
chord_num = {'M5': 0, 'm5': 1, 'M7': 2, 'm7': 3}
labels = {chord: chord_num[chord[1:3]] for chord in chords}
print(labels)

['AM5', 'AM50', 'AM51', 'AM52', 'AM7', 'AM70', 'AM71', 'AM72', 'AM73', 'AM74', 'Am5', 'Am50', 'Am51', 'Am52', 'Am7', 'Am70', 'Am71', 'Am72', 'Am73', 'Am74', 'BM5', 'BM50', 'BM51', 'BM52', 'BM7', 'BM70', 'BM71', 'BM72', 'BM73', 'BM74', 'Bm5', 'Bm50', 'Bm51', 'Bm52', 'Bm7', 'Bm70', 'Bm71', 'Bm72', 'Bm73', 'Bm74', 'CM5', 'CM50', 'CM51', 'CM52', 'CM7', 'CM70', 'CM71', 'CM72', 'CM73', 'CM74', 'Cm5', 'Cm50', 'Cm51', 'Cm52', 'Cm7', 'Cm70', 'Cm71', 'Cm72', 'Cm73', 'Cm74', 'DM5', 'DM50', 'DM51', 'DM52', 'DM7', 'DM70', 'DM71', 'DM72', 'DM73', 'DM74', 'Dm5', 'Dm50', 'Dm51', 'Dm52', 'Dm7', 'Dm70', 'Dm71', 'Dm72', 'Dm73', 'Dm74', 'EM5', 'EM50', 'EM51', 'EM52', 'EM7', 'EM70', 'EM71', 'EM72', 'EM73', 'EM74', 'Em5', 'Em50', 'Em51', 'Em52', 'Em7', 'Em70', 'Em71', 'Em72', 'Em73', 'Em74', 'FM5', 'FM50', 'FM51', 'FM52', 'FM7', 'FM70', 'FM71', 'FM72', 'FM73', 'FM74', 'Fm5', 'Fm50', 'Fm51', 'Fm52', 'Fm7', 'Fm70', 'Fm71', 'Fm72', 'Fm73', 'Fm74', 'GM5', 'GM50', 'GM51', 'GM52', 'GM7', 'GM70', 'GM71', 'GM72', '

In [21]:
M5 = m5 = M7 = m7 = 0

for x in labels:
    chord_type = x[1:3]
    if chord_type == "M5":
        M5 += 1
    elif chord_type == "m5":
        m5 += 1
    elif chord_type == "M7":
        M7 += 1
    elif chord_type == "m7":
        m7 += 1


print(f"M5 {M5}, m5 {m5}, M7 {M7}, m7 {m7}")

M5 43, m5 43, M7 42, m7 42


In [7]:
df = pd.DataFrame.from_dict(labels, orient='index')
df.to_csv('./data/multi-labels.csv')
df

Unnamed: 0,0
AM5,0
AM50,0
AM51,0
AM52,0
AM7,2
...,...
gM51,0
gM52,0
gm50,1
gm51,1


Make train/test/val split

In [8]:
df['Filename'] = df.index + ".npz"

train_val, test = train_test_split(df, test_size=0.2, random_state=0)
train, val = train_test_split(train_val, test_size=0.25, random_state=0)

train.to_csv('./data/train_multi.csv', index=False)
val.to_csv('./data/val_multi.csv', index=False)
test.to_csv('./data/test_multi.csv', index=False)

*Totally not necessary to do this*

In [9]:
source = "./data/"
train_labels = pd.read_csv(source + "train_multi.csv")
val_labels = pd.read_csv(source + "val_multi.csv")
test_labels = pd.read_csv(source + "test_multi.csv")

move_files(train_labels, source, os.path.join(source, "multitrain"))
move_files(test_labels, source, os.path.join(source, "multitest"))
move_files(val_labels, source, os.path.join(source, "multival"))