In [1]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import random

Train test validation split

In [2]:
# Define the folder path where your CSV files are located
folder_path = os.path.join('processed_data')

# Get a list of all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Get the base name of each CSV file (without the extension)
csv_base_names = list(sorted([os.path.splitext(f)[0] for f in csv_files]))

# Print the list of base names
# print(csv_base_names)

In [3]:
def train_test_val_split(folder_path, train_size=0.7, val_size=0.15, test_size=0.15, random_seed=42):
    """
    Randomly splits CSV files in a folder into train, validation, and test sets based on the specified proportions.

    :param folder_path: Path to the folder containing the CSV files.
    :param train_size: Proportion of the files to use for training.
    :param val_size: Proportion of the files to use for validation.
    :param test_size: Proportion of the files to use for testing.
    :param random_seed: Random seed for reproducibility.
    :return: Three lists of file path train, validation, and test.
    """

    # Get a list of all CSV files in the folder
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

    # Shuffle the list of files randomly
    random.seed(random_seed)
    random.shuffle(csv_files)

    # Calculate the number of files for each set
    total_files = len(csv_files)
    train_end = int(train_size * total_files)
    val_end = train_end + int(val_size * total_files)

    # Split the files into train, validation, and test sets (randomly)
    train_files = csv_files[:train_end]
    val_files = csv_files[train_end:val_end]
    test_files = csv_files[val_end:]

    # # Load the CSV files into DataFrames (random order)
    # train_data_list = [pd.read_csv(os.path.join(folder_path, file)) for file in train_files]
    # val_data_list = [pd.read_csv(os.path.join(folder_path, file)) for file in val_files]
    # test_data_list = [pd.read_csv(os.path.join(folder_path, file)) for file in test_files]
    # 
    # # join / concat dfs
    # train = pd.concat(train_data_list, axis=0, ignore_index=True)
    # val = pd.concat(val_data_list, axis=0, ignore_index=True)
    # test = pd.concat(test_data_list, axis=0, ignore_index=True)

    # return train, val, test
    
    return train_files, val_files, test_files

In [4]:
train_data, val_data, test_data = train_test_val_split(folder_path)

In [5]:
import json

def save_list_to_json(file_path, data_list):
    """
    Saves a Python list into a JSON file.

    Args:
        file_path (str): Path to the JSON file to save the list.
        data_list (list): List to save in the JSON file.

    Raises:
        ValueError: If the input data is not a list.
    """
    if not isinstance(data_list, list):
        raise ValueError("Input data must be a list.")
    
    with open(file_path, 'w') as json_file:
        json.dump(data_list, json_file)

In [7]:
save_list_to_json("train_test_split/train_csv_paths.json", train_data)
save_list_to_json("train_test_split/val_csv_paths.json", val_data)
save_list_to_json("train_test_split/test_csv_paths.json", test_data)

In [54]:
train_data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,Y
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,N
1,0.04644,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,N
2,0.09288,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,N
3,0.13932,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,N
4,0.18576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,N


In [55]:
# get all unique chords name
unique_chords = sorted(train_data['Y'].unique())
print("all unique chords:\n", unique_chords)

# mapping from chord to int
chord_to_int = {chord: i for i, chord in enumerate(unique_chords)}
print("\nMapping from chord to int:\n", chord_to_int)

# inverse
int_to_chord = {i: chord for chord, i in chord_to_int.items()}
print("\nMapping from int to chord:\n", int_to_chord)

all unique chords:
 ['A#:maj', 'A:maj', 'A:min', 'Ab:maj', 'Ab:min', 'B:maj', 'B:min', 'Bb:maj', 'Bb:min', 'C#:maj', 'C#:min', 'C:maj', 'C:min', 'Cb:maj', 'D#:maj', 'D#:min', 'D:maj', 'D:min', 'Db:maj', 'Db:min', 'E:maj', 'E:min', 'Eb:maj', 'Eb:min', 'F#:maj', 'F#:min', 'F:maj', 'F:min', 'Fb:maj', 'G#:maj', 'G#:min', 'G:maj', 'G:min', 'Gb:maj', 'Gb:min', 'N', 'X']

Mapping from chord to int:
 {'A#:maj': 0, 'A:maj': 1, 'A:min': 2, 'Ab:maj': 3, 'Ab:min': 4, 'B:maj': 5, 'B:min': 6, 'Bb:maj': 7, 'Bb:min': 8, 'C#:maj': 9, 'C#:min': 10, 'C:maj': 11, 'C:min': 12, 'Cb:maj': 13, 'D#:maj': 14, 'D#:min': 15, 'D:maj': 16, 'D:min': 17, 'Db:maj': 18, 'Db:min': 19, 'E:maj': 20, 'E:min': 21, 'Eb:maj': 22, 'Eb:min': 23, 'F#:maj': 24, 'F#:min': 25, 'F:maj': 26, 'F:min': 27, 'Fb:maj': 28, 'G#:maj': 29, 'G#:min': 30, 'G:maj': 31, 'G:min': 32, 'Gb:maj': 33, 'Gb:min': 34, 'N': 35, 'X': 36}

Mapping from int to chord:
 {0: 'A#:maj', 1: 'A:maj', 2: 'A:min', 3: 'Ab:maj', 4: 'Ab:min', 5: 'B:maj', 6: 'B:min', 7:

In [56]:
train_data['Y_Encoded'] = train_data['Y'].map(chord_to_int)
train_data.drop('Y', axis=1, inplace=True)

val_data['Y_Encoded'] = val_data['Y'].map(chord_to_int)
val_data.drop('Y', axis=1, inplace=True)

test_data['Y_Encoded'] = test_data['Y'].map(chord_to_int)
test_data.drop('Y', axis=1, inplace=True)

In [58]:
train_data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,Y_Encoded
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35
1,0.04644,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35
2,0.09288,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35
3,0.13932,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35
4,0.18576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35


Split Training data and Label

In [59]:
train_X = train_data.drop('Y_Encoded', axis=1)
train_Y = train_data['Y_Encoded']

val_X = val_data.drop('Y_Encoded', axis=1)
val_Y = val_data['Y_Encoded']

test_X = test_data.drop('Y_Encoded', axis=1)
test_Y = test_data['Y_Encoded']

In [61]:
train_X.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.04644,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.09288,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.13932,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.18576,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
train_Y.head()

0    35
1    35
2    35
3    35
4    35
Name: Y_Encoded, dtype: int64

In [63]:
training_data_dir = "train_data"
train_X.to_csv(os.path.join(training_data_dir, "train_x.csv"), index=False)
train_Y.to_csv(os.path.join(training_data_dir, "train_y.csv"), index=False)

val_X.to_csv(os.path.join(training_data_dir, "val_x.csv"), index=False)
val_Y.to_csv(os.path.join(training_data_dir, "val_y.csv"), index=False)

test_X.to_csv(os.path.join(training_data_dir, "test_x.csv"), index=False)
test_Y.to_csv(os.path.join(training_data_dir, "test_y.csv"), index=False)