In [3]:
import numpy as np
import pandas as pd

In [4]:
#import raw data
path_to_raw_data='../../data/chordonomicon_raw.csv'
raw_df=pd.read_csv(path_to_raw_data, low_memory=False)

Since our target variables are decade, genre and popularity, remove all entries that have NA for any of the decade, main_genre and spotify_song_id (can be linked to Spotify data to get popularity metric) features.

In [5]:
#standardize NA's
raw_df = raw_df.replace({np.nan: pd.NA})
#filter raw_df
clean_df=raw_df[raw_df['release_date'].notna() & raw_df['spotify_song_id'].notna() & 
                raw_df['main_genre'].notna()]

We will build all our predictive features from the chords feature, so filter the cleaned dataset to have four columns: chords, decade, main_genre and spotify_song_id

In [6]:
clean_df=clean_df[['chords','decade','main_genre','spotify_song_id']]

The original dataset covers all decades from 1890-2020, but there is extreme class inbalance due to few entries from the 1890s-1940s. Remove these decades.

In [7]:
clean_df=clean_df[clean_df['decade']>1940]

If we are using n-grams of up to length 5 as features, then we need to remove all data entries having 5 or fewer total chords.

In [8]:
#helper functions to find the total number of chords listed in each song

import re

# regex to capture tags like <verse_1>, <chorus_2>, <bridge>, etc.
TAG = re.compile(r"<\s*([^>]+?)\s*>", flags=re.IGNORECASE)

# Given string of chords partitioned into sections, returns dictionary of sections:chords in section.
def song_split(chord_str: str):
    s = (chord_str or "").strip()

    # find all tags and their spans
    spans = [(m.group(1).strip(), m.start(), m.end()) for m in TAG.finditer(s)]
    if not spans:
        return {"whole": s}  # no tags → treat the whole thing as one section

    # sentinel for the end of the string
    spans.append(("__END__", len(s), len(s)))

    chord_dict = {}
    for (name, tag_start, tag_end), (_, next_start, _) in zip(spans, spans[1:]):
        if name == "__END__":
            break
        # Get the segment between the end of the current tag and the start of the next tag
        segment = s[tag_end:next_start].strip()
        if segment:  # only keep non-empty segments
            # If the section already exists, concatenate the new segment to the existing string
            if name in chord_dict:
                chord_dict[name] += " " + segment
            else:
                chord_dict[name] = segment

    return chord_dict

#Given a sections:chords dictionary, return the total number of chords
def total_chord_count(dict):
    #split each sequence into a list
    nest_list=[dict[i].split() for i in dict.keys()]
    #concatenate lists
    unnest_list=[j for i in nest_list for j in i]
    return len(unnest_list)

# make new temporary 'chord_dict' and total_chords' features
clean_df.insert(loc=4,column='chord_dict',
                value=clean_df['chords'].apply(song_split))
clean_df.insert(loc=5,column='total_chords',
                value=clean_df['chord_dict'].apply(total_chord_count))

In [9]:
#remove entries with 5 or fewer total chords
clean_df=clean_df[clean_df['total_chords']>5]

There is a single song in the dataset which contains an unrecognized chord 'sC', drop this entry.

In [10]:
final_df=clean_df[clean_df['spotify_song_id']!='0cUssfb9LDMpEXy812iWCO']

In [11]:
#update row indices to account for removed data points
final_df=final_df.reset_index(drop=True)

In [12]:
# remove temporary rows
final_df=final_df.drop(columns=['chord_dict','total_chords'])

In [13]:
# add in a simplified chords column which doesn't have section labels, inversions, and is comma-separated
def replace_space_with_comma(my_string):
    return my_string.replace(" ",",")

def remove_section_markers(my_string):
    result = []
    i = 0
    n = len(my_string)
    while i < n:
        if my_string[i] == '<':
            # Skip until after the following ", "
            j = my_string.find('>', i)
            if j == -1:
                break  # no closing '>', stop
            i = j + 2  # skip '>,' and the space
        else:
            result.append(my_string[i])
            i += 1
    assert('<' not in result)
    assert('>' not in result)
    return ''.join(result)
    
def remove_inversions(my_string):
    result = []
    i = 0
    n = len(my_string)
    while i < n:
        if my_string[i] == '/':
            # Skip until after the following ", "
            j = my_string.find(',', i)
            if j == -1:
                break  # no closing comma, stop
            i = j  # skip comma
        else:
            result.append(my_string[i])
            i += 1
    return ''.join(result)

def clean_up_chord_string(my_string):
    return remove_inversions(
        remove_section_markers(
            replace_space_with_comma(my_string)))

final_df.insert(loc = 1,
                column = 'simplified_chords',
                value = final_df['chords'].apply(clean_up_chord_string),
                allow_duplicates = False)

In [14]:
#inspect final dataset
final_df.head(10)

Unnamed: 0,chords,simplified_chords,decade,main_genre,spotify_song_id
0,<intro_1> E D A/Cs E D A/Cs <verse_1> E D A/Cs...,"E,D,A,E,D,A,E,D,A,E,D,A,E,D,A,E,D,A,C,E,G,D,A,...",2000.0,metal,2ffJZ2r8HxI5DHcmf3BO6c
1,<intro_1> Csmin <verse_1> A Csmin A Csmin A Cs...,"Csmin,A,Csmin,A,Csmin,A,Csmin,A,B,Csmin,A,Fsmi...",2000.0,metal,5KiY8SZEnvCPyIEkFGRR3y
2,<intro_1> C <verse_1> G C G C <chorus_1> F Dmi...,"C,G,C,G,C,F,Dmin,G,Dmin,G,C,G,C,F,Dmin,G,Dmin,...",2020.0,pop,3zUecdrWC3IqrNSjhnoF3G
3,<intro_1> G Bmin Amin D G Bmin <verse_1> Amin ...,"G,Bmin,Amin,D,G,Bmin,Amin,D,G,Emin,Amin,D,G,Em...",2020.0,pop,1gh9q0HsS3tVXQypDXp4gf
4,<intro_1> Fsmin Fsno3d Bno3d E/B Fsno3d Bno3d ...,"Fsmin,Fsno3d,Bno3d,E,Fsno3d,Bno3d,E,Fsmin,B,As...",2020.0,pop,4y3uAOMHISJ3OOdjPC1FFN
5,<chorus_1> C Amin Dmin G C G Amin Dmin G C <ve...,"C,Amin,Dmin,G,C,G,Amin,Dmin,G,C,Dmin,C,Dmin,C,...",2020.0,pop,7FPREUUChbE5dPuDXAzjFz
6,<chorus_1> Amin G F G Amin Fmaj7 Amin G Amin G...,"Amin,G,F,G,Amin,Fmaj7,Amin,G,Amin,G,Amin,G,Ami...",2020.0,electronic,6IgzeUSIN04yVLvZdxZ6iI
7,<intro_1> Cmaj7 C Cmaj7 C <verse_1> G D Emin D...,"Cmaj7,C,Cmaj7,C,G,D,Emin,D,Emin,Amin,D,G,C,Cma...",2020.0,pop,6DneB3VYsHvIYdsfFoI1e6
8,<intro_1> D G D G D G D G D G D Bb C D G D Bb ...,"D,G,D,G,D,G,D,G,D,G,D,Bb,C,D,G,D,Bb,C,D,G,D,Bb...",2000.0,rock,3ouHmB2wtv5CzbsWlAGZzq
9,Amin7 Emin Gmaj7 Amin Cmaj7 Emin7 Cmaj7 Amin7 ...,"Amin7,Emin,Gmaj7,Amin,Cmaj7,Emin7,Cmaj7,Amin7,...",2000.0,soul,3a3Zhb4QJx7r3yxw4j9VVP


In [15]:
final_df.shape

(300713, 5)

Create final train and set split. An 85-15 split will produce a train set of size 255,606 and a test set of size 45,107.

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
final_train, final_test = train_test_split(final_df,test_size=.15,random_state=000)

In [None]:
#write these files to csv
path_to_train_data='../../data/final_train.csv'
path_to_test_data='../../data/final_test.csv'
final_train.to_csv(path_to_train_data, index = False)
final_test.to_csv(path_to_test_data, index = False)