In [17]:
import numpy as np
import pandas as pd

Read csv and standardize NaNs:

In [18]:
# csv imported fom https://huggingface.co/datasets/ailsntua/Chordonomicon/blob/main/chordonomicon_v2.csv
raw_df=pd.read_csv('../../data/chordonomicon_raw.csv')
raw_df = raw_df.replace({np.nan: pd.NA})


  raw_df=pd.read_csv('../../data/chordonomicon_raw.csv', low_memory=False)


Since we are analyzing release dataes, main genre and popularity (via spotify song id), remove any entries that don't have any of these features. In the end, it seems that only 70% of the data could be useful to us.

In [19]:
clean_df=raw_df[raw_df['release_date'].notna() | raw_df['spotify_song_id'].notna() | raw_df['main_genre'].notna()]

print(clean_df.shape) 
clean_df.head()

(478709, 10)


Unnamed: 0,id,chords,release_date,genres,decade,rock_genre,artist_id,main_genre,spotify_song_id,spotify_artist_id
0,1,<intro_1> C <verse_1> F C E7 Amin C F C G7 C F...,,'classic country pop',,,artist_1,pop,,4AIEGdwDzPELXYgM5JaEY5
1,2,<intro_1> E D A/Cs E D A/Cs <verse_1> E D A/Cs...,2003-01-01,'alternative metal' 'alternative rock' 'nu met...,2000.0,pop rock,artist_2,metal,2ffJZ2r8HxI5DHcmf3BO6c,694QW15WkebjcrWgQHzRYF
2,3,<intro_1> Csmin <verse_1> A Csmin A Csmin A Cs...,2003-01-01,'alternative metal' 'canadian rock' 'funk meta...,2000.0,canadian rock,artist_3,metal,5KiY8SZEnvCPyIEkFGRR3y,0niJkG4tKkne3zwr7I8n9n
3,4,<intro_1> D Dmaj7 D Dmaj7 <verse_1> Emin A D G...,2022-09-23,,2020.0,,artist_4,,01TtAcUqyLCRBZq4ZZiQWS,17BfKBemmMGO5ZAK25wraW
4,5,<intro_1> C <verse_1> G C G C <chorus_1> F Dmi...,2023-02-10,'modern country pop',2020.0,,artist_5,pop,3zUecdrWC3IqrNSjhnoF3G,4GGfAshSkqoxpZdoaHm7ky


The genre and by extension rock_genre labels seem to specific to be useful. To confirm, check how many unique genres are listed and view some of them

In [20]:
#find number of unique genres

import random
#since each genres entry is a list, split the list into components
newlist=[string.split("' '") for string in clean_df[clean_df.genres.notna()].genres]
#unnest all the components into one list
unnested_list=[j.strip("'") for i in newlist for j in i]
#analyze unique elements
print(len(set(unnested_list)))
print(random.sample(list(set(unnested_list)),20)) #random sample of unique genres

4992
['russian hardcore', 'kenyan hip hop', 'danish singer-songwriter', 'australian indie folk', 'norwegian hardcore', 'singaporean electronic', 'suomi rock', 'wave', 'gospel amapiano', 'blackened deathcore', 'irish pop', 'oriental metal', 'urbano chileno', 'futuristic swag', 'chilean rock', 'nice indie', 'street punk espanol', 'louisville underground', 'barbadian pop', 'nordic soundtrack']


There are 4992 genres and a quick glance shows many are quite specific. This information is likely too precise to be useful and will be dropped. By extension, also drop rock_genre feature. Finally, drop the 'id' feature because this has no useful information.

In [21]:
clean_df=clean_df.drop(columns=['id','genres','rock_genre'])
clean_df.head()

Unnamed: 0,chords,release_date,decade,artist_id,main_genre,spotify_song_id,spotify_artist_id
0,<intro_1> C <verse_1> F C E7 Amin C F C G7 C F...,,,artist_1,pop,,4AIEGdwDzPELXYgM5JaEY5
1,<intro_1> E D A/Cs E D A/Cs <verse_1> E D A/Cs...,2003-01-01,2000.0,artist_2,metal,2ffJZ2r8HxI5DHcmf3BO6c,694QW15WkebjcrWgQHzRYF
2,<intro_1> Csmin <verse_1> A Csmin A Csmin A Cs...,2003-01-01,2000.0,artist_3,metal,5KiY8SZEnvCPyIEkFGRR3y,0niJkG4tKkne3zwr7I8n9n
3,<intro_1> D Dmaj7 D Dmaj7 <verse_1> Emin A D G...,2022-09-23,2020.0,artist_4,,01TtAcUqyLCRBZq4ZZiQWS,17BfKBemmMGO5ZAK25wraW
4,<intro_1> C <verse_1> G C G C <chorus_1> F Dmi...,2023-02-10,2020.0,artist_5,pop,3zUecdrWC3IqrNSjhnoF3G,4GGfAshSkqoxpZdoaHm7ky


Create a new feature 'chord_dict' by using Juan's song_split function to separate the song sections in the 'chords' feature into dictionary entries:

In [22]:
#copy over Juan's code

import re

# regex to capture tags like <verse 1>, <chorus_2>, <bridge>, etc.
TAG = re.compile(r"<\s*([^>]+?)\s*>", flags=re.IGNORECASE)

# Given string of chords partioned into sections, returns dictionary of sections:chords in section.  
def song_split(chord_str: str):
    s = (chord_str or "").strip()

    # find all tags and their spans
    spans = [(m.group(1).strip(), m.start(), m.end()) for m in TAG.finditer(s)]
    if not spans:
        return {"whole": s}   # no tags → treat the whole thing as one section

    # sentinel for the end of the string
    spans.append(("__END__", len(s), len(s)))

    chord_dict = {}
    for (name, _, tag_end), (_, next_start, _) in zip(spans, spans[1:]):
        if name == "__END__":
            break
        segment = s[tag_end:next_start].strip()
        if segment:                     # only keep non-empty segments
            chord_dict[name] = segment

    return chord_dict

# make new 'chord_dict' feature
clean_df.insert(1,'chord_dict',clean_df['chords'].apply(song_split))



Create some more basic features:
* num_sections: number of sections the song is split into (verses, choruses, etc.)
* tot_chords: the total number of chords listed in the progression
* tot_unique_chords: number of unique chords appearing in the progression

In [23]:
# create num_sections
clean_df.insert(2,'num_sections',clean_df['chord_dict'].apply(len))


In [24]:
#helper functions to make tot_chords and tot_unique chords

def total_chord_count(dict):
    #split each sequence into a list
    nest_list=[dict[i].split() for i in dict.keys()]
    #concatenate lists
    unnest_list=[j for i in nest_list for j in i]
    return len(unnest_list)
   
def unique_chord_count(dict):
    #split each sequence into a list
    nest_list=[dict[i].split() for i in dict.keys()]
    #concatenate lists
    unnest_list=[j for i in nest_list for j in i]
    return len(set(unnest_list))



In [25]:
#create tot_chords and total_unique chords
clean_df.insert(3,'tot_chords',clean_df['chord_dict'].apply(total_chord_count))
clean_df.insert(4,'tot_unique_chords',clean_df['chord_dict'].apply(unique_chord_count))

I naive measure of song complexity would be the ratio of the number of unique chords to the total number of chords. Add this feature to the data as unique_chord_density.

In [None]:
clean_df.insert(5,'unique_chord_density',clean_df['tot_unique_chords']/clean_df['tot_chords'])

Unnamed: 0,chords,chord_dict,num_sections,tot_chords,tot_unique_chords,unique_chord_density,release_date,decade,artist_id,main_genre,spotify_song_id,spotify_artist_id
0,<intro_1> C <verse_1> F C E7 Amin C F C G7 C F...,"{'intro_1': 'C', 'verse_1': 'F C E7 Amin C F C...",6,67,11,0.164179,,,artist_1,pop,,4AIEGdwDzPELXYgM5JaEY5
1,<intro_1> E D A/Cs E D A/Cs <verse_1> E D A/Cs...,"{'intro_1': 'E D A/Cs E D A/Cs', 'verse_1': 'E...",8,122,6,0.04918,2003-01-01,2000.0,artist_2,metal,2ffJZ2r8HxI5DHcmf3BO6c,694QW15WkebjcrWgQHzRYF
2,<intro_1> Csmin <verse_1> A Csmin A Csmin A Cs...,"{'intro_1': 'Csmin', 'verse_1': 'A Csmin A Csm...",7,56,7,0.125,2003-01-01,2000.0,artist_3,metal,5KiY8SZEnvCPyIEkFGRR3y,0niJkG4tKkne3zwr7I8n9n
3,<intro_1> D Dmaj7 D Dmaj7 <verse_1> Emin A D G...,"{'intro_1': 'D Dmaj7 D Dmaj7', 'verse_1': 'Emi...",9,138,6,0.043478,2022-09-23,2020.0,artist_4,,01TtAcUqyLCRBZq4ZZiQWS,17BfKBemmMGO5ZAK25wraW
4,<intro_1> C <verse_1> G C G C <chorus_1> F Dmi...,"{'intro_1': 'C', 'verse_1': 'G C G C', 'chorus...",8,39,5,0.128205,2023-02-10,2020.0,artist_5,pop,3zUecdrWC3IqrNSjhnoF3G,4GGfAshSkqoxpZdoaHm7ky


In [27]:
#clean_df.to_csv('.\clean.csv')