In [9]:
import numpy as np
import pandas as pd

Read training set:

In [10]:
path_to_train='../../data/final_train.csv'
df=pd.read_csv(path_to_train, low_memory=False)

Some final features to test:
* unique_chord_density: number of unique chords divided by total number of chords
* unique_5gram_density: number of unique 5-grams divided by total number of chords

2, 3 and 4-gram densities are not included because they are fairly correlated with the 2 and 5-gram densities.

In [11]:
#temporary helper functions to find the total number of chords listed in each song

import re

# regex to capture tags like <verse_1>, <chorus_2>, <bridge>, etc.
TAG = re.compile(r"<\s*([^>]+?)\s*>", flags=re.IGNORECASE)

# Given string of chords partitioned into sections, returns dictionary of sections:chords in section.
def song_split(chord_str: str):
    s = (chord_str or "").strip()

    # find all tags and their spans
    spans = [(m.group(1).strip(), m.start(), m.end()) for m in TAG.finditer(s)]
    if not spans:
        return {"whole": s}  # no tags → treat the whole thing as one section

    # sentinel for the end of the string
    spans.append(("__END__", len(s), len(s)))

    chord_dict = {}
    for (name, tag_start, tag_end), (_, next_start, _) in zip(spans, spans[1:]):
        if name == "__END__":
            break
        # Get the segment between the end of the current tag and the start of the next tag
        segment = s[tag_end:next_start].strip()
        if segment:  # only keep non-empty segments
            # If the section already exists, concatenate the new segment to the existing string
            if name in chord_dict:
                chord_dict[name] += " " + segment
            else:
                chord_dict[name] = segment

    return chord_dict



#Given a sections:chords dictionary, return the total number of unique n-grams
def unique_n_density(dict,n):
    #split each sequence into a list
    nest_list=[dict[i].split() for i in dict.keys()]
    
    #concatenate lists
    unnest_list=[j for i in nest_list for j in i]
    unique=[]
    no_unique=0
    
    #check sequential n-grams for uniqueness
    for i in range(len(unnest_list)-n+1):
        if unnest_list[i:i+n] not in unique:
            unique.append(unnest_list[i:i+n])
            no_unique+=1
    
    return(no_unique/len(unnest_list))


# make new temporary 'chord_dict' feature
df.insert(loc=1,column='chord_dict',
                value=df['chords'].apply(song_split))



In [12]:
#insert features
df.insert(1,'unique_chord_density',df['chord_dict'].apply(unique_n_density,n=1))
df.insert(1,'unique_5gram_density',df['chord_dict'].apply(unique_n_density,n=5))
#drop chord_dict helper
df=df.drop(columns=['chord_dict'])


In [13]:
#inspect final dataset
df.head(10)

Unnamed: 0,chords,unique_5gram_density,unique_chord_density,simplified_chords,decade,main_genre,spotify_song_id
0,<intro_1> G A Fsmin Bmin G A Fsmin Bmin <verse...,0.043478,0.043478,"G,A,Fsmin,Bmin,G,A,Fsmin,Bmin,G,A,Fsmin,Bmin,G...",2010.0,pop,7vpGKEUPrA4UEsS4o4W1tP
1,C F G C F G F Dmin G C F Dmin G C F G C F G F ...,0.545455,0.151515,"C,F,G,C,F,G,F,Dmin,G,C,F,Dmin,G,C,F,G,C,F,G,F,...",2000.0,alternative,7MTpNQUBKyyymbS3gPuqwQ
2,C F C G Amin G F C F C G Amin G F C G C F C G ...,0.407407,0.12963,"C,F,C,G,Amin,G,F,C,F,C,G,Amin,G,F,C,G,C,F,C,G,...",2000.0,alternative,6jIIMhcBPRTrkTWh3PXIc7
3,Amin G Gmin B Amin G Gmin B Amin G Gmin B Amin...,0.058824,0.058824,"Amin,G,Gmin,B,Amin,G,Gmin,B,Amin,G,Gmin,B,Amin...",2010.0,pop,2zAfQdoOeYujy7QIgDUq9p
4,<verse_1> D Dmaj7 G/D A/D D Dmaj7 G/D A/D <cho...,0.530303,0.151515,"D,Dmaj7,G,A,D,Dmaj7,G,A,G,D,Emin,D,A,G,D,Emin,...",2010.0,metal,40rChMoUd1VXb4TKgTuTSP
5,<verse_1> Eb Gmin Ab Eb Gmin Ab Eb Gmin Ab Eb ...,0.116667,0.05,"Eb,Gmin,Ab,Eb,Gmin,Ab,Eb,Gmin,Ab,Eb,Gmin,Ab,Eb...",2010.0,alternative,6Zc6CovSlkLcuqxkBgea0x
6,A Amin Emin A Amin Emin A Amin Emin A Amin Emi...,0.454545,0.116883,"A,Amin,Emin,A,Amin,Emin,A,Amin,Emin,A,Amin,Emi...",2010.0,metal,2p58AzW86Z0B0pXgE0K2NO
7,F G C Amin F G C Amin F G C Amin F G C Amin F ...,0.05,0.05,"F,G,C,Amin,F,G,C,Amin,F,G,C,Amin,F,G,C,Amin,F,...",2020.0,electronic,43jSmFYpG1mgJcLgIC53gF
8,<intro_1> C F G Amin Emin Dmin C F G Amin Emin...,0.166667,0.043011,"C,F,G,Amin,Emin,Dmin,C,F,G,Amin,Emin,Dmin,C,F,...",2000.0,rock,2O60Sr29rg9vACJXYVICEo
9,E Csmin Amin Gsmin E Csmin Amin E Csmin A E Cs...,0.617021,0.12766,"E,Csmin,Amin,Gsmin,E,Csmin,Amin,E,Csmin,A,E,Cs...",2000.0,pop rock,3Y3OcmUcS4jWsEu2PoSP31


Check correlation between variables:

In [14]:
#correlation matrix
features=['unique_chord_density','unique_5gram_density']
df[features].corr()

Unnamed: 0,unique_chord_density,unique_5gram_density
unique_chord_density,1.0,0.618137
unique_5gram_density,0.618137,1.0


Juding by the correlation values and pair plots in the eda file in the data folder, these features are only moderately correlated. 

In [16]:
#save expanded df
path_to_save='../../data/density_expanded_final_train.csv'
df.to_csv(path_to_save,index=False)