In [34]:
import numpy as np
import pandas as pd

Read training set:

In [35]:
path_to_train='../../data/final_train.csv'
df=pd.read_csv(path_to_train, low_memory=False)

Some final features to test:
* unique_chord_density: number of unique chords divided by total number of chords
* unique_2gram_density: number of unique 2-grams divided by total number of chords
* unique_5gram_density: number of unique 5-grams divided by total number of chords

3 and 4-gram densities are not included because they are fairly correlated with the 2 and 5-gram densities.

In [None]:
#temporary helper functions to find the total number of chords listed in each song

import re

# regex to capture tags like <verse_1>, <chorus_2>, <bridge>, etc.
TAG = re.compile(r"<\s*([^>]+?)\s*>", flags=re.IGNORECASE)

# Given string of chords partitioned into sections, returns dictionary of sections:chords in section.
def song_split(chord_str: str):
    s = (chord_str or "").strip()

    # find all tags and their spans
    spans = [(m.group(1).strip(), m.start(), m.end()) for m in TAG.finditer(s)]
    if not spans:
        return {"whole": s}  # no tags → treat the whole thing as one section

    # sentinel for the end of the string
    spans.append(("__END__", len(s), len(s)))

    chord_dict = {}
    for (name, tag_start, tag_end), (_, next_start, _) in zip(spans, spans[1:]):
        if name == "__END__":
            break
        # Get the segment between the end of the current tag and the start of the next tag
        segment = s[tag_end:next_start].strip()
        if segment:  # only keep non-empty segments
            # If the section already exists, concatenate the new segment to the existing string
            if name in chord_dict:
                chord_dict[name] += " " + segment
            else:
                chord_dict[name] = segment

    return chord_dict



#Given a sections:chords dictionary, return the total number of unique n-grams
def unique_n_density(dict,n):
    #split each sequence into a list
    nest_list=[dict[i].split() for i in dict.keys()]
    
    #concatenate lists
    unnest_list=[j for i in nest_list for j in i]
    unique=[]
    no_unique=0
    
    #check sequential n-grams for uniqueness
    for i in range(len(unnest_list)-n+1):
        if unnest_list[i:i+n] not in unique:
            unique.append(unnest_list[i:i+n])
            no_unique+=1
    
    return(no_unique/len(unnest_list))


# make new temporary 'chord_dict' feature
df.insert(loc=1,column='chord_dict',
                value=df['chords'].apply(song_split))



In [None]:
#insert features
df.insert(1,'unique_chord_density',df['chord_dict'].apply(unique_n_density,n=1))
df.insert(1,'unique_2gram_density',df['chord_dict'].apply(unique_n_density,n=2))
df.insert(1,'unique_5gram_density',df['chord_dict'].apply(unique_n_density,n=5))


In [None]:
#save expanded df
path_to_save='../../data/expanded_final_train.csv'
df.to_csv(path_to_save)