This notebook provides tools for add one-hot columns based on whether a song contains an n-gram which is harmonically equivalent to a fixed n-gram.

In other words, given a dataframe with a "chords" column and given a particular n-gram (such as "C,F,G"), tabulate a binary column titled "contains_C,F,G" which is 

* 1 if the song in that row contains any 1-4-5 chord progression

* 0 if not

The following loads a dictionary file which speeds up this process a lot. This dictionary contains all chords organized by harmonic equivalence. It is a dictionary of dictionaries. The top-level keys are chord names (such as 'Amin'), and dictionary value associated with a top-level key has keys which are equivalent chords (such as 'Bmin') whose associated values give the distance in semitones between the top-level key and the low-level key. So for exmaple, within the 'Amin' dictionary, the value of 'Bmin' is 2, since B minor is equivalent to Aminor harmonically and is two semitones higher.

In [3]:
# read the equivalence dictionary file
# this is a dictionary of dictionaries
#    the top-level keys are chord names (e.g. 'C','Amin')
#    the top-level values are dictionaries, whose keys are equivalent chords, and whose values are the semitone distance between the top-level key and the low-level key
with open('../../data/harmonic_equivalence_dictionary.json') as file:
    equiv_dict = json.load(file)

In [4]:
# example of what the 'C' dictionary looks like
print(equiv_dict['C'])

{'C': 0, 'Cs': 1, 'D': 2, 'Ds': 3, 'E': 4, 'Es': 5, 'Fs': 6, 'G': 7, 'Gs': 8, 'A': 9, 'As': 10, 'B': 11, 'Db': 1, 'Eb': 3, 'F': 5, 'Gb': 6, 'Ab': 8, 'Bb': 10}


In [5]:
# if the two input chords are harmonically equivalent, return (True, num_semitones) where num_semitones is the distance from n_gram_1 (up) to n_gram_2
# otherwise, return (False, None)
def compare_chords(chord_1, chord_2):
    # assumption: chord_1 and chord_2 are both string labels for chords, such as 'C' or 'Amin'
    assert(chord_1 in equiv_dict)
    assert(chord_2 in equiv_dict)
    if chord_2 in equiv_dict[chord_1]:
        return (True, equiv_dict[chord_1][chord_2])
    else:
        return (False, None)

assert(compare_chords('C','D') == (True, 2))
assert(compare_chords('C','E') == (True, 4))
assert(compare_chords('C','Amin') == (False, None))

In [6]:
# if the two input n_grams are harmonically equivalent, return (True, num_semitones) where num_semitones is the distance from n_gram_1 (up) to n_gram_2
# otherwise, return (False, None)
def compare_n_grams(n_gram_1, n_gram_2):
    # assumption: n_gram_1 and n_gram_2 are both comma-separated strings of chord names, without inversion markings
    list_1 = n_gram_1.split(',')
    list_2 = n_gram_2.split(',')

    # if they aren't the same length, we don't have to check anything else
    if len(list_1) != len(list_2):
        return (False, None)

    # now we can assume they have the same length
    # make a list of comparisons
    comparison = [compare_chords(list_1[i], list_2[i]) for i in range(len(list_1))]

    # if any pairs are not the same, return False
    for c in comparison:
        if not c[0]:
            return (False, None)

    # now we can assume every respective pair is equivalent, but we still need all of the distances to match
    dist_0 = comparison[0][1] # extract the semitone distance between the first chord of n_gram_1 and the first chord of n_gram_2
    for c in comparison:
        if c[1] != dist_0:
            return (False, None)

    return (True, dist_0)

assert(compare_n_grams('C,D,E','F,G') == (False, None))
assert(compare_n_grams('C,D,E','F,G,A') == (True, 5))
assert(compare_n_grams('C,D,E','F,G,B') == (False, None))

In [7]:
# return true/false depending on if a song contains a harmonically equivalent n_gram to the input n_gram
def contains_n_gram(song, n_gram):
    # assumption: input song is a comma-separated string of chord names
    # assumption: input n_gram is a comma-separated string of chord names

    # skip ahead and return true if the literal/raw version is the song
    # This isn't necessary to have, but it was added because it seemed to speed things up
    # Probably depends what kind of looping/checking is happening whether this will speed up or slow down
    if n_gram in song:
        return True

    # split the song into a list of individual chord names
    song_as_list = song.split(',')
    n = len(n_gram.split(','))

    # iterate through the possible starting points of n-grams within the song
    for i in range(0,len(song_as_list) - n):
        song_n_gram = ','.join(song_as_list[i:i+n])
        if compare_n_grams(n_gram, song_n_gram)[0]:
            return True
    return False

assert(contains_n_gram('A,B,C,D,E,F,G','C,D'))
assert(contains_n_gram('A,B,C,D,E,F','F,G'))
assert(not(contains_n_gram('A,B,C,D,E,F','C,E')))

In [8]:
# given the chord column of our dataframe and a fixed n-gram, make a binary one-hot column for that n-gram
def get_one_hot(chord_column, n_gram):
    return chord_column.apply(lambda song : contains_n_gram(song, n_gram))

Below is an illustration of using the tool above.

In [10]:
import pandas as pd
df = pd.read_csv('../../data/clean_test.csv', low_memory=False)

In [11]:
sample_data = df.head(10)
display(sample_data)

Unnamed: 0,chords,simplified_chords,decade,main_genre,spotify_song_id
0,<intro_1> G A Fsmin Bmin G A Fsmin Bmin <verse...,"G,A,Fsmin,Bmin,G,A,Fsmin,Bmin,G,A,Fsmin,Bmin,G...",2010.0,pop,7vpGKEUPrA4UEsS4o4W1tP
1,C F G C F G F Dmin G C F Dmin G C F G C F G F ...,"C,F,G,C,F,G,F,Dmin,G,C,F,Dmin,G,C,F,G,C,F,G,F,...",2000.0,alternative,7MTpNQUBKyyymbS3gPuqwQ
2,C F C G Amin G F C F C G Amin G F C G C F C G ...,"C,F,C,G,Amin,G,F,C,F,C,G,Amin,G,F,C,G,C,F,C,G,...",2000.0,alternative,6jIIMhcBPRTrkTWh3PXIc7
3,Amin G Gmin B Amin G Gmin B Amin G Gmin B Amin...,"Amin,G,Gmin,B,Amin,G,Gmin,B,Amin,G,Gmin,B,Amin...",2010.0,pop,2zAfQdoOeYujy7QIgDUq9p
4,<verse_1> D Dmaj7 G/D A/D D Dmaj7 G/D A/D <cho...,"D,Dmaj7,G,A,D,Dmaj7,G,A,G,D,Emin,D,A,G,D,Emin,...",2010.0,metal,40rChMoUd1VXb4TKgTuTSP
5,<verse_1> Eb Gmin Ab Eb Gmin Ab Eb Gmin Ab Eb ...,"Eb,Gmin,Ab,Eb,Gmin,Ab,Eb,Gmin,Ab,Eb,Gmin,Ab,Eb...",2010.0,alternative,6Zc6CovSlkLcuqxkBgea0x
6,A Amin Emin A Amin Emin A Amin Emin A Amin Emi...,"A,Amin,Emin,A,Amin,Emin,A,Amin,Emin,A,Amin,Emi...",2010.0,metal,2p58AzW86Z0B0pXgE0K2NO
7,F G C Amin F G C Amin F G C Amin F G C Amin F ...,"F,G,C,Amin,F,G,C,Amin,F,G,C,Amin,F,G,C,Amin,F,...",2020.0,electronic,43jSmFYpG1mgJcLgIC53gF
8,<intro_1> C F G Amin Emin Dmin C F G Amin Emin...,"C,F,G,Amin,Emin,Dmin,C,F,G,Amin,Emin,Dmin,C,F,...",2000.0,rock,2O60Sr29rg9vACJXYVICEo
9,E Csmin Amin Gsmin E Csmin Amin E Csmin A E Cs...,"E,Csmin,Amin,Gsmin,E,Csmin,Amin,E,Csmin,A,E,Cs...",2000.0,pop rock,3Y3OcmUcS4jWsEu2PoSP31


In [12]:
n_gram_1 = 'F,G,C'
sample_data.insert(loc=len(sample_data.columns),
                   column='contains_' + n_gram_1,
                   value = get_one_hot(chord_column = sample_data['simplified_chords'],n_gram = n_gram_1),
                   allow_duplicates = False)

display(sample_data)

Unnamed: 0,chords,simplified_chords,decade,main_genre,spotify_song_id,"contains_F,G,C"
0,<intro_1> G A Fsmin Bmin G A Fsmin Bmin <verse...,"G,A,Fsmin,Bmin,G,A,Fsmin,Bmin,G,A,Fsmin,Bmin,G...",2010.0,pop,7vpGKEUPrA4UEsS4o4W1tP,False
1,C F G C F G F Dmin G C F Dmin G C F G C F G F ...,"C,F,G,C,F,G,F,Dmin,G,C,F,Dmin,G,C,F,G,C,F,G,F,...",2000.0,alternative,7MTpNQUBKyyymbS3gPuqwQ,True
2,C F C G Amin G F C F C G Amin G F C G C F C G ...,"C,F,C,G,Amin,G,F,C,F,C,G,Amin,G,F,C,G,C,F,C,G,...",2000.0,alternative,6jIIMhcBPRTrkTWh3PXIc7,False
3,Amin G Gmin B Amin G Gmin B Amin G Gmin B Amin...,"Amin,G,Gmin,B,Amin,G,Gmin,B,Amin,G,Gmin,B,Amin...",2010.0,pop,2zAfQdoOeYujy7QIgDUq9p,False
4,<verse_1> D Dmaj7 G/D A/D D Dmaj7 G/D A/D <cho...,"D,Dmaj7,G,A,D,Dmaj7,G,A,G,D,Emin,D,A,G,D,Emin,...",2010.0,metal,40rChMoUd1VXb4TKgTuTSP,True
5,<verse_1> Eb Gmin Ab Eb Gmin Ab Eb Gmin Ab Eb ...,"Eb,Gmin,Ab,Eb,Gmin,Ab,Eb,Gmin,Ab,Eb,Gmin,Ab,Eb...",2010.0,alternative,6Zc6CovSlkLcuqxkBgea0x,False
6,A Amin Emin A Amin Emin A Amin Emin A Amin Emi...,"A,Amin,Emin,A,Amin,Emin,A,Amin,Emin,A,Amin,Emi...",2010.0,metal,2p58AzW86Z0B0pXgE0K2NO,False
7,F G C Amin F G C Amin F G C Amin F G C Amin F ...,"F,G,C,Amin,F,G,C,Amin,F,G,C,Amin,F,G,C,Amin,F,...",2020.0,electronic,43jSmFYpG1mgJcLgIC53gF,True
8,<intro_1> C F G Amin Emin Dmin C F G Amin Emin...,"C,F,G,Amin,Emin,Dmin,C,F,G,Amin,Emin,Dmin,C,F,...",2000.0,rock,2O60Sr29rg9vACJXYVICEo,False
9,E Csmin Amin Gsmin E Csmin Amin E Csmin A E Cs...,"E,Csmin,Amin,Gsmin,E,Csmin,Amin,E,Csmin,A,E,Cs...",2000.0,pop rock,3Y3OcmUcS4jWsEu2PoSP31,False
