This notebook provides tools for add one-hot columns based on whether a song contains an n-gram which is harmonically equivalent to a fixed n-gram

In other words, given a dataframe with a "chords" column and a fixed n-gram (such as "C,F,G"), tabulate a binary column titled "contains_C,F,G" which is 1 if the row in that column contains any 1-4-5 chord progression, and 0 if not.

In [27]:
# read the equivalence dictionary file
# this is a dictionary of dictionaries
#    the top-level keys are chord names (e.g. 'C','Amin')
#    the top-level values are dictionaries, whose keys are equivalent chords, and whose values are the semitone distance between the top-level key and the low-level key
with open('harmonic_equivalence.json') as file:
    equiv_dict = json.load(file)

In [36]:
print(equiv_dict['C'])

{'C': 0, 'Cs': 1, 'D': 2, 'Ds': 3, 'E': 4, 'Es': 5, 'Fs': 6, 'G': 7, 'Gs': 8, 'A': 9, 'As': 10, 'B': 11, 'Db': 1, 'Eb': 3, 'F': 5, 'Gb': 6, 'Ab': 8, 'Bb': 10}


In [29]:
# if the two input chords are harmonically equivalent, return (True, num_semitones) where num_semitones is the distance from n_gram_1 (up) to n_gram_2
# otherwise, return (False, None)
def compare_chords(chord_1, chord_2):
    if chord_2 in equiv_dict[chord_1]:
        return (True, equiv_dict[chord_1][chord_2])
    else:
        return (False, None)

assert(compare_chords('C','D')[0])
assert(compare_chords('C','E')[0])
assert(not(compare_chords('C','Amin')[0]))

In [30]:
# if the two input n_grams are harmonically equivalent, return (True, num_semitones) where num_semitones is the distance from n_gram_1 (up) to n_gram_2
# otherwise, return (False, None)
def compare_n_grams(n_gram_1, n_gram_2):
    list_1 = n_gram_1.split(',')
    list_2 = n_gram_2.split(',')

    # if they aren't the same length, we don't have to check anything
    if len(list_1) != len(list_2):
        return (False, None)

    # now we can assume they have the same length
    comparison = [compare_chords(list_1[i], list_2[i]) for i in range(len(list_1))]

    # if any pairs are not the same, return False
    for c in comparison:
        if not c[0]:
            return (False, None)

    # now we can assume every respective pair is equivalent, but we still need all of the distances to match
    dist_0 = comparison[0][1]
    for c in comparison:
        if c[1] != dist_0:
            return (False, None)

    return (True, dist_0)

assert(compare_n_grams('C,D,E','F,G,A') == (True, 5))
assert(compare_n_grams('C,D,E','F,G,B') == (False, None))

In [42]:
# return true/false depending on if a song contains a harmonically equivalent n_gram to the input n_gram
# new version of this, making use of the equivalence dictionary for lookups rather than doing calculations every time
def contains_n_gram(song, n_gram):
    # assumption: input song is a comma-separated string of chord names
    # assumption: input n_gram is a comma-separated string of chord names

    # skip ahead and return true if the literal/raw version is the song
    # This isn't necessary to have, but it was added because it seemed to speed things up
    # Probably depends what kind of looping/checking is happening whether this will speed up or slow down
    if n_gram in song:
        return True

    song_as_list = song.split(',')
    n = len(n_gram.split(','))
    for i in range(0,len(song_as_list) - n):
        song_n_gram = ','.join(song_as_list[i:i+n])
        if compare_n_grams(n_gram, song_n_gram)[0]:
            return True
    return False

assert(contains_n_gram('A,B,C,D,E,F,G','C,D'))
assert(contains_n_gram('A,B,C,D,E,F','F,G'))
assert(not(contains_n_gram('A,B,C,D,E,F','C,E')))

In [None]:
# given the chord column of our dataframe and a fixed n-gram, make a binary one-hot column for that n-gram
def get_one_hot(chord_column, n_gram):
    return chord_column.apply(lambda song : contains_n_gram(song, n_gram))