This notebook contains some tools for dealing with chords up to harmonic equivalence. 

Mathematically, the space of (raw) chords as we are dealing with them is $X = \{0,1\}^{12}$, i.e. binary vectors of length 12. The cyclic group $G = \mathbb{Z}/12 \mathbb{Z}$ acts on this set by cyclically permuting vectors. Musically, this corresponds to transponsing all tones by a fixed number of semitones. We say that two chords are harmonically equvialent if they lie in the same orbit of this group action. That is, the set of chords up to harmonic equivalence is the quotient space $X/G$.

More generally, a chord progression or song is a finite sequence in $X$, and $G$ acts entry-wise on these sequences, and two songs are harmonically equivalent if they lie in the same $G$-orbit.

In [2]:
import pandas as pd
from collections import Counter
import time
import json

data_folder_path = '../../data/'
df = pd.read_csv(data_folder_path + 'clean_test.csv', low_memory=False)

In [3]:
sample_data = df.head(10)
display(sample_data)

Unnamed: 0,chords,simplified_chords,decade,main_genre,spotify_song_id
0,<intro_1> G A Fsmin Bmin G A Fsmin Bmin <verse...,"G,A,Fsmin,Bmin,G,A,Fsmin,Bmin,G,A,Fsmin,Bmin,G...",2010.0,pop,7vpGKEUPrA4UEsS4o4W1tP
1,C F G C F G F Dmin G C F Dmin G C F G C F G F ...,"C,F,G,C,F,G,F,Dmin,G,C,F,Dmin,G,C,F,G,C,F,G,F,...",2000.0,alternative,7MTpNQUBKyyymbS3gPuqwQ
2,C F C G Amin G F C F C G Amin G F C G C F C G ...,"C,F,C,G,Amin,G,F,C,F,C,G,Amin,G,F,C,G,C,F,C,G,...",2000.0,alternative,6jIIMhcBPRTrkTWh3PXIc7
3,Amin G Gmin B Amin G Gmin B Amin G Gmin B Amin...,"Amin,G,Gmin,B,Amin,G,Gmin,B,Amin,G,Gmin,B,Amin...",2010.0,pop,2zAfQdoOeYujy7QIgDUq9p
4,<verse_1> D Dmaj7 G/D A/D D Dmaj7 G/D A/D <cho...,"D,Dmaj7,G,A,D,Dmaj7,G,A,G,D,Emin,D,A,G,D,Emin,...",2010.0,metal,40rChMoUd1VXb4TKgTuTSP
5,<verse_1> Eb Gmin Ab Eb Gmin Ab Eb Gmin Ab Eb ...,"Eb,Gmin,Ab,Eb,Gmin,Ab,Eb,Gmin,Ab,Eb,Gmin,Ab,Eb...",2010.0,alternative,6Zc6CovSlkLcuqxkBgea0x
6,A Amin Emin A Amin Emin A Amin Emin A Amin Emi...,"A,Amin,Emin,A,Amin,Emin,A,Amin,Emin,A,Amin,Emi...",2010.0,metal,2p58AzW86Z0B0pXgE0K2NO
7,F G C Amin F G C Amin F G C Amin F G C Amin F ...,"F,G,C,Amin,F,G,C,Amin,F,G,C,Amin,F,G,C,Amin,F,...",2020.0,electronic,43jSmFYpG1mgJcLgIC53gF
8,<intro_1> C F G Amin Emin Dmin C F G Amin Emin...,"C,F,G,Amin,Emin,Dmin,C,F,G,Amin,Emin,Dmin,C,F,...",2000.0,rock,2O60Sr29rg9vACJXYVICEo
9,E Csmin Amin Gsmin E Csmin Amin E Csmin A E Cs...,"E,Csmin,Amin,Gsmin,E,Csmin,Amin,E,Csmin,A,E,Cs...",2000.0,pop rock,3Y3OcmUcS4jWsEu2PoSP31


In [4]:
# read the equivalence dictionary file
# this is a dictionary of dictionaries
#    the top-level keys are chord names (e.g. 'C','Amin')
#    the top-level values are dictionaries, whose keys are equivalent chords, and whose values are the semitone distance between the top-level key and the low-level key
with open(data_folder_path + 'harmonic_equivalence_dictionary.json') as file:
    equiv_dict = json.load(file)

In [5]:
def compare_chords(chord_1, chord_2):
    if chord_2 in equiv_dict[chord_1]:
        return (True, equiv_dict[chord_1][chord_2])
    else:
        return (False, None)

def compare_n_grams(n_gram_1, n_gram_2):
    list_1 = n_gram_1.split(',')
    list_2 = n_gram_2.split(',')

    # if they aren't the same length, we don't have to check anything
    if len(list_1) != len(list_2):
        return (False, None)

    # now we can assume they have the same length
    comparison = [compare_chords(list_1[i], list_2[i]) for i in range(len(list_1))]

    # if any pairs are not the same, return False
    for c in comparison:
        if not c[0]:
            return (False, None)

    # now we can assume every respective pair is equivalent, but we still need all of the distances to match
    dist_0 = comparison[0][1]
    for c in comparison:
        if c[1] != dist_0:
            return (False, None)

    return (True, dist_0)

In [6]:
def get_raw_n_gram_counts(chord_column, n):
    # compile a dictionary of counts
    results = Counter()
    for song in chord_column:
        song_as_list = song.split(',')
        song_n_grams = [','.join(song_as_list[i:i+n]) for i in range(len(song_as_list) - n + 1)]
        for ng in song_n_grams:
            results[ng] += 1
    return results

In [7]:
get_raw_n_gram_counts(sample_data['simplified_chords'],n=1)

Counter({'G': 133,
         'C': 107,
         'Amin': 93,
         'F': 79,
         'A': 63,
         'Gmin': 54,
         'Dmin': 38,
         'Eb': 37,
         'Ab': 37,
         'Bmin': 27,
         'D': 26,
         'Emin': 25,
         'Fsmin': 23,
         'B': 23,
         'Csmin': 14,
         'Cadd9': 8,
         'Csus4': 8,
         'E': 8,
         'Dmaj7': 5,
         'Gsmin': 5,
         'Cmin': 3,
         'Bb': 3,
         'Dsus4': 2,
         'As': 1,
         'Fmin': 1})

In [8]:
def get_unique_n_gram_counts(chord_column, n, progress_updates = False):
    raw_n_gram_counts = get_raw_n_gram_counts(chord_column, n)
    results = Counter()
    processed = set()
    if progress_updates:
        t0 = time.time()
        i = 0
        percent_complete = 0
        num_raw = len(raw_n_gram_counts)
        print("Finding counts of " + str(n) + "-grams up to harmonic equivalence.")
        print("There are " + str(num_raw) + " raw " + str(n) + "-grams to process.")
        progress_interval = int(num_raw /100)
    for ng1 in raw_n_gram_counts:
        if progress_updates:
            i += 1
            if i % progress_interval == 0 and i !=0:
                percent_complete += 1
                print("Processing " + str(n) + "-gram number " + str(i) + ".")
                print("\tTime spent so far:",time.time() - t0)
                print("\tComputation is " + str(percent_complete) + "% complete.")
        if ng1 in processed:
            continue
        total = raw_n_gram_counts[ng1]
        for ng2 in raw_n_gram_counts:
            if (ng2 not in processed) and ng1 != ng2:
                if compare_n_grams(ng1, ng2)[0]:
                    total += raw_n_gram_counts[ng2]
                    processed.add(ng2)
        results[ng1] = total
        processed.add(ng1)
    return results

In [9]:
get_unique_n_gram_counts(sample_data['simplified_chords'],n=1)

Counter({'G': 517, 'Fsmin': 283, 'Dsus4': 10, 'Cadd9': 8, 'Dmaj7': 5})

In [21]:
def get_n_gram_counts(n, chord_column, equivalence, timed =  True):
    t0 = time.time()
    if equivalence:
        #print("Finding counts of " + str(n) + "-grams up to harmonic equivalence.")
        results = get_unique_n_gram_counts(chord_column, n, progress_updates = True)
        print("Counting completed. Time taken (in seconds):",time.time() - t0)
        print("Number of harmonically distinct " + str(n) + "-grams:", len(results),'\n')
    else:
        print("Finding counts of raw " + str(n) + "-grams.")
        results = get_raw_n_gram_counts(chord_column, n)
        print("Counting completed. Time taken (in seconds):",time.time() - t0)
        print("Number of raw " + str(n) + "-grams:", len(results),'\n')
    return results

In [24]:
# This only needs to be run once per n, and only for n=1,2,3,4,5,6
# n's completed: 1, 2
n = 3
unique_n_gram_counts = get_n_gram_counts(n, chord_column = df['simplified_chords'], equivalence = True, timed = True)
with open(data_folder_path + 'harmonically_unique_' + str(n) + '_gram_counts.json','w') as f:
    json.dump(obj = unique_n_gram_counts, fp = f)

Finding counts of 3-grams up to harmonic equivalence.
There are 298213 raw 3-grams to process.


KeyboardInterrupt: 

In [26]:
raw_1_gram_counts = get_n_gram_counts(n=1, chord_column = df['simplified_chords'], equivalence = False, timed = True)
raw_2_gram_counts = get_n_gram_counts(n=2, chord_column = df['simplified_chords'], equivalence = False, timed = True)
raw_3_gram_counts = get_n_gram_counts(n=3, chord_column = df['simplified_chords'], equivalence = False, timed = True)
raw_4_gram_counts = get_n_gram_counts(n=4, chord_column = df['simplified_chords'], equivalence = False, timed = True)
raw_5_gram_counts = get_n_gram_counts(n=5, chord_column = df['simplified_chords'], equivalence = False, timed = True)
raw_6_gram_counts = get_n_gram_counts(n=6, chord_column = df['simplified_chords'], equivalence = False, timed = True)
raw_7_gram_counts = get_n_gram_counts(n=4, chord_column = df['simplified_chords'], equivalence = False, timed = True)
raw_8_gram_counts = get_n_gram_counts(n=5, chord_column = df['simplified_chords'], equivalence = False, timed = True)
raw_9_gram_counts = get_n_gram_counts(n=6, chord_column = df['simplified_chords'], equivalence = False, timed = True)

Finding counts of raw 1-grams.
Counting completed. Time taken (in seconds): 12.371426105499268
Number of raw 1-grams: 690 

Finding counts of raw 2-grams.
Counting completed. Time taken (in seconds): 14.557015657424927
Number of raw 2-grams: 42574 

Finding counts of raw 3-grams.
Counting completed. Time taken (in seconds): 15.321894884109497
Number of raw 3-grams: 298213 

Finding counts of raw 4-grams.
Counting completed. Time taken (in seconds): 15.965595722198486
Number of raw 4-grams: 888910 

Finding counts of raw 5-grams.
Counting completed. Time taken (in seconds): 16.25770592689514
Number of raw 5-grams: 1752485 

Finding counts of raw 6-grams.
Counting completed. Time taken (in seconds): 16.503804445266724
Number of raw 6-grams: 2764461 

Finding counts of raw 4-grams.
Counting completed. Time taken (in seconds): 15.865700960159302
Number of raw 4-grams: 888910 

Finding counts of raw 5-grams.
Counting completed. Time taken (in seconds): 16.573243379592896
Number of raw 5-gra