The goal of this notebook is to compile a list of chords by frequency in the entire database.
My main goal is to do this counting chords as harmonically equivalent, but I will also do it more literally.

In [None]:
# importing basic packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ast
import copy
from collections import Counter, deque

# read in the data set
df = pd.read_csv('../data/chordonomicon.csv', low_memory=False)

In [None]:
# Read the mapping CSV file
chord_relations = pd.read_csv('../data/chords_mapping.csv')

# Create a dictionary with keys the "chords" and values the "degrees"
chord_degrees = dict(zip(chord_relations['Chords'], chord_relations['Degrees']))
for key, value in chord_degrees.items():
    chord_degrees[key] = ast.literal_eval(value)
    
# full list of chords from the chords_mapping csv
known_chords = list(chord_degrees.keys())
assert(len(known_chords) == len(set(known_chords))) # Validating no duplicates

# some examples of what the string labels for known chords look like
print(known_chords[0:10])

In [None]:
# drop all columns except for chords and genres
chord_data = df[['chords','genres']]
chord_data.sample(5)

In [None]:
# replacing spaces with commas
def replace_space_with_comma(my_string):
    return my_string.replace(" ",",")

# Remove section markers
def remove_section_markers(my_string):
    result = []
    i = 0
    n = len(my_string)
    while i < n:
        if my_string[i] == '<':
            # Skip until after the following ", "
            j = my_string.find('>', i)
            if j == -1:
                break  # no closing '>', stop
            i = j + 2  # skip '>,' and the space
        else:
            result.append(my_string[i])
            i += 1
    assert('<' not in result)
    assert('>' not in result)
    return ''.join(result)

# Remove inversions
def remove_inversions(my_string):
    result = []
    i = 0
    n = len(my_string)
    while i < n:
        if my_string[i] == '/':
            # Skip until after the following ", "
            j = my_string.find(',', i)
            if j == -1:
                break  # no closing comma, stop
            i = j  # skip comma
        else:
            result.append(my_string[i])
            i += 1
    return ''.join(result)

def clean_up_chord_string(my_string):
    return remove_inversions(
        remove_section_markers(
            replace_space_with_comma(my_string)))

# replacing spaces with commons in all chords in all rows of the data
chord_data.loc[:,'chords'] = chord_data['chords'].apply(clean_up_chord_string)

In [None]:
chord_data.sample(5)

In [None]:
# compile a list of all chords in the data set
list_of_chord_lists = list(chord_data.chords)
giant_chord_string = ','.join(list_of_chord_lists)
data_set_chords_with_duplicates = list(giant_chord_string.split(',')) # converting to a set as an intermediate step will get rid of duplicates
data_set_chords_with_duplicates.remove('')

In [None]:
# compile a dictionary of chords by count, where the key is the chord name, and the value is the number of times it appears in the database
data_set_chord_counts = Counter(data_set_chords_with_duplicates)

In [None]:
# pie chart of all chords, just by string name
plt.pie(data_set_chord_counts.values(), 
        labels = data_set_chord_counts.keys())
plt.show()

In [None]:
# plot of the log of frequency numbers
plt.scatter(x = np.arange(len(data_set_chord_counts.values())),
            y = np.log(np.sort(list(data_set_chord_counts.values()))),
           marker='.')
plt.show()

Next goal: compile a list of chord counts by harmonic equivalence.

In [None]:
# method to transpose a chord in vector format
def transpose_chord_up(chord_vector, num_semitones):
    d = deque(chord_vector)
    d.rotate(num_semitones)
    return(list(d))
    
# method to return true if chord_1 and chord_2 are just tranposed versions of each other
def is_harmonic_equivalent(chord_1, chord_2):
    # if they have different numbers of notes, then we don't have to check if 
    # any of the transpositions are equal
    if sum(chord_1) != sum(chord_2):
        return False

    # if they have the same number of notes, just rotate through the 12 possible transpositions to check
    for i in range(12):
        if np.array_equal(chord_1, transpose_chord_up(chord_2, i)):
            return True

    return False

C = chord_degrees['C']
D = chord_degrees['D']
E = chord_degrees['E']
assert(is_harmonic_equivalent(C,D))
assert(is_harmonic_equivalent(C,E))
assert(is_harmonic_equivalent(D,E))

Cmaj7 = chord_degrees['Cmaj7']
Dmaj7 = chord_degrees['Dmaj7']
assert(is_harmonic_equivalent(Cmaj7,Dmaj7))

The next code block compiles a counter of harmonically unique chords.

In [None]:
chord_counts_copy = copy.deepcopy(data_set_chord_counts)
harmonically_unique_chord_counts = Counter()

while len(chord_counts_copy) > 0:
    # pick the most common chord, and retrieve a vector version of it
    chord_1 = chord_counts_copy.most_common(1)[0][0]
    chord_1_vec = chord_degrees[chord_1]

    # start compiling a list of equivalent chords, and a running total of those chords
    equivalent_chords = [chord_1]
    running_total = chord_counts_copy[chord_1]

    # remove the chord from the list so we don't double count it
    del chord_counts_copy[chord_1]
    
    # go through and find all harmonically equivalent chords to the most common chord left
    for chord_2 in chord_counts_copy:
        chord_2_vec = chord_degrees[chord_2]
        if is_harmonic_equivalent(chord_1_vec, chord_2_vec):
            equivalent_chords.append(chord_2)
            running_total = running_total + data_set_chord_counts[chord_2]

    # set the value in the new counter to be the accumulated sum
    harmonically_unique_chord_counts[chord_1] = running_total

    # go through chord_counts_copy and delete all the chords that were just added to the unique chord counter
    for ec in equivalent_chords:
        del chord_counts_copy[ec]

print("Number of harmonically unique chords:",len(harmonically_unique_chord_counts))
print(harmonically_unique_chord_counts)

In [None]:
# pie chart of harmonically unique chords
plt.pie(harmonically_unique_chord_counts.values(), 
        labels = harmonically_unique_chord_counts.keys())
plt.show()

In [None]:
# plot of the log of frequency numbers
plt.scatter(x = np.arange(len(harmonically_unique_chord_counts.values())),
            y = np.log(np.sort(list(harmonically_unique_chord_counts.values()))),
           marker='.')
plt.show()

In [None]:
# making a some charts for the "other" category, i.e. the chords other than major and minor
harmonically_unique_chord_counts_rare = copy.deepcopy(harmonically_unique_chord_counts)
del harmonically_unique_chord_counts_rare['G']
del harmonically_unique_chord_counts_rare['Amin']
print(harmonically_unique_chord_counts_rare)

In [None]:
# pie chart of rare unique chords
plt.pie(harmonically_unique_chord_counts_rare.values(), 
        labels = harmonically_unique_chord_counts_rare.keys())
plt.show()

In [None]:
# plot of the log of rare unique chord frequencies
plt.scatter(x = np.arange(len(harmonically_unique_chord_counts_rare.values())),
            y = np.log(np.sort(list(harmonically_unique_chord_counts_rare.values()))),
           marker='.')
plt.show()

Next goal: compile lists of n-gram frequencies, both in raw terms and using harmonic equivalence groupings.

In [None]:
def n_grams(n):
    # return a counter object of n-grams, ignoring harmonic equivalence
    chord_n_gram_counter = Counter()
    for song in list_of_chord_lists:
        song_as_list = song.split(',')
        for i in range(len(song_as_list)-n+1):
            pair = ','.join(song_as_list[i:i+n])
            chord_n_gram_counter[pair] += 1
    return chord_n_gram_counter

In [None]:
chord_2_gram_counter = n_grams(2)
print("Number of 2-grams (ignoring harmonic equivalence):",len(chord_2_gram_counter))
print("Top 2 most common 2-grams:",chord_2_gram_counter.most_common(10))

In [None]:
# pie chart of the top 50 most common 2-grams (without harmonic equivalence)
pie_slices = 50
most_common = Counter(dict(chord_2_gram_counter.most_common(pie_slices)))
plt.pie(most_common.values(), 
        labels = most_common.keys())
plt.show()

In [None]:
# plot of the log of chord 2-gram frequencies (without harmonic equivalence)
plt.scatter(x = np.arange(len(chord_2_gram_counter.values())),
            y = np.log(np.sort(list(chord_2_gram_counter.values()))),
           marker='.')
plt.show()

In [None]:
chord_3_gram_counter = n_grams(3)
print(chord_3_gram_counter.most_common(10))

In [None]:
# pie chart of the top 50 most common 3-grams (without harmonic equivalence)
pie_slices = 50
most_common = Counter(dict(chord_3_gram_counter.most_common(pie_slices)))
plt.pie(most_common.values(), 
        labels = most_common.keys())
plt.show()

In [None]:
# plot of the log of chord 3-gram frequencies (without enharmonic equivalence)
chord_3_gram_counter = n_grams(3)
plt.scatter(x = np.arange(len(chord_3_gram_counter.values())),
            y = np.log(np.sort(list(chord_3_gram_counter.values()))),
           marker='.')
plt.show()

In [None]:
# function to convert a string of comma-separated chords into a matrix, where each column denotes a chord
def string_to_chord_matrix(chord_sequence):
    # split sequence over commas, ignoring any "empty string" chords
    chord_list = [c for c in chord_sequence.split(',') if c != '']
    
    # then look up each chord in chord_degrees dictionary by the key value
    return np.array([chord_degrees[c][::-1] for c in chord_list])

def transpose_matrix_up(chord_matrix, num_semitones):
    # transpose the entire matrix up by a number of semitones
    # this just means applying transpose_chord_up to each row
    # we'll accomplish this by taking the matrix transpose, applying to each row, then taking the matrix transpose back
    for row in chord_matrix:
        new_matrix[i] = transpose_chord_up(row,num_semitones)
    return new_matrix

def is_harmonic_equivalent_matrix(chord_matrix_1, chord_matrix_2):
    # return true if chord_matrix_1 and chord_matrix_2 are just tranposed versions of each other

    # some basic checks to skip some looping when we can rule that out already
    if (chord_matrix_1.sum() != chord_matrix_2.sum()) or (chord_matrix_1.shape != chord_matrix_2.shape):
        return False

    # loop to check if they're the same
    for i in range(12):
        if np.array_equal(chord_matrix_1, transpose_chord_up(chord_matrix_2, i)):
            return True
    return False

In [None]:
def unique_n_grams(chord_n_gram_counter, output_messages=False, countdown=False):
    # return a counter object of harmonically unique n-grams
    # this will be a dictionary where keys are (harmonically unique) n-grams, and values are numbers of occurences in the database

    # make a deep copy of the input so we don't modify the original
    chord_n_gram_counter_copy = copy.deepcopy(chord_n_gram_counter)

    if output_messages:
        print("Initial length of numbers of n-grams with duplicates:",len(chord_n_gram_counter_copy))
        print()
    
    # we'll build up a counter of uniques
    unique_n_gram_counter = Counter()

    while len(chord_n_gram_counter_copy) > 0:
        n_gram_1 = chord_n_gram_counter_copy.most_common(1)[0][0]
        matrix_1 = string_to_chord_matrix(n_gram_1)
        
        # initialize a list of equivalent n-grams
        equivalent_n_grams = [n_gram_1]
        running_total = chord_n_gram_counter_copy[n_gram_1]

        # delete the current n-gram under consideration so that we don't count it as an equivalent to itself a second time
        del chord_n_gram_counter_copy[n_gram_1]
        
        # go through and find all harmonically equivalent chords to the most common chord left
        for n_gram_2 in chord_n_gram_counter_copy:
            matrix_2 = string_to_chord_matrix(n_gram_2)
            if is_harmonic_equivalent_matrix(matrix_1, matrix_2):
                equivalent_n_grams.append(n_gram_2)
                running_total = running_total + data_set_chord_counts[chord_2]
                #print(n_gram_2) ###
    
        # set the value in the new counter to be the accumulated sum
        unique_n_gram_counter[n_gram_1] = running_total

        # go through chord_counts_copy and delete all the chords that were just added to the unique chord counter
        for e in equivalent_n_grams:
            del chord_n_gram_counter_copy[e]

        if output_messages:
            print("n-gram under consideration:",n_gram_1)
            print("Equivalent n-grams:",equivalent_n_grams)
            print("Number of equivalent n-grams:",len(equivalent_n_grams))
            print("New length of non-unique n-grams to check:",len(chord_n_gram_counter_copy))
            print("Updated list of unique n-grams:",unique_n_gram_counter)
            print()

        if countdown:
            print("Remaining n-grams:",len(chord_n_gram_counter_copy))

    return unique_n_gram_counter


In [None]:
chord_2_gram_counter = n_grams(2)

In [None]:
unique_2_gram_counter = unique_n_grams(chord_2_gram_counter, output_messages=True)
print(unique_2_gram_counter.most_common(10))

In [None]:
# plot of the log of chord 2-gram frequencies (with enharmonic equivalence)
plt.scatter(x = np.arange(len(unique_2_gram_counter.values())),
            y = np.log(np.sort(list(unique_2_gram_counter.values()))),
           marker='.')
plt.show()

In [None]:
unique_3_gram_counter = unique_n_grams(chord_3_gram_counter)
print(unique_3_gram_counter.most_common(10))

In [None]:
# plot of the log of chord 3-gram frequencies (with enharmonic equivalence)
plt.scatter(x = np.arange(len(unique_3_gram_counter.values())),
            y = np.log(np.sort(list(unique_3_gram_counter.values()))),
           marker='.')
plt.show()

In [None]:
chord_4_gram_counter = n_grams(4)
unique_4_gram_counter = unique_n_grams(chord_4_gram_counter)
print(unique_4_gram_counter.most_common(10))

In [None]:
# plot of the log of chord 4-gram frequencies (with enharmonic equivalence)
plt.scatter(x = np.arange(len(unique_4_gram_counter.values())),
            y = np.log(np.sort(list(unique_4_gram_counter.values()))),
           marker='.')
plt.show()