This notebook is used to make some csv files which highlight top n-grams for features.

In [2]:
# importing basic packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ast
import copy
import json
import time
from collections import Counter, deque

# read in the data set
df = pd.read_csv('../data/chordonomicon.csv', low_memory=False)

In [3]:
# Read the mapping CSV file
chord_relations = pd.read_csv('../data/chords_mapping.csv')

# Create a dictionary with keys the "chords" and values the "degrees"
chord_degrees = dict(zip(chord_relations['Chords'], chord_relations['Degrees']))
for key, value in chord_degrees.items():
    chord_degrees[key] = ast.literal_eval(value)
    
# full list of chords from the chords_mapping csv
known_chords = list(chord_degrees.keys())

In [4]:
# read the equivalence dictionary file
# this is a dictionary of dictionaries
#    the top-level keys are chord names (e.g. 'C','Amin')
#    the top-level values are dictionaries, whose keys are equivalent chords, and whose values are the semitone distance between the top-level key and the low-level key
with open('../data/harmonic_equivalence.json') as file:
    equiv_dict = json.load(file)

In [5]:
# drop all columns except for chords and genres
chord_data = df[['chords','genres']]

# drop anything that lacks chord or genre data
chord_data = chord_data.dropna()

In [6]:
# replacing spaces with commas
def replace_space_with_comma(my_string):
    return my_string.replace(" ",",")

In [7]:
# Remove section markers
def remove_section_markers(my_string):
    result = []
    i = 0
    n = len(my_string)
    while i < n:
        if my_string[i] == '<':
            # Skip until after the following ", "
            j = my_string.find('>', i)
            if j == -1:
                break  # no closing '>', stop
            i = j + 2  # skip '>,' and the space
        else:
            result.append(my_string[i])
            i += 1
    assert('<' not in result)
    assert('>' not in result)
    return ''.join(result)

In [8]:
# Remove inversions
def remove_inversions(my_string):
    result = []
    i = 0
    n = len(my_string)
    while i < n:
        if my_string[i] == '/':
            # Skip until after the following ", "
            j = my_string.find(',', i)
            if j == -1:
                break  # no closing comma, stop
            i = j  # skip comma
        else:
            result.append(my_string[i])
            i += 1
    return ''.join(result)

In [9]:
def clean_up_chord_string(my_string):
    return remove_inversions(
        remove_section_markers(
            replace_space_with_comma(my_string)))
    
# cleaning up the data in a few ways
chord_data.loc[:,'chords'] = chord_data['chords'].apply(clean_up_chord_string)

In [10]:
# simplify genre data
major_genres = ['pop','rock','country','alternative','punk','metal','rap','soul','jazz','reggae','electronic']
def simplify_genre(genre_string):
    for g in major_genres:
        if g in genre_string:
            return g
    return 'other'

chord_data.loc[:,'genres'] = chord_data['genres'].apply(simplify_genre)
major_genres = ['pop','rock','country','alternative','punk','metal','rap','soul','jazz','reggae','electronic','other']

In [11]:
full_data_genre_counter = Counter(chord_data.genres)
print(full_data_genre_counter)

Counter({'pop': 131247, 'other': 106961, 'rock': 81276, 'country': 15884, 'punk': 10060, 'alternative': 4358, 'rap': 4013, 'metal': 3007, 'soul': 2909, 'reggae': 2156, 'jazz': 1857, 'electronic': 545})


In [52]:
# if the two input chords are harmonically equivalent, return (True, num_semitones) where num_semitones is the distance from n_gram_1 (up) to n_gram_2
# otherwise, return (False, None)
def compare_chords(chord_1, chord_2):
    if chord_2 in equiv_dict[chord_1]:
        return (True, equiv_dict[chord_1][chord_2])
    else:
        return (False, None)

assert(compare_chords('C','D')[0])
assert(compare_chords('C','E')[0])
assert(not(compare_chords('C','Amin')[0]))

In [56]:
# if the two input n_grams are harmonically equivalent, return (True, num_semitones) where num_semitones is the distance from n_gram_1 (up) to n_gram_2
# otherwise, return (False, None)
def compare_n_grams(n_gram_1, n_gram_2):
    list_1 = n_gram_1.split(',')
    list_2 = n_gram_2.split(',')

    # if they aren't the same length, we don't have to check anything
    if len(list_1) != len(list_2):
        return (False, None)

    # now we can assume they have the same length
    comparison = [compare_chords(list_1[i], list_2[i]) for i in range(len(list_1))]

    # if any pairs are not the same, return False
    for c in comparison:
        if not c[0]:
            return (False, None)

    # now we can assume every respective pair is equivalent, but we still need all of the distances to match
    dist_0 = comparison[0][1]
    for c in comparison:
        if c[1] != dist_0:
            return (False, None)

    return (True, dist_0)

assert(compare_n_grams('C,D,E','F,G,A')[0])
assert(not(compare_n_grams('C,D,E','F,G,B')[0]))

In [58]:
# return true/false depending on if a song contains a harmonically equivalent n_gram to the input n_gram
# new version of this, making use of the equivalence dictionary for lookups rather than doing calculations every time
def contains_n_gram(song, n_gram):
    # assumption: input song is a comma-separated string of chord names
    # assumption: input n_gram is a comma-separated string of chord names

    # skip ahead and return true if the raw version is the song
    if n_gram in song:
        return True

    # split up the song and n_gram into lists of strings of single chords
    song_as_list = song.split(',')
    song_length = len(song_as_list)
    n_gram_as_list = n_gram.split(',')
    n = len(n_gram_as_list)

    for i in range(0,song_length - n):
        song_n_gram = ','.join(song_as_list[i:i+n])
        is_same, dist = compare_n_grams(n_gram, song_n_gram)
        if is_same:
            return True

    return False

assert(contains_n_gram('A,B,C,D,E,F,G','C,D'))
assert(contains_n_gram('A,B,C,D,E,F','F,G'))

In [17]:
def get_raw_n_grams_list(data, n):
    # return a list of all raw n-grams, ignoring harmonic equivalence
    results = []
    for song in list(data.chords):
        song_as_list = song.split(',')
        for i in range(len(song_as_list)-n+1):
            n_gram = ','.join(song_as_list[i:i+n])
            if not n_gram in results:
                results.append(n_gram)
    return results

In [62]:
# return true if n_gram (or a harmonic equivalent) belongs to the list
# false otherwise
def n_gram_belongs_to_list(list_of_n_grams, n_gram):
    for ng in list_of_n_grams:
        if compare_n_grams(ng,n_gram)[0]:
            return True
    return False

my_list = ['C','D','E','C,D','C,G']
assert(n_gram_belongs_to_list(my_list,'C'))
assert(n_gram_belongs_to_list(my_list,'D'))
assert(n_gram_belongs_to_list(my_list,'E'))
assert(n_gram_belongs_to_list(my_list,'C,D'))
assert(n_gram_belongs_to_list(my_list,'C,G'))

assert(n_gram_belongs_to_list(my_list,'A'))
assert(n_gram_belongs_to_list(my_list,'G'))
assert(n_gram_belongs_to_list(my_list,'F'))
assert(n_gram_belongs_to_list(my_list,'D,E'))
assert(n_gram_belongs_to_list(my_list,'A,B'))
assert(n_gram_belongs_to_list(my_list,'A,E'))

assert(not(n_gram_belongs_to_list(my_list,'Cmin')))
assert(not(n_gram_belongs_to_list(my_list,'C,F')))

In [100]:
# return a list of harmonically unique n-grams
def get_unique_n_grams_list(data, n, progress_updates = False, progress_spacing = 1000):
    results = []

    if progress_updates:
        t0 = time.time()
        num_songs = len(list(data.chords))
        print("Computing all unique n-grams from given data.")
        print("Number of songs to analyze:",num_songs)
    
    for index, song in enumerate(list(data.chords)):
            
        song_as_list = song.split(',')
        for i in range(len(song_as_list)-n+1):
            n_gram = ','.join(song_as_list[i:i+n])
            if not(n_gram_belongs_to_list(results,n_gram)):
                results.append(n_gram)

        if progress_updates and (index % progress_spacing == 0) and index != 0:
            t1 = time.time()
            time_so_far = t1-t0
            average_time_per_song = time_so_far/(index+1)
            remaining_songs = num_songs - index
            estimated_total_time = num_songs*average_time_per_song
            estimated_remaining_time = average_time_per_song*remaining_songs
            print("Analyzing song index number:",index)
            print("\tRemaining songs:",remaining_songs)
            print("\tTotal time spent so far: " + str(time_so_far) + " seconds")            
            print("\tAverage time per song so far: " + str(average_time_per_song) + " seconds")
            print("\tEstimate total time: " + str(estimated_total_time) + " seconds")
            print("\tEstimated remaining time: " + str(estimated_remaining_time) + " seconds")
            print()

    if progress_updates:
        t2 = time.time()
        print("Done analyzing all songs.")
        print("\tTotal time spent: " + str(t2-t0) + ' seconds')
        print()
    return results

In [104]:
# just testing on a small sample
sample_data = chord_data.sample(100)
unique_3_grams = get_unique_n_grams_list(sample_data, n=3, progress_updates = True)

Computing all unique n-grams from given data.
Number of songs to analyze: 100
Analyzing song index number: 10
	Remaining songs: 90
	Total time spent so far: 0.08959078788757324 seconds
	Average time per song so far: 0.008144617080688477 seconds
	Estimate total time: 0.8144617080688477 seconds
	Estimated remaining time: 0.7330155372619629 seconds

Analyzing song index number: 20
	Remaining songs: 80
	Total time spent so far: 0.4119689464569092 seconds
	Average time per song so far: 0.019617568878900437 seconds
	Estimate total time: 1.9617568878900438 seconds
	Estimated remaining time: 1.569405510312035 seconds

Analyzing song index number: 30
	Remaining songs: 70
	Total time spent so far: 0.7448461055755615 seconds
	Average time per song so far: 0.02402729372824392 seconds
	Estimate total time: 2.402729372824392 seconds
	Estimated remaining time: 1.6819105609770746 seconds

Analyzing song index number: 40
	Remaining songs: 60
	Total time spent so far: 1.061500072479248 seconds
	Average 

In [22]:
# return a list of harmonically unique n-grams
def get_unique_n_grams_list(data, n, progress_updates = False, progress_spacing = 10):
    results = []

    if progress_updates:
        t0 = time.time()
        num_songs = len(list(data.chords))
        print("Computing all unique n-grams from given data.")
        print("Number of songs to analyze:",num_songs)
    
    for index, song in enumerate(list(data.chords)):
        if progress_updates and (index % progress_spacing == 0) and index != 0:
            t1 = time.time()
            time_so_far = t1-t0
            average_time_per_song = (t1-t0)/index
            remaining_songs = num_songs - index
            estimated_remaining_time = average_time_per_song*remaining_songs
            print("Analyzing song index number:",index)
            print("\tTotal time spent so far: " + str(time_so_far) + " seconds")            
            print("\tAverage time per song so far: " + str(average_time_per_song) + " seconds")
            print("\tEstimated remaining time: " + str(estimated_remaining_time) + " seconds")
            
        song_as_list = song.split(',')
        for i in range(len(song_as_list)-n+1):
            n_gram = ','.join(song_as_list[i:i+n])
            if not(n_gram_belongs_to_list(results,n_gram)):
                results.append(n_gram)

    if progress_updates:
        print("Done analyzing all songs.")
        print()
    return results

In [23]:
def build_empty_df(data, n, progress_updates = False, progress_spacing = 10):
    # construct a list of all unique n_grams that are present in the data, and n ranges over a specified list
    # these will be the rows of the dataframe
    n_grams = ['baseline'] + get_unique_n_grams_list(data, n, progress_updates, progress_spacing)
    num_rows = len(n_grams)
    results_dict = {'n_gram' : n_grams}

    # initialize a bunch of columns of zeros, one for each major genre
    for g in major_genres:
        results_dict[g + '_raw'] = np.zeros(shape = num_rows, dtype = int)

    # compute the baseline row by just counting all genres
    for song, genre in zip(data.chords, data.genres):
        results_dict[genre + '_raw'][0] += 1

    # add a "counts_completed" column for tracking partial work/computations
    results_dict['counts_completed'] = [True] + [False]*(num_rows - 1) # the first true is for the baseline row, which should be ignored mostly
        
    return pd.DataFrame(results_dict)

In [108]:
# just testing on a small sample
sample_data = chord_data.sample(20)
empty_df = build_empty_df(sample_data, n=2, progress_updates = True, progress_spacing = 10)
display(empty_df)

Computing all unique n-grams from given data.
Number of songs to analyze: 20
Analyzing song index number: 10
	Remaining songs: 10
	Total time spent so far: 0.03582763671875 seconds
	Average time per song so far: 0.0032570578835227275 seconds
	Estimate total time: 0.06514115767045454 seconds
	Estimated remaining time: 0.03257057883522727 seconds

Done analyzing all songs.
	Total time spent: 0.05147957801818848 seconds



Unnamed: 0,n_gram,pop_raw,rock_raw,country_raw,alternative_raw,punk_raw,metal_raw,rap_raw,soul_raw,jazz_raw,reggae_raw,electronic_raw,other_raw,counts_completed
0,baseline,5,6,0,0,0,0,1,0,0,0,0,8,True
1,"C,D",0,0,0,0,0,0,0,0,0,0,0,0,False
2,"D,C",0,0,0,0,0,0,0,0,0,0,0,0,False
3,"D,Emin",0,0,0,0,0,0,0,0,0,0,0,0,False
4,"Emin,D",0,0,0,0,0,0,0,0,0,0,0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68,"Eb7,Bbmin",0,0,0,0,0,0,0,0,0,0,0,0,False
69,"Eb7,Gb",0,0,0,0,0,0,0,0,0,0,0,0,False
70,"Db,Ebadd9",0,0,0,0,0,0,0,0,0,0,0,0,False
71,"Dsus2,Cadd9",0,0,0,0,0,0,0,0,0,0,0,0,False


In [25]:
#######################################################################
#### populate a "blank" 3-grams csv data file 
#### I have still not run this yet, I anticipate it will take a long time
#######################################################################
n=3
blank_df = build_empty_df(chord_data, n, progress_updates = True, progress_spacing = 5000)
blank_df.to_csv(str(n) + '_gram_deviations.csv', sep=',', index=False)
#######################################################################