The goal of this notebook is to complete the missing gaps in the chord_mapping.csv file provided by the Chordonomicon authors. The file is currently missing some of the chords present in the data set.

In [2]:
# importing basic packages
import numpy as np
import pandas as pd
import matplotlib as plt
import ast

# read in the data set
df = pd.read_csv('../data/chordonomicon.csv', low_memory=False)

# Read the mapping CSV file
chord_relations = pd.read_csv('../data/chords_mapping.csv')

# Create a dictionary with keys the "chords" and values the "degrees"
chord_degrees = dict(zip(chord_relations['Chords'], chord_relations['Degrees']))
for key, value in chord_degrees.items():
    chord_degrees[key] = ast.literal_eval(value)

In [3]:
print(len(chord_degrees))
print("C major: \t",chord_degrees['C'])
print("C major 7: \t",chord_degrees['Cmaj7'])
print("C minor: \t",chord_degrees['Cmin'])

2793
C major: 	 [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0]
C major 7: 	 [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1]
C minor: 	 [1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0]


In [4]:
# drop all columns except for chords
chord_data = df[['chords','genres']]
chord_data.sample(20)

Unnamed: 0,chords,genres
209200,<verse_1> Eb Db B E B Fs B <verse_2> E Bsus2 F...,naija worship
563503,<intro_1> D Emin7 D Emin7 D Emin7 D Emin7 <ver...,
248008,<intro_1> B Bsus4 Bsus2 B <instrumental_1> Gsm...,"j-pop""j-rock""japanese emo""visual kei"
115641,D G D G D G F E D G F E D G F E D A Bmin A9 G ...,
300796,<verse_1> Dmin Gmin A Dmin F A Dmin Bb A Dmin ...,
630023,Amin Fmaj7 Amin C F C Amin Fmaj7 F G/C C G/C A...,
474552,C G Amin F C G Amin F C G Amin F C G Amin F C ...,
529157,Dmin A A7 Dmin D D7 Gmin Dmin A Dmin C F A7 Dm...,opm
33781,<intro_1> G Emin C <verse_1> G Emin C G Emin C...,belgian punk
153230,<intro_1> F C Dmin Bb <verse_1> F C Dmin Bb F ...,"ccm""christian alternative rock""christian music"


In [5]:
# replacing spaces with commas
def replace_space_with_comma(my_string):
    return my_string.replace(" ",",")

# replacing spaces with commons in all chords in all rows of the data
chord_data.loc[:,'chords'] = chord_data['chords'].apply(replace_space_with_comma)
chord_data.sample(20)

Unnamed: 0,chords,genres
595377,"<intro_1>,C,G/B,Amin7,C/G,F,C/E,Dmin7,G7sus4,<...",
258274,"<intro_1>,Dmin,F,Dmin,Bb,Dmin,F,Bb,<verse_1>,D...","modern alternative rock""modern blues rock""mode..."
102094,"<verse_1>,Cmin,F,Bb,Gmin,Cmin,F7,Bb,Gmin,<chor...","art pop""chamber pop""chillwave""denver indie""ind..."
461783,"Emin,C,G,D,Emin,C,G,D,Emin,C,G,D,Emin,C,G,D,Em...","contemporary country""country""country dawn""coun..."
137657,"<intro_1>,G,D,A,G,D,A,<verse_1>,G,D,A,G,D,A,G,...","classic texas country""contemporary country""cou..."
583701,"<verse_1>,Dmin7,G9,C7,Dmin7,G9,Fmin,C7,Dmin7,G...",
414153,"D,Amin,G,D,Amin,G,D,Amin,G,D,Amin,G,D,Amin,G,E...",argentine alternative rock
567956,"<intro_1>,C,Amin,C/G,F/A,<verse_1>,C,F,Gsus4,C...",
343448,"C,G,C,G,D7,C,G,C,G,C,G,C,G,D7,C,G,C,G,D7,C,G,C...","classic country pop""country""country rock""honky..."
34108,"<intro_1>,Amin,Dmin,Amin,Dmin,Amin,Dmin,Amin,<...",


In [6]:
chord_data.sample(20)

Unnamed: 0,chords,genres
631598,"C,F7,G,C,Amin,D,F,G,Emin,Amin,Dmin7,G7,Emin,Am...",
562098,"<intro_1>,E,Fsmin,A,E,<verse_1>,Fsmin,A,E,B,E,...",
228217,"<intro_1>,Fsmin,A,D,Bmin7,<verse_1>,Fsmin,A,D,...","anthem worship""uk worship""world worship""worship"
490933,"Gmin,Eb,Gmin,Eb,Gmin,Eb,Bb,Ab,Amin,D,Gmin,Eb,G...","pop rock""post-grunge"
494129,"C,G,C,Bb,G,Bb,C,G,Bb,C,G,D,C,G,D,C,G,D,F,C,G,C...","classic oklahoma country""contemporary country""..."
174212,"<intro_1>,Bb,C,Bb,<chorus_1>,F,Bb,C,F,Dmin,C,B...","canadian singer-songwriter""newfoundland indie""..."
572408,"<intro_1>,Ab,<verse_1>,Db,B,Ab,Db,B,Ab,<chorus...",
438229,"G,C,G,D,G,C,G,D,C,G,F,D,C,A,As,A,G,C,E,D,G,C,E...","latin alternative""mexican rock"
155281,"D,C,Bmin,Bb,A,Gmin,D,Gmin,A,D,Bmin,Bbmin,D,Gmi...",movie tunes
646921,"<intro_1>,Emin,Amin,Emin,Amin,Emin,Amin,Emin,A...",


In [7]:
# Remove section markers
def remove_section_markers(my_string):
    result = []
    i = 0
    n = len(my_string)
    while i < n:
        if my_string[i] == '<':
            # Skip until after the following ", "
            j = my_string.find('>', i)
            if j == -1:
                break  # no closing '>', stop
            i = j + 2  # skip '>,' and the space
        else:
            result.append(my_string[i])
            i += 1
    assert('<' not in result)
    assert('>' not in result)
    return ''.join(result)

chord_data.loc[:,'chords'] = chord_data['chords'].apply(remove_section_markers)
chord_data.sample(20)

Unnamed: 0,chords,genres
235771,"C,E,F,D/Fs,G,Gsdim7,Amin,Emin7,C,E,F,D/Fs,G,Gs...","black punk""chaotic hardcore"
305854,"D,Bmin,G,A,D,Bmin,G,D,Bmin,G,A,D,Bmin,Emin,C,B...","alternative rock""britpop""permanent wave""rock"
572556,"Amin,F,G,Amin,F,G,Emin,Amin,F,G,Emin,Amin,F,G,...",
186683,"Gsmin,Fs,E,Fs,Gsmin,Fs,E,Fs,Gsmin,Fs,E,Fs,Gsmi...",modern indie folk
445609,"C,G,Amin,F,C,G,Amin,F,C,G,F,E,C,G,F,Cmin,Gs,As...","punk urbano""spanish punk"
548286,"E,Amin,D,Bmin,E,Amin,D,Bmin,E,Amin,D,Bmin,E,Am...",
264172,"E,E7,A,A7,A,E,E7,B7,E,E7,A,Amin,E,Esus4,B7,E,B...",classic belgian pop
494985,"Asus2,Amin9,Fmaj7,G13,Dmin9,Gsus2,Asus2,Amin9,...",
496946,"G,A,Bmin,A,G,A,Bmin,A/Cs,G,A,Bmin,A,G,A,Bmin,A...","alternative rock""bay area indie""garage rock""ne..."
367450,"Dno3d,Bmin,D,G,A,Bmin,D,G,D,Bmin,D,G,D,Bmin,D,...",cornwall indie


In [8]:
# get some sample chords with inversions
songs_with_inversions = chord_data.loc[['/' in ch for ch in chord_data.chords]]
songs_with_inversions.sample(10)

Unnamed: 0,chords,genres
283105,"Asmin,Gs,Asmin,Gs,Asmin,Fs,C,Asmin,Fs,C,Asmin,...",
32284,"C,Amin7,G/B,C,Csus4,C,Csus4,C,Csus4,Gadd11,G,A...",indiecoustica
125535,"Amin,F,Dmin,Amin,G,F,G,Amin,G,Amin,F,Dmin,Amin...","hurdy-gurdy""medieval folk""medieval rock"
8048,"D7,Dmin7,Gmin7,Dsus2,E7/D,Emin7/D,D7,Dmin7,Gmi...",experimental r&b
111400,"G/D,Dsus2,Gs,Csmaj7,E,B,Csmaj7,Gs,Csmaj7,Gs,Cs...","bluegrass""instrumental bluegrass""mandolin""prog..."
489288,"D,E,Bmin,D,A,Csmin,B,A/Cs,Bmin,A,D/Fs,G,Fsmin,...","candy pop""pixie""pop emo""pop punk"
366901,"C,G,Amin,F,C,G,Amin,F,C,G,Amin,F,C,G,Amin,F,C,...","neon pop punk""pixie""pop punk""pov: indie""texas ..."
346754,"D/Cs,Bmin/A,G/Fs,D,D/Cs,Bmin,Bmin/A,G,G/Fs,Emi...","cantautor""latin arena pop""latin pop""mexican pop"
590826,"F,Dmin7,Amin,C,F,Dmin7,D7sus4,Amin,C,F,Dmin7,D...",
491123,"Emin,G,C,D,Emin,G,C,D,Emin,C,D,Emin,C,G,D,C,Em...","alternative metal""nu metal""post-grunge""rock"


In [9]:
# Removing inversions
def remove_inversions(my_string):
    result = []
    i = 0
    n = len(my_string)
    while i < n:
        if my_string[i] == '/':
            # Skip until after the following ", "
            j = my_string.find(',', i)
            if j == -1:
                break  # no closing comma, stop
            i = j  # skip comma
        else:
            result.append(my_string[i])
            i += 1
    return ''.join(result)

# just a basic test on a random chord sequence with some inversions
n = 3
my_sample = songs_with_inversions.sample(5)
for i in range(n):
    s = songs_with_inversions.iloc[i].chords
    print(s)
    print()
    print(remove_inversions(s))
    print()
    print()

E,D,A/Cs,E,D,A/Cs,E,D,A/Cs,E,D,A/Cs,E,D,A/Cs,E,D,A,C,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,D,A/Cs,E,D,A/Cs,E,D,A/Cs,E,D,A/Cs,E,D,A,C,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,C,G,D,E,C,G,D,E,C,G,D,C,D,E,G,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,C,G,D,E,C,G,D,E,C,G,D,C,D,E

E,D,A,E,D,A,E,D,A,E,D,A,E,D,A,E,D,A,C,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,D,A,E,D,A,E,D,A,E,D,A,E,D,A,C,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,C,G,D,E,C,G,D,E,C,G,D,C,D,E,G,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,C,G,D,E,C,G,D,E,C,G,D,C,D,E


D,Dmaj7,D,Dmaj7,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,Emin,A,D,G,Emin,A,D,G,D/Fs,Emin,A,D,G,D/Fs,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,Emin,A,D,G,Emin,A,D,G,D/Fs,Emin,A,D,G,D/Fs,Emin,A,D,G,Emin,A,D,G,D/Fs,Emin,A,D,G,D/Fs,Emin,A,D,G,D/Fs,Emin,A,D,Emin,A,D,G,Emin,A,D,G,D/Fs,Emin,A,D,G,D/Fs,Emin,A,D,G,Emin,A,D,G,D/Fs,Emin,A,D,G,D/Fs,Emin,A,D,G,D/Fs,Emin,A,D,G,Emin,A,D,G,D/Fs

D,Dmaj7,D,Dmaj7,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,Emin,A,

In [10]:
# remove inversions from the whole data set
chord_data.loc[:,'chords'] = chord_data['chords'].apply(remove_inversions)
chord_data.sample(20)

Unnamed: 0,chords,genres
152307,"G,Cadd13,G,Cadd13,Cadd13,D,Cadd13,D,G,Cadd13,G...","modern reggae""reggae rock"
659431,"Emin7,Cadd9,G,Emin7,Cadd9,G,Emin7,Cadd9,D,Emin...",
440596,"G,C,D,G,Bmin,Emin,C,D,G,C,D,G,Bmin,Emin,C,D,C,...","indie catala""rock catala"
290315,"G,C,D,C,G,C,D,C,G,C,D,C,G,C,D,C,G,C,D,C,G,C,D,...",
641618,"F,G,Amin,C,F,G,Amin,C,F,G,Amin,G,Amin,F,G,Amin...",
599747,"Bmin,D,A,E,Bmin,D,A,E,Bmin,D,A,E,Bmin,D,A,E,Bm...",
246859,"Amin,G,Amin,F,E7,C,G,F,G,F,Amin,G,Amin,F,E7,C,...","acoustic blues""blues""country blues""jazz blues""..."
289115,"A,D,E,D,E,Fsmin,E,D,Csmin,Bmin,E,Fsmin,E,D,E,F...","classic oklahoma country""contemporary country""..."
244525,"F,Dmin,Bb,C,F,Dmin,Bb,F,Dmin,Bb,C,F,Dmin,Bb,C,...",progressive metal
577038,"D,Bmin7,E7,A7,D,B7,Emin,A7,D,Bmin7,G,A7,D,Bmin...",


In [11]:
chord_data.sample(5)

Unnamed: 0,chords,genres
618337,"C,Amin,F,C,F,C,Bdim7,E7,Amin,C7,F,Bb7,Amin,Fsd...",
551615,"C,F,G,C,F,Dmin,G,Amin,C,F,G,C,F,Dmin,Amin,G,C,...",
462431,"A,E,A,E,G,A,E,G,A,E,G,A,G,A,B,E,A,E,A,E,G,A,E,...","croatian pop""croatian rock""yugoslav rock"
629915,"F,C,F,C,F,C,F,C,F,C,F,Bb,C,Bb,C,F,C,F,C,F,C,F,...",
419040,"G,D7,G,G7,C,G,D7,G,C,G,D7,G,G7,C,G,D7,G,C,G,D7...",mexican rock-and-roll


In [12]:
# full list of chords from the csv
known_chords = list(chord_degrees.keys())
print(len(known_chords))
print(len(set(known_chords))) # should be the same if known_chords has no duplicates
print(known_chords[0:100])

2793
2793
['C7', 'Cmaj7', 'C9', 'Cmaj9', 'Cmajs9', 'Cb9', 'Cb79', 'Cb7b9', 'C7b9', 'C7sus2', 'Cmaj7sus2', 'C7sus4', 'Cmaj7sus4', 'Cadd9', 'Cadd11', 'Cadd13', 'C11s', 'C11b9', 'Cmajs911s', 'Cb711b9', 'Cmaj11', 'Cmaj911s', 'C11', 'Cb11b9', 'C13b11b9b', 'C13b9', 'C1311s', 'C13b', 'Cmaj13b', 'C13bb9', 'Cmaj13s911s', 'Cb713b9', 'Cmaj13', 'Cmaj1311s', 'C13', 'C11b913b', 'C1113b', 'Cmin7', 'Cminmaj7', 'Cmin9', 'Cminmaj9', 'Cminmajs9', 'Cminb9', 'Cminb79', 'Cminb7b9', 'Cmin7b9', 'Cmin7sus2', 'Cminmaj7sus2', 'Cmin7sus4', 'Cminmaj7sus4', 'Cminadd9', 'Cminadd11', 'Cminadd13', 'Cmin11s', 'Cmin11b9', 'Cminmajs911s', 'Cminb711b9', 'Cminmaj11', 'Cminmaj911s', 'Cmin11', 'Cminb11b9', 'Cmin13b11b9b', 'Cmin13b9', 'Cmin1311s', 'Cmin13b', 'Cminmaj13b', 'Cmin13bb9', 'Cminmaj13s911s', 'Cminb713b9', 'Cminmaj13', 'Cminmaj1311s', 'Cmin13', 'Cmin11b913b', 'Cmin1113b', 'Cdim7', 'Cdimmaj7', 'Cdim9', 'Cdimmaj9', 'Cdimmajs9', 'Cdimb9', 'Cdimb79', 'Cdimb7b9', 'Cdim7b9', 'Cdim7sus2', 'Cdimmaj7sus2', 'Cdim7sus4', 'Cdim

In [13]:
# compile a list of all chords in the data set
list_of_chord_lists = list(chord_data.chords)
giant_chord_string = ','.join(list_of_chord_lists)
data_set_chords = list(set(giant_chord_string.split(',')))

In [14]:
print(list_of_chord_lists[0:5])
print()
print(giant_chord_string[0:200])
print()
print(len(data_set_chords))
print(data_set_chords[0:20])

['C,F,C,E7,Amin,C,F,C,G7,C,F,C,E7,Amin,C,F,G7,C,F,C,E7,Amin,C,F,C,G7,C,F,C,E7,Amin,C,F,G7,C,F,C,F,C,G,C,F,C,E7,Amin,C,F,G7,C,D,G,D,G,D,A,D,G,D,Fs7,Bmin,D,G,A7,D,G,A7,D', 'E,D,A,E,D,A,E,D,A,E,D,A,E,D,A,E,D,A,C,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,D,A,E,D,A,E,D,A,E,D,A,E,D,A,C,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,C,G,D,E,C,G,D,E,C,G,D,C,D,E,G,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,C,G,D,E,C,G,D,E,C,G,D,C,D,E', 'D,Dmaj7,D,Dmaj7,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,Emin,A,D,G,Emin,A,D,G,D,Emin,A,D,G,D,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,Emin,A,D,G,Emin,A,D,G,D,Emin,A,D,G,D,Emin,A,D,G,Emin,A,D,G,D,Emin,A,D,G,D,Emin,A,D,G,D,Emin,A,D,Emin,A,D,G,Emin,A,D,G,D,Emin,A,D,G,D,Emin,A,D,G,Emin,A,D,G,D,Emin,A,D,G,D,Emin,A,D,G,D,Emin,A,D,G,Emin,A,D,G,D', 'C,G,C,G,C,F,Dmin,G,Dmin,G,C,G,C,F,Dmin,G,Dmin,G,C,F,Dmin,G,Dmin7,G,C,G,C,F,Dmin,G,Dmin,G,C,F,Dmin,G,Dmin,G,C', 'C,G,C,G,C,G,C,G,C,Bmin,Emin,Amin,D,G,C,D,G,C,D,G,D,F,E,Amin,D,G,C,D,G,C,D,G,D,F,E,Amin,D,C

In [15]:
# check if data_set_chords has repeats
print(len(data_set_chords))
print(len(set(data_set_chords)))

746
746


In [16]:
# list of all chords in the data set which are not in the chords_mapping csv file, at least if I did this right
mystery_chords = list(set(data_set_chords).difference(set(known_chords)))
print(len(mystery_chords))
print(len(set(mystery_chords)))

1
1


In [34]:
print(len(known_chords))
print(len(mystery_chords))

# why are these two not the same?
print(len(data_set_chords))
print(len(known_chords) + len(mystery_chords))

print(mystery_chords)

2793
1
746
2794
['']


In [36]:
'Fb' in known_chords

False

In [38]:
'Fb' in data_set_chords

False