The goal of this notebook is convert the chordonomicon data set to make each song a matrix where each column records a single chord. 

Steps:

1. Import the chordonomicon data set, drop all columns except for 'id' and 'chords'
2. Remove section marker info from chords i.e. remove \<intro_1\>
3. For each song, convert each chord into a vector, then concatenate them into a matrix 

In [86]:
# importing basic packages
import numpy as np
import pandas as pd
import matplotlib as plt
import ast

# read in the data set
df = pd.read_csv('../data/chordonomicon.csv', low_memory=False)

In [87]:
# Read the mapping CSV file
chord_relations = pd.read_csv('../data/chords_mapping.csv')

# Create a dictionary with keys the "chords" and values the "degrees"
chord_degrees = dict(zip(chord_relations['Chords'], chord_relations['Degrees']))
for key, value in chord_degrees.items():
    chord_degrees[key] = ast.literal_eval(value)
    
# full list of chords from the chords_mapping csv
known_chords = list(chord_degrees.keys())
assert(len(known_chords) == len(set(known_chords))) # Validating no duplicates

# some examples of what the string labels for known chords look like
print(known_chords[0:10])

['C7', 'Cmaj7', 'C9', 'Cmaj9', 'Cmajs9', 'Cb9', 'Cb79', 'Cb7b9', 'C7b9', 'C7sus2']


In [88]:
# some examples of what chords look like in this data file
print("Number of known chords: ",len(chord_degrees))
print("C major: \t",chord_degrees['C'])
print("C major 7: \t",chord_degrees['Cmaj7'])
print("C minor: \t",chord_degrees['Cmin'])

Number of known chords:  2793
C major: 	 [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0]
C major 7: 	 [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1]
C minor: 	 [1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0]


In [89]:
# drop all columns except for chords and genres
chord_data = df[['chords','genres']]
chord_data.sample(5)

Unnamed: 0,chords,genres
265891,<intro_1> Bmin Emin Bmin Emin Bmin G D C B Emi...,
345994,Amin E7 Amin G7 C Amin E G7 C Amin E7 Amin,
296514,<verse_1> G As F <verse_2> G As F G As F G As ...,"iskelma""suomi rock"
127926,<intro_1> Bsus2 Bmin A G G/Fs Emin Bmin <verse...,"alternative metal""nu metal""post-grunge"
132075,<chorus_1> F G Emin F Dmin7 G Dmin7 G F G Emin...,nz indie


In [90]:
# replacing spaces with commas
def replace_space_with_comma(my_string):
    return my_string.replace(" ",",")

# replacing spaces with commons in all chords in all rows of the data
chord_data.loc[:,'chords'] = chord_data['chords'].apply(replace_space_with_comma)
chord_data.sample(5)

Unnamed: 0,chords,genres
501949,"G,C,Cadd9,G,C,Cadd9,G,C,Cadd9,G,C,Cadd9,D,G,C,...",
410491,"Bmin,Fmin,G,A,Emin,G,A,Emin,G,A,Bmin,Fmin,G,A,...","banda""corrido""musica mexicana""norteno""sierreno"
531578,"C,D,E,F,C,D,E,C,D,E,F,G",garage rock
352609,"G,D7,G,G7,C,G,D7,G,D7,G,D7,G,G7,C,G,D7,G,D7,G,...","classic country pop""nashville sound""traditiona..."
616994,"D,Dmaj7,D,G,D,Dmaj7,D,G,D,Dmaj7,D,G,D,Dmaj7,D,...",


In [91]:
# Remove section markers
def remove_section_markers(my_string):
    result = []
    i = 0
    n = len(my_string)
    while i < n:
        if my_string[i] == '<':
            # Skip until after the following ", "
            j = my_string.find('>', i)
            if j == -1:
                break  # no closing '>', stop
            i = j + 2  # skip '>,' and the space
        else:
            result.append(my_string[i])
            i += 1
    assert('<' not in result)
    assert('>' not in result)
    return ''.join(result)

chord_data.loc[:,'chords'] = chord_data['chords'].apply(remove_section_markers)
chord_data.sample(5)

Unnamed: 0,chords,genres
354191,"C,Dmin,C,G,C,Emin,F,Fmin,C,Dmin,Fmin,C,G,C,Emi...",
30650,"F,G,Gmin,F,G,Gmin,C,F,As,F,Gmin,Dmin,Gmin,F,G,...","canzone d""classic italian pop""italian adult pop"
428091,"Amin,Emin,Dmin,Emin,Amin,Emin,Dmin,Emin,Amin,E...",
420604,"Ab,Eb7,Ab,Eb7,Ab,Eb7,Ab,Eb7,Ab,Eb7,Ab,Eb7,Ab,E...","musica mexicana""norteno"
179235,"Amin,F,Emin,G,C,Amin,F,Emin,G,C,Amin,F,Emin,G,...","classic texas country""modern southern rock""red..."


In [92]:
# Removing inversions
def remove_inversions(my_string):
    result = []
    i = 0
    n = len(my_string)
    while i < n:
        if my_string[i] == '/':
            # Skip until after the following ", "
            j = my_string.find(',', i)
            if j == -1:
                break  # no closing comma, stop
            i = j  # skip comma
        else:
            result.append(my_string[i])
            i += 1
    return ''.join(result)

In [93]:
# get some sample chords with inversions
songs_with_inversions = chord_data.loc[['/' in ch for ch in chord_data.chords]]
songs_with_inversions.sample(10)

# just a basic test on a random chord sequence with some inversions
n = 3
my_sample = songs_with_inversions.sample(5)
for i in range(n):
    s = songs_with_inversions.iloc[i].chords
    print(s)
    print()
    print(remove_inversions(s))
    print()
    print()

E,D,A/Cs,E,D,A/Cs,E,D,A/Cs,E,D,A/Cs,E,D,A/Cs,E,D,A,C,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,D,A/Cs,E,D,A/Cs,E,D,A/Cs,E,D,A/Cs,E,D,A,C,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,C,G,D,E,C,G,D,E,C,G,D,C,D,E,G,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,C,G,D,E,C,G,D,E,C,G,D,C,D,E

E,D,A,E,D,A,E,D,A,E,D,A,E,D,A,E,D,A,C,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,D,A,E,D,A,E,D,A,E,D,A,E,D,A,C,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,C,G,D,E,C,G,D,E,C,G,D,C,D,E,G,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,C,G,D,E,C,G,D,E,C,G,D,C,D,E


D,Dmaj7,D,Dmaj7,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,Emin,A,D,G,Emin,A,D,G,D/Fs,Emin,A,D,G,D/Fs,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,Emin,A,D,G,Emin,A,D,G,D/Fs,Emin,A,D,G,D/Fs,Emin,A,D,G,Emin,A,D,G,D/Fs,Emin,A,D,G,D/Fs,Emin,A,D,G,D/Fs,Emin,A,D,Emin,A,D,G,Emin,A,D,G,D/Fs,Emin,A,D,G,D/Fs,Emin,A,D,G,Emin,A,D,G,D/Fs,Emin,A,D,G,D/Fs,Emin,A,D,G,D/Fs,Emin,A,D,G,Emin,A,D,G,D/Fs

D,Dmaj7,D,Dmaj7,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,Emin,A,

In [94]:
# remove inversions from the whole data set
chord_data.loc[:,'chords'] = chord_data['chords'].apply(remove_inversions)
chord_data.sample(5)

Unnamed: 0,chords,genres
630961,"E,Csmin,Gsmin,B,E,Csmin,Gsmin,A,E,Gsmin,Csmin,...",
274344,"F,C,F,C,F,C,F,C,F,C,F,G,Amin,F,Emin,Dmin,G,C,A...","alternative emo""dreamo""emo""stl indie"
198195,"D,A,G,D,A,G,D,A,G,D,Fsmin7,Bmin,G,Dadd11,D,Dad...",
511786,"Fs,B,Fs,Cs,B,Fs,Fs7,B,Fs,Cs,B,Fs,B,Fs,Cs,B,Fs,...",
291485,"Emin,B7sus4,B7,Bdim7,Asus4,Amin,Cmin,Cmin,Gsus...",


In [95]:
# compile a list of all chords in the data set
list_of_chord_lists = list(chord_data.chords)
giant_chord_string = ','.join(list_of_chord_lists)
data_set_chords = list(set(giant_chord_string.split(','))) # converting to a set as an intermediate step will get rid of duplicates
assert(len(data_set_chords) == len(set(data_set_chords))) # validating no duplicates

In [96]:
data_set_chords.remove('') # I still don't understand why the empty string ends up in here after what I'm doing above, but it does, so this gets rid of it. This is a very hacky solution.

In [97]:
print(list_of_chord_lists[0:5])
print()
print(giant_chord_string[0:200])
print()
print(len(data_set_chords))
print(data_set_chords[0:20])

['C,F,C,E7,Amin,C,F,C,G7,C,F,C,E7,Amin,C,F,G7,C,F,C,E7,Amin,C,F,C,G7,C,F,C,E7,Amin,C,F,G7,C,F,C,F,C,G,C,F,C,E7,Amin,C,F,G7,C,D,G,D,G,D,A,D,G,D,Fs7,Bmin,D,G,A7,D,G,A7,D', 'E,D,A,E,D,A,E,D,A,E,D,A,E,D,A,E,D,A,C,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,D,A,E,D,A,E,D,A,E,D,A,E,D,A,C,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,C,G,D,E,C,G,D,E,C,G,D,C,D,E,G,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,C,G,D,E,C,G,D,E,C,G,D,C,D,E', 'D,Dmaj7,D,Dmaj7,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,Emin,A,D,G,Emin,A,D,G,D,Emin,A,D,G,D,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,Emin,A,D,G,Emin,A,D,G,D,Emin,A,D,G,D,Emin,A,D,G,Emin,A,D,G,D,Emin,A,D,G,D,Emin,A,D,G,D,Emin,A,D,Emin,A,D,G,Emin,A,D,G,D,Emin,A,D,G,D,Emin,A,D,G,Emin,A,D,G,D,Emin,A,D,G,D,Emin,A,D,G,D,Emin,A,D,G,Emin,A,D,G,D', 'C,G,C,G,C,F,Dmin,G,Dmin,G,C,G,C,F,Dmin,G,Dmin,G,C,F,Dmin,G,Dmin7,G,C,G,C,F,Dmin,G,Dmin,G,C,F,Dmin,G,Dmin,G,C', 'C,G,C,G,C,G,C,G,C,Bmin,Emin,Amin,D,G,C,D,G,C,D,G,D,F,E,Amin,D,G,C,D,G,C,D,G,D,F,E,Amin,D,C

In [98]:
# list of all chords in the data set which are not in the chords_mapping csv file, should be basically zero
mystery_chords = list(set(data_set_chords).difference(set(known_chords)))
assert(len(mystery_chords) == len(set(mystery_chords))) # validating no duplicates
print(len(mystery_chords))
print(mystery_chords)

0
[]


In [99]:
# function to convert a string of comma-separated chords into a matrix, where each column denotes a chord
def string_to_chord_matrix(chord_sequence):
    # split sequence over commas, ignoring any "empty string" chords
    chord_list = [c for c in chord_sequence.split(',') if c != '']
    
    # then look up each chord in chord_degrees dictionary by the key value
    return np.array([chord_degrees[c][::-1] for c in chord_list]).transpose()

# visualizing the output for a sample of a few songs
n = 3
my_sample = chord_data.sample(3)
np.set_printoptions(linewidth=400)
for i in range(n):
    s = chord_data.iloc[i].chords
    print(s)
    print()
    print(string_to_chord_matrix(s))
    print()
    print()

C,F,C,E7,Amin,C,F,C,G7,C,F,C,E7,Amin,C,F,G7,C,F,C,E7,Amin,C,F,C,G7,C,F,C,E7,Amin,C,F,G7,C,F,C,F,C,G,C,F,C,E7,Amin,C,F,G7,C,D,G,D,G,D,A,D,G,D,Fs7,Bmin,D,G,A7,D,G,A7,D

[[0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 1 0 1 0 0 0 1 0 0 1 0 1 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
 [0 1 0 0 1 0 1 0 0 0 1 0 0 1 0 1 0 0 1 0 0 1 0 1 0 0 0 1 0 0 1 0 1 0 0 1 0 1 0 0 0 1 0 0 1 0 1 0 0 1 0 1 0 1 1 1 0 1 0 0 1 0 1 1 0 1 1]
 [0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 1 0 0 1 0 1 1 1 0 1 0 0 1 0 1 1 0 1 0 0 1 0 1 1 1 0 1 0 0 1 0 1 1 0 1 0 1 1 1 0 1 0 0 1 0 1 1 0 1 0 1 0 0 0 1 0 0 0 0 1 1 0 1 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 1 0 1 0 1 1 1 1 0 0 1 0 0 1]
 [0 1 0 0 0

In [100]:
# remove inversions from the whole data set
chord_data.insert(loc = 2,
                  column = 'chord_matrix',
                  value = chord_data['chords'].apply(string_to_chord_matrix),
                  allow_duplicates = False)
chord_data.sample(5)

Unnamed: 0,chords,genres,chord_matrix
555438,"Bmin,A,Bmin,A,Bmin,A,Bmin,A,Bmin,A,Bmin,A,Bmin...",,"[[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,..."
249673,"E,A,B,A,E,A,B,A,E,A,B,A,E,A,B,A,E,A,E,B,A,E,A,...",,"[[1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,..."
242820,"Fsmin,A,Fsmin,A,Fsmin,A,Fsmin,A,Fsmin,A,Fsmin,...",,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
245066,"C,E,Amin,G,F,G,C,Amin,F,G,C,Amin,F,Dmin,G,C,E,...","american folk revival""canadian indigenous musi...","[[0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,..."
94704,"Amin,G,Amin,G,Amin,G,Amin,F,G,Amin,F,G,Amin,F,...",gospel,"[[0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,..."


In [128]:
# Illustration of how to transpose, in vector/matrix form
# For a vector chord, transposing is just a matter of cyclically permuting the vector
# It seems the easiest way to do this is using deque objects
# deque = "double-ended queue"
from collections import deque 

def transpose_chord_up(chord_vector, num_semitones):
    # transpose the input chord_vector up by num_semitones
    d = deque(chord_vector)
    d.rotate(num_semitones)
    return(list(d))

Cmaj_vec = chord_degrees['C']
Dmaj_vec = chord_degrees['D']
print("C major:\t\t",Cmaj_vec)
print("D major:\t\t",Dmaj_vec)
print("C major transposed up 2:",transpose_chord_up(Cmaj_vec,2))

C major:		 [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0]
D major:		 [0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0]
C major transposed up 2: [0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0]
