The goal of this notebook is convert the chordonomicon data set to make each song a matrix where each column records a single chord. 

Steps:

1. Import the chordonomicon data set, drop all columns except for 'id' and 'chords'
2. Remove section marker info from chords i.e. remove \<intro_1\>
3. For each song, convert each chord into a vector, then concatenate them into a matrix 

In [3]:
# importing basic packages
import numpy as np
import pandas as pd
import matplotlib as plt
import ast

# read in the data set
df = pd.read_csv('../data/chordonomicon.csv', low_memory=False)

FileNotFoundError: [Errno 2] No such file or directory: '../data/chordonomicon.csv'

In [None]:
# Read the mapping CSV file
chord_relations = pd.read_csv('../data/chords_mapping.csv')

# Create a dictionary with keys the "chords" and values the "degrees"
chord_degrees = dict(zip(chord_relations['Chords'], chord_relations['Degrees']))
for key, value in chord_degrees.items():
    chord_degrees[key] = ast.literal_eval(value)
    
# full list of chords from the chords_mapping csv
known_chords = list(chord_degrees.keys())
assert(len(known_chords) == len(set(known_chords))) # Validating no duplicates

# some examples of what the string labels for known chords look like
print(known_chords[0:10])

In [None]:
# some examples of what chords look like in this data file
print("Number of known chords: ",len(chord_degrees))
print("C major: \t",chord_degrees['C'])
print("C major 7: \t",chord_degrees['Cmaj7'])
print("C minor: \t",chord_degrees['Cmin'])

In [None]:
# drop all columns except for chords and genres
chord_data = df[['chords','genres']]
chord_data.sample(5)

In [None]:
# replacing spaces with commas
def replace_space_with_comma(my_string):
    return my_string.replace(" ",",")

# replacing spaces with commons in all chords in all rows of the data
chord_data.loc[:,'chords'] = chord_data['chords'].apply(replace_space_with_comma)
chord_data.sample(5)

In [None]:
# Remove section markers
def remove_section_markers(my_string):
    result = []
    i = 0
    n = len(my_string)
    while i < n:
        if my_string[i] == '<':
            # Skip until after the following ", "
            j = my_string.find('>', i)
            if j == -1:
                break  # no closing '>', stop
            i = j + 2  # skip '>,' and the space
        else:
            result.append(my_string[i])
            i += 1
    assert('<' not in result)
    assert('>' not in result)
    return ''.join(result)

chord_data.loc[:,'chords'] = chord_data['chords'].apply(remove_section_markers)
chord_data.sample(5)

In [None]:
# Removing inversions
def remove_inversions(my_string):
    result = []
    i = 0
    n = len(my_string)
    while i < n:
        if my_string[i] == '/':
            # Skip until after the following ", "
            j = my_string.find(',', i)
            if j == -1:
                break  # no closing comma, stop
            i = j  # skip comma
        else:
            result.append(my_string[i])
            i += 1
    return ''.join(result)

In [None]:
# get some sample chords with inversions
songs_with_inversions = chord_data.loc[['/' in ch for ch in chord_data.chords]]
songs_with_inversions.sample(10)

# just a basic test on a random chord sequence with some inversions
n = 3
my_sample = songs_with_inversions.sample(5)
for i in range(n):
    s = songs_with_inversions.iloc[i].chords
    print(s)
    print()
    print(remove_inversions(s))
    print()
    print()

In [None]:
# remove inversions from the whole data set
chord_data.loc[:,'chords'] = chord_data['chords'].apply(remove_inversions)
chord_data.sample(5)

In [None]:
# compile a list of all chords in the data set
list_of_chord_lists = list(chord_data.chords)
giant_chord_string = ','.join(list_of_chord_lists)
data_set_chords = list(set(giant_chord_string.split(','))) # converting to a set as an intermediate step will get rid of duplicates
assert(len(data_set_chords) == len(set(data_set_chords))) # validating no duplicates

In [None]:
data_set_chords.remove('') # I still don't understand why the empty string ends up in here after what I'm doing above, but it does, so this gets rid of it. This is a very hacky solution.

In [None]:
print(list_of_chord_lists[0:5])
print()
print(giant_chord_string[0:200])
print()
print(len(data_set_chords))
print(data_set_chords[0:20])

In [None]:
# list of all chords in the data set which are not in the chords_mapping csv file, should be basically zero
mystery_chords = list(set(data_set_chords).difference(set(known_chords)))
assert(len(mystery_chords) == len(set(mystery_chords))) # validating no duplicates
print(len(mystery_chords))
print(mystery_chords)

In [None]:
# function to convert a string of comma-separated chords into a matrix, where each column denotes a chord
def string_to_chord_matrix(chord_sequence):
    # split sequence over commas, ignoring any "empty string" chords
    chord_list = [c for c in chord_sequence.split(',') if c != '']
    
    # then look up each chord in chord_degrees dictionary by the key value
    return np.array([chord_degrees[c][::-1] for c in chord_list]).transpose()

# visualizing the output for a sample of a few songs
n = 3
my_sample = chord_data.sample(3)
np.set_printoptions(linewidth=400)
for i in range(n):
    s = chord_data.iloc[i].chords
    print(s)
    print()
    print(string_to_chord_matrix(s))
    print()
    print()

In [None]:
# remove inversions from the whole data set
chord_data.insert(loc = 2,
                  column = 'chord_matrix',
                  value = chord_data['chords'].apply(string_to_chord_matrix),
                  allow_duplicates = False)
chord_data.sample(5)

In [None]:
# Illustration of how to transpose, in vector/matrix form
# For a vector chord, transposing is just a matter of cyclically permuting the vector
# It seems the easiest way to do this is using deque objects
# deque = "double-ended queue"
from collections import deque 

def transpose_chord_up(chord_vector, num_semitones):
    # transpose the input chord_vector up by num_semitones
    d = deque(chord_vector)
    d.rotate(num_semitones)
    return(list(d))

Cmaj_vec = chord_degrees['C']
Dmaj_vec = chord_degrees['D']
print("C major:\t\t",Cmaj_vec)
print("D major:\t\t",Dmaj_vec)
print("C major transposed up 2:",transpose_chord_up(Cmaj_vec,2))