The goal of this notebook is convert the chordonomicon data set to make each song a matrix where each column records a single chord. 

Steps:

1. Import the chordonomicon data set, drop all columns except for 'id' and 'chords'
2. Remove section marker info from chords i.e. remove \<intro_1\>
3. For each song, convert each chord into a vector, then concatenate them into a matrix 

In [4]:
# importing basic packages
import numpy as np
import pandas as pd
import matplotlib as plt
import ast

# read in the data set
df = pd.read_csv('../data/chordonomicon.csv', low_memory=False)

In [5]:
# Read the mapping CSV file
chord_relations = pd.read_csv('../data/chords_mapping.csv')

# Create a dictionary with keys the "chords" and values the "degrees"
chord_degrees = dict(zip(chord_relations['Chords'], chord_relations['Degrees']))
for key, value in chord_degrees.items():
    chord_degrees[key] = ast.literal_eval(value)
    
# full list of chords from the chords_mapping csv
known_chords = list(chord_degrees.keys())
assert(len(known_chords) == len(set(known_chords))) # Validating no duplicates

# some examples of what the string labels for known chords look like
print(known_chords[0:10])

['C7', 'Cmaj7', 'C9', 'Cmaj9', 'Cmajs9', 'Cb9', 'Cb79', 'Cb7b9', 'C7b9', 'C7sus2']


In [6]:
# some examples of what chords look like in this data file
print("Number of known chords: ",len(chord_degrees))
print("C major: \t",chord_degrees['C'])
print("C major 7: \t",chord_degrees['Cmaj7'])
print("C minor: \t",chord_degrees['Cmin'])

Number of known chords:  2793
C major: 	 [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0]
C major 7: 	 [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1]
C minor: 	 [1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0]


In [7]:
# drop all columns except for chords and genres
chord_data = df[['chords','genres']]
chord_data.sample(5)

Unnamed: 0,chords,genres
298381,<intro_1> E A B A Amin E <verse_1> A B A Amin ...,"modern alternative rock""modern rock""pop""pov: i..."
193483,<verse_1> G D Cadd9 G D G D Cadd9 G D <chorus_...,"charlottesville indie""southern americana"
529011,Amin E Amin E F Amin F Amin E Amin E F Amin F ...,"malaysian indie""malaysian punk"
30554,<verse_1> Emin C B Emin C B Csmin A B Gs/C Csm...,"north carolina emo""screamo"
469087,D Bmin D Bmin D Bmin D Bmin G A D Bmin D Bmin ...,"art pop""chamber pop""indie rock""melancholia""new..."


In [8]:
# replacing spaces with commas
def replace_space_with_comma(my_string):
    return my_string.replace(" ",",")

# replacing spaces with commons in all chords in all rows of the data
chord_data.loc[:,'chords'] = chord_data['chords'].apply(replace_space_with_comma)
chord_data.sample(5)

Unnamed: 0,chords,genres
398762,"G,C,D,G,C,D,G,C,G,D,G,C,D,C,D,G,C,D,G,C,G,D,G,...","art punk""oc indie"
210984,"<verse_1>,Amin,Emin,Amin,Emin,C,G,Amin,E,<chor...",
167514,"<intro_1>,C,Amin,Emin,<verse_1>,C,Amin,Emin,C,...",world
324439,"<intro_1>,A,E,G,D,A,E,G,D,<chorus_1>,A,E,Esus4...","gyerekdalok""hungarian folk"
363605,"G,Emin,D,C,G,Emin,D,C,G,D,Emin,C,G,D,C,G,Emin,...","roots americana""southern americana"


In [9]:
# Remove section markers
def remove_section_markers(my_string):
    result = []
    i = 0
    n = len(my_string)
    while i < n:
        if my_string[i] == '<':
            # Skip until after the following ", "
            j = my_string.find('>', i)
            if j == -1:
                break  # no closing '>', stop
            i = j + 2  # skip '>,' and the space
        else:
            result.append(my_string[i])
            i += 1
    assert('<' not in result)
    assert('>' not in result)
    return ''.join(result)

chord_data.loc[:,'chords'] = chord_data['chords'].apply(remove_section_markers)
chord_data.sample(5)

Unnamed: 0,chords,genres
67231,"Dmin,A7,Dmin,A,Dmin,A,Dmin,Gmin,Dmin,A7,D,Gmin...",classic belgian pop
655674,"Dmaj911s,Dmaj9,Dsus2,Fsmin,Asus2/E,Dmaj911s,Dm...",
279236,"C,F,C,F,C,F,C,F,C,F,C,F,C,F,C,F,C,F,C,F,C,F,C,...",japanese indie folk
437786,"F,Fmin,Cs,Ds,Gs,Fmin,Cs,Ds,C,Fmin,Cs,Ds,Cmin,F...",
97720,"A,Bmin,A,Fsmin,E,Fsmin,A,Fs,Fsmin,Fs,F,A,Fsmin...","liverpool indie""modern alternative rock""modern..."


In [10]:
# Removing inversions
def remove_inversions(my_string):
    result = []
    i = 0
    n = len(my_string)
    while i < n:
        if my_string[i] == '/':
            # Skip until after the following ", "
            j = my_string.find(',', i)
            if j == -1:
                break  # no closing comma, stop
            i = j  # skip comma
        else:
            result.append(my_string[i])
            i += 1
    return ''.join(result)

In [11]:
# get some sample chords with inversions
songs_with_inversions = chord_data.loc[['/' in ch for ch in chord_data.chords]]
songs_with_inversions.sample(10)

# just a basic test on a random chord sequence with some inversions
n = 3
my_sample = songs_with_inversions.sample(n)
for i in range(n):
    s = my_sample.iloc[i].chords
    print(s)
    print()
    print(remove_inversions(s))
    print()
    print()

B,G,A,B,A/B,D/E,A/B,D,E,A/B,D/E,G,A,B,G,B,G,A,B,A/B,D/E,A/B,D,E,A/B,D/E,G,A,B,G,B,D,A,B,D,G,A,B

B,G,A,B,A,D,A,D,E,A,D,G,A,B,G,B,G,A,B,A,D,A,D,E,A,D,G,A,B,G,B,D,A,B,D,G,A,B


Fs7,Bmin7,Fs7,Bmin7,Gmaj7/B,Dadd13,Fs7,Bmin7,Gmaj7/B,Dadd13,Fs7,Gmaj7/B,Dadd13,Fs7,Bmin,Gmaj7/B,Dadd13,Fs7,Bmin,Gmaj7/B,Dadd13,Fs7,Bmin,Fsmajs9,Bmin7,Fs7,Bmin7,Gmaj7/B,Dadd13,Fs7,Bmin7,Gmaj7,A,Fs7,Bmin,Gmaj7/B,Dadd13,Fs7,Bmin,Gmaj7/B,Dadd13,Fs7,Bmin,Amaj7,Fsmin9,Amaj7

Fs7,Bmin7,Fs7,Bmin7,Gmaj7,Dadd13,Fs7,Bmin7,Gmaj7,Dadd13,Fs7,Gmaj7,Dadd13,Fs7,Bmin,Gmaj7,Dadd13,Fs7,Bmin,Gmaj7,Dadd13,Fs7,Bmin,Fsmajs9,Bmin7,Fs7,Bmin7,Gmaj7,Dadd13,Fs7,Bmin7,Gmaj7,A,Fs7,Bmin,Gmaj7,Dadd13,Fs7,Bmin,Gmaj7,Dadd13,Fs7,Bmin,Amaj7,Fsmin9,Amaj7


D,A,G,D,Bmin,A,Bmin,A,G,D,Bmin,Fsmin,G,A,D,A,G,D/A,Bmin,A,D,A,G,D,Bmin,A,G,A,D,A,Bmin,A,Bmin,A,Bmin,A,G,D,Bmin,Fsmin,G,A,D,A,G,D/A,Bmin,A,D,A,G,D,Bmin,A,G,A,G,A,D,Bmin,A,G,D,A,G,D/A,Bmin,A,D,A,G,D,Bmin,A,G,A,D,A,D,A,G,D,Bmin,A

D,A,G,D,Bmin,A,Bmin,A,G,D,Bmin,Fsmin,G,A,D,A,G,D,Bmin,A,D,A,G,D,Bmin,A,G

In [12]:
# remove inversions from the whole data set
chord_data.loc[:,'chords'] = chord_data['chords'].apply(remove_inversions)
chord_data.sample(5)

Unnamed: 0,chords,genres
618078,"Amin,Dmin,E7,F,Amin,Dmin,E7,F,Amin,Dmin,E7,F,A...",
432249,"G,D,C,A,G,C,D,G,C,D,G,D,C,G,B,B7,Emin,C,D,G,B7...",
520904,"A,Fsmin,D,A,E,Fsmin,D,E,D,E,Fsmin,D,E,D,E,D,Fs...","metalcore""pixie""screamo"
433627,"D,C,G,D,C,G,D,C,D,C,D,C,D,C,D,C,D,C,G,D",roots worship
636519,"G,C,Amin,D,A,E,G,Bmin,Amin,D,G,Bmin,Amin,Dsus4...",


In [13]:
# compile a list of all chords in the data set
list_of_chord_lists = list(chord_data.chords)
giant_chord_string = ','.join(list_of_chord_lists)
data_set_chords = list(set(giant_chord_string.split(','))) # converting to a set as an intermediate step will get rid of duplicates
assert(len(data_set_chords) == len(set(data_set_chords))) # validating no duplicates

In [14]:
data_set_chords.remove('') # I still don't understand why the empty string ends up in here after what I'm doing above, but it does, so this gets rid of it. This is a very hacky solution.

In [15]:
print(list_of_chord_lists[0:5])
print()
print(giant_chord_string[0:200])
print()
print(len(data_set_chords))
print(data_set_chords[0:20])

['C,F,C,E7,Amin,C,F,C,G7,C,F,C,E7,Amin,C,F,G7,C,F,C,E7,Amin,C,F,C,G7,C,F,C,E7,Amin,C,F,G7,C,F,C,F,C,G,C,F,C,E7,Amin,C,F,G7,C,D,G,D,G,D,A,D,G,D,Fs7,Bmin,D,G,A7,D,G,A7,D', 'E,D,A,E,D,A,E,D,A,E,D,A,E,D,A,E,D,A,C,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,D,A,E,D,A,E,D,A,E,D,A,E,D,A,C,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,C,G,D,E,C,G,D,E,C,G,D,C,D,E,G,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,G,D,A,E,G,D,A,E,G,D,A,C,D,E,C,G,D,E,C,G,D,E,C,G,D,C,D,E', 'D,Dmaj7,D,Dmaj7,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,Emin,A,D,G,Emin,A,D,G,D,Emin,A,D,G,D,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,G,Emin,A,D,Emin,A,D,G,Emin,A,D,G,D,Emin,A,D,G,D,Emin,A,D,G,Emin,A,D,G,D,Emin,A,D,G,D,Emin,A,D,G,D,Emin,A,D,Emin,A,D,G,Emin,A,D,G,D,Emin,A,D,G,D,Emin,A,D,G,Emin,A,D,G,D,Emin,A,D,G,D,Emin,A,D,G,D,Emin,A,D,G,Emin,A,D,G,D', 'C,G,C,G,C,F,Dmin,G,Dmin,G,C,G,C,F,Dmin,G,Dmin,G,C,F,Dmin,G,Dmin7,G,C,G,C,F,Dmin,G,Dmin,G,C,F,Dmin,G,Dmin,G,C', 'C,G,C,G,C,G,C,G,C,Bmin,Emin,Amin,D,G,C,D,G,C,D,G,D,F,E,Amin,D,G,C,D,G,C,D,G,D,F,E,Amin,D,C

In [16]:
# list of all chords in the data set which are not in the chords_mapping csv file, should be basically zero
mystery_chords = list(set(data_set_chords).difference(set(known_chords)))
assert(len(mystery_chords) == len(set(mystery_chords))) # validating no duplicates
print(len(mystery_chords))
print(mystery_chords)

0
[]


In [17]:
# function to convert a string of comma-separated chords into a matrix, where each column denotes a chord
def string_to_chord_matrix(chord_sequence):
    # split sequence over commas, ignoring any "empty string" chords
    chord_list = [c for c in chord_sequence.split(',') if c != '']
    
    # then look up each chord in chord_degrees dictionary by the key value
    return np.array([chord_degrees[c][::-1] for c in chord_list]).transpose()

# visualizing the output for a sample of a few songs
n = 3
my_sample = chord_data.sample(n)
np.set_printoptions(linewidth=400)
for i in range(n):
    s = my_sample.iloc[i].chords
    print(s)
    print()
    print(string_to_chord_matrix(s))
    print()
    print()

Emin,Amin,D7,G,Emin,Amin,B7,Emin,Amin,D7,G,Emin,Amin,B7,Emin,Dmin,D7,G,Emin,Amin,B7,Emin,Amin,D7,G,Emin,Amin,B7,Emin,Amin,D7,C,Emin,Amin,B7,Emin,Amin,D7,C,Emin,Amin,B7,Emin,Amin,D7,G,Emin,Amin,B7,Emin,Amin,D7,G,Emin,Amin,B7,Emin,Amin,D7,G,Emin,Amin,B7,Emin,Amin,D7,C,Emin,Amin,B7,Emin,Amin,D7,C,Emin,Amin,B7,Emin,B7,Emin,B7,Emin,B7,Emin,Amin,D7,G,Emin,Amin,B7,Emin,C,B7,Emin

[[1 0 0 ... 0 1 1]
 [0 0 0 ... 0 0 0]
 [0 1 1 ... 0 1 0]
 ...
 [0 0 1 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 1 1 ... 1 0 0]]


Bmaj7,Csmaj7,Emin,Bmaj7,Csmaj7,Emin,Bbmaj7,Bmaj7

[[1 0 1 1 0 1 0 1]
 [1 0 0 1 0 0 1 1]
 [0 0 0 0 0 0 1 0]
 [0 1 0 0 1 0 0 0]
 [0 0 1 0 0 1 0 0]
 [1 0 0 1 0 0 0 1]
 [0 1 0 0 1 0 1 0]
 [0 0 1 0 0 1 0 0]
 [1 0 0 1 0 0 0 1]
 [0 0 0 0 0 0 1 0]
 [0 1 0 0 1 0 0 0]
 [0 1 0 0 1 0 0 0]]


D,G,D,G,D,G,D,G,D,G,D,G,D,G,D,G,D,G,D,G,D,G,D

[[0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1]
 [0 0 0 0 0 0 0 0 0 0 0 0 0

In [18]:
# remove inversions from the whole data set
chord_data.insert(loc = 2,
                  column = 'chord_matrix',
                  value = chord_data['chords'].apply(string_to_chord_matrix),
                  allow_duplicates = False)
chord_data.sample(5)

Unnamed: 0,chords,genres,chord_matrix
340649,"Amin,C,F,Amin,C,F,Amin,C,F,Emin,C,Dmin,Emin,F,...",,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0,..."
533982,"Fsmin,Bmin,D,Fsmin,Bmin,Fsmin,Bmin,Csmin,D,B,E...",,"[[0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0,..."
429283,"Fs,E,Fs,Cs,B,Cs,B,Cs,Fs,B,Fs,Cs,Fs,Cs,Eb,B,Fs,...",,"[[0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,..."
560895,"G,C,D,G,C,G,C,D,G,C,D,G,C,D,G,C,G,C,D,G,C,D,G,...",,"[[1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,..."
493532,"C,G,Amin,F,G,Amin,G,Amin,C,F,G,Amin,G,Amin,C,F...",,"[[0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,..."


In [19]:
# Illustration of how to transpose, in vector/matrix form
# For a vector chord, transposing is just a matter of cyclically permuting the vector
# It seems the easiest way to do this is using deque objects
# deque = "double-ended queue"
from collections import deque 

def transpose_chord_up(chord_vector, num_semitones):
    # transpose the input chord_vector up by num_semitones
    d = deque(chord_vector)
    d.rotate(num_semitones)
    return(list(d))

Cmaj_vec = chord_degrees['C']
Dmaj_vec = chord_degrees['D']
print("C major:\t\t",Cmaj_vec)
print("D major:\t\t",Dmaj_vec)
print("C major transposed up 2:",transpose_chord_up(Cmaj_vec,2))

C major:		 [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0]
D major:		 [0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0]
C major transposed up 2: [0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0]


In [20]:
def transpose_matrix_up(chord_matrix, num_semitones):
    # transpose the entire matrix up by a number of semitones
    # this just means applying transpose_chord_up to each column
    # we'll accomplish this by taking the matrix transpose, applying to each row, then taking the matrix transpose back
    transpose_matrix = np.transpose(np.copy(chord_matrix))
    for i in range(len(transpose_matrix)):
        transpose_matrix[i] = transpose_chord_up(transpose_matrix[i],-num_semitones) # the negative sign makes it musically transpose the correct direction, something about transposing the matrix
    return np.transpose(transpose_matrix)

# visualizing the output for a sample of a few songs
n = 1
my_sample = chord_data.sample(n)
np.set_printoptions(linewidth=400)
for i in range(n):
    s = my_sample.iloc[i].chords
    s_matrix = string_to_chord_matrix(s)

    semitones = 3
    shifted_matrix = transpose_matrix_up(s_matrix,semitones)
    
    print(s)
    print()
    print(s_matrix)
    print()
    print(shifted_matrix)
    print()
    print(transpose_matrix_up(s_matrix,semitones))
    print()

F,C,G,Amin,C,G,Dmin,F,C,G,F,C,G,F,C,G,F,C,Amin,F,G,F,C,Amin,F,G,F,C,G,Amin,F,C,Dmin,G,Amin,Emin,Amin,C,D,F,C,G,C,Amin,F,G,F,C,Amin,F,G,F,C,G,Amin,F,C,Dmin,G,Amin,Emin,Amin,C,D,F,C,G,Dmin,F,C,G,Dmin,F,C,G,Amin,C,G,Dmin,F,C,G,F,C,G,F,C,G,F

[[0 0 1 ... 0 1 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 1]
 ...
 [0 0 1 ... 0 1 0]
 [0 0 0 ... 0 0 0]
 [1 1 0 ... 1 0 1]]

[[0 0 0 ... 0 0 0]
 [0 1 1 ... 1 1 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 1 ... 0 1 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 1]]

[[0 0 0 ... 0 0 0]
 [0 1 1 ... 1 1 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 1 ... 0 1 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 1]]



In [21]:
# make a version of the chord_degrees dictionary which is easier to look up by chord vector
# Note that it's not possible to make dictionary version of this where the key is a binary vector, because that isn't a hashable type
# Actually, I'm not sure this is useful, but I'll keep it here
reverse_chord_degrees = [tuple(x) for x in chord_degrees.items()]
reverse_chord_degrees[0:10]

[('C7', [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]),
 ('Cmaj7', [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1]),
 ('C9', [1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0]),
 ('Cmaj9', [1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1]),
 ('Cmajs9', [1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1]),
 ('Cb9', [1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]),
 ('Cb79', [1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0]),
 ('Cb7b9', [1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0]),
 ('C7b9', [1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0]),
 ('C7sus2', [1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0])]

In [22]:
# converting back from a chord matrix to a comma-separated string of chord names
# note that if you convert a string to a matrix and back, you can get enharmonic equivalents depending on the order things appear in the dictionary
# for example, F->vector->string can become Es (E sharp) when coming back to a string, because the chord_mapping.csv file lists both 
def chord_matrix_to_string(chord_matrix):
    # transpose the matrix so that rows are chord vectors
    transposed_matrix = np.transpose(chord_matrix)
    
    chord_string = ''

    # look up the row in chord mappings dictionary, add to our string
    for row in transposed_matrix:
        found = False
        for key, value in chord_degrees.items():
            if found==False and (row[::-1]==value).all(): # the found=False check avoids enharmonically equivalent chords being both added, e.g. Fs and Gb are the same as a vector
                chord_string = chord_string + key + ","
                found = True

    return chord_string[0:-1] # chop off the very last comma

# visualizing the output for a sample of a few songs
n = 1
my_sample = chord_data.sample(n)
np.set_printoptions(linewidth=400)
for i in range(n):
    s = my_sample.iloc[i].chords
    s_matrix = string_to_chord_matrix(s)
    s_again = chord_matrix_to_string(s_matrix)
    
    print(s)
    print()
    print(s_matrix)
    print()
    print(s_again)
    print()

F,G,C,F,G,C,F,G,C,Amin,D,G,G7,F,G,C,F,G,C,F,G,C,Amin,F,G,C,F,G,C,F,G,C,F,G,C,Amin,D,G,G7,F,G,C,F,G,C,F,G,C,Amin,F,G,C,F,G,C,F,G,C

[[0 1 0 0 1 0 0 1 0 0 0 1 1 0 1 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 1 1 0 1 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 1 0 0 1 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 1 1 0 0 1 0 0 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 1 0 1 1 0 1 1 0 0 1 1 0 1 1 0 1 1 0 1 1 0 0 1 1 0 1 1 0 1 1 0 1 1 0 0 1 1 0 1 1 0 1 1 0 1 1 0 0 1 1 0 1 1 0 1 1]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 1 0 0]
 [0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 1 0 

In [23]:
# comparing a transposed chord string with a shifted matrix to check for agreement
n = 1
my_sample = chord_data.sample(n)
np.set_printoptions(linewidth=600)
for i in range(n):
    s = my_sample.iloc[i].chords
    s_matrix = string_to_chord_matrix(s)

    semitones = 3
    shifted_matrix = transpose_matrix_up(s_matrix,semitones)
    shifted_string = chord_matrix_to_string(shifted_matrix)
    
    print(s)
    print()
    print(s_matrix)
    print()
    print(shifted_matrix)
    print()
    print(shifted_string)
    print()

Csmin,B,A,E,Gsmin,Fsmin,C7,Csmin,B,A,E,Gsmin,Fsmin,C7,Csmin,B,A,E,Gsmin,Fsmin,C7,Csmin,B,A,E,Gsmin,Fsmin,C7,B,A,Csmin,A,Csmin,A,Csmin,A,B,A,Csmin,A,Csmin,A,Csmin,A,Fsmin,Csmin,B,A,E,Gsmin,Fsmin,C7,Csmin,B,A,E,Gsmin,Fsmin,C7,Csmin,B,A,E,Gsmin,Fsmin,C7,Csmin,B,A,E,Gsmin,Fsmin,C7,B,A,Csmin,A,Csmin,A,Csmin,A,B,A,Csmin,A,Csmin,A,Csmin,A,Fsmin,Csmin,B,A,E,Fsmin,F,Csmin,B,A,E,Gsmin,Fsmin,C7,Csmin,B,A,E,Gsmin,Fsmin,C7,Csmin,B,A,E,Gsmin,Fsmin,C7,Csmin,B,A,E,Gsmin,Fsmin,F

[[0 1 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 1 1]
 ...
 [0 0 0 ... 0 0 0]
 [1 0 1 ... 0 1 0]
 [0 0 0 ... 0 0 1]]

[[1 0 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]
 [0 1 0 ... 0 1 0]
 ...
 [0 1 0 ... 1 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 1 1]]

Emin,D,C,G,Bmin,Amin,Ds7,Emin,D,C,G,Bmin,Amin,Ds7,Emin,D,C,G,Bmin,Amin,Ds7,Emin,D,C,G,Bmin,Amin,Ds7,D,C,Emin,C,Emin,C,Emin,C,D,C,Emin,C,Emin,C,Emin,C,Amin,Emin,D,C,G,Bmin,Amin,Ds7,Emin,D,C,G,Bmin,Amin,Ds7,Emin,D,C,G,Bmin,Amin,Ds7,Emin,D,C,G,Bmin,Amin,Ds7,D,C,Emin,C,Emin,C,Emin,C,D,C,Emin,C

In [24]:
note_names_sharps = ['C','Cs','D','Ds','E','F','Fs','G','Gs','A','As','B']
note_names_flats = ['C','Db','D','Eb','E','F','Gb','G','Ab','A','Bb','B']

In [25]:
# some possible features

def total_chords(chord_string):
    # count the number of total chords in a comma-separated string
    return len(chord_string.split(','))

def unique_chords(chord_string):
    # count the number of unique chords in a comma-separated string
    return len(set(chord_string.split(',')))

def unique_chord_ratio(chord_string):
    return float(unique_chords(chord_string)) / float(total_chords(chord_string))

def missing_notes(chord_string):
    # returns the number of notes missing, i.e. number of zero rows in the chord matrix
    chord_matrix = string_to_chord_matrix(chord_string)
    count = 0
    for row in chord_matrix:
        if not row.any():
            count += 1
    return count

def note_frequency(chord_string, relative=False):
    # return a vector of note counts
    # or if relative=True, return a percentage version of this, using the number of total chords as the denominator
    chord_matrix = string_to_chord_matrix(chord_string)
    count_vector = [np.sum(chord_matrix[i]) for i in range(12)]
    if relative == True:
        count_vector = list(np.array(count_vector) / total_chords(chord_string))
    return count_vector[::-1] # need to reverse to get "bottom-up"

def most_common_notes(chord_string, names = True, spelling = 'sharp'):
    frequency_vector = note_frequency(chord_string)
    max_frequency = np.max(frequency_vector)
    notes = []
    for i in range(12):
        if frequency_vector[i] == max_frequency:
            if names == True and spelling == 'sharp':
                notes.append(note_names_sharps[i])
            elif names == True and spelling == 'flat':
                notes.append(note_names_flats[i])
            else:
                notes.append(i)
    return notes


In [26]:
# more possible features

def drone_ratio(chord_string):
    # a metric of how close the song is to having a drone pitch
    # return the fraction of all chords that contain the most frequently appearing note
    # i.e. returns 1 if the most frequent single note appears in every chord, 0.5 if the most frequent note appears in half the chords
    return np.max(note_frequency(chord_string,relative=True))

def sequential_overlap(chord_string):
    # a measure of how much sequential pairs of chords overlap in notes
    chord_matrix = string_to_chord_matrix(chord_string)
    transposed_matrix = np.transpose(np.copy(chord_matrix))
    similarity_vector = np.zeros(shape = len(transposed_matrix) - 1, 
                                 dtype=int)
    for i in range(len(transposed_matrix) - 1):
        # look the ith row (chord), and compare with the following chord
        # take the dot product, which counts the number of overlapping notes
        similarity_vector[i] = np.dot(transposed_matrix[i],transposed_matrix[i+1])
    return list(similarity_vector)

def average_sequential_overlap(chord_string):
    return np.mean(sequential_overlap(chord_string))

def is_major_triad(chord_vector):
    # return true if chord_vector is a major triad
    C_triad = chord_degrees['C']
    for i in range(12):
        if np.array_equal(chord_vector[::-1], 
                          transpose_chord_up(C_triad,i)):
            return 1
    return 0

def major_triads(chord_string):
    # return a binary vector indicating where simple major triads occur
    return [is_major_triad(c) for c in np.transpose(string_to_chord_matrix(chord_string))]

def major_triad_ratio(chord_string):
    return np.mean(major_triads(chord_string))

In [27]:
def transpose_to_most_frequent_note(chord_string, chord_matrix):
    # take an input chord string, convert to a matrix, and transpose it to the most frequently appearing note
    # in case of ties, just throw an error
    
    # find the most common note
    common_notes = most_common_notes(chord_string, names = False)
    if len(common_notes) > 1:
        return('Unable to transpose to most frequent note, tied for most common notes')
    assert(len(common_notes) == 1)
    most_common_note = common_notes[0]

    # Transpose so that the most common note is now C
    # In other words, shift down by a number of semitones equal to the scale degree of the most common note
    shifted_chord_matrix = transpose_matrix_up(chord_matrix, -most_common_note)

    return shifted_chord_matrix

In [28]:
# basic visual tests of these metrics
my_sample = chord_data.sample(1)
np.set_printoptions(linewidth=600)
s = my_sample.iloc[0].chords
s_matrix = string_to_chord_matrix(s)
print(s)
print()
print(s_matrix)
print()
print("Total chords:",total_chords(s))
print("Unique chords:",unique_chords(s))
print("Unique chord ratio:",unique_chord_ratio(s))
print("Missing notes:",missing_notes(s))
print("Absolute note frequencies:",note_frequency(s,relative=False))
print("Relative note frequencies:",note_frequency(s,relative=True))
print("Most common notes:",most_common_notes(s, names = True, spelling = 'sharp'))
print("Drone ratio:",drone_ratio(s))
print("Sequential overlap:",sequential_overlap(s))
print("Average sequential overlap:",average_sequential_overlap(s))
print("Major triads:",major_triads(s))
print("Major triad ratio:",major_triad_ratio(s))
print()
print("Chord matrix transposed to C:")
print(transpose_to_most_frequent_note(s, s_matrix))

Bmin,Emin,G,Fs,Bmin,Emin,A,D,G,Emin,Cs,Fs,Bmin,Emin,A,D,G,Emin,Cs,Fs,B,Emin,A,D,G,Emin,Fs,B,Emin,A,D,G,Emin,Fs,Bmin,Emin,G,Fs,Bmin,Emin,A,D,G,Emin,Cs,Fs,B,Emin,A,D,G,Emin,Fs,B,Emin,A,D,G,Emin,Fs,B,Emin,A,D,G,Emin,Fs,B,Emin,A,D,G,Emin,Fs,Bmin,Emin,G,Fs,Bmin,Emin,G,Fs,Bmin

[[1 1 1 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 1 1 0 0 1 1 0 1 1 1 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 1 1 0 0 1 1 0 1 1 0 0 1 1 0 1 1 0 0 1 1 0 1 1 1 0 1 1 1 0 1]
 [0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0]
 [0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 1 0 0 1 0 0 1 1 0 0 0 1 0 0 1 1 0 0 0 1 0 0 1

In [29]:
def get_subsequences(my_list, subsequence_length):
    # return all subsequences of a given length from a list
    assert(subsequence_length < len(my_list))
    return [my_list[i:i+subsequence_length] for i in range(len(my_list) - subsequence_length + 1)]

get_subsequences(my_list = [1,2,3,4,5,6,7,8,9,10], 
                 subsequence_length = 4)

[[1, 2, 3, 4],
 [2, 3, 4, 5],
 [3, 4, 5, 6],
 [4, 5, 6, 7],
 [5, 6, 7, 8],
 [6, 7, 8, 9],
 [7, 8, 9, 10]]

In [30]:
def n_grams(chord_string, n):
    # compile a list of n-grams in the chord_string
    chord_matrix = string_to_chord_matrix(chord_string)
    transposed_matrix = np.transpose(np.copy(chord_matrix)) # now chord vectors are rows
    return get_subsequences(my_list = transposed_matrix,
                            subsequence_length = n)

In [31]:
# basic visual tests of these metrics
my_sample = chord_data.sample(1)
np.set_printoptions(linewidth=600)
s = my_sample.iloc[0].chords
s_matrix = string_to_chord_matrix(s)
print(s)
print()
print(s_matrix)
print()
print("2-grams:",n_grams(s,2))

Amin,G,C,E,Amin,G,C,E,Amin,G,C,E,Amin,G,C,E,Amin,G,C,E,Amin,G,C,E,Amin,G,C,E,Amin,G,C,E,Amin,G,C,E,Amin,G,C,E,Amin,G,C,E,Amin,G,C,E,Amin,G,C,E,Amin,G,C,E,Amin,G,C,E,Amin,G,C,E,Amin

[[0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1]
 [0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0]
 [0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 