In [1]:
import pandas as pd
import numpy as np
import copy
import matplotlib.pyplot as plt
import time
import json

In [12]:
# load the chord and genre dataframe
# Note: this file is too big to store in GitHub, ~575+ Kb
chord_data = pd.read_csv('../../data/chord_and_genre.csv')

In [None]:
# load the genre deviations dataframe
n = 2
filepath = '../data/' + str(n) + '_gram_deviations.csv'
deviation_df = pd.read_csv(filepath)

In [None]:
# setting the sample size threshold
###################################################################################################################
sample_size_threshold = 0.01 # make this bigger to exclude rarer chords from the model
###################################################################################################################
baseline_total = deviation_df.loc[deviation_df['n_gram'] == 'baseline', 'total'].iloc[0]

# drop the baseline row
deviation_df = deviation_df[deviation_df['n_gram'] != 'baseline']

# drop all rows not meeting a sample size threshold
print("Number of n-grams before dropping based on sample size threshold:", len(deviation_df.index))
deviation_df = deviation_df[deviation_df['total'] >= baseline_total * sample_size_threshold]
print("Number of n-grams after dropping based on sample size threshold:", len(deviation_df.index))

# sort by maximum absolute log deviation ration, so that the "good feature" chords are at the top
deviation_df = deviation_df.sort_values(by = 'max_abs_log_dev_ratio', ascending = False)

In [None]:
# read the equivalence dictionary file
# this is a dictionary of dictionaries
#    the top-level keys are chord names (e.g. 'C','Amin')
#    the top-level values are dictionaries, whose keys are equivalent chords, and whose values are the semitone distance between the top-level key and the low-level key
with open('../data/harmonic_equivalence_dictionary.json') as file:
    equiv_dict = json.load(file)

In [None]:
# if the two input chords are harmonically equivalent, return (True, num_semitones) where num_semitones is the distance from n_gram_1 (up) to n_gram_2
# otherwise, return (False, None)
def compare_chords(chord_1, chord_2):
    if chord_2 in equiv_dict[chord_1]:
        return (True, equiv_dict[chord_1][chord_2])
    else:
        return (False, None)

assert(compare_chords('C','D')[0])
assert(compare_chords('C','E')[0])
assert(not(compare_chords('C','Amin')[0]))

In [None]:
# if the two input n_grams are harmonically equivalent, return (True, num_semitones) where num_semitones is the distance from n_gram_1 (up) to n_gram_2
# otherwise, return (False, None)
def compare_n_grams(n_gram_1, n_gram_2):
    list_1 = n_gram_1.split(',')
    list_2 = n_gram_2.split(',')

    # if they aren't the same length, we don't have to check anything
    if len(list_1) != len(list_2):
        return (False, None)

    # now we can assume they have the same length
    comparison = [compare_chords(list_1[i], list_2[i]) for i in range(len(list_1))]

    # if any pairs are not the same, return False
    for c in comparison:
        if not c[0]:
            return (False, None)

    # now we can assume every respective pair is equivalent, but we still need all of the distances to match
    dist_0 = comparison[0][1]
    for c in comparison:
        if c[1] != dist_0:
            return (False, None)

    return (True, dist_0)

assert(compare_n_grams('C,D,E','F,G,A')[0])
assert(not(compare_n_grams('C,D,E','F,G,B')[0]))

In [None]:
# return true/false depending on if a song contains a harmonically equivalent n_gram to the input n_gram
# new version of this, making use of the equivalence dictionary for lookups rather than doing calculations every time
def contains_n_gram(song, n_gram):
    # assumption: input song is a comma-separated string of chord names
    # assumption: input n_gram is a comma-separated string of chord names

    # skip ahead and return true if the raw version is the song
    if n_gram in song:
        return True

    # split up the song and n_gram into lists of strings of single chords
    song_as_list = song.split(',')
    song_length = len(song_as_list)
    n_gram_as_list = n_gram.split(',')
    n = len(n_gram_as_list)

    for i in range(0,song_length - n):
        song_n_gram = ','.join(song_as_list[i:i+n])
        is_same, dist = compare_n_grams(n_gram, song_n_gram)
        if is_same:
            return True

    return False

assert(contains_n_gram('A,B,C,D,E,F,G','C,D'))
assert(contains_n_gram('A,B,C,D,E,F','F,G'))

With the above setup out of the way, now I want to make a classifier model which will output a genre prediction. The only features will be a series of binary columns of the form 'Contains a 2-gram harmonically equivalent to C,D' etc. To decide which chords to use, I'm using the top sorted chords from the deviation dataframe loaded above.

I will also make two different baslines to compare against:

    1. Predict most common, i.e. just predict 'pop' for every song.
    
    2. An alternate classifier of similar type, whose inputs are also binary columns, but of the form 'Contains a (literal, raw) C,D 2-gram'. In other words, the same kinds of features, but without considering harmonic equivalence. I'll have it use the same columns as the harmonic-equivalence-based classifier, unless I think of a better way to do it.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

Run cells from here down and change num_feature_chords to make and test a mode.

In [None]:
# add a one-hot column for a list of feature chords to the chord and genre data
def add_one_hot(feature_chords):
    df = pd.read_csv('chord_and_genre.csv')

    new_feature_chords = [fc for fc in feature_chords if (('has_literal_'+fc) not in list(df.columns))]
    num_new_features = len(new_feature_chords) 

    if num_new_features == 0:
        print("No new feature columns to build.")
        return
    
    print("Building " + str(num_new_features) + " one-hot encoded columns for chord containment.\n")
    t0 = time.time()
    for index, fc in enumerate(new_feature_chords):
        df['has_literal_' + fc] = df['chords'].apply(lambda song : fc in song)
        df['has_equivalent_' + fc] = df['chords'].apply(lambda song : contains_n_gram(song, fc))
        
        print("Finished tabulating columns for:",fc)
        print("\tCompleted chords so far:",index+1)
        print("\tChords remaining:",num_new_features - (index+1))
        print("\tAverage time per chord so far:",np.round((time.time()-t0)/(index+1), decimals=1))
        print()

    print("Finished all tabulations.")
    df.to_csv('chord_and_genre.csv', index = False)

In [None]:
# extract "good" feature chords from the top rows of the deviation dataframe
features_to_try = 60
features_to_try = min(features_to_try, len(deviation_df.index)) # causes an error if you try to do more than the number of rows in the dataframe
top_rows = deviation_df.head(features_to_try)
feature_chords = list(top_rows['n_gram'])

# add one-hot columns to the chord and genre dataframe
add_one_hot(feature_chords)

In [None]:
# setting up the feature dataframes
model_df = pd.read_csv('chord_and_genre.csv')

# make a numerically encoded genre column
encoder = LabelEncoder()
genre_encoded = encoder.fit_transform(model_df['genres'])

# make some convenient handles
literal_columns = ['has_literal_' + fc for fc in feature_chords]
equivalent_columns = ['has_equivalent_' + fc for fc in feature_chords]
feature_columns = literal_columns + equivalent_columns

In [None]:
display(model_df.head(5))

In [None]:
# do a train test split on the harmonic equivalence dataframe
X_train, X_test, y_train, y_test = train_test_split(model_df[feature_columns], 
                                                    genre_encoded,
                                                    random_state = 145,
                                                    test_size = 0.2)

In [None]:
# fit the models and make predictions
equivalence_model = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', max_iter = 2000)
equivalence_model.fit(X_train[equivalent_columns], y_train)
y_pred_equiv = equivalence_model.predict(X_test[equivalent_columns])

literal_model = LogisticRegression(multi_class = 'multinomial', solver = 'lbfgs', max_iter = 2000)
literal_model.fit(X_train[literal_columns], y_train)
y_pred_lit = literal_model.predict(X_test[literal_columns])

dummy_model = DummyClassifier(strategy = 'most_frequent')
dummy_model.fit(X_train, y_train)
y_pred_dummy = dummy_model.predict(X_test)

# compute and output accuracy scores
accuracy_equiv = accuracy_score(y_test, y_pred_equiv)
print("Accuracy of equivalence model:\t", accuracy_equiv)
accuracy_lit = accuracy_score(y_test, y_pred_lit)
print("Accuracy of literal model:\t", accuracy_lit)
accuracy_dummy = accuracy_score(y_test, y_pred_dummy)
print("Accuracy of dummy model:\t", accuracy_dummy)
print()

# compute and output precision scores
precision_equiv = precision_score(y_test, y_pred_equiv, average = 'micro')
print("Precision of equivalence model:\t", precision_equiv)
precision_lit = precision_score(y_test, y_pred_lit, average = 'micro')
print("Precision of literal model:\t", precision_lit)
precision_dummy = precision_score(y_test, y_pred_dummy, average = 'micro')
print("Precision of dummy model:\t", precision_dummy)

In [None]:
tree = DecisionTreeClassifier(
    max_depth = 10,
    min_samples_leaf = 5,
    random_state = 145)

rf = RandomForestClassifier(
    n_estimators = 500,
    max_depth = 10,
    min_samples_leaf = 5,
    #max_features = 2,
    bootstrap = True,
    max_samples = 500,
    random_state = 145)

et = ExtraTreesClassifier(
    n_estimators = 500,
    max_depth = 10,
    min_samples_leaf = 5,
    #max_features = 2,
    #bootstrap = True,
    #max_samples = 500,
    random_state = 145)

tree.fit(X_train[equivalent_columns], y_train)
rf.fit(X_train[equivalent_columns], y_train)
et.fit(X_train[equivalent_columns], y_train)

tree_pred = tree.predict(X_test[equivalent_columns])
rf_pred = rf.predict(X_test[equivalent_columns])
et_pred = et.predict(X_test[equivalent_columns])

accuracy_tree = accuracy_score(y_test, tree_pred)
print("Accuracy of decision tree model:\t", accuracy_tree)
accuracy_rf = accuracy_score(y_test, rf_pred)
print("Accuracy of random forest model:\t", accuracy_rf)
accuracy_et = accuracy_score(y_test, et_pred)
print("Accuracy of extra trees model:\t\t", accuracy_et)
print()