In [None]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import neighbors, linear_model
from sklearn.model_selection import cross_val_predict, KFold, StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix, f1_score, classification_report, make_scorer
from sklearn.preprocessing import PolynomialFeatures



data = pd.read_csv('data.csv')
song_names = data['song name']
song_names
data = data.drop(columns=['song name'])


<h1>Setting up </h1>

In [None]:
#data2 = data[data.genre != 'westcoastrap']
msfcc_names = []
for i in range(1, 21):
    msfcc_names.append('mfcc{0}'.format(i))
X = data[['zero crossing rate', 'spectral bandwidth', 'spectral centroid', 'chroma'] + msfcc_names]

In [None]:
genre_dict = {'alternative': 0, 'westcoastrap': 1,'edm': 2, 'jazz': 3, 'classical': 4} 
Y = [genre_dict[item] for item in data['genre']]

<h2>KNN-analysis</h2>

In [None]:
for k in range(1,11):
    model = neighbors.KNeighborsClassifier(k)
    yhat= cross_val_predict(model,X,Y, cv = StratifiedKFold(5, True))
    label_ord = range(5)
    Titanic_Conf_matrix = confusion_matrix(yhat,Y, labels=label_ord)
    print(f"{k}-nearest neighbor accuracy:{np.trace(Titanic_Conf_matrix)/np.sum(Titanic_Conf_matrix)}")
    print(Titanic_Conf_matrix)

<h3>KDTree</h3>

In [None]:

#this code prints cluters of all songs that are similar
from sklearn.neighbors import KDTree

arr = np.array(X)

tree = KDTree(arr, leaf_size=4)

# r = 1.5 (means radius 1.5)
all_nn_indices  = tree.query_radius(arr, r=150)

all_nns = [[song_names[idx] for idx in nn_indices] for nn_indices in all_nn_indices]
for nns in all_nns:
    print(nns)

<h2>Logistic Regression</h2>

This performs logistic regression with a number of different feature counts, cross validated with 5-fold CV.

Adding more dimensions did not produce significant results, but it was significantly better than K-Nearest Neighbors.

In [None]:
#logistic regression testing
for pval in range(1,4):
    model = linear_model.LogisticRegression()
    pf = PolynomialFeatures(degree = pval)
    pf.fit(X)
    yhat = cross_val_predict(model, pf.transform(X), Y, cv = StratifiedKFold(5,True))
    
    Titanic_Conf_matrix = confusion_matrix(yhat,Y)
    print(f"{pval}-degree poly logistic regression accuracy:{np.trace(Titanic_Conf_matrix)/np.sum(Titanic_Conf_matrix)}")
    print(Titanic_Conf_matrix)

<h2>Experimentation on percentage of each type each song was</h2>

As one of the main goals of this was to be able to predict the genre of a song given a database, we perform leave-one-out cross-validation with logistic regression on the values we have.

In [None]:
print('perc_alt', 'perc_classical', 'perc_edm', 'perc_jazz', 'perc_rap')
def abx(Z,x):
    return np.append(Z[:x],Z[x+1:],axis=0)
pf = PolynomialFeatures(degree=1)
pf.fit(X)
phi = pf.transform(X)
score = 0
for idx in range(np.shape(phi)[0]):
    
    model = linear_model.LogisticRegression()
    # To test things, we fit on everything else
    model.fit(abx(phi,idx),abx(Y,idx))
    predicted_scores = model.predict_proba(phi[idx:idx+1])[0]
    # normalize percentages
    perc_alt = "%.6f" % ((predicted_scores[0] / sum(predicted_scores)) * 100)
    perc_rap = "%.6f" % ((predicted_scores[1] / sum(predicted_scores)) * 100)
    perc_edm = "%.6f" % ((predicted_scores[2] / sum(predicted_scores)) * 100)
    perc_jazz = "%.6f" % ((predicted_scores[3] / sum(predicted_scores)) * 100)
    perc_classical = "%.6f" % ((predicted_scores[4] / sum(predicted_scores)) * 100)
    score += predicted_scores[Y[idx]]
    print(song_names[idx])
    print(perc_alt, '    ', perc_classical, '       ', perc_edm, '    ', perc_jazz, '    ', perc_rap)
#order: 'alt', 'classical', 'edm', 'jazz'
print(f"overall score: {score/np.shape(X)[0]}")