# EDA Notebook

In [1]:
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import sklearn.neural_network
from sklearn.tree import DecisionTreeClassifier
from collections import defaultdict

In [2]:
analysis_df = pd.read_csv("../data/test/all/metadata/all_audio_features.csv")

# Convert string-based col to list-based col where applicable
analysis_df["genre(s)"] = analysis_df["genre(s)"].str.split('//')
analysis_df["artist_names"] = analysis_df["artist_names"].str.split('//')

In [3]:
analysis_df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,analysis_url,duration_ms,time_signature,genre(s),artist_ids,artist_names,name
0,0.529,0.722,1,-4.815,0,0.0814,0.213,0.0,0.102,0.666,79.951,14xRAc1zbSZZzKaYgkwqdY,https://api.spotify.com/v1/audio-analysis/14xR...,212132,4,[kpop],3cjEqqelV9zb4BYE3qDQ4O,[EXO],PLAYBOY
1,0.533,0.838,0,-3.722,1,0.0319,0.103,0.0,0.394,0.451,75.013,7rLvsAO1yb7ElxPhkz60qh,https://api.spotify.com/v1/audio-analysis/7rLv...,196549,4,[kpop],0UEP2XBR9aC5NBKcAKnBIq//4ufh0WuMZh6y4Dmdnklvdl...,"[CHEN, BAEKHYUN, XIUMIN]",For You
2,0.749,0.85,0,-4.346,1,0.0468,0.0319,4.8e-05,0.0678,0.672,145.983,5EzitieoPnjyKHAq0gfRMa,https://api.spotify.com/v1/audio-analysis/5Ezi...,190423,4,[kpop],3cjEqqelV9zb4BYE3qDQ4O,[EXO],Ko Ko Bop
3,0.619,0.462,4,-9.154,1,0.0402,0.854,4e-06,0.0932,0.131,123.945,1RMUSljuiZKUNaf6xskK9n,https://api.spotify.com/v1/audio-analysis/1RMU...,193931,4,[kpop],4ufh0WuMZh6y4Dmdnklvdl,[BAEKHYUN],My Love
4,0.629,0.575,6,-5.763,0,0.0354,0.235,0.0,0.0935,0.239,136.053,5pesNiBKAx8JNwK2mQ2HEc,https://api.spotify.com/v1/audio-analysis/5pes...,218090,4,[kpop],3cjEqqelV9zb4BYE3qDQ4O,[EXO],지나갈 테니 Been Through


In [4]:
def score_preds(preds, y_test):
    scores = [dict() for _ in preds]
    elem_len = len(preds[0])
    
    for i in range(len(preds)):
        correct = 0
        wrong = 0
        for j in range(len(preds[i])):
            if preds[i, j] == y_test[i, j]:
                correct += 1
            else:
                wrong += 1
        scores[i] = {
            "correct":correct,
            "wrong":wrong,
            "total_elems":elem_len
        }
        
    return pd.DataFrame(scores)

In [41]:
def get_pred_stats(preds, y_test, classes):
    scores_df = score_preds(preds, y_test)
    category_count = scores_df["total_elems"].iloc[0]
    total_correctness = np.mean(scores_df["correct"]/category_count)
    
    preds_df = pd.DataFrame(preds, columns=classes)
    y_test_df = pd.DataFrame(y_test, columns=classes)
    equality_df = preds_df == y_test_df
    
    out = {"correct_predictions":total_correctness}
    
    for genre in equality_df:
        out["{}_accuracy".format(genre)] = np.mean(equality_df[genre])
    
    return pd.Series(out).round(3).sort_values(ascending=False)

In [6]:
numerical_feats = ['danceability','energy','loudness','key','mode',
                   'speechiness','acousticness','instrumentalness',
                   'liveness','valence','tempo', 'duration_ms',
                   'time_signature']

In [42]:
binarizer = MultiLabelBinarizer()
binarized_data = binarizer.fit_transform(analysis_df["genre(s)"])

X_train, X_test, y_train, y_test = train_test_split(analysis_df[numerical_feats], binarized_data)

models = {"KNeighborsClassifier":KNeighborsClassifier(),
          "DecisionTreeClassifier":DecisionTreeClassifier(),
          "RandomForestClassifier":RandomForestClassifier(),
          "MLPClassifier":sklearn.neural_network.MLPClassifier(max_iter=5000)}

for modl_name in models:
    model = models[modl_name]
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(modl_name)
    display(get_pred_stats(preds, y_test, binarizer.classes_))

KNeighborsClassifier


reggae_accuracy        0.950
rnb_accuracy           0.945
kpop_accuracy          0.933
jazz_accuracy          0.892
correct_predictions    0.886
country_accuracy       0.883
hiphop_accuracy        0.880
classical_accuracy     0.878
pop_accuracy           0.860
rock_accuracy          0.848
edm_dance_accuracy     0.788
dtype: float64

DecisionTreeClassifier


classical_accuracy     0.953
reggae_accuracy        0.928
kpop_accuracy          0.916
rnb_accuracy           0.910
jazz_accuracy          0.895
hiphop_accuracy        0.893
correct_predictions    0.893
country_accuracy       0.876
rock_accuracy          0.866
edm_dance_accuracy     0.864
pop_accuracy           0.829
dtype: float64

RandomForestClassifier


classical_accuracy     0.972
reggae_accuracy        0.953
rnb_accuracy           0.948
kpop_accuracy          0.947
correct_predictions    0.929
jazz_accuracy          0.926
hiphop_accuracy        0.924
edm_dance_accuracy     0.916
country_accuracy       0.913
rock_accuracy          0.910
pop_accuracy           0.880
dtype: float64

MLPClassifier


reggae_accuracy        0.950
rnb_accuracy           0.943
classical_accuracy     0.943
kpop_accuracy          0.938
correct_predictions    0.903
jazz_accuracy          0.900
hiphop_accuracy        0.891
country_accuracy       0.884
pop_accuracy           0.876
rock_accuracy          0.867
edm_dance_accuracy     0.838
dtype: float64

In [44]:
train_proportion = 0.6

shuffled_data = analysis_df.sample(frac=1, random_state=1)
train_set = shuffled_data.iloc[:int(shuffled_data.shape[0] * train_proportion)]
test_set = shuffled_data.iloc[int(shuffled_data.shape[0] * train_proportion):]
X_train = train_set[numerical_feats]
X_test = test_set[numerical_feats]

binarizer = MultiLabelBinarizer().fit(shuffled_data["genre(s)"])
y_train = binarizer.transform(train_set["genre(s)"])
y_test = binarizer.transform(test_set["genre(s)"])

kpop_test_set = test_set[list(map(lambda row:"kpop" in row, test_set["genre(s)"]))]
kpop_X = kpop_test_set[numerical_feats]
kpop_y = binarizer.transform(kpop_test_set["genre(s)"])

models = {"KNeighborsClassifier":KNeighborsClassifier(),
          "DecisionTreeClassifier":DecisionTreeClassifier(),
          "RandomForestClassifier":RandomForestClassifier(),
          "MLPClassifier":sklearn.neural_network.MLPClassifier(max_iter=5000)}

for modl_name in models:
    model = models[modl_name]
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(modl_name)
    display(get_pred_stats(preds, y_test, binarizer.classes_))

KNeighborsClassifier


reggae_accuracy        0.954
rnb_accuracy           0.945
kpop_accuracy          0.936
jazz_accuracy          0.896
correct_predictions    0.885
country_accuracy       0.880
hiphop_accuracy        0.875
classical_accuracy     0.875
pop_accuracy           0.858
rock_accuracy          0.847
edm_dance_accuracy     0.785
dtype: float64

DecisionTreeClassifier


classical_accuracy     0.951
reggae_accuracy        0.932
kpop_accuracy          0.921
rnb_accuracy           0.913
jazz_accuracy          0.897
correct_predictions    0.894
hiphop_accuracy        0.889
country_accuracy       0.877
rock_accuracy          0.872
edm_dance_accuracy     0.869
pop_accuracy           0.818
dtype: float64

RandomForestClassifier


classical_accuracy     0.972
reggae_accuracy        0.956
rnb_accuracy           0.947
kpop_accuracy          0.947
correct_predictions    0.930
jazz_accuracy          0.929
hiphop_accuracy        0.927
edm_dance_accuracy     0.918
rock_accuracy          0.915
country_accuracy       0.911
pop_accuracy           0.878
dtype: float64

MLPClassifier


reggae_accuracy        0.954
rnb_accuracy           0.946
kpop_accuracy          0.937
jazz_accuracy          0.904
classical_accuracy     0.899
correct_predictions    0.891
hiphop_accuracy        0.876
rock_accuracy          0.863
country_accuracy       0.862
pop_accuracy           0.852
edm_dance_accuracy     0.819
dtype: float64