# EDA Notebook

In [16]:
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from collections import defaultdict

In [14]:
analysis_df = pd.read_csv("../data/test/all/metadata/all_audio_features.csv")

# Convert string-based col to list-based col where applicable
analysis_df["genre(s)"] = analysis_df["genre(s)"].str.split('//')
analysis_df["artist_names"] = analysis_df["artist_names"].str.split('//')

In [15]:
analysis_df.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,analysis_url,duration_ms,time_signature,genre(s),artist_ids,artist_names,name
0,0.529,0.722,1,-4.815,0,0.0814,0.213,0.0,0.102,0.666,79.951,14xRAc1zbSZZzKaYgkwqdY,https://api.spotify.com/v1/audio-analysis/14xR...,212132,4,[kpop],3cjEqqelV9zb4BYE3qDQ4O,[EXO],PLAYBOY
1,0.533,0.838,0,-3.722,1,0.0319,0.103,0.0,0.394,0.451,75.013,7rLvsAO1yb7ElxPhkz60qh,https://api.spotify.com/v1/audio-analysis/7rLv...,196549,4,[kpop],0UEP2XBR9aC5NBKcAKnBIq//4ufh0WuMZh6y4Dmdnklvdl...,"[CHEN, BAEKHYUN, XIUMIN]",For You
2,0.749,0.85,0,-4.346,1,0.0468,0.0319,4.8e-05,0.0678,0.672,145.983,5EzitieoPnjyKHAq0gfRMa,https://api.spotify.com/v1/audio-analysis/5Ezi...,190423,4,[kpop],3cjEqqelV9zb4BYE3qDQ4O,[EXO],Ko Ko Bop
3,0.619,0.462,4,-9.154,1,0.0402,0.854,4e-06,0.0932,0.131,123.945,1RMUSljuiZKUNaf6xskK9n,https://api.spotify.com/v1/audio-analysis/1RMU...,193931,4,[kpop],4ufh0WuMZh6y4Dmdnklvdl,[BAEKHYUN],My Love
4,0.629,0.575,6,-5.763,0,0.0354,0.235,0.0,0.0935,0.239,136.053,5pesNiBKAx8JNwK2mQ2HEc,https://api.spotify.com/v1/audio-analysis/5pes...,218090,4,[kpop],3cjEqqelV9zb4BYE3qDQ4O,[EXO],지나갈 테니 Been Through


In [172]:
def score_preds(preds, y_test):
    scores = [dict() for _ in preds]
    elem_len = len(preds[0])
    
    for i in range(len(preds)):
        correct = 0
        wrong = 0
        for j in range(len(preds[i])):
            if preds[i, j] == y_test[i, j]:
                correct += 1
            else:
                wrong += 1
        scores[i] = {
            "correct":correct,
            "wrong":wrong,
            "total_elems":elem_len
        }
        
    return pd.DataFrame(scores)

In [173]:
def get_pred_stats(preds, y_test, classes):
    scores_df = score_preds(preds, y_test)
    category_count = scores_df["total_elems"].iloc[0]
    total_correctness = np.mean(scores_df["correct"]/category_count)
    
    preds_df = pd.DataFrame(preds, columns=binarizer.classes_)
    y_test_df = pd.DataFrame(y_test, columns=binarizer.classes_)
    equality_df = preds_df == y_test_df
    
    out = {"correct_predictions":total_correctness}
    
    for genre in equality_df:
        out["{}_accuracy".format(genre)] = np.mean(equality_df[genre])
    
    return pd.Series(out).round(3)

In [66]:
numerical_feats = ['danceability','energy','loudness','key','mode',
                   'speechiness','acousticness','instrumentalness',
                   'liveness','valence','tempo', 'duration_ms',
                   'time_signature']

In [190]:
import sklearn.neural_network
from sklearn.tree import DecisionTreeClassifier

In [197]:
binarizer = MultiLabelBinarizer()
binarized_data = binarizer.fit_transform(analysis_df["genre(s)"])
train_test_data = binarized_data

X_train, X_test, y_train, y_test = train_test_split(analysis_df[numerical_feats], binarized_data)

models = {"KNeighborsClassifier":KNeighborsClassifier(),
          "DecisionTreeClassifier":DecisionTreeClassifier(),
          "RandomForestClassifier":RandomForestClassifier(),
          "MLPClassifier":sklearn.neural_network.MLPClassifier(max_iter=5000)}

for modl_name in models:
    model = models[modl_name]
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    print(modl_name)
    display(get_pred_stats(preds, y_test, binarizer.classes_))

KNeighborsClassifier


correct_predictions    0.708
hiphop_accuracy        0.695
kpop_accuracy          0.835
pop_accuracy           0.654
rock_accuracy          0.647
dtype: float64

DecisionTreeClassifier


correct_predictions    0.787
hiphop_accuracy        0.819
kpop_accuracy          0.844
pop_accuracy           0.674
rock_accuracy          0.811
dtype: float64

RandomForestClassifier


correct_predictions    0.856
hiphop_accuracy        0.887
kpop_accuracy          0.902
pop_accuracy           0.747
rock_accuracy          0.888
dtype: float64

MLPClassifier


correct_predictions    0.647
hiphop_accuracy        0.752
kpop_accuracy          0.858
pop_accuracy           0.609
rock_accuracy          0.367
dtype: float64