In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import warnings

import pandas as pd
import numpy as np
import pickle

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    fbeta_score,
    f1_score,
    make_scorer,
    accuracy_score,
)

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

<IPython.core.display.Javascript object>

In [3]:
def print_vif(x):
    """Utility for checking multicollinearity assumption
    
    :param x: input features to check using VIF. This is assumed to be a pandas.DataFrame
    :return: nothing is returned the VIFs are printed as a pandas series
    """
    # Silence numpy FutureWarning about .ptp
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        x = sm.add_constant(x)

    vifs = []
    for i in range(x.shape[1]):
        vif = variance_inflation_factor(x.values, i)
        vifs.append(vif)

    print("VIF results\n-------------------------------")
    print(pd.Series(vifs, index=x.columns))
    print("-------------------------------\n")

<IPython.core.display.Javascript object>

In [4]:
# To use all
# df_long = pd.read_csv("../data/features_30_sec.csv")
# df_short = pd.read_csv("../data/features_3_sec.csv")
# df = pd.concat((df_long, df_short))

# To use just one
# df = pd.read_csv("../data/features_30_sec.csv")
df = pd.read_csv("../data/features_3_sec.csv")

df["genre"] = df["filename"].str.split(".").str[0]

# "blues.00000.0.wav" -> "blues.00000"
# and
# "blues.00000.wav" -> "blues.00000"
# logic: split on period, take first 2 elements, and but back together
df["songname"] = df["filename"].str.split(".").str[:2].str.join(".")

<IPython.core.display.Javascript object>

In [5]:
m_start = 20  # highest mfcc to use. higher than this is too high in the frequency spectrum to really matter
mel_freq_drops = [f"mfcc{x}_mean" for x in range(m_start, 21)] + [
    f"mfcc{x}_var" for x in range(m_start, 21)
]

<IPython.core.display.Javascript object>

In [6]:
drop_cols = [
    "filename",
    "label",
    "genre",
    "songname",
    "length",
    "chroma_stft_mean",
    #     "chroma_stft_var",
    "rms_mean",
    #     "rms_var",
    "spectral_centroid_mean",
    "spectral_centroid_var",
    "spectral_bandwidth_mean",
    #     "spectral_bandwidth_var",
    "rolloff_mean",
    #     "rolloff_var",
    #     "zero_crossing_rate_mean",
    #     "zero_crossing_rate_var",
    #     "harmony_mean",
    #     "harmony_var",
    #     "perceptr_mean",
    #     "perceptr_var",
    #     "tempo",
]

drop_cols = drop_cols + mel_freq_drops
# print_vif(df.drop(drop_cols, 1,))

<IPython.core.display.Javascript object>

In [7]:
# X = df.drop(columns=drop_cols + ["genre"])
X = df.drop(drop_cols, 1)
y = df["genre"]

<IPython.core.display.Javascript object>

In [8]:
X_logged = X.copy()
for c in X_logged:
    if c.endswith("_var"):
        X_logged[c] = np.log(X_logged[c])

<IPython.core.display.Javascript object>

In [9]:
print_vif(X)

VIF results
-------------------------------
const                      431.492904
chroma_stft_var              1.997918
rms_var                      3.452525
spectral_bandwidth_var       6.569263
rolloff_var                  7.973294
zero_crossing_rate_mean      7.294004
zero_crossing_rate_var       3.147415
harmony_mean                 1.481718
harmony_var                  2.365511
perceptr_mean                1.585636
perceptr_var                 3.683834
tempo                        1.012672
mfcc1_mean                   4.190136
mfcc1_var                    2.075027
mfcc2_mean                   6.190307
mfcc2_var                    3.257947
mfcc3_mean                   2.767521
mfcc3_var                    2.125682
mfcc4_mean                   2.353447
mfcc4_var                    2.189950
mfcc5_mean                   2.816167
mfcc5_var                    2.115541
mfcc6_mean                   3.441603
mfcc6_var                    2.440623
mfcc7_mean                   3.242545
mfcc7_

<IPython.core.display.Javascript object>

In [10]:
# og: "blues.00000.0.wav"
# songname: "blues.00000"
# genre: "blues"
song_genre = df[["songname", "genre"]].drop_duplicates()

train_songs, test_songs = train_test_split(
    song_genre["songname"], test_size=0.2, random_state=42, stratify=song_genre["genre"]
)

train_songs = pickle.load(open("../data/train_songs.p", "rb"))
test_songs = pickle.load(open("../data/test_songs.p", "rb"))

train_idxs = df[df["songname"].isin(train_songs)].index
test_idxs = df[df["songname"].isin(test_songs)].index

X_train = X_logged.loc[train_idxs, :]
X_test = X_logged.loc[test_idxs, :]
y_train = y[train_idxs]
y_test = y[test_idxs]

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(7990, 49) (7990,)
(2000, 49) (2000,)


<IPython.core.display.Javascript object>

In [11]:
# Prove no overlap of songs between train/test
set(train_songs).intersection(set(test_songs))

set()

<IPython.core.display.Javascript object>

In [18]:
num_cols = list(X.columns)

bin_cols = []

cat_cols = []
drop_cats = []


preprocessing = ColumnTransformer(
    [
        # Scale numeric columns (not needed for all models but can't hurt)
        ("scaler", StandardScaler(), num_cols)
    ],
    remainder="passthrough",
)


pipeline = Pipeline(
    [
        ("preprocessing", preprocessing),
        # Choose your model and put it here
        ("xgb", XGBClassifier()),
    ]
)


params = {
    "xgb__subsample": [0.5, 0.75, 1.0],
    "xgb__colsample_bytree": [0.4, 0.6, 0.8, 1.0],
    "xgb__max_depth": [6],
}

n_trees = 100
learning_rate = 2 / n_trees

pipeline["xgb"].n_estimators = n_trees
pipeline["xgb"].learning_rate = learning_rate

scaler = pipeline.named_steps["preprocessing"]
scaler.fit(X_train)
X_test_scaled = scaler.transform(X_test)


pipeline_cv = GridSearchCV(pipeline, params, verbose=1, n_jobs=-1, cv=2)

pipeline_cv.fit(X_train, y=y_train, xgb__eval_set=[(X_test_scaled, y_test)])


print(pipeline_cv.score(X_train, y_train))
print(pipeline_cv.score(X_test, y_test))
pipeline_cv.best_params_

Fitting 2 folds for each of 12 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.


[0]	validation_0-merror:0.50100
[1]	validation_0-merror:0.46250


[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  1.2min finished


[2]	validation_0-merror:0.43050
[3]	validation_0-merror:0.40800
[4]	validation_0-merror:0.39700
[5]	validation_0-merror:0.38800
[6]	validation_0-merror:0.37900
[7]	validation_0-merror:0.36750
[8]	validation_0-merror:0.36000
[9]	validation_0-merror:0.35650
[10]	validation_0-merror:0.35350
[11]	validation_0-merror:0.35500
[12]	validation_0-merror:0.35550
[13]	validation_0-merror:0.35700
[14]	validation_0-merror:0.35400
[15]	validation_0-merror:0.35500
[16]	validation_0-merror:0.34800
[17]	validation_0-merror:0.34800
[18]	validation_0-merror:0.34150
[19]	validation_0-merror:0.34200
[20]	validation_0-merror:0.34150
[21]	validation_0-merror:0.33950
[22]	validation_0-merror:0.34100
[23]	validation_0-merror:0.33800
[24]	validation_0-merror:0.33800
[25]	validation_0-merror:0.33650
[26]	validation_0-merror:0.33550
[27]	validation_0-merror:0.34000
[28]	validation_0-merror:0.33250
[29]	validation_0-merror:0.33150
[30]	validation_0-merror:0.33650
[31]	validation_0-merror:0.33400
[32]	validation_0-

{'xgb__colsample_bytree': 0.4, 'xgb__max_depth': 6, 'xgb__subsample': 0.75}

<IPython.core.display.Javascript object>

In [15]:
y_pred = pipeline_cv.predict(X_test)

print(confusion_matrix(y_test, y_pred))

[[109   0  18   2   2  27  14   0  24   4]
 [  0 190   3   0   0   5   0   0   0   2]
 [ 13   3 114  16   1  18  16   0   9  10]
 [  3   0  13 129  12   0   3  29   9   2]
 [  4   0   9  25  61   0  12  47  42   0]
 [  0  36  15   2   1 136   0   5   3   2]
 [  3   0   0   4   7   4 170   0   8   4]
 [  0   0  12   7   1   8   0 163   7   2]
 [  0   1  35  12   3   0   4  11 133   1]
 [  4   2  21  42   0  22  48  22   7  32]]


<IPython.core.display.Javascript object>

In [17]:
index = [f"{x}" for x in pipeline_cv.classes_]
cols = [f"{x}" for x in pipeline_cv.classes_]

pd.DataFrame(confusion_matrix(y_test, y_pred), index=index, columns=cols)

Unnamed: 0,blues,classical,country,disco,hiphop,jazz,metal,pop,reggae,rock
blues,109,0,18,2,2,27,14,0,24,4
classical,0,190,3,0,0,5,0,0,0,2
country,13,3,114,16,1,18,16,0,9,10
disco,3,0,13,129,12,0,3,29,9,2
hiphop,4,0,9,25,61,0,12,47,42,0
jazz,0,36,15,2,1,136,0,5,3,2
metal,3,0,0,4,7,4,170,0,8,4
pop,0,0,12,7,1,8,0,163,7,2
reggae,0,1,35,12,3,0,4,11,133,1
rock,4,2,21,42,0,22,48,22,7,32


<IPython.core.display.Javascript object>