In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import warnings

import pandas as pd
import numpy as np

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    fbeta_score,
    f1_score,
    make_scorer,
)

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier


%matplotlib inline

<IPython.core.display.Javascript object>

In [3]:
def print_vif(x):
    """Utility for checking multicollinearity assumption
    
    :param x: input features to check using VIF. This is assumed to be a pandas.DataFrame
    :return: nothing is returned the VIFs are printed as a pandas series
    """
    # Silence numpy FutureWarning about .ptp
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        x = sm.add_constant(x)

    vifs = []
    for i in range(x.shape[1]):
        vif = variance_inflation_factor(x.values, i)
        vifs.append(vif)

    print("VIF results\n-------------------------------")
    print(pd.Series(vifs, index=x.columns))
    print("-------------------------------\n")

<IPython.core.display.Javascript object>

In [4]:
feat_30_loc = "../data/features_30_sec.csv"
feat_3_loc = "../data/features_3_sec.csv"

<IPython.core.display.Javascript object>

In [5]:
# named long and short to differentiate easier
long = pd.read_csv(feat_30_loc)
short = pd.read_csv(feat_3_loc)

<IPython.core.display.Javascript object>

In [6]:
# log transform the variance cols
var_cols = short.columns[short.columns.str.contains("_var")]
logged_var_df = short.copy()
for col in var_cols:
    logged_var_df[col + "_logged"] = np.log(logged_var_df[col])
    logged_var_df = logged_var_df.drop(col, 1)

<IPython.core.display.Javascript object>

In [7]:
m_start = 12  # highest mfcc to use. higher than this is too high in the frequency spectrum to really matter
mel_freq_drops = [f"mfcc{x}_mean" for x in range(m_start, 21)] + [
    f"mfcc{x}_var_logged" for x in range(m_start, 21)
]

<IPython.core.display.Javascript object>

In [8]:
# # best balance for VIF I could tune
# drop_cols = [
#     "length",
#     "filename",
#     "label",
#     #     "zero_crossing_rate_mean",
#     #     "zero_crossing_rate_var",
#     "rolloff_mean",
#     "harmony_var",
#     "rolloff_var",
#     "spectral_centroid_var",
#     "spectral_bandwidth_var",
#     "spectral_centroid_mean",
#     "spectral_bandwidth_mean",
#     #     "rms_mean",
#     #     "rms_var",
# ]
# drop_cols = drop_cols + mel_freq_drops
# print_vif(logged_cc_var_df.drop(drop_cols, 1,))

<IPython.core.display.Javascript object>

In [9]:
drop_cols = ["filename", "length", "label"]
# drop_cols = drop_cols + mel_freq_drops
X = logged_var_df.drop(drop_cols, 1)
y = logged_var_df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=34, stratify=y
)

<IPython.core.display.Javascript object>

In [14]:
num_cols = list(X.columns)

bin_cols = []

cat_cols = []
drop_cats = []


preprocessing = ColumnTransformer(
    [
        # Scale numeric columns (not needed for all models but can't hurt)
        ("scaler", StandardScaler(), num_cols)
    ],
    remainder="passthrough",
)


pipeline = Pipeline(
    [
        ("preprocessing", preprocessing),
        ("pca", PCA(n_components=8)),
        # Choose your model and put it here
        ("rfc", RandomForestClassifier()),
    ]
)


grid = {
    # Use model__ with hyperprammeter names after
    "rfc__criterion": ["gini", "entropy"],
    "rfc__max_depth": [10, 14, 20],
    "rfc__min_samples_leaf": [10, 16, 20],
}

n_trees = 200

pipeline["rfc"].n_estimators = n_trees

pipeline_cv = GridSearchCV(
    pipeline,
    grid,
    verbose=1,
    n_jobs=-1,
    cv=2,
    # scoring=make_scorer(fbeta_score, beta=1.5, average="weighted"),
)
pipeline_cv.fit(X_train, y_train)


print(pipeline_cv.score(X_train, y_train))
print(pipeline_cv.score(X_test, y_test))

Fitting 2 folds for each of 18 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:   12.1s finished


0.8193193193193193
0.6716716716716716


<IPython.core.display.Javascript object>

In [15]:
pipeline_cv.best_params_

{'rfc__criterion': 'entropy',
 'rfc__max_depth': 20,
 'rfc__min_samples_leaf': 10}

<IPython.core.display.Javascript object>

In [12]:
y_pred = pipeline_cv.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[105   1  22  11   2  15  17   0   9  18]
 [  3 169   3   0   0  20   0   0   1   4]
 [ 31   5  91  17   1  18   0   3  22  11]
 [ 15   0  15  85  14   5  16  21  11  18]
 [  8   0   4  13 130   1  13   7  21   3]
 [ 10  36  16   2   0 119   4   2   3   8]
 [  3   0   2  11  11   4 151   0   3  15]
 [  0   1   7   7   9  14   0 147  11   4]
 [ 16   0  14  11  22   6   1  10 116   4]
 [ 13   1  19  23   7  25  23  10  12  66]]
              precision    recall  f1-score   support

       blues       0.51      0.53      0.52       200
   classical       0.79      0.84      0.82       200
     country       0.47      0.46      0.46       199
       disco       0.47      0.42      0.45       200
      hiphop       0.66      0.65      0.66       200
        jazz       0.52      0.59      0.56       200
       metal       0.67      0.76      0.71       200
         pop       0.73      0.73      0.73       200
      reggae       0.56      0.58      0.57       200
        rock       0.44     

<IPython.core.display.Javascript object>