In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import warnings

import pandas as pd
import numpy as np

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    fbeta_score,
    f1_score,
    make_scorer,
)

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier


%matplotlib inline

<IPython.core.display.Javascript object>

In [3]:
def print_vif(x):
    """Utility for checking multicollinearity assumption
    
    :param x: input features to check using VIF. This is assumed to be a pandas.DataFrame
    :return: nothing is returned the VIFs are printed as a pandas series
    """
    # Silence numpy FutureWarning about .ptp
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        x = sm.add_constant(x)

    vifs = []
    for i in range(x.shape[1]):
        vif = variance_inflation_factor(x.values, i)
        vifs.append(vif)

    print("VIF results\n-------------------------------")
    print(pd.Series(vifs, index=x.columns))
    print("-------------------------------\n")

<IPython.core.display.Javascript object>

In [4]:
feat_30_loc = "../data/features_30_sec.csv"
feat_3_loc = "../data/features_3_sec.csv"

<IPython.core.display.Javascript object>

In [5]:
# named long and short to differentiate easier
long = pd.read_csv(feat_30_loc)
short = pd.read_csv(feat_3_loc)

<IPython.core.display.Javascript object>

In [6]:
# log transform the mfcc variance
cc_var_cols = [f"mfcc{x}_var" for x in range(1, 21)]
logged_cc_var_df = short.copy()
for col in cc_var_cols:
    logged_cc_var_df[col + "_logged"] = np.log(logged_cc_var_df[col])
    logged_cc_var_df = logged_cc_var_df.drop(col, 1)

<IPython.core.display.Javascript object>

In [7]:
m_start = 12  # highest mfcc to use. higher than this is too high in the frequency spectrum to really matter
mel_freq_drops = [f"mfcc{x}_mean" for x in range(m_start, 21)] + [
    f"mfcc{x}_var_logged" for x in range(m_start, 21)
]

<IPython.core.display.Javascript object>

In [8]:
# best balance for VIF I could tune
drop_cols = [
    "length",
    "filename",
    "label",
    #     "zero_crossing_rate_mean",
    #     "zero_crossing_rate_var",
    "rolloff_mean",
    "harmony_var",
    "rolloff_var",
    "spectral_centroid_var",
    "spectral_bandwidth_var",
    "spectral_centroid_mean",
    "spectral_bandwidth_mean",
    #     "rms_mean",
    #     "rms_var",
]
drop_cols = drop_cols + mel_freq_drops
print_vif(logged_cc_var_df.drop(drop_cols, 1,))

VIF results
-------------------------------
const                      776.274686
chroma_stft_mean             3.391661
chroma_stft_var              2.546500
rms_mean                     7.666024
rms_var                      3.268666
zero_crossing_rate_mean      6.949903
zero_crossing_rate_var       2.660002
harmony_mean                 1.478593
perceptr_mean                1.573693
perceptr_var                 4.918247
tempo                        1.009578
mfcc1_mean                   7.258845
mfcc2_mean                   5.652443
mfcc3_mean                   2.547365
mfcc4_mean                   2.076686
mfcc5_mean                   2.679961
mfcc6_mean                   3.340509
mfcc7_mean                   3.014978
mfcc8_mean                   3.704902
mfcc9_mean                   2.584736
mfcc10_mean                  2.548987
mfcc11_mean                  2.098440
mfcc1_var_logged             2.949586
mfcc2_var_logged             2.583133
mfcc3_var_logged             2.590872
mfcc4_

<IPython.core.display.Javascript object>

In [9]:
X = logged_cc_var_df.drop(drop_cols, 1,)
y = logged_cc_var_df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=34, stratify=y
)

<IPython.core.display.Javascript object>

In [32]:
num_cols = list(X.columns)

bin_cols = []

cat_cols = []
drop_cats = []


preprocessing = ColumnTransformer(
    [
        # Scale numeric columns (not needed for all models but can't hurt)
        ("scaler", StandardScaler(), num_cols)
    ],
    remainder="passthrough",
)


pipeline = Pipeline(
    [
        ("preprocessing", preprocessing),
        # Choose your model and put it here
        ("rfc", RandomForestClassifier()),
    ]
)


grid = {
    # Use model__ with hyperprammeter names after
    "rfc__criterion": ["gini", "entropy"],
    "rfc__max_depth": [10, 14, 20],
    "rfc__min_samples_leaf": [10, 16, 20],
}

n_trees = 200

pipeline["rfc"].n_estimators = n_trees

pipeline_cv = GridSearchCV(
    pipeline,
    grid,
    verbose=1,
    n_jobs=-1,
    cv=2,
    # scoring=make_scorer(fbeta_score, beta=1.5, average="weighted"),
)
pipeline_cv.fit(X_train, y_train)


print(pipeline_cv.score(X_train, y_train))
print(pipeline_cv.score(X_test, y_test))

Fitting 2 folds for each of 18 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:   24.5s finished


0.9062812812812813
0.7827827827827828


<IPython.core.display.Javascript object>

In [29]:
pipeline_cv.best_params_

{'rfc__criterion': 'entropy',
 'rfc__max_depth': 20,
 'rfc__min_samples_leaf': 10}

<IPython.core.display.Javascript object>

In [30]:
y_pred = pipeline_cv.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[169   0   8   3   1   5   6   0   6   2]
 [  0 187   2   0   0  10   0   0   1   0]
 [ 15   0 144   5   0  21   0   0  10   4]
 [  0   1   4 157   4   3   3   3  10  15]
 [  5   0   3   7 152   0   3  17  13   0]
 [  5  11   5   1   0 178   0   0   0   0]
 [  3   0   2   2   1   2 177   0   5   8]
 [  0   1  15   6   6   5   1 152  10   4]
 [  8   1   7   4   9   1   1   7 160   2]
 [ 13   4  16  19   0  16   7   1  11 112]]
              precision    recall  f1-score   support

       blues       0.78      0.84      0.81       200
   classical       0.91      0.94      0.92       200
     country       0.70      0.72      0.71       199
       disco       0.77      0.79      0.78       200
      hiphop       0.88      0.76      0.82       200
        jazz       0.74      0.89      0.81       200
       metal       0.89      0.89      0.89       200
         pop       0.84      0.76      0.80       200
      reggae       0.71      0.80      0.75       200
        rock       0.76     

<IPython.core.display.Javascript object>