In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import warnings

import pandas as pd
import numpy as np

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    fbeta_score,
    f1_score,
    make_scorer,
)

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.decomposition import PCA


%matplotlib inline

<IPython.core.display.Javascript object>

In [3]:
def print_vif(x):
    """Utility for checking multicollinearity assumption
    
    :param x: input features to check using VIF. This is assumed to be a pandas.DataFrame
    :return: nothing is returned the VIFs are printed as a pandas series
    """
    # Silence numpy FutureWarning about .ptp
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        x = sm.add_constant(x)

    vifs = []
    for i in range(x.shape[1]):
        vif = variance_inflation_factor(x.values, i)
        vifs.append(vif)

    print("VIF results\n-------------------------------")
    print(pd.Series(vifs, index=x.columns))
    print("-------------------------------\n")

<IPython.core.display.Javascript object>

In [4]:
feat_30_loc = "../data/features_30_sec.csv"
feat_3_loc = "../data/features_3_sec.csv"

<IPython.core.display.Javascript object>

In [5]:
# named long and short to differentiate easier
long = pd.read_csv(feat_30_loc)
short = pd.read_csv(feat_3_loc)

<IPython.core.display.Javascript object>

In [6]:
# log transform the mfcc variance
var_cols = short.columns[short.columns.str.contains("_var")]
logged_var_df = short.copy()
for col in var_cols:
    logged_var_df[col + "_logged"] = np.log(logged_var_df[col])
    logged_var_df = logged_var_df.drop(col, 1)

<IPython.core.display.Javascript object>

In [16]:
m_start = 10  # highest mfcc to use. higher than this is too high in the frequency spectrum to really matter
mel_freq_drops = [f"mfcc{x}_mean" for x in range(m_start, 21)] + [
    f"mfcc{x}_var_logged" for x in range(m_start, 21)
]

<IPython.core.display.Javascript object>

In [18]:
# best balance for VIF I could tune
drop_cols = [
    "length",
    "filename",
    "label",
    #     "zero_crossing_rate_mean",  # and this
    #     "zero_crossing_rate_var_logged",  # so does this
    "rolloff_mean",  # and this
    "harmony_var_logged",  # and this
    "rolloff_var_logged",
    "spectral_centroid_var_logged",
    "spectral_bandwidth_var_logged",
    "spectral_centroid_mean",
    "spectral_bandwidth_mean",
    "rms_mean",
    #     "rms_var_logged",
    "mfcc1_mean",
    #         "mfcc2_mean",
    #     "perceptr_var_logged",
    "chroma_stft_mean",
    "chroma_stft_var_logged",
    "harmony_mean",  # This causes high training guessing
    "perceptr_mean",  # this one
    "tempo",
]
drop_cols = drop_cols + mel_freq_drops
print_vif(logged_var_df.drop(drop_cols, 1,))

VIF results
-------------------------------
const                            826.190269
zero_crossing_rate_mean            5.025561
mfcc2_mean                         5.399041
mfcc3_mean                         2.220546
mfcc4_mean                         1.978505
mfcc5_mean                         2.504531
mfcc6_mean                         3.073838
mfcc7_mean                         2.834979
mfcc8_mean                         3.033714
mfcc9_mean                         2.325726
rms_var_logged                     5.503822
zero_crossing_rate_var_logged      4.491712
perceptr_var_logged                5.541972
mfcc1_var_logged                   3.008138
mfcc2_var_logged                   2.786420
mfcc3_var_logged                   2.562920
mfcc4_var_logged                   2.858780
mfcc5_var_logged                   2.869145
mfcc6_var_logged                   2.976339
mfcc7_var_logged                   2.761200
mfcc8_var_logged                   2.455166
mfcc9_var_logged                

<IPython.core.display.Javascript object>

In [19]:
drop_cols = ["filename", "length", "label"]
drop_cols = drop_cols + mel_freq_drops
X = logged_var_df.drop(drop_cols, 1)
y = logged_var_df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=34, stratify=y
)

<IPython.core.display.Javascript object>

In [20]:
num_cols = list(X.columns)

bin_cols = []

cat_cols = []
drop_cats = []


preprocessing = ColumnTransformer(
    [
        # Scale numeric columns (not needed for all models but can't hurt)
        ("scaler", StandardScaler(), num_cols)
    ],
    remainder="passthrough",
)


pipeline = Pipeline(
    [
        ("preprocessing", preprocessing),
        #         ("pca", PCA(n_components=10)),
        # Choose your model and put it here
        ("svc", SVC()),
    ]
)


grid = {
    # Use model__ with hyperprammeter names after
    "svc__C": [10, 100],
    "svc__kernel": ["linear"],
    "svc__decision_function_shape": ["ovo", "ovr"],
    #     "svc__degree": [2, 3, 5],
}


pipeline_cv = GridSearchCV(pipeline, grid, verbose=1, n_jobs=-1, cv=2)
pipeline_cv.fit(X_train, y_train)


print(pipeline_cv.score(X_train, y_train))
print(pipeline_cv.score(X_test, y_test))

Fitting 2 folds for each of 4 candidates, totalling 8 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:   34.6s remaining:   11.5s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:   39.6s finished


0.7611361361361362
0.7167167167167167


<IPython.core.display.Javascript object>

In [21]:
pipeline_cv.best_params_

{'svc__C': 100, 'svc__decision_function_shape': 'ovo', 'svc__kernel': 'linear'}

<IPython.core.display.Javascript object>

In [12]:
y_pred = pipeline_cv.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[173   0   9   3   1   5   2   0   3   4]
 [  0 189   0   0   0   9   0   1   1   0]
 [ 20   1 148   3   0   8   1   4   6   8]
 [  2   0   6 165   4   1   0   1   9  12]
 [  2   1   1   9 173   0   2   5   4   3]
 [  6   8   7   0   0 173   0   3   0   3]
 [  2   0   4   1   8   4 172   0   2   7]
 [  0   0  11   5   5   6   1 171   1   0]
 [  9   0   8   8   6   2   2   5 158   2]
 [ 13   1  20   9   4   3   8   3  12 126]]
              precision    recall  f1-score   support

       blues       0.76      0.86      0.81       200
   classical       0.94      0.94      0.94       200
     country       0.69      0.74      0.72       199
       disco       0.81      0.82      0.82       200
      hiphop       0.86      0.86      0.86       200
        jazz       0.82      0.86      0.84       200
       metal       0.91      0.86      0.89       200
         pop       0.89      0.85      0.87       200
      reggae       0.81      0.79      0.80       200
        rock       0.76     

<IPython.core.display.Javascript object>