In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import warnings

import pandas as pd
import numpy as np

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    fbeta_score,
    f1_score,
    make_scorer,
    accuracy_score,
)

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

%matplotlib inline

<IPython.core.display.Javascript object>

In [3]:
def print_vif(x):
    """Utility for checking multicollinearity assumption
    
    :param x: input features to check using VIF. This is assumed to be a pandas.DataFrame
    :return: nothing is returned the VIFs are printed as a pandas series
    """
    # Silence numpy FutureWarning about .ptp
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        x = sm.add_constant(x)

    vifs = []
    for i in range(x.shape[1]):
        vif = variance_inflation_factor(x.values, i)
        vifs.append(vif)

    print("VIF results\n-------------------------------")
    print(pd.Series(vifs, index=x.columns))
    print("-------------------------------\n")

<IPython.core.display.Javascript object>

In [4]:
feat_30_loc = "../data/features_30_sec.csv"
feat_3_loc = "../data/features_3_sec.csv"
# named long and short to differentiate easier
long = pd.read_csv(feat_30_loc)
short = pd.read_csv(feat_3_loc)

long_og = long.copy()
short_og = short.copy()

<IPython.core.display.Javascript object>

In [5]:
var_cols = short.columns[short.columns.str.contains("_var")]
logged_var_long_df = long.copy()
logged_var_short_df = short.copy()
for col in var_cols:
    logged_var_long_df[col + "_logged"] = np.log(logged_var_long_df[col])
    logged_var_long_df = logged_var_long_df.drop(col, 1)
    logged_var_short_df[col + "_logged"] = np.log(logged_var_short_df[col])
    logged_var_short_df = logged_var_short_df.drop(col, 1)

<IPython.core.display.Javascript object>

In [19]:
m_start = 20  # highest mfcc to use. higher than this is too high in the frequency spectrum to really matter
mel_freq_drops = [f"mfcc{x}_mean" for x in range(m_start, 21)] + [
    f"mfcc{x}_var_logged" for x in range(m_start, 21)
]

<IPython.core.display.Javascript object>

In [33]:
# best balance for VIF I could tune
drop_cols = [
    "length",
    # "filename",
    "label",
    "zero_crossing_rate_mean",
    "zero_crossing_rate_var_logged",
    "rolloff_mean",
    "harmony_var_logged",
    "rolloff_var_logged",
    "spectral_centroid_var_logged",
    "spectral_bandwidth_var_logged",
    "spectral_centroid_mean",
    "spectral_bandwidth_mean",
    "rms_mean",
    "rms_var_logged",
    "chroma_stft_mean",
    "chroma_stft_var_logged",
    "harmony_mean",
    "perceptr_mean",
    "perceptr_var_logged",
    "tempo",
    #     "mfcc1_mean",
    #         "mfcc2_mean",
]
drop_cols = drop_cols + mel_freq_drops
print_vif(logged_var_long_df.drop(drop_cols, 1,))

TypeError: ufunc 'isfinite' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

<IPython.core.display.Javascript object>

In [34]:
long_X = logged_var_long_df.drop(drop_cols, 1)
long_y = logged_var_long_df["label"]

long_X_train, long_X_test, long_y_train, long_y_test = train_test_split(
    long_X, long_y, test_size=0.2, random_state=34, stratify=long_y
)

<IPython.core.display.Javascript object>

In [36]:
long_X_train.head()

Unnamed: 0,filename,mfcc1_mean,mfcc2_mean,mfcc3_mean,mfcc4_mean,mfcc5_mean,mfcc6_mean,mfcc7_mean,mfcc8_mean,mfcc9_mean,...,mfcc10_var_logged,mfcc11_var_logged,mfcc12_var_logged,mfcc13_var_logged,mfcc14_var_logged,mfcc15_var_logged,mfcc16_var_logged,mfcc17_var_logged,mfcc18_var_logged,mfcc19_var_logged
181,classical.00081.wav,-374.643646,153.851944,-21.576778,20.847727,7.007453,-3.944745,-2.51822,-2.812335,-4.294847,...,4.078912,3.895973,3.5716,3.885432,3.910654,3.789928,3.844711,4.002104,4.205649,4.327283
150,classical.00050.wav,-449.527832,151.519836,-12.761743,44.265835,6.236342,4.053188,-1.483168,-1.618902,-2.489429,...,2.963815,3.093204,3.235443,3.874849,4.675578,4.599849,3.662588,3.861043,3.768293,3.427149
971,rock.00071.wav,-79.773857,109.916206,-17.120001,47.194103,-11.917439,33.311535,-11.698196,23.627981,-13.662325,...,4.221295,4.157164,3.761028,3.747519,3.75906,3.342199,3.696176,3.916827,3.712169,4.174981
607,metal.00007.wav,-108.049858,65.767876,-20.838789,47.557873,0.869435,8.991188,-3.199908,20.971745,-10.341425,...,3.456743,3.474491,3.56274,3.590173,3.575676,3.520266,3.491157,3.392208,3.309134,3.372939
728,pop.00028.wav,-45.586212,74.727806,15.975804,14.799959,8.998905,8.202152,5.682154,5.249434,5.390201,...,3.946248,3.974693,3.88945,3.922349,3.823894,3.887185,3.663259,3.823706,4.25873,4.395639


<IPython.core.display.Javascript object>

In [31]:
num_cols = list(long_X_train.columns)

bin_cols = []

cat_cols = []
drop_cats = []


preprocessing = ColumnTransformer(
    [
        # Scale numeric columns (not needed for all models but can't hurt)
        ("scaler", StandardScaler(), num_cols)
    ],
    remainder="passthrough",
)


pipeline = Pipeline(
    [
        ("preprocessing", preprocessing),
        #         ("pca", PCA(n_components=8)),
        # Choose your model and put it here
        ("svc", SVC()),
    ]
)


params = {
    # Use model__ with hyperprammeter names after
    "svc__C": [10, 100, 1000],
    "svc__kernel": ["linear", "rbf", "poly"],
    "svc__decision_function_shape": ["ovo", "ovr"],
    "svc__degree": [2, 3, 5],
}


pipeline_cv = GridSearchCV(pipeline, params, verbose=1, n_jobs=-1, cv=2)

pipeline_cv.fit(long_X_train, y=long_y_train)


print(pipeline_cv.score(long_X_train, long_y_train))
print(pipeline_cv.score(long_X_test, long_y_test))
pipeline_cv.best_params_

Fitting 2 folds for each of 54 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.


0.99875
0.705


[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:    0.7s finished


{'svc__C': 10,
 'svc__decision_function_shape': 'ovo',
 'svc__degree': 2,
 'svc__kernel': 'rbf'}

<IPython.core.display.Javascript object>