In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import warnings

import pandas as pd
import numpy as np

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    fbeta_score,
    f1_score,
    make_scorer,
)

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.neighbors import KNeighborsClassifier


%matplotlib inline

<IPython.core.display.Javascript object>

In [3]:
def print_vif(x):
    """Utility for checking multicollinearity assumption
    
    :param x: input features to check using VIF. This is assumed to be a pandas.DataFrame
    :return: nothing is returned the VIFs are printed as a pandas series
    """
    # Silence numpy FutureWarning about .ptp
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        x = sm.add_constant(x)

    vifs = []
    for i in range(x.shape[1]):
        vif = variance_inflation_factor(x.values, i)
        vifs.append(vif)

    print("VIF results\n-------------------------------")
    print(pd.Series(vifs, index=x.columns))
    print("-------------------------------\n")

<IPython.core.display.Javascript object>

In [4]:
feat_30_loc = "../data/features_30_sec.csv"
feat_3_loc = "../data/features_3_sec.csv"
# named long and short to differentiate easier
long = pd.read_csv(feat_30_loc)
short = pd.read_csv(feat_3_loc)

<IPython.core.display.Javascript object>

In [5]:
# log transform variances
var_cols = short.columns[short.columns.str.contains("_var")]
logged_var_df = short.copy()
for col in var_cols:
    logged_var_df[col + "_logged"] = np.log(logged_var_df[col])
    logged_var_df = logged_var_df.drop(col, 1)

<IPython.core.display.Javascript object>

In [6]:
m_start = 1  # highest mfcc to use. higher than this is too high in the frequency spectrum to really matter
mel_freq_drops = [f"mfcc{x}_mean" for x in range(m_start, 21)] + [
    f"mfcc{x}_var_logged" for x in range(m_start, 21)
]

<IPython.core.display.Javascript object>

In [15]:
# best balance for VIF I could tune
drop_cols = [
    "length",
    "filename",
    "label",
    #     "zero_crossing_rate_mean",  # and this
    "zero_crossing_rate_var_logged",  # so does this
    "rolloff_mean",  # and this
    "harmony_var_logged",  # and this
    "rolloff_var_logged",
    "spectral_centroid_var_logged",
    "spectral_bandwidth_var_logged",
    "spectral_centroid_mean",
    "spectral_bandwidth_mean",
    "rms_mean",
    "rms_var_logged",
    #     "mfcc1_mean",
    #         "mfcc2_mean",
    "perceptr_var_logged",
    "chroma_stft_mean",
    "chroma_stft_var_logged",
    "harmony_mean",  # This causes high training guessing
    "perceptr_mean",  # this one
    "tempo",
]
drop_cols = drop_cols + mel_freq_drops
print_vif(logged_var_df.drop(drop_cols, 1,))

VIF results
-------------------------------
const                      6.049541
zero_crossing_rate_mean    1.000000
dtype: float64
-------------------------------



<IPython.core.display.Javascript object>

In [16]:
X = logged_var_df.drop(drop_cols, 1,)
y = logged_var_df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=34, stratify=y
)

<IPython.core.display.Javascript object>

drop_cols = ["filename", "length", "label"]
# drop_cols = drop_cols + mel_freq_drops
X = logged_var_df.drop(drop_cols, 1)
y = logged_var_df["label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=12, stratify=y
)

In [19]:
num_cols = list(X.columns)

bin_cols = []

cat_cols = []
drop_cats = []


preprocessing = ColumnTransformer(
    [
        # Scale numeric columns (not needed for all models but can't hurt)
        ("scaler", StandardScaler(), num_cols)
    ],
    remainder="passthrough",
)


pipeline = Pipeline(
    [
        ("preprocessing", preprocessing),
        #         ("pca", PCA()),
        # Choose your model and put it here
        ("knn", KNeighborsClassifier()),
    ]
)


params = {
    "knn__n_neighbors": [5],
    "knn__weights": ["distance"],
    "knn__leaf_size": [30],
    "knn__algorithm": ["kd_tree", "ball_tree"],
}


pipeline_cv = GridSearchCV(pipeline, params, verbose=1, n_jobs=-1, cv=5)

pipeline_cv.fit(X_train, y=y_train)


print(pipeline_cv.score(X_train, y_train))
print(pipeline_cv.score(X_test, y_test))
pipeline_cv.best_params_

Fitting 5 folds for each of 2 candidates, totalling 10 fits
0.9993743743743744
0.12862862862862862


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished


{'knn__algorithm': 'kd_tree',
 'knn__leaf_size': 30,
 'knn__n_neighbors': 5,
 'knn__weights': 'distance'}

<IPython.core.display.Javascript object>

In [13]:
all_feat_cols = list(logged_var_df.columns[2:])
all_feat_cols.remove("label")
all_feat_cols

['chroma_stft_mean',
 'rms_mean',
 'spectral_centroid_mean',
 'spectral_bandwidth_mean',
 'rolloff_mean',
 'zero_crossing_rate_mean',
 'harmony_mean',
 'perceptr_mean',
 'tempo',
 'mfcc1_mean',
 'mfcc2_mean',
 'mfcc3_mean',
 'mfcc4_mean',
 'mfcc5_mean',
 'mfcc6_mean',
 'mfcc7_mean',
 'mfcc8_mean',
 'mfcc9_mean',
 'mfcc10_mean',
 'mfcc11_mean',
 'mfcc12_mean',
 'mfcc13_mean',
 'mfcc14_mean',
 'mfcc15_mean',
 'mfcc16_mean',
 'mfcc17_mean',
 'mfcc18_mean',
 'mfcc19_mean',
 'mfcc20_mean',
 'chroma_stft_var_logged',
 'rms_var_logged',
 'spectral_centroid_var_logged',
 'spectral_bandwidth_var_logged',
 'rolloff_var_logged',
 'zero_crossing_rate_var_logged',
 'harmony_var_logged',
 'perceptr_var_logged',
 'mfcc1_var_logged',
 'mfcc2_var_logged',
 'mfcc3_var_logged',
 'mfcc4_var_logged',
 'mfcc5_var_logged',
 'mfcc6_var_logged',
 'mfcc7_var_logged',
 'mfcc8_var_logged',
 'mfcc9_var_logged',
 'mfcc10_var_logged',
 'mfcc11_var_logged',
 'mfcc12_var_logged',
 'mfcc13_var_logged',
 'mfcc14_var_log

<IPython.core.display.Javascript object>

In [10]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

<IPython.core.display.Javascript object>

In [18]:
for col in all_feat_cols:
    X = logged_var_df[[col]]
    y = logged_var_df["label"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=34, stratify=y
    )
    num_cols = list(X.columns)

    bin_cols = []

    cat_cols = []
    drop_cats = []

    preprocessing = ColumnTransformer(
        [
            # Scale numeric columns (not needed for all models but can't hurt)
            ("scaler", StandardScaler(), num_cols)
        ],
        remainder="passthrough",
    )

    pipeline = Pipeline(
        [
            ("preprocessing", preprocessing),
            # ("pca", PCA()),
            # Choose your model and put it here
            ("knn", KNeighborsClassifier()),
            # ("svc", SVC()),
        ]
    )

    # params = {
    #         "knn__n_neighbors": [5],
    #         "knn__weights": ["distance"],
    #         "knn__leaf_size": [30],
    #         "knn__algorithm": ["kd_tree"],
    #     }
    # pipeline_cv = GridSearchCV(pipeline, params, verbose=0, n_jobs=-1, cv=2)

    pipeline.fit(X_train, y=y_train)

    print(col)
    print(pipeline.score(X_train, y_train))
    print(pipeline.score(X_test, y_test))

chroma_stft_mean
0.4446946946946947
0.20770770770770772
rms_mean
0.4403153153153153
0.21171171171171171
spectral_centroid_mean
0.43355855855855857
0.19519519519519518
spectral_bandwidth_mean
0.4555805805805806
0.22322322322322322
rolloff_mean
0.4403153153153153
0.2067067067067067
zero_crossing_rate_mean
0.4025275275275275
0.16716716716716717
harmony_mean
0.3997747747747748
0.17367367367367367
perceptr_mean
0.40503003003003
0.15315315315315314
tempo
0.11211211211211211
0.09759759759759759
mfcc1_mean
0.4371871871871872
0.2047047047047047
mfcc2_mean
0.4271771771771772
0.19369369369369369
mfcc3_mean
0.4004004004004004
0.15165165165165165
mfcc4_mean
0.4152902902902903
0.17317317317317318
mfcc5_mean
0.39376876876876876
0.13813813813813813
mfcc6_mean
0.40965965965965967
0.16266266266266266
mfcc7_mean
0.3993993993993994
0.14864864864864866
mfcc8_mean
0.41178678678678676
0.15465465465465467
mfcc9_mean
0.41266266266266266
0.17367367367367367
mfcc10_mean
0.3978978978978979
0.15165165165165165
mfc

<IPython.core.display.Javascript object>

In [None]:
for col in all_feat_cols:
    X = logged_var_df[[col]]
    y = logged_var_df["label"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=34, stratify=y
    )
    num_cols = list(X.columns)

    bin_cols = []

    cat_cols = []
    drop_cats = []

    preprocessing = ColumnTransformer(
        [
            # Scale numeric columns (not needed for all models but can't hurt)
            ("scaler", StandardScaler(), num_cols)
        ],
        remainder="passthrough",
    )

    pipeline = Pipeline(
        [
            ("preprocessing", preprocessing),
            #         ("pca", PCA()),
            # Choose your model and put it here
            # ("knn", KNeighborsClassifier()),
            #("svc", SVC()),
        ]
    )

    # params = {
    #         "knn__n_neighbors": [5],
    #         "knn__weights": ["distance"],
    #         "knn__leaf_size": [30],
    #         "knn__algorithm": ["kd_tree"],
    #     }
    # pipeline_cv = GridSearchCV(pipeline, params, verbose=0, n_jobs=-1, cv=2)

    pipeline.fit(X_train, y=y_train)

    print(col)
    print(pipeline.score(X_train, y_train))
    print(pipeline.score(X_test, y_test))