In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import warnings

import pandas as pd
import numpy as np
import pickle

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    fbeta_score,
    f1_score,
    make_scorer,
    accuracy_score,
    precision_score,
)

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression, Lasso, RidgeClassifier

<IPython.core.display.Javascript object>

In [3]:
def print_vif(x):
    """Utility for checking multicollinearity assumption
    
    :param x: input features to check using VIF. This is assumed to be a pandas.DataFrame
    :return: nothing is returned the VIFs are printed as a pandas series
    """
    # Silence numpy FutureWarning about .ptp
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        x = sm.add_constant(x)

    vifs = []
    for i in range(x.shape[1]):
        vif = variance_inflation_factor(x.values, i)
        vifs.append(vif)

    print("VIF results\n-------------------------------")
    print(pd.Series(vifs, index=x.columns))
    print("-------------------------------\n")

<IPython.core.display.Javascript object>

In [4]:
# To use all
# df_long = pd.read_csv("../data/features_30_sec.csv")
# df_short = pd.read_csv("../data/features_3_sec.csv")
# df = pd.concat((df_long, df_short))

# To use just one
# df = pd.read_csv("../data/features_30_sec.csv")
df = pd.read_csv("../data/features_3_sec.csv")

df["genre"] = df["filename"].str.split(".").str[0]

# "blues.00000.0.wav" -> "blues.00000"
# and
# "blues.00000.wav" -> "blues.00000"
# logic: split on period, take first 2 elements, and but back together
df["songname"] = df["filename"].str.split(".").str[:2].str.join(".")

<IPython.core.display.Javascript object>

In [5]:
df_long = pd.read_csv("../data/features_30_sec.csv")
df_short = pd.read_csv("../data/features_3_sec.csv")

<IPython.core.display.Javascript object>

In [None]:
df

In [12]:
df_long[["filename", "rolloff_mean", "mfcc1_mean"]].head(1)

Unnamed: 0,filename,rolloff_mean,mfcc1_mean
0,blues.00000.wav,3805.839606,-113.570648


<IPython.core.display.Javascript object>

In [13]:
df_short[["filename", "rolloff_mean", "mfcc1_mean"]].head(10)

Unnamed: 0,filename,rolloff_mean,mfcc1_mean
0,blues.00000.0.wav,3714.560359,-118.627914
1,blues.00000.1.wav,3869.682242,-125.590706
2,blues.00000.2.wav,3997.63916,-132.44194
3,blues.00000.3.wav,3568.300218,-118.231087
4,blues.00000.4.wav,3469.992864,-105.968376
5,blues.00000.5.wav,4371.985614,-100.752792
6,blues.00000.6.wav,4325.026668,-101.773033
7,blues.00000.7.wav,3625.280386,-109.165077
8,blues.00000.8.wav,3586.934721,-113.373199
9,blues.00000.9.wav,3505.522649,-125.532906


<IPython.core.display.Javascript object>

In [15]:
print_vif(df_short.drop(["filename", "length", "label"], 1))

VIF results
-------------------------------
const                      2703.515224
chroma_stft_mean              4.118726
chroma_stft_var               2.526067
rms_mean                     51.227912
rms_var                       3.840067
spectral_centroid_mean      185.494010
spectral_centroid_var        20.730610
spectral_bandwidth_mean      77.964079
spectral_bandwidth_var        6.843537
rolloff_mean                118.856952
rolloff_var                  15.139926
zero_crossing_rate_mean      26.767622
zero_crossing_rate_var        8.054137
harmony_mean                  1.487883
harmony_var                  15.550854
perceptr_mean                 1.595504
perceptr_var                  9.233448
tempo                         1.014670
mfcc1_mean                   19.701721
mfcc1_var                     2.138339
mfcc2_mean                   20.273645
mfcc2_var                     4.009512
mfcc3_mean                    5.610340
mfcc3_var                     2.204361
mfcc4_mean          

<IPython.core.display.Javascript object>