In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import warnings

import pandas as pd
import numpy as np
import pickle

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    fbeta_score,
    f1_score,
    make_scorer,
    accuracy_score,
)

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

<IPython.core.display.Javascript object>

In [3]:
def print_vif(x):
    """Utility for checking multicollinearity assumption
    
    :param x: input features to check using VIF. This is assumed to be a pandas.DataFrame
    :return: nothing is returned the VIFs are printed as a pandas series
    """
    # Silence numpy FutureWarning about .ptp
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        x = sm.add_constant(x)

    vifs = []
    for i in range(x.shape[1]):
        vif = variance_inflation_factor(x.values, i)
        vifs.append(vif)

    print("VIF results\n-------------------------------")
    print(pd.Series(vifs, index=x.columns))
    print("-------------------------------\n")

<IPython.core.display.Javascript object>

In [4]:
# To use all
# df_long = pd.read_csv("../data/features_30_sec.csv")
# df_short = pd.read_csv("../data/features_3_sec.csv")
# df = pd.concat((df_long, df_short))

# To use just one
# df = pd.read_csv("../data/features_30_sec.csv")
df = pd.read_csv("../data/features_3_sec.csv")

df["genre"] = df["filename"].str.split(".").str[0]

# "blues.00000.0.wav" -> "blues.00000"
# and
# "blues.00000.wav" -> "blues.00000"
# logic: split on period, take first 2 elements, and but back together
df["songname"] = df["filename"].str.split(".").str[:2].str.join(".")

<IPython.core.display.Javascript object>

In [5]:
m_start = 10  # highest mfcc to use. higher than this is too high in the frequency spectrum to really matter
mel_freq_drops = [f"mfcc{x}_mean" for x in range(m_start, 21)] + [
    f"mfcc{x}_var" for x in range(m_start, 21)
]

<IPython.core.display.Javascript object>

In [6]:
drop_cols = [
    "filename",
    "label",
    "genre",
    "songname",
    "length",
    "chroma_stft_mean",
    "chroma_stft_var",
    "rms_mean",
    "rms_var",
    "spectral_centroid_mean",
    "spectral_centroid_var",
    "spectral_bandwidth_mean",
    "spectral_bandwidth_var",
    "rolloff_mean",
    "rolloff_var",
    #     "zero_crossing_rate_mean",
    #     "zero_crossing_rate_var",
    #     "harmony_mean",
    #     "harmony_var",
    #     "perceptr_mean",
    "perceptr_var",
    #     "tempo",
]

drop_cols = drop_cols + mel_freq_drops
# print_vif(df.drop(drop_cols, 1,))

<IPython.core.display.Javascript object>

In [7]:
# X = df.drop(columns=drop_cols + ["genre"])
X = df.drop(drop_cols, 1)
y = df["genre"]

<IPython.core.display.Javascript object>

In [8]:
X_logged = X.copy()
for c in X_logged:
    if c.endswith("_var"):
        X_logged[c] = np.log(X_logged[c])

<IPython.core.display.Javascript object>

In [9]:
print_vif(X)

VIF results
-------------------------------
const                      179.520374
zero_crossing_rate_mean      5.515683
zero_crossing_rate_var       2.654780
harmony_mean                 1.463564
harmony_var                  1.834723
perceptr_mean                1.558186
tempo                        1.007095
mfcc1_mean                   3.120024
mfcc1_var                    1.952657
mfcc2_mean                   5.215105
mfcc2_var                    2.124468
mfcc3_mean                   2.420531
mfcc3_var                    1.988724
mfcc4_mean                   1.992960
mfcc4_var                    2.148103
mfcc5_mean                   2.509799
mfcc5_var                    2.061580
mfcc6_mean                   3.093166
mfcc6_var                    2.277700
mfcc7_mean                   2.834846
mfcc7_var                    2.087279
mfcc8_mean                   2.977315
mfcc8_var                    2.045695
mfcc9_mean                   2.358157
mfcc9_var                    1.860646
dtype:

<IPython.core.display.Javascript object>

In [10]:
# og: "blues.00000.0.wav"
# songname: "blues.00000"
# genre: "blues"
song_genre = df[["songname", "genre"]].drop_duplicates()

train_songs, test_songs = train_test_split(
    song_genre["songname"], test_size=0.2, random_state=42, stratify=song_genre["genre"]
)

train_songs = pickle.load(open("../data/train_songs.p", "rb"))
test_songs = pickle.load(open("../data/test_songs.p", "rb"))

train_idxs = df[df["songname"].isin(train_songs)].index
test_idxs = df[df["songname"].isin(test_songs)].index

X_train = X_logged.loc[train_idxs, :]
X_test = X_logged.loc[test_idxs, :]
y_train = y[train_idxs]
y_test = y[test_idxs]

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(7990, 24) (7990,)
(2000, 24) (2000,)


<IPython.core.display.Javascript object>

In [11]:
# Prove no overlap of songs between train/test
set(train_songs).intersection(set(test_songs))

set()

<IPython.core.display.Javascript object>

In [16]:
num_cols = list(X.columns)

bin_cols = []

cat_cols = []
drop_cats = []


preprocessing = ColumnTransformer(
    [
        # Scale numeric columns (not needed for all models but can't hurt)
        ("scaler", StandardScaler(), num_cols)
    ],
    remainder="passthrough",
)


pipeline = Pipeline(
    [
        ("preprocessing", preprocessing),
        #         ("pca", PCA(n_components=10)),
        # Choose your model and put it here
        ("svc", SVC()),
    ]
)


grid = {
    # Use model__ with hyperprammeter names after
    "svc__C": [1, 10, 100],
    "svc__kernel": ["linear"],
    "svc__decision_function_shape": ["ovo", "ovr"],
   # "svc__degree": [2, 3, 5],
}


pipeline_cv = GridSearchCV(pipeline, grid, verbose=1, n_jobs=-1, cv=2)
pipeline_cv.fit(X_train, y_train)


print(pipeline_cv.score(X_train, y_train))
print(pipeline_cv.score(X_test, y_test))
pipeline_cv.best_params_

Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   39.0s finished


0.9322903629536922
0.641


{'svc__C': 10,
 'svc__decision_function_shape': 'ovo',
 'svc__degree': 5,
 'svc__kernel': 'poly'}

<IPython.core.display.Javascript object>

In [13]:
pipeline_cv.best_params_

{'svc__C': 10,
 'svc__decision_function_shape': 'ovo',
 'svc__degree': 2,
 'svc__kernel': 'rbf'}

<IPython.core.display.Javascript object>