In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import warnings

warnings.filterwarnings("ignore")

from dask.distributed import Client, progress


import pandas as pd
import numpy as np
import pickle

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
)
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

<IPython.core.display.Javascript object>

In [3]:
client = Client(n_workers=6, threads_per_worker=1, memory_limit="2GB")
client

0,1
Client  Scheduler: tcp://127.0.0.1:38821  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 6  Cores: 6  Memory: 12.00 GB


<IPython.core.display.Javascript object>

In [12]:
import dask.dataframe as dd
import joblib

<IPython.core.display.Javascript object>

In [5]:
# To use all
# df_long = pd.read_csv("../data/features_30_sec.csv")
# df_short = pd.read_csv("../data/features_3_sec.csv")
# df = pd.concat((df_long, df_short))

# To use just one
# df = pd.read_csv("../data/features_30_sec.csv")
df = dd.read_csv("../data/features_3_sec.csv")

df["genre"] = df["filename"].str.split(".").str[0]

# "blues.00000.0.wav" -> "blues.00000"
# and
# "blues.00000.wav" -> "blues.00000"
# logic: split on period, take first 2 elements, and but back together
df["songname"] = df["filename"].str.split(".").str[:2].str.join(".")

<IPython.core.display.Javascript object>

In [9]:
# generated by tuning vif and then checking model coefficients while tuning
keep_cols = [
    "chroma_stft_mean",
    "chroma_stft_var",
    "rms_var",
    "zero_crossing_rate_mean",
    "zero_crossing_rate_var",
    "harmony_mean",
    "harmony_var",
    "perceptr_mean",
    "tempo",
    "mfcc1_mean",
    "mfcc2_mean",
    "mfcc2_var",
    "mfcc3_mean",
    "mfcc3_var",
    "mfcc4_mean",
    "mfcc4_var",
    "mfcc5_var",
    "mfcc6_mean",
    "mfcc6_var",
    "mfcc7_mean",
    "mfcc8_mean",
    "mfcc8_var",
    "mfcc9_mean",
    "mfcc9_var",
    "mfcc10_var",
    "mfcc12_mean",
    "mfcc12_var",
    "mfcc13_mean",
    "mfcc15_mean",
    "mfcc15_var",
    "mfcc16_mean",
    "mfcc16_var",
    "mfcc17_mean",
    "mfcc18_mean",
    "mfcc19_mean",
    "mfcc19_var",
]

<IPython.core.display.Javascript object>

In [10]:
X = df[keep_cols]
y = df["genre"]

<IPython.core.display.Javascript object>

In [11]:
# Log all the variance features because of their distributions
X_logged = X.copy()
for c in X_logged:
    if c.endswith("_var"):
        X_logged[c] = np.log(X_logged[c])

<IPython.core.display.Javascript object>

In [16]:
# og: "blues.00000.0.wav"
# songname: "blues.00000"
# genre: "blues"
song_genre = df[["songname", "genre"]].drop_duplicates()

# stratification was done in another notebook.
# Some extra steps were taken.
# Stratify on the 30 second clips and then project down to the 3 second clips

train_songs = pickle.load(open("../data/train_songs.p", "rb"))
test_songs = pickle.load(open("../data/test_songs.p", "rb"))

train_idxs = df[df["songname"].isin(train_songs)].index
test_idxs = df[df["songname"].isin(test_songs)].index

X_train = X_logged.loc[train_idxs, :]
X_test = X_logged.loc[test_idxs, :]
y_train = y[train_idxs]
y_test = y[test_idxs]

print(X_train.compute().shape, y_train.compute().shape)
print(X_test.compute().shape, y_test.compute().shape)

(7990, 36) (7990,)
(2000, 36) (2000,)


<IPython.core.display.Javascript object>

In [17]:
X_train.persist()
X_test.persist()
y_train.persist()
y_test.persist()

Dask Series Structure:
npartitions=1
    object
       ...
Name: genre, dtype: object
Dask Name: index, 1 tasks

<IPython.core.display.Javascript object>

In [18]:
num_cols = list(X.columns)

preprocessing = ColumnTransformer(
    [
        # Scale numeric columns (not needed for all models but can't hurt)
        ("scaler", StandardScaler(), num_cols)
    ],
    remainder="passthrough",
)


pipeline_knn = Pipeline(
    [
        ("preprocessing", preprocessing),
        #         ("pca", PCA()),
        # Choose your model and put it here
        ("knn", KNeighborsClassifier(weights="uniform", n_neighbors=100)),
    ]
)


# pipeline_knn.fit(X_train, y=y_train)


# print(pipeline_knn.score(X_train, y_train))
# print(pipeline_knn.score(X_test, y_test))

<IPython.core.display.Javascript object>

In [21]:
with joblib.parallel_backend("dask"):
    scores = cross_validate(pipeline_knn, X_train.compute(), y_train.compute(), cv=4)

scores

{'fit_time': array([0.45922542, 0.10578203, 0.1310823 , 0.30323076]),
 'score_time': array([1.30521512, 1.48301768, 1.43849897, 1.45733213]),
 'test_score': array([0.42742743, 0.49249249, 0.46069104, 0.41862794])}

<IPython.core.display.Javascript object>

In [22]:
pipeline_knn.fit(X_train, y=y_train)


print(pipeline_knn.score(X_train, y_train))
print(pipeline_knn.score(X_test, y_test))

0.7321652065081352
0.628


<IPython.core.display.Javascript object>

In [23]:
num_cols = list(X.columns)

preprocessing = ColumnTransformer(
    [
        # Scale numeric columns (not needed for all models but can't hurt)
        ("scaler", StandardScaler(), num_cols)
    ],
    remainder="passthrough",
)


pipeline_svc = Pipeline(
    [
        ("preprocessing", preprocessing),
        #         ("pca", PCA()),
        # Choose your model and put it here
        ("svc", SVC(kernel="rbf", C=10)),
    ]
)



<IPython.core.display.Javascript object>

In [24]:
with joblib.parallel_backend("dask"):
    scores = cross_validate(pipeline_svc, X_train.compute(), y_train.compute(), cv=4)

scores

{'fit_time': array([0.90983391, 0.93542743, 0.93849683, 0.92314005]),
 'score_time': array([0.41796446, 0.39633226, 0.85081291, 0.39193892]),
 'test_score': array([0.4964965 , 0.5975976 , 0.56284427, 0.52428643])}

<IPython.core.display.Javascript object>

In [25]:
pipeline_svc.fit(X_train, y_train)

print(pipeline_svc.score(X_train, y_train))
print(pipeline_svc.score(X_test, y_test))

0.9987484355444305
0.7165


<IPython.core.display.Javascript object>

In [26]:
num_cols = list(X.columns)

preprocessing = ColumnTransformer(
    [
        # Scale numeric columns (not needed for all models but can't hurt)
        ("scaler", StandardScaler(), num_cols)
    ],
    remainder="passthrough",
)


pipeline = Pipeline(
    [
        ("preprocessing", preprocessing),
        #         ("pca", PCA()),
        # Choose your model and put it here
        (
            "log",
            LogisticRegression(
                max_iter=800, penalty="elasticnet", solver="saga", C=0.1, l1_ratio=0.5
            ),
        ),
    ]
)



<IPython.core.display.Javascript object>

In [27]:
with joblib.parallel_backend("dask"):
    scores = cross_validate(pipeline, X_train.compute(), y_train.compute(), cv=4)

scores

{'fit_time': array([4.71645594, 2.36453271, 4.47998762, 4.29329538]),
 'score_time': array([0.07666302, 0.03719211, 0.03573394, 0.03163028]),
 'test_score': array([0.48948949, 0.58708709, 0.54131197, 0.49474211])}

<IPython.core.display.Javascript object>

In [28]:
pipeline.fit(X_train, y_train)

print(pipeline.score(X_train, y_train))
print(pipeline.score(X_test, y_test))

0.7058823529411765
0.65


<IPython.core.display.Javascript object>

In [None]:
client.close()