In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import warnings

import pandas as pd
import numpy as np
import pickle

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    fbeta_score,
    f1_score,
    make_scorer,
    accuracy_score,
)

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

<IPython.core.display.Javascript object>

In [3]:
def print_vif(x):
    """Utility for checking multicollinearity assumption
    
    :param x: input features to check using VIF. This is assumed to be a pandas.DataFrame
    :return: nothing is returned the VIFs are printed as a pandas series
    """
    # Silence numpy FutureWarning about .ptp
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        x = sm.add_constant(x)

    vifs = []
    for i in range(x.shape[1]):
        vif = variance_inflation_factor(x.values, i)
        vifs.append(vif)

    print("VIF results\n-------------------------------")
    print(pd.Series(vifs, index=x.columns))
    print("-------------------------------\n")

<IPython.core.display.Javascript object>

In [4]:
# To use all
# df_long = pd.read_csv("../data/features_30_sec.csv")
# df_short = pd.read_csv("../data/features_3_sec.csv")
# df = pd.concat((df_long, df_short))

# To use just one
# df = pd.read_csv("../data/features_30_sec.csv")
df = pd.read_csv("../data/features_3_sec.csv")

df["genre"] = df["filename"].str.split(".").str[0]

# "blues.00000.0.wav" -> "blues.00000"
# and
# "blues.00000.wav" -> "blues.00000"
# logic: split on period, take first 2 elements, and but back together
df["songname"] = df["filename"].str.split(".").str[:2].str.join(".")

<IPython.core.display.Javascript object>

In [5]:
m_start = 20  # highest mfcc to use. higher than this is too high in the frequency spectrum to really matter
mel_freq_drops = [f"mfcc{x}_mean" for x in range(m_start, 21)] + [
    f"mfcc{x}_var" for x in range(m_start, 21)
]

<IPython.core.display.Javascript object>

In [6]:
drop_cols = [
    "filename",
    "label",
    "genre",
    "songname",
    "length",
    "chroma_stft_mean",
    #     "chroma_stft_var",
    "rms_mean",
    #     "rms_var",
    "spectral_centroid_mean",
    "spectral_centroid_var",
    "spectral_bandwidth_mean",
    #     "spectral_bandwidth_var",
    "rolloff_mean",
    #     "rolloff_var",
    "zero_crossing_rate_mean",
    #     "zero_crossing_rate_var",
    #     "harmony_mean",
    "harmony_var",
    #     "perceptr_mean",
    #     "perceptr_var",
    #     "tempo",
]

drop_cols = drop_cols + mel_freq_drops
# print_vif(df.drop(drop_cols, 1,))

<IPython.core.display.Javascript object>

In [7]:
# X = df.drop(columns=drop_cols + ["genre"])
X = df.drop(drop_cols, 1)
y = df["genre"]

<IPython.core.display.Javascript object>

In [8]:
X_logged = X.copy()
for c in X_logged:
    if c.endswith("_var"):
        X_logged[c] = np.log(X_logged[c])

<IPython.core.display.Javascript object>

In [9]:
print_vif(X)

VIF results
-------------------------------
const                     156.060365
chroma_stft_var             1.750548
rms_var                     3.113077
spectral_bandwidth_var      6.557318
rolloff_var                 7.750728
zero_crossing_rate_var      2.511541
harmony_mean                1.480923
perceptr_mean               1.584839
perceptr_var                3.515107
tempo                       1.011359
mfcc1_mean                  3.284236
mfcc1_var                   2.063002
mfcc2_mean                  3.053849
mfcc2_var                   3.255681
mfcc3_mean                  2.325006
mfcc3_var                   2.121027
mfcc4_mean                  2.222342
mfcc4_var                   2.178382
mfcc5_mean                  2.760103
mfcc5_var                   2.113111
mfcc6_mean                  3.378106
mfcc6_var                   2.427075
mfcc7_mean                  3.240837
mfcc7_var                   2.161646
mfcc8_mean                  3.979041
mfcc8_var                   2.1

<IPython.core.display.Javascript object>

In [10]:
# og: "blues.00000.0.wav"
# songname: "blues.00000"
# genre: "blues"
song_genre = df[["songname", "genre"]].drop_duplicates()

train_songs, test_songs = train_test_split(
    song_genre["songname"], test_size=0.2, random_state=42, stratify=song_genre["genre"]
)

train_songs = pickle.load(open("../data/train_songs.p", "rb"))
test_songs = pickle.load(open("../data/test_songs.p", "rb"))

train_idxs = df[df["songname"].isin(train_songs)].index
test_idxs = df[df["songname"].isin(test_songs)].index

X_train = X_logged.loc[train_idxs, :]
X_test = X_logged.loc[test_idxs, :]
y_train = y[train_idxs]
y_test = y[test_idxs]

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(7990, 47) (7990,)
(2000, 47) (2000,)


<IPython.core.display.Javascript object>

In [11]:
# Prove no overlap of songs between train/test
set(train_songs).intersection(set(test_songs))

set()

<IPython.core.display.Javascript object>

num_cols = list(X.columns)

bin_cols = []

cat_cols = []
drop_cats = []


preprocessing = ColumnTransformer(
    [
        # Scale numeric columns (not needed for all models but can't hurt)
        ("scaler", StandardScaler(), num_cols)
    ],
    remainder="passthrough",
)


pipeline = Pipeline(
    [
        ("preprocessing", preprocessing),
        #         ("pca", PCA()),
        # Choose your model and put it here
        ("knn", KNeighborsClassifier()),
    ]
)


params = {
    "knn__n_neighbors": [20],
    "knn__weights": ["uniform"],
    "knn__leaf_size": [100],
    "knn__algorithm": ["kd_tree", "ball_tree"],
}


pipeline_cv = GridSearchCV(pipeline, params, verbose=1, n_jobs=-1, cv=2)

pipeline_cv.fit(X_train, y=y_train)


print(pipeline_cv.score(X_train, y_train))
print(pipeline_cv.score(X_test, y_test))
pipeline_cv.best_params_

In [12]:
num_cols = list(X.columns)

preprocessing = ColumnTransformer(
    [
        # Scale numeric columns (not needed for all models but can't hurt)
        ("scaler", StandardScaler(), num_cols)
    ],
    remainder="passthrough",
)

pipeline = Pipeline(
    [
        ("preprocessing", preprocessing),
        #         ("pca", PCA()),
        # Choose your model and put it here
        (
            "knn",
            KNeighborsClassifier(
                algorithm="kd_tree", leaf_size=400, n_neighbors=60, weights="uniform"
            ),
        ),
    ]
)

pipeline.fit(X_train, y=y_train)
print(pipeline.score(X_train, y_train))
print(pipeline.score(X_test, y_test))

0.7627033792240301
0.643


<IPython.core.display.Javascript object>

In [13]:
y_pred = pipeline.predict(X_test)

print(confusion_matrix(y_test, y_pred))

[[114   0  11  10   4  12   5   0  41   3]
 [  0 195   2   0   0   2   0   0   0   1]
 [  7   0 106  47   2   5  11   2  15   5]
 [  0   0   9 148   9   3   2  17  11   1]
 [  3   0   1  18 100   0   8  35  34   1]
 [  2  31  21  10   3 112   0   9   4   8]
 [  1   0   1  16   7   0 168   0   2   5]
 [  0   0   9  13  11   1   0 160   2   4]
 [  1   0  19  29  17   0   4   6 123   1]
 [  8   0   7  70   4  17  22   7   5  60]]


<IPython.core.display.Javascript object>

In [14]:
index = [f"{x}" for x in pipeline.classes_]
cols = [f"{x}" for x in pipeline.classes_]

pd.DataFrame(confusion_matrix(y_test, y_pred), index=index, columns=cols)

Unnamed: 0,blues,classical,country,disco,hiphop,jazz,metal,pop,reggae,rock
blues,114,0,11,10,4,12,5,0,41,3
classical,0,195,2,0,0,2,0,0,0,1
country,7,0,106,47,2,5,11,2,15,5
disco,0,0,9,148,9,3,2,17,11,1
hiphop,3,0,1,18,100,0,8,35,34,1
jazz,2,31,21,10,3,112,0,9,4,8
metal,1,0,1,16,7,0,168,0,2,5
pop,0,0,9,13,11,1,0,160,2,4
reggae,1,0,19,29,17,0,4,6,123,1
rock,8,0,7,70,4,17,22,7,5,60


<IPython.core.display.Javascript object>

In [15]:
keep_cols = X.columns

<IPython.core.display.Javascript object>

In [16]:
df["predictions"] = pipeline.predict(X_logged[keep_cols])

<IPython.core.display.Javascript object>

In [17]:
long = pd.read_csv("../data/features_30_sec.csv")

<IPython.core.display.Javascript object>

In [18]:
long["vote_pred"] = "none"

for i in range(long["filename"].size):
    curr_file = long["filename"][i]
    file_stripped = curr_file.strip(".wav")
    sub_selection = df["filename"].str.contains(file_stripped)
    prediction = (
        df[sub_selection]["predictions"]
        .value_counts()
        .sort_values(ascending=False)
        .index[0]
    )
    long["vote_pred"][i] = prediction

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


<IPython.core.display.Javascript object>

In [19]:
long_model_pred = long[keep_cols]
for c in long_model_pred:
    if c.endswith("_var"):
        long_model_pred[c] = np.log(long_model_pred[c])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


<IPython.core.display.Javascript object>

In [20]:
long["model_pred"] = pipeline.predict(long_model_pred)

<IPython.core.display.Javascript object>

In [21]:
probs_df = pd.DataFrame(pipeline.predict_proba(X_logged), columns=pipeline.classes_)
df = pd.concat([df, probs_df], 1)

<IPython.core.display.Javascript object>

In [22]:
for c in pipeline.classes_:
    long[c] = 0.0

long["avg_vote"] = "none"

for i in range(long["filename"].size):
    curr_file = long["filename"][i]
    file_stripped = curr_file.strip(".wav")
    sub_selection = df["filename"].str.contains(file_stripped)
    avg_dict = {}
    for c in pipeline.classes_:
        long[c][i] = df[sub_selection][c].mean()
        avg_dict[c] = df[sub_selection][c].mean()
        prediction = max(avg_dict, key=avg_dict.get)
    long["avg_vote"][i] = prediction

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


<IPython.core.display.Javascript object>

In [23]:
long["songname"] = long["filename"].str.split(".").str[:2].str.join(".")

train_idxs = long[long["songname"].isin(train_songs)].index
test_idxs = long[long["songname"].isin(test_songs)].index

long_train = long.loc[train_idxs, :]
long_test = long.loc[test_idxs, :]

<IPython.core.display.Javascript object>

In [24]:
long_train = long.iloc[train_idxs, :]
long_test = long.iloc[test_idxs, :]

print(confusion_matrix(long_test["label"], long_test["avg_vote"]))
print(classification_report(long_test["label"], long_test["avg_vote"]))

[[13  0  0  0  0  2  0  0  5  0]
 [ 0 20  0  0  0  0  0  0  0  0]
 [ 0  0 14  4  0  0  1  0  1  0]
 [ 0  0  1 18  0  0  0  1  0  0]
 [ 0  0  0  2  9  0  1  5  3  0]
 [ 0  5  0  0  0 14  0  1  0  0]
 [ 0  0  0  1  0  0 19  0  0  0]
 [ 0  0  1  0  1  0  0 18  0  0]
 [ 0  0  2  3  1  0  0  1 13  0]
 [ 1  0  1  8  0  1  2  1  0  6]]
              precision    recall  f1-score   support

       blues       0.93      0.65      0.76        20
   classical       0.80      1.00      0.89        20
     country       0.74      0.70      0.72        20
       disco       0.50      0.90      0.64        20
      hiphop       0.82      0.45      0.58        20
        jazz       0.82      0.70      0.76        20
       metal       0.83      0.95      0.88        20
         pop       0.67      0.90      0.77        20
      reggae       0.59      0.65      0.62        20
        rock       1.00      0.30      0.46        20

    accuracy                           0.72       200
   macro avg       

<IPython.core.display.Javascript object>

In [25]:
pd.DataFrame(
    confusion_matrix(long_test["label"], long_test["avg_vote"]),
    index=index,
    columns=cols,
)

Unnamed: 0,blues,classical,country,disco,hiphop,jazz,metal,pop,reggae,rock
blues,13,0,0,0,0,2,0,0,5,0
classical,0,20,0,0,0,0,0,0,0,0
country,0,0,14,4,0,0,1,0,1,0
disco,0,0,1,18,0,0,0,1,0,0
hiphop,0,0,0,2,9,0,1,5,3,0
jazz,0,5,0,0,0,14,0,1,0,0
metal,0,0,0,1,0,0,19,0,0,0
pop,0,0,1,0,1,0,0,18,0,0
reggae,0,0,2,3,1,0,0,1,13,0
rock,1,0,1,8,0,1,2,1,0,6


<IPython.core.display.Javascript object>

In [26]:
long.head()

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,country,disco,hiphop,jazz,metal,pop,reggae,rock,avg_vote,songname
0,blues.00000.wav,661794,0.350088,0.088757,0.130228,0.002827,1784.16585,129774.064525,2002.44906,85882.761315,...,0.168333,0.18,0.096667,0.003333,0.013333,0.0,0.021667,0.146667,blues,blues.00000
1,blues.00001.wav,661794,0.340914,0.09498,0.095948,0.002373,1530.176679,375850.073649,2039.036516,213843.755497,...,0.121667,0.126667,0.088333,0.008333,0.005,0.0,0.351667,0.12,reggae,blues.00001
2,blues.00002.wav,661794,0.363637,0.085275,0.17557,0.002746,1552.811865,156467.643368,1747.702312,76254.192257,...,0.203333,0.171667,0.016667,0.023333,0.031667,0.0,0.025,0.156667,blues,blues.00002
3,blues.00003.wav,661794,0.404785,0.093999,0.141093,0.006346,1070.106615,184355.942417,1596.412872,166441.494769,...,0.135,0.03,0.051667,0.046667,0.0,0.008333,0.285,0.13,blues,blues.00003
4,blues.00004.wav,661794,0.308526,0.087841,0.091529,0.002303,1835.004266,343399.939274,1748.172116,88445.209036,...,0.228333,0.078333,0.02,0.03,0.015,0.0,0.051667,0.17,blues,blues.00004


<IPython.core.display.Javascript object>

In [27]:
long[
    [
        "filename",
        "avg_vote",
        "blues",
        "classical",
        "country",
        "disco",
        "hiphop",
        "jazz",
        "metal",
        "pop",
        "reggae",
        "rock",
    ]
].loc[4:4].round(2).T

Unnamed: 0,4
filename,blues.00004.wav
avg_vote,blues
blues,0.38
classical,0.03
country,0.23
disco,0.08
hiphop,0.02
jazz,0.03
metal,0.02
pop,0


<IPython.core.display.Javascript object>

In [29]:
dir(pipeline.named_steps["knn"])

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_check_algorithm_metric',
 '_estimator_type',
 '_fit',
 '_fit_X',
 '_fit_method',
 '_get_param_names',
 '_get_tags',
 '_kneighbors_reduce_func',
 '_more_tags',
 '_pairwise',
 '_tree',
 '_y',
 'algorithm',
 'classes_',
 'effective_metric_',
 'effective_metric_params_',
 'fit',
 'get_params',
 'kneighbors',
 'kneighbors_graph',
 'leaf_size',
 'metric',
 'metric_params',
 'n_jobs',
 'n_neighbors',
 'n_samples_fit_',
 'outputs_2d_',
 'p',
 'predict',
 'predict_proba',
 'radius',
 'score',
 'set_params',
 'weights']

<IPython.core.display.Javascript object>

In [33]:
pipeline.named_steps["knn"].n_samples_fit_

7990

<IPython.core.display.Javascript object>