In [1]:
%reload_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from xgboost import XGBClassifier

<IPython.core.display.Javascript object>

In [3]:
df = pd.read_csv("../data/features_3_sec.csv")
df["genre"] = df["filename"].str.split(".").str[0]

# "blues.00000.0.wav" -> "blues.00000"
df["songname"] = df["filename"].str[:-6]

<IPython.core.display.Javascript object>

In [4]:
drop_cols = [
    # ------------
    # Added by adam
    # ------------
    "songname",
    # ------------
    # Original from initial_eda.ipynb
    # ------------
    "length",
    "filename",
    "label",
    #     "zero_crossing_rate_mean",
    #     "zero_crossing_rate_var",
    "rolloff_mean",
    "harmony_var",
    "rolloff_var",
    "spectral_centroid_var",
    "spectral_bandwidth_var",
    "spectral_centroid_mean",
    "spectral_bandwidth_mean",
]

<IPython.core.display.Javascript object>

## Normal train/test split

In [5]:
X = df.drop(columns=drop_cols + ["genre"])
y = df["genre"]

<IPython.core.display.Javascript object>

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(7992, 50) (7992,)
(1998, 50) (1998,)


<IPython.core.display.Javascript object>

In [7]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

<IPython.core.display.Javascript object>

In [8]:
model = KNeighborsClassifier(10)
model.fit(X_train, y_train)

print(f"Train score: {model.score(X_train, y_train):.4f}")
print(f"Test score: {model.score(X_test, y_test):.4f}")

Train score: 0.9040
Test score: 0.8388


<IPython.core.display.Javascript object>

In [9]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

print(f"Train score: {model.score(X_train, y_train):.4f}")
print(f"Test score: {model.score(X_test, y_test):.4f}")

Train score: 0.7264
Test score: 0.7087


<IPython.core.display.Javascript object>

## Song based train/test split

In [10]:
X = df.drop(columns=drop_cols + ["genre"])
y = df["genre"]

<IPython.core.display.Javascript object>

In [11]:
song_genre = df[["songname", "genre"]].drop_duplicates()

train_songs, test_songs = train_test_split(
    song_genre["songname"], test_size=0.2, random_state=42, stratify=song_genre["genre"]
)

train_idxs = df[df["songname"].isin(train_songs)].index
test_idxs = df[df["songname"].isin(test_songs)].index

X_train = X.loc[train_idxs, :]
X_test = X.loc[test_idxs, :]
y_train = y[train_idxs]
y_test = y[test_idxs]

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(7990, 50) (7990,)
(2000, 50) (2000,)


<IPython.core.display.Javascript object>

In [12]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train = pd.DataFrame(X_train, columns=X.columns)
X_test = pd.DataFrame(X_test, columns=X.columns)

<IPython.core.display.Javascript object>

In [18]:
model = KNeighborsClassifier(5)
model.fit(X_train, y_train)

print(f"Train score: {model.score(X_train, y_train):.4f}")
print(f"Test score: {model.score(X_test, y_test):.4f}")

Train score: 0.9448
Test score: 0.6495


<IPython.core.display.Javascript object>

In [14]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

print(f"Train score: {model.score(X_train, y_train):.4f}")
print(f"Test score: {model.score(X_test, y_test):.4f}")

Train score: 0.7395
Test score: 0.6445


<IPython.core.display.Javascript object>

In [15]:
model = XGBClassifier()
model.fit(X_train, y_train)

print(f"Train score: {model.score(X_train, y_train):.4f}")
print(f"Test score: {model.score(X_test, y_test):.4f}")

Train score: 0.9987
Test score: 0.7560


<IPython.core.display.Javascript object>

In [16]:
params = {
    "subsample": [0.5, 0.75, 1.0],
    "colsample_bytree": [0.4, 0.6, 0.8, 1.0],
    "max_depth": [6, 8, 10],
}

n_trees = 100
learning_rate = 2 / n_trees


model_cv = GridSearchCV(
    XGBClassifier(n_estimators=n_trees, learning_rate=learning_rate),
    params,
    verbose=1,
    n_jobs=-1,
    cv=2,
)
model_cv.fit(X_train, y=y_train, eval_set=[(X_test, y_test)])

Fitting 2 folds for each of 36 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  38 tasks      | elapsed:  1.7min


[0]	validation_0-merror:0.48650


[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  4.5min finished


[1]	validation_0-merror:0.39750
[2]	validation_0-merror:0.37100
[3]	validation_0-merror:0.36750
[4]	validation_0-merror:0.33700
[5]	validation_0-merror:0.32850
[6]	validation_0-merror:0.31700
[7]	validation_0-merror:0.31500
[8]	validation_0-merror:0.30550
[9]	validation_0-merror:0.30850
[10]	validation_0-merror:0.30700
[11]	validation_0-merror:0.30350
[12]	validation_0-merror:0.30250
[13]	validation_0-merror:0.29850
[14]	validation_0-merror:0.29300
[15]	validation_0-merror:0.29200
[16]	validation_0-merror:0.29100
[17]	validation_0-merror:0.29000
[18]	validation_0-merror:0.29050
[19]	validation_0-merror:0.29100
[20]	validation_0-merror:0.29300
[21]	validation_0-merror:0.29100
[22]	validation_0-merror:0.28750
[23]	validation_0-merror:0.28550
[24]	validation_0-merror:0.28600
[25]	validation_0-merror:0.28650
[26]	validation_0-merror:0.28550
[27]	validation_0-merror:0.28750
[28]	validation_0-merror:0.28550
[29]	validation_0-merror:0.28650
[30]	validation_0-merror:0.28450
[31]	validation_0-m

GridSearchCV(cv=2, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=0.02, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estim...
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=None, tree_method=None,
                                     validate_parameters=None, 

<IPython.core.display.Javascript object>

In [17]:
print(f"Train score: {model_cv.score(X_train, y_train):.4f}")
print(f"Test score: {model_cv.score(X_test, y_test):.4f}")
model_cv.best_params_

Train score: 0.9982
Test score: 0.7250


{'colsample_bytree': 0.4, 'max_depth': 10, 'subsample': 1.0}

<IPython.core.display.Javascript object>