## I will optimizing Random Forest and K Nearest Neighbors

In [1]:
import pandas as pd
from utils import (
    music_genre_clean_up,
    divide_dataframe,
    get_preprocessor
)

df = pd.read_csv('../data/music_genre.csv')
music_genre_clean_up(df)

df

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,music_genre
0,27.0,0.00468,0.652,,0.941,0.79200,A#,0.115,-5.201,Minor,0.0748,100.889,0.759,Electronic
1,31.0,0.01270,0.622,218293.0,0.890,0.95000,D,0.124,-7.043,Minor,0.0300,115.002,0.531,Electronic
2,28.0,0.00306,0.620,215613.0,0.755,0.01180,G#,0.534,-4.617,Major,0.0345,127.994,0.333,Electronic
3,34.0,0.02540,0.774,166875.0,0.700,0.00253,C#,0.157,-4.498,Major,0.2390,128.014,0.270,Electronic
4,32.0,0.00465,0.638,222369.0,0.587,0.90900,F#,0.157,-6.266,Major,0.0413,145.036,0.323,Electronic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50000,59.0,0.03340,0.913,,0.574,0.00000,C#,0.119,-7.022,Major,0.2980,98.028,0.330,Hip-Hop
50001,72.0,0.15700,0.709,251860.0,0.362,0.00000,B,0.109,-9.814,Major,0.0550,122.043,0.113,Hip-Hop
50002,51.0,0.00597,0.693,189483.0,0.763,0.00000,D,0.143,-5.443,Major,0.1460,131.079,0.395,Hip-Hop
50003,65.0,0.08310,0.782,262773.0,0.472,0.00000,G,0.106,-5.016,Minor,0.0441,75.886,0.354,Hip-Hop


In [2]:
TARGET = 'music_genre'

X, y, num, cat = divide_dataframe(df, TARGET)

print(num)
print(cat)

['popularity', 'acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence']
['key', 'mode']


In [3]:
preprocessor = get_preprocessor(num, cat)

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_test, y_test, test_size=0.5, random_state=42
)

### KNN

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

knn = Pipeline([
    ('preprocessing', preprocessor),
    ('model', KNeighborsClassifier())
])

knn_params = {
    'model__n_neighbors': [25, 75, 100, 150, 200],
    'model__weights': ['uniform', 'distance'],
    'model__p': [1, 2]
}

knn_grid = GridSearchCV(knn, knn_params, scoring='accuracy', n_jobs=-1)
knn_grid.fit(X_train, y_train)

      nan 0.50205  0.52335  0.50555       nan 0.503825 0.5206   0.502975
      nan 0.506775 0.518975 0.5018  ]


In [9]:
print("Best parameters:", knn_grid.best_params_)
print("Best score:", knn_grid.best_estimator_.score(X_test, y_test))

Best parameters: {'model__n_neighbors': 100, 'model__p': 2, 'model__weights': 'uniform'}
Best score: 0.5244


Not much difference (mostly default params)

### Random forest

In [11]:
rand_forest = Pipeline([
    ('preprocessing', preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])

rf_params = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 10, 30],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2],
    'model__max_features': ['sqrt', 'log2'],
    'model__bootstrap': [True],
    'model__criterion': ['gini', 'entropy']
}

rf_grid = GridSearchCV(rand_forest, rf_params, scoring='accuracy', n_jobs=-1)
rf_grid.fit(X_train, y_train)

In [12]:
print("Best parameters:", rf_grid.best_params_)
print("Best score:", rf_grid.best_estimator_.score(X_test, y_test))

Best parameters: {'model__bootstrap': True, 'model__criterion': 'entropy', 'model__max_depth': 10, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 2, 'model__min_samples_split': 2, 'model__n_estimators': 200}
Best score: 0.5636


Minimal improvements (10 minutes wait)

### SVC

In [14]:
from sklearn.svm import SVC

svc = Pipeline([
    ('preprocessing', preprocessor),
    ('model', SVC(random_state=42))
])

svc_params = {
    'model__C': [0.1, 1, 10],
    'model__kernel': ['linear', 'rbf'],
    'model__gamma': ['scale', 'auto']
}

svc_grid = GridSearchCV(svc, svc_params, scoring='accuracy', n_jobs=-1)
svc_grid.fit(X_train, y_train)

In [15]:
print("Best parameters:", svc_grid.best_params_)
print("Best score:", svc_grid.best_estimator_.score(X_test, y_test))

Best parameters: {'model__C': 1, 'model__gamma': 'scale', 'model__kernel': 'rbf'}
Best score: 0.578


Improvements almost non existing (8 minutes wait)