## Grid search
I want to perform a grid search accross different column subsets


In [5]:
import sys
sys.path.append('..')

from metrics import default_competition_metric
from metrics import make_competition_scorer, competition_scoring, ColumnSelector


import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

np.random.seed(44)

In [6]:
# device = 'cuda' # modify if needed

In [7]:
X_train = np.load('../../data/x_train.npy')
y_train = np.load('../../data/y_train.npy')
X_val = np.load('../../data/x_val.npy')
y_val = np.load('../../data/y_val.npy')


In [9]:
model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=44)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)

default_competition_metric(y_val, y_pred=y_pred, k=X_train.shape[1])

-98150.0

In [10]:
print(f"Accuracy: {np.mean(y_val == y_pred)}")

Accuracy: 0.606


### Pipeline

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[:, self.columns]

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.compose import ColumnTransformer

# pipeline
pipe = Pipeline([
    ("feature_selection", ColumnSelector(columns = [100, 101])),
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(n_estimators=1000, max_depth=5, random_state=44))
])


pipe.fit(X_train, y_train)

y_proba = pipe.predict_proba(X_val)[:, 1]
default_competition_metric(y_val, k=2, y_pred_proba=y_proba)

6450.000000000001

In [17]:
competition_scoring(pipe, X_val, y_val, scale_metric=True)

6450.000000000001

In [18]:
default_competition_metric(y_val, y_pred_proba=pipe.predict_proba(X_val)[:, 1], k=2)

6450.000000000001

In [19]:
competition_scoring(pipe, X_val, y_val)

6450.000000000001

## Grid search on features from Boruta

In [21]:
features_to_train = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 100, 101, 102, 103, 104, 105])

In [23]:
pipe = Pipeline([
    ("feature_selection", ColumnSelector()),
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(random_state=44))
])

def get_all_subsets(input_list):
    subsets = []
    n = len(input_list)
    for i in range(1, 2**n):
        subset = [input_list[j] for j in range(n) if (i & (1 << j))]
        subsets.append(subset)
    return subsets

grid = {
    'feature_selection__columns': get_all_subsets(features_to_train),
    'model__bootstrap': [True, False],
    'model__max_depth': [10, 20, 40, 60, 80, 100, None],
    'model__max_features': ['auto', 'sqrt'],
    'model__min_samples_leaf': [1, 2, 4],
    'model__min_samples_split': [2, 5, 10],
    'model__n_estimators': [200, 500, 1000, 1500, 2000]    
}

grid_search = RandomizedSearchCV(pipe, grid, cv=5, scoring=competition_scoring, verbose=2, n_iter = 100)

grid_search.fit(X_train, y_train)
print("Best score: ", grid_search.best_score_)
print("Best parameters: ", grid_search.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END feature_selection__columns=[1, 2, 4, 6, 7, 9, 102, 103, 105], model__bootstrap=False, model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=2000; total time=   0.0s
[CV] END feature_selection__columns=[1, 2, 4, 6, 7, 9, 102, 103, 105], model__bootstrap=False, model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=2000; total time=   0.0s
[CV] END feature_selection__columns=[1, 2, 4, 6, 7, 9, 102, 103, 105], model__bootstrap=False, model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=5, model__n_estimators=2000; total time=   0.0s
[CV] END feature_selection__columns=[1, 2, 4, 6, 7, 9, 102, 103, 105], model__bootstrap=False, model__max_depth=None, model__max_features=auto, model__min_samples_leaf=1, model__min_samples_split=5, model_

245 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
245 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\tymot\Documents\studia\aml\pro\adv_ml_project_2\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\tymot\Documents\studia\aml\pro\adv_ml_project_2\.venv\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\tymot\Documents\studia\aml\pro\adv_ml_project_2\.venv\Lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_es

Best score:  6287.5
Best parameters:  {'model__n_estimators': 1500, 'model__min_samples_split': 5, 'model__min_samples_leaf': 1, 'model__max_features': 'sqrt', 'model__max_depth': 10, 'model__bootstrap': True, 'feature_selection__columns': [5, 100, 103]}


In [24]:
best_params_boruta = {'model__n_estimators': 1500, 'model__min_samples_split': 5, 'model__min_samples_leaf': 1, 'model__max_features': 'sqrt', 'model__max_depth': 10, 'model__bootstrap': True, 'feature_selection__columns': [5, 100, 103]}

pipe.set_params(**best_params_boruta)
pipe.fit(X_train, y_train)
competition_scoring(pipe, X_val, y_val)

6400.0

## MRMR features

In [25]:
# larger training set
features_to_train = np.array([100, 102, 105,403, 466])

In [26]:
pipe = Pipeline([
    ("feature_selection", ColumnSelector()),
    ('scaler', StandardScaler()),
    ('model', RandomForestClassifier(random_state=44))
])

def get_all_subsets(input_list):
    subsets = []
    n = len(input_list)
    for i in range(1, 2**n):
        subset = [input_list[j] for j in range(n) if (i & (1 << j))]
        subsets.append(subset)
    return subsets

grid = {
    'feature_selection__columns': get_all_subsets(features_to_train),
    'model__bootstrap': [True, False],
    'model__max_depth': [10, 20, 40, 60, 80, 100, None],
    'model__max_features': ['auto', 'sqrt'],
    'model__min_samples_leaf': [1, 2, 4],
    'model__min_samples_split': [2, 5, 10],
    'model__n_estimators': [200, 500, 1000, 1500, 2000]    
}


grid_search = RandomizedSearchCV(pipe, grid, cv=5, scoring=competition_scoring, verbose=2, n_iter = 100)

grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 5 folds for each of 100 candidates, totalling 500 fits
[CV] END feature_selection__columns=[102], model__bootstrap=False, model__max_depth=100, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=500; total time=   0.0s
[CV] END feature_selection__columns=[102], model__bootstrap=False, model__max_depth=100, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=500; total time=   0.0s
[CV] END feature_selection__columns=[102], model__bootstrap=False, model__max_depth=100, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=500; total time=   0.0s
[CV] END feature_selection__columns=[102], model__bootstrap=False, model__max_depth=100, model__max_features=auto, model__min_samples_leaf=4, model__min_samples_split=10, model__n_estimators=500; total time=   0.0s
[CV] END feature_selection__columns=[102], model__bootstrap=False, model__max

225 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
225 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\tymot\Documents\studia\aml\pro\adv_ml_project_2\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\tymot\Documents\studia\aml\pro\adv_ml_project_2\.venv\Lib\site-packages\sklearn\base.py", line 1474, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\tymot\Documents\studia\aml\pro\adv_ml_project_2\.venv\Lib\site-packages\sklearn\pipeline.py", line 475, in fit
    self._final_es

{'model__n_estimators': 1000,
 'model__min_samples_split': 10,
 'model__min_samples_leaf': 4,
 'model__max_features': 'sqrt',
 'model__max_depth': 20,
 'model__bootstrap': False,
 'feature_selection__columns': [100, 102, 105]}

In [31]:
import pandas as pd
results = pd.DataFrame(grid_search.cv_results_)
results.to_csv('mrmr_rf.csv')

In [27]:
print("Best score: ", grid_search.best_score_)
print("Best parameters: ", grid_search.best_params_)

Best score:  6362.5
Best parameters:  {'model__n_estimators': 1000, 'model__min_samples_split': 10, 'model__min_samples_leaf': 4, 'model__max_features': 'sqrt', 'model__max_depth': 20, 'model__bootstrap': False, 'feature_selection__columns': [100, 102, 105]}


In [28]:
best_params_mrmr = {'model__n_estimators': 1000, 'model__min_samples_split': 10, 'model__min_samples_leaf': 4, 'model__max_features': 'sqrt', 'model__max_depth': 20, 'model__bootstrap': False, 'feature_selection__columns': [100, 102, 105]}

In [29]:
pipe.set_params(**best_params_mrmr)

In [30]:
pipe.fit(X_train, y_train)

y_proba = pipe.predict_proba(X_val)[:, 1]
competition_scoring(pipe, X_val, y_val, scale_metric=True)

6650.0