In [1]:
import numpy as np
import os
import pandas as pd
from skimage.io import imread, imsave
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import roc_auc_score, mean_absolute_error
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [2]:
er_trdat_path = './data/train/ER/'
nr_trdat_path = './data/train/NR/'
cropdat_path = './data/train/all_cropped/'
balanced_path = './data/train/all_balanced/'

### form crp_ and bal_ datasets

In [3]:
conv_type = {'ER': 1, 'NR': 0}

In [4]:
%%time
X_crp = []
y_crp_typ = []
y_crp_nrj = []
for filename in os.listdir(cropdat_path):
    img = imread(cropdat_path + filename)    
    X_crp.append(img.flatten())
    fn_parts = filename.split('-')
    y_crp_typ.append(conv_type[fn_parts[1]]) 
    y_crp_nrj.append(int(fn_parts[2].split('.')[0]  ))
print(len(X_crp))
print(len(y_crp_typ))
print(len(y_crp_nrj))

13404
13404
13404
Wall time: 4.13 s


In [5]:
unique, counts = np.unique(y_crp_typ, return_counts=True)
print(dict(zip(unique, counts)))
unique, counts = np.unique(y_crp_nrj, return_counts=True)
print(dict(zip(unique, counts)))

{0: 6646, 1: 6758}
{1: 2180, 3: 2245, 6: 2256, 10: 2274, 20: 2210, 30: 2239}


# 1. Classification

In [7]:
X_crp = np.array(X_crp)
y_crp_typ = np.array(y_crp_typ)

In [8]:
cv = StratifiedKFold(n_splits=3,shuffle=True,random_state=125)

### SGD

In [10]:
%%time
param_grid = {
    'loss': ['log', 'hinge'],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'alpha': [10 ** x for x in range(-6, 1)],
    'l1_ratio': [0, 0.05, 0.15, 0.25, 0.5, 0.8, 1],
}
clf = SGDClassifier(random_state=125)
gscv = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2)
gscv.fit(X_crp, y_crp_typ)

Fitting 3 folds for each of 294 candidates, totalling 882 fits
Wall time: 34min 1s


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=125, shuffle=True),
             estimator=SGDClassifier(random_state=125), n_jobs=-2,
             param_grid={'alpha': [1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.1, 1],
                         'l1_ratio': [0, 0.05, 0.15, 0.25, 0.5, 0.8, 1],
                         'loss': ['log', 'hinge'],
                         'penalty': ['l1', 'l2', 'elasticnet']},
             scoring='roc_auc', verbose=2)

In [11]:
df = pd.DataFrame(gscv.cv_results_)
print(gscv.best_params_)
df

{'alpha': 0.1, 'l1_ratio': 0.05, 'loss': 'hinge', 'penalty': 'elasticnet'}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_l1_ratio,param_loss,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,28.713367,2.758909,0.066156,0.003672,0.000001,0,log,l1,"{'alpha': 1e-06, 'l1_ratio': 0, 'loss': 'log',...",0.767609,0.774988,0.760077,0.767558,0.006087,170
1,10.032924,1.849702,0.073470,0.005298,0.000001,0,log,l2,"{'alpha': 1e-06, 'l1_ratio': 0, 'loss': 'log',...",0.812571,0.807534,0.800725,0.806943,0.004854,48
2,29.004255,5.816668,0.074468,0.004179,0.000001,0,log,elasticnet,"{'alpha': 1e-06, 'l1_ratio': 0, 'loss': 'log',...",0.812571,0.807534,0.800725,0.806943,0.004854,48
3,25.532106,7.883942,0.071476,0.003082,0.000001,0,hinge,l1,"{'alpha': 1e-06, 'l1_ratio': 0, 'loss': 'hinge...",0.786167,0.770857,0.759927,0.772317,0.010762,161
4,9.849264,1.138697,0.065492,0.001695,0.000001,0,hinge,l2,"{'alpha': 1e-06, 'l1_ratio': 0, 'loss': 'hinge...",0.821683,0.801827,0.800114,0.807874,0.009789,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,23.809698,1.756732,0.046875,0.003257,1,1,log,l2,"{'alpha': 1, 'l1_ratio': 1, 'loss': 'log', 'pe...",0.802183,0.804904,0.793973,0.800353,0.004647,123
290,2.493667,0.322558,0.083776,0.007052,1,1,log,elasticnet,"{'alpha': 1, 'l1_ratio': 1, 'loss': 'log', 'pe...",0.500000,0.500000,0.500000,0.500000,0.000000,276
291,2.850380,0.439065,0.094082,0.011234,1,1,hinge,l1,"{'alpha': 1, 'l1_ratio': 1, 'loss': 'hinge', '...",0.500000,0.500000,0.500000,0.500000,0.000000,276
292,10.900860,0.324526,0.064495,0.008197,1,1,hinge,l2,"{'alpha': 1, 'l1_ratio': 1, 'loss': 'hinge', '...",0.809373,0.806578,0.793574,0.803175,0.006884,95


### Random Forest

In [12]:
%%time
param_grid = { 
    'n_estimators': [200, 500, 800],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4, 5, 6, 7, 8],
    'criterion' :['gini', 'entropy'],
}
clf = RandomForestClassifier(random_state=125)
gscv = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2)
gscv.fit(X_crp, y_crp_typ)

Fitting 3 folds for each of 90 candidates, totalling 270 fits
Wall time: 16min 14s


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=125, shuffle=True),
             estimator=RandomForestClassifier(random_state=125), n_jobs=-2,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [4, 5, 6, 7, 8],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'n_estimators': [200, 500, 800]},
             scoring='roc_auc', verbose=2)

In [13]:
df = pd.DataFrame(gscv.cv_results_)
print(gscv.best_params_)
df

{'criterion': 'gini', 'max_depth': 8, 'max_features': 'auto', 'n_estimators': 500}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,14.055762,0.078835,0.297206,0.023977,gini,4,auto,200,"{'criterion': 'gini', 'max_depth': 4, 'max_fea...",0.995422,0.994492,0.995584,0.995166,0.000481,61
1,34.876107,0.031118,0.606711,0.013190,gini,4,auto,500,"{'criterion': 'gini', 'max_depth': 4, 'max_fea...",0.996218,0.994765,0.995763,0.995582,0.000607,59
2,55.847713,0.249839,0.906245,0.013683,gini,4,auto,800,"{'criterion': 'gini', 'max_depth': 4, 'max_fea...",0.996775,0.995487,0.996010,0.996091,0.000529,52
3,14.079699,0.087926,0.288230,0.022845,gini,4,sqrt,200,"{'criterion': 'gini', 'max_depth': 4, 'max_fea...",0.995422,0.994492,0.995584,0.995166,0.000481,61
4,35.124772,0.196834,0.566486,0.014127,gini,4,sqrt,500,"{'criterion': 'gini', 'max_depth': 4, 'max_fea...",0.996218,0.994765,0.995763,0.995582,0.000607,59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,62.982305,0.100093,0.752988,0.045623,entropy,8,sqrt,500,"{'criterion': 'entropy', 'max_depth': 8, 'max_...",0.999584,0.999146,0.999342,0.999357,0.000179,7
86,80.055333,3.068040,0.515966,0.024556,entropy,8,sqrt,800,"{'criterion': 'entropy', 'max_depth': 8, 'max_...",0.999552,0.999103,0.999265,0.999307,0.000186,9
87,5.819111,0.052474,0.359373,0.007566,entropy,8,log2,200,"{'criterion': 'entropy', 'max_depth': 8, 'max_...",0.986421,0.991145,0.989563,0.989043,0.001963,70
88,14.263870,0.096657,0.754650,0.020621,entropy,8,log2,500,"{'criterion': 'entropy', 'max_depth': 8, 'max_...",0.993021,0.993701,0.994683,0.993802,0.000682,64


### CatBoost

In [14]:
%%time
param_grid = {
    'iterations': [200, 500, 800],
    'depth': [2, 3, 4, 6],
    'loss_function': ['Logloss', 'CrossEntropy'],
    'l2_leaf_reg': np.logspace(-20, -19, 3),
    'leaf_estimation_iterations': [10],
    'logging_level':['Silent'],
}
clf = CatBoostClassifier(random_state=125)
gscv = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2)
gscv.fit(X_crp, y_crp_typ)

Fitting 3 folds for each of 72 candidates, totalling 216 fits
Wall time: 57min 39s


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=125, shuffle=True),
             estimator=<catboost.core.CatBoostClassifier object at 0x000001AE827EF188>,
             n_jobs=-2,
             param_grid={'depth': [2, 3, 4, 6], 'iterations': [200, 500, 800],
                         'l2_leaf_reg': array([1.00000000e-20, 3.16227766e-20, 1.00000000e-19]),
                         'leaf_estimation_iterations': [10],
                         'logging_level': ['Silent'],
                         'loss_function': ['Logloss', 'CrossEntropy']},
             scoring='roc_auc', verbose=2)

In [15]:
df = pd.DataFrame(gscv.cv_results_)
print(gscv.best_params_)
df

{'depth': 6, 'iterations': 800, 'l2_leaf_reg': 1e-20, 'leaf_estimation_iterations': 10, 'logging_level': 'Silent', 'loss_function': 'Logloss'}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_depth,param_iterations,param_l2_leaf_reg,param_leaf_estimation_iterations,param_logging_level,param_loss_function,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,44.684884,1.859334,4.917522,1.396057,2,200,0.0,10,Silent,Logloss,"{'depth': 2, 'iterations': 200, 'l2_leaf_reg':...",0.997771,0.997415,0.997660,0.997616,0.000149,67
1,43.419599,1.940180,5.866319,1.741196,2,200,0.0,10,Silent,CrossEntropy,"{'depth': 2, 'iterations': 200, 'l2_leaf_reg':...",0.997771,0.997415,0.997660,0.997616,0.000149,67
2,43.958493,1.913076,5.216389,1.183544,2,200,0.0,10,Silent,Logloss,"{'depth': 2, 'iterations': 200, 'l2_leaf_reg':...",0.997771,0.997415,0.997660,0.997616,0.000149,67
3,41.296608,4.224377,9.662504,5.271363,2,200,0.0,10,Silent,CrossEntropy,"{'depth': 2, 'iterations': 200, 'l2_leaf_reg':...",0.997771,0.997415,0.997660,0.997616,0.000149,67
4,42.303250,1.477618,15.520512,1.645369,2,200,0.0,10,Silent,Logloss,"{'depth': 2, 'iterations': 200, 'l2_leaf_reg':...",0.997771,0.997415,0.997660,0.997616,0.000149,67
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,399.430281,2.668010,31.644410,3.377043,6,800,0.0,10,Silent,CrossEntropy,"{'depth': 6, 'iterations': 800, 'l2_leaf_reg':...",0.999984,0.999961,0.999981,0.999975,0.000010,1
68,404.644186,3.942375,26.263460,2.019886,6,800,0.0,10,Silent,Logloss,"{'depth': 6, 'iterations': 800, 'l2_leaf_reg':...",0.999980,0.999956,0.999981,0.999972,0.000012,5
69,375.893708,18.011602,27.984194,6.379581,6,800,0.0,10,Silent,CrossEntropy,"{'depth': 6, 'iterations': 800, 'l2_leaf_reg':...",0.999980,0.999956,0.999981,0.999972,0.000012,5
70,345.281538,1.756792,14.086013,1.742301,6,800,0.0,10,Silent,Logloss,"{'depth': 6, 'iterations': 800, 'l2_leaf_reg':...",0.999982,0.999959,0.999981,0.999974,0.000010,3


### XGBoost

In [16]:
%%time
param_grid = {
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1, 1.5, 2, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [3, 4, 5],
}
clf = XGBClassifier(eval_metric='auc', use_label_encoder=False, random_state=125)
gscv = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=1)
gscv.fit(X_crp, y_crp_typ)

Fitting 3 folds for each of 405 candidates, totalling 1215 fits
Wall time: 3h 14min 35s


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=125, shuffle=True),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, eval_metric='auc',
                                     gamma=None, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child...
                                     num_parallel_tree=None, random_state=125,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                                     tree_method=None, use_label_encoder=False,
                

In [17]:
df = pd.DataFrame(gscv.cv_results_)
print(gscv.best_params_)
df

{'colsample_bytree': 1.0, 'gamma': 1, 'max_depth': 3, 'min_child_weight': 1, 'subsample': 1.0}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_gamma,param_max_depth,param_min_child_weight,param_subsample,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,79.257797,0.286759,0.529252,0.073759,0.6,0.5,3,1,0.6,"{'colsample_bytree': 0.6, 'gamma': 0.5, 'max_d...",0.999922,0.999852,0.999883,0.999886,0.000029,210
1,79.656067,0.856927,0.600396,0.109225,0.6,0.5,3,1,0.8,"{'colsample_bytree': 0.6, 'gamma': 0.5, 'max_d...",0.999975,0.999899,0.999952,0.999942,0.000032,9
2,77.088596,0.626516,0.481048,0.008783,0.6,0.5,3,1,1.0,"{'colsample_bytree': 0.6, 'gamma': 0.5, 'max_d...",0.999960,0.999937,0.999943,0.999947,0.000009,4
3,76.260809,0.879234,0.385969,0.017970,0.6,0.5,3,5,0.6,"{'colsample_bytree': 0.6, 'gamma': 0.5, 'max_d...",0.999913,0.999859,0.999916,0.999896,0.000026,168
4,77.007148,0.355653,0.531248,0.062295,0.6,0.5,3,5,0.8,"{'colsample_bytree': 0.6, 'gamma': 0.5, 'max_d...",0.999912,0.999886,0.999928,0.999909,0.000018,102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
400,144.474133,0.790495,0.482045,0.040465,1.0,5,5,5,0.8,"{'colsample_bytree': 1.0, 'gamma': 5, 'max_dep...",0.999883,0.999823,0.999809,0.999838,0.000032,332
401,143.492756,3.601140,0.387963,0.052592,1.0,5,5,5,1.0,"{'colsample_bytree': 1.0, 'gamma': 5, 'max_dep...",0.999840,0.999819,0.999855,0.999838,0.000014,333
402,130.785391,1.173031,0.400264,0.030364,1.0,5,5,10,0.6,"{'colsample_bytree': 1.0, 'gamma': 5, 'max_dep...",0.999782,0.999731,0.999779,0.999764,0.000023,395
403,110.995293,14.778331,0.281248,0.038532,1.0,5,5,10,0.8,"{'colsample_bytree': 1.0, 'gamma': 5, 'max_dep...",0.999856,0.999697,0.999801,0.999785,0.000066,387


### SVM

In [9]:
%%time
param_grid = {
    'kernel': ['linear'],
    'C': [0.1, 1, 10, 100],
}
clf = svm.SVC(random_state=125)
gscv = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2)
gscv.fit(X_crp, y_crp_typ)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
Wall time: 21min 34s


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=125, shuffle=True),
             estimator=SVC(random_state=125), n_jobs=-1,
             param_grid={'C': [0.1, 1, 10, 100], 'kernel': ['linear']},
             scoring='roc_auc', verbose=2)

In [10]:
df = pd.DataFrame(gscv.cv_results_)
print(gscv.best_params_)
df

{'C': 0.1, 'kernel': 'linear'}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,799.908949,31.133793,119.266848,49.023903,0.1,linear,"{'C': 0.1, 'kernel': 'linear'}",0.774786,0.779762,0.765948,0.773498,0.005712,1
1,810.530555,6.348258,150.819172,4.927554,1.0,linear,"{'C': 1, 'kernel': 'linear'}",0.769639,0.777268,0.764201,0.770369,0.005359,2
2,852.513328,15.212272,145.274346,9.815078,10.0,linear,"{'C': 10, 'kernel': 'linear'}",0.769521,0.777597,0.760816,0.769311,0.006852,3
3,859.108698,36.125118,132.618844,14.4328,100.0,linear,"{'C': 100, 'kernel': 'linear'}",0.76749,0.775481,0.760565,0.767845,0.006095,4


In [11]:
%%time
param_grid = {
    'kernel': ['rbf'],
    'C': [0.1],
    'gamma': [0.001, 0.01, 1, 'scale', 'auto'],
}
clf = svm.SVC(random_state=125)
gscv = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2)
gscv.fit(X_crp, y_crp_typ)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Wall time: 33min 15s


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=125, shuffle=True),
             estimator=SVC(random_state=125), n_jobs=-1,
             param_grid={'C': [0.1], 'gamma': [0.001, 0.01, 1, 'scale', 'auto'],
                         'kernel': ['rbf']},
             scoring='roc_auc', verbose=2)

In [12]:
df = pd.DataFrame(gscv.cv_results_)
print(gscv.best_params_)
df

{'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,1075.860807,5.771122,645.124352,3.390438,0.1,0.001,rbf,"{'C': 0.1, 'gamma': 0.001, 'kernel': 'rbf'}",0.734674,0.734917,0.733263,0.734285,0.000729,4
1,1100.912505,5.532121,643.225763,2.640686,0.1,0.01,rbf,"{'C': 0.1, 'gamma': 0.01, 'kernel': 'rbf'}",0.81086,0.798831,0.794304,0.801332,0.006986,3
2,1478.927548,8.337664,382.606946,6.420461,0.1,1,rbf,"{'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}",0.60962,0.604491,0.60332,0.60581,0.002736,5
3,516.670035,3.717997,367.993535,4.312969,0.1,scale,rbf,"{'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}",0.998123,0.997304,0.997647,0.997691,0.000336,1
4,666.183393,2.100989,276.848942,6.805909,0.1,auto,rbf,"{'C': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}",0.992815,0.991916,0.992596,0.992443,0.000383,2


### Best Estimators

sgd


{'alpha': 0.1, 'l1_ratio': 0.05, 'loss': 'hinge', 'penalty': 'elasticnet'}


rf


{'criterion': 'gini', 'max_depth': 8, 'max_features': 'auto', 'n_estimators': 500}


catboost


{'depth': 6, 'iterations': 800, 'l2_leaf_reg': 1e-20, 'leaf_estimation_iterations': 10, 'logging_level': 'Silent', 'loss_function': 'Logloss'}


xgboost


{'colsample_bytree': 1.0, 'gamma': 1, 'max_depth': 3, 'min_child_weight': 1, 'subsample': 1.0}


svm


{'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}