In [35]:
import numpy as np
import os
import pandas as pd
from skimage.io import imread, imsave
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import roc_auc_score, mean_absolute_error
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [2]:
er_trdat_path = './data/train/ER/'
nr_trdat_path = './data/train/NR/'
cropdat_path = './data/train/all_cropped/'
balanced_path = './data/train/all_balanced/'

### form crp_ and bal_ datasets

In [3]:
conv_type = {'ER': 1, 'NR': 0}

In [4]:
%%time
X_crp = []
y_crp_typ = []
y_crp_enr = []
for filename in os.listdir(cropdat_path):
    img = imread(cropdat_path + filename)    
    X_crp.append(img.flatten())
    fn_parts = filename.split('-')
    y_crp_typ.append(conv_type[fn_parts[1]]) 
    y_crp_enr.append(int(fn_parts[2].split('.')[0]  ))
print(len(X_crp))
print(len(y_crp_typ))
print(len(y_crp_enr))

13404
13404
13404
Wall time: 4.07 s


In [5]:
unique, counts = np.unique(y_crp_typ, return_counts=True)
print(dict(zip(unique, counts)))
unique, counts = np.unique(y_crp_enr, return_counts=True)
print(dict(zip(unique, counts)))

{0: 6646, 1: 6758}
{1: 2180, 3: 2245, 6: 2256, 10: 2274, 20: 2210, 30: 2239}


In [6]:
%%time
X_bal = []
y_bal_typ = []
y_bal_enr = []
for filename in os.listdir(balanced_path):
    img = imread(balanced_path + filename)    
    X_bal.append(img.flatten())
    fn_parts = filename.split('-')
    y_bal_typ.append(conv_type[fn_parts[1]]) 
    y_bal_enr.append(int(fn_parts[2].split('.')[0]  ))
print(len(X_bal))
print(len(y_bal_typ))
print(len(y_bal_enr))

26624
26624
26624
Wall time: 8.2 s


In [7]:
unique, counts = np.unique(y_bal_typ, return_counts=True)
print(dict(zip(unique, counts)))
unique, counts = np.unique(y_bal_enr, return_counts=True)
print(dict(zip(unique, counts)))

{0: 13246, 1: 13378}
{1: 4400, 3: 4445, 6: 4456, 10: 4474, 20: 4410, 30: 4439}


# 1. Classification

In [None]:
X_crp = np.array(X_crp)
y_crp_typ = np.array(y_crp_typ)

In [8]:
cv = StratifiedKFold(n_splits=3,shuffle=True,random_state=125)

### SVM

In [9]:
%%time
param_grid = {
    'kernel': ['linear', 'rbf', 'sigmoid', 'poly'],
    'C': [0.1, 1, 10, 100],
    'gamma': [0.001, 0.01, 1, 'scale'],
}
clf = svm.SVC(random_state=125)
gscv = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2)
gscv.fit(X_crp, y_crp_typ)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Wall time: 19min 54s


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=125, shuffle=True),
             estimator=SVC(random_state=125), n_jobs=-1,
             param_grid={'C': [100], 'gamma': ['scale'], 'kernel': ['poly']},
             scoring='roc_auc', verbose=2)

In [15]:
df = pd.DataFrame(gscv.cv_results_)
print(gscv.best_params_)
df

{'C': 100, 'gamma': 'scale', 'kernel': 'poly'}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_kernel,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,530.774464,90.160325,8.478336,0.047251,100,scale,poly,"{'C': 100, 'gamma': 'scale', 'kernel': 'poly'}",0.967655,0.965624,0.968093,0.967124,0.001076,1


### SGD

In [16]:
%%time
param_grid = {
    'loss': ['log', 'hinge'],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'alpha': [10 ** x for x in range(-6, 1)],
    'l1_ratio': [0, 0.05, 0.15, 0.25, 0.5, 0.8, 1],
}
clf = SGDClassifier(random_state=125)
gscv = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=cv, n_jobs=-2, verbose=2)
gscv.fit(X_crp, y_crp_typ)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Wall time: 47.9 s


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=125, shuffle=True),
             estimator=SGDClassifier(random_state=125), n_jobs=-2,
             param_grid={'alpha': [0.01], 'l1_ratio': [0.15], 'loss': ['log'],
                         'penalty': ['elasticnet']},
             scoring='roc_auc', verbose=2)

In [17]:
df = pd.DataFrame(gscv.cv_results_)
print(gscv.best_params_)
df

{'alpha': 0.01, 'l1_ratio': 0.15, 'loss': 'log', 'penalty': 'elasticnet'}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_l1_ratio,param_loss,param_penalty,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,21.472765,3.372449,0.050854,0.000814,0.01,0.15,log,elasticnet,"{'alpha': 0.01, 'l1_ratio': 0.15, 'loss': 'log...",0.81959,0.813956,0.79981,0.811119,0.008321,1


### Random Forest

In [18]:
%%time
param_grid = { 
    'n_estimators': [200, 500, 800],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4, 5, 6, 7, 8],
    'criterion' :['gini', 'entropy'],
}
clf = RandomForestClassifier(random_state=125)
gscv = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=cv, n_jobs=-2, verbose=2)
gscv.fit(X_crp, y_crp_typ)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Wall time: 19.3 s


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=125, shuffle=True),
             estimator=RandomForestClassifier(random_state=125), n_jobs=-2,
             param_grid={'criterion': ['gini'], 'max_depth': [4],
                         'max_features': ['auto'], 'n_estimators': [200]},
             scoring='roc_auc', verbose=2)

In [19]:
df = pd.DataFrame(gscv.cv_results_)
print(gscv.best_params_)
df

{'criterion': 'gini', 'max_depth': 4, 'max_features': 'auto', 'n_estimators': 200}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_max_features,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,7.019129,0.013315,0.129964,0.001699,gini,4,auto,200,"{'criterion': 'gini', 'max_depth': 4, 'max_fea...",0.995422,0.994492,0.995584,0.995166,0.000481,1


### CatBoost

In [29]:
%%time
param_grid = {
    'iterations': [200, 500, 800],
    'depth': [2, 3, 4, 6],
    'loss_function': ['Logloss', 'CrossEntropy'],
    'l2_leaf_reg': np.logspace(-20, -19, 3),
    'leaf_estimation_iterations': [10],
    'logging_level':['Silent'],
}
clf = CatBoostClassifier(random_state=125)
gscv = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=cv, n_jobs=-2, verbose=2)
gscv.fit(X_crp, y_crp_typ)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
Wall time: 1min 9s


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=125, shuffle=True),
             estimator=<catboost.core.CatBoostClassifier object at 0x0000020D97064C88>,
             n_jobs=-2,
             param_grid={'depth': [3], 'iterations': [200],
                         'l2_leaf_reg': array([1.00000000e-20, 3.16227766e-20, 1.00000000e-19]),
                         'leaf_estimation_iterations': [10],
                         'logging_level': ['Silent'],
                         'loss_function': ['Logloss']},
             scoring='roc_auc', verbose=2)

In [30]:
df = pd.DataFrame(gscv.cv_results_)
print(gscv.best_params_)
df

{'depth': 3, 'iterations': 200, 'l2_leaf_reg': 1e-20, 'leaf_estimation_iterations': 10, 'logging_level': 'Silent', 'loss_function': 'Logloss'}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_depth,param_iterations,param_l2_leaf_reg,param_leaf_estimation_iterations,param_logging_level,param_loss_function,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,23.046476,6.216593,17.028506,2.306531,3,200,0.0,10,Silent,Logloss,"{'depth': 3, 'iterations': 200, 'l2_leaf_reg':...",0.999455,0.999572,0.999661,0.999563,8.4e-05,1
1,39.198698,3.601635,10.697126,2.878223,3,200,0.0,10,Silent,Logloss,"{'depth': 3, 'iterations': 200, 'l2_leaf_reg':...",0.999455,0.999572,0.999661,0.999563,8.4e-05,1
2,46.101943,0.805867,1.969079,0.083651,3,200,0.0,10,Silent,Logloss,"{'depth': 3, 'iterations': 200, 'l2_leaf_reg':...",0.999455,0.999572,0.999661,0.999563,8.4e-05,1


### XGBoost

In [46]:
%%time
param_grid = {
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1, 1.5, 2, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [3, 4, 5],
}
clf = XGBClassifier(eval_metric='auc', use_label_encoder=False, random_state=125)
gscv = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=cv, n_jobs=-2, verbose=1)
gscv.fit(X_crp, y_crp_typ);

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Wall time: 33.7 s


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=125, shuffle=True),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, eval_metric='auc',
                                     gamma=None, gpu_id=None,
                                     importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child...
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=125,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
                            

In [47]:
df = pd.DataFrame(gscv.cv_results_)
print(gscv.best_params_)
df

{'colsample_bytree': 0.6, 'gamma': 0.5, 'max_depth': 3, 'min_child_weight': 1, 'subsample': 1.0}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_gamma,param_max_depth,param_min_child_weight,param_subsample,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,19.834127,0.840815,0.236368,0.021126,0.6,0.5,3,1,1.0,"{'colsample_bytree': 0.6, 'gamma': 0.5, 'max_d...",0.99996,0.999937,0.999943,0.999947,9e-06,1


### SVM