In [1]:
import numpy as np
import os
import pandas as pd
from skimage.io import imread, imsave
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import roc_auc_score, mean_absolute_error
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [2]:
er_trdat_path = './data/train/ER/'
nr_trdat_path = './data/train/NR/'
cropdat_path = './data/train/all_cropped/'

### form crp_ dataset

In [3]:
conv_type = {'ER': 1, 'NR': 0}

In [4]:
%%time
X_crp = []
y_crp_typ = []
y_crp_nrj = []
for filename in os.listdir(cropdat_path):
    img = imread(cropdat_path + filename)    
    X_crp.append(img.flatten())
    fn_parts = filename.split('-')
    y_crp_typ.append(conv_type[fn_parts[1]]) 
    y_crp_nrj.append(int(fn_parts[2].split('.')[0]  ))
print(len(X_crp))
print(len(y_crp_typ))
print(len(y_crp_nrj))

13404
13404
13404
Wall time: 4.09 s


In [5]:
unique, counts = np.unique(y_crp_typ, return_counts=True)
print(dict(zip(unique, counts)))
unique, counts = np.unique(y_crp_nrj, return_counts=True)
print(dict(zip(unique, counts)))

{0: 6646, 1: 6758}
{1: 2180, 3: 2245, 6: 2256, 10: 2274, 20: 2210, 30: 2239}


# 1. Binary Classification

In [6]:
X_crp = np.array(X_crp)
y_crp_typ = np.array(y_crp_typ)
y_crp_nrj = np.array(y_crp_nrj)

In [7]:
cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=125)

### SGD

In [8]:
%%time
clf = SGDClassifier(
    alpha=0.1, 
    l1_ratio=0.05, 
    loss='hinge', 
    penalty='elasticnet', 
    random_state=125)
cvs = cross_val_score(clf, X_crp, y_crp_typ, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2)
print(np.mean(cvs))

0.7011147379534307
Wall time: 1min 4s


### Random Forest

In [9]:
%%time
clf = RandomForestClassifier(
    criterion='gini', 
    max_depth=8, 
    max_features='auto', 
    n_estimators=800, 
    random_state=125)
cvs = cross_val_score(clf, X_crp, y_crp_typ, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2)
print(np.mean(cvs))

0.9534102843532285
Wall time: 1min 29s


### CatBoost

In [10]:
%%time
clf = CatBoostClassifier(
    depth=6,
    iterations=800,
    l2_leaf_reg=1e-20,
    leaf_estimation_iterations=10,
    logging_level='Silent',
    loss_function='Logloss',
    eval_metric='AUC',
    random_state=125)
cvs = cross_val_score(clf, X_crp, y_crp_typ, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2)
print(np.mean(cvs))

0.9923262710217695
Wall time: 4min 32s


In [12]:
%%time
clf = CatBoostClassifier(
    depth=6,
    iterations=20000,
    early_stopping_rounds=500,
    l2_leaf_reg=1e-20,
    leaf_estimation_iterations=10,
    logging_level='Silent',
    loss_function='Logloss',
    eval_metric='AUC',
    boosting_type='Plain',
    task_type='GPU',
    random_state=125)
cvs = cross_val_score(clf, X_crp, y_crp_typ, scoring='roc_auc', cv=cv, verbose=2)
print(np.mean(cvs))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time= 7.7min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  7.7min remaining:    0.0s


[CV] END .................................................... total time= 7.7min
[CV] END .................................................... total time= 7.7min
[CV] END .................................................... total time= 7.8min
[CV] END .................................................... total time= 7.8min
0.9956034864522824
Wall time: 38min 37s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 38.6min finished


### XGBoost

In [9]:
%%time
clf = XGBClassifier(
    colsample_bytree=1.0,
    gamma=1,
    max_depth=3,
    min_child_weight=1,
    subsample=1.0,
    eval_metric='auc', 
    use_label_encoder=False,
    random_state=125)
cvs = cross_val_score(clf, X_crp, y_crp_typ, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2)
print(np.mean(cvs))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   56.8s remaining:   37.8s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   57.1s finished


0.9876824025570545
Wall time: 57.4 s


In [19]:
%%time
clf = XGBClassifier(
    n_estimators=400,
    colsample_bytree=1.0,
    gamma=1,
    max_depth=3,
    min_child_weight=1,
    subsample=1.0,
    eval_metric='auc', 
    use_label_encoder=False,
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    random_state=125)
cvs = cross_val_score(clf, X_crp, y_crp_typ, scoring='roc_auc', cv=cv, verbose=2)
print(np.mean(cvs))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  11.6s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   11.6s remaining:    0.0s


[CV] END .................................................... total time=  11.4s
[CV] END .................................................... total time=  11.7s
[CV] END .................................................... total time=  11.2s
[CV] END .................................................... total time=  11.6s
0.9939376484170884
Wall time: 58.2 s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   58.1s finished


### SVM

In [20]:
%%time
clf = svm.SVC(
    C=0.1,
    gamma='scale',
    kernel='rbf',
    random_state=125)
cvs = cross_val_score(clf, X_crp, y_crp_typ, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2)
print(np.mean(cvs))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed: 10.8min remaining:  7.2min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 10.8min finished


0.9170779231945364
Wall time: 10min 48s


# 2. Six-classes Classification

In [21]:
y_crp_nrj

array([30,  3,  3, ..., 20, 20,  1])

### SGD

In [22]:
%%time
clf = SGDClassifier(
    alpha=0.1, 
    l1_ratio=0.05, 
    loss='hinge', 
    penalty='elasticnet', 
    random_state=125)
cvs = cross_val_score(clf, X_crp, y_crp_nrj, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2)
print(np.mean(cvs))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  2.7min remaining:  1.8min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.1min finished


nan
Wall time: 3min 3s


### RandomForestClassifier

In [23]:
%%time
clf = RandomForestClassifier(
    criterion='gini', 
    max_depth=8, 
    max_features='auto', 
    n_estimators=800, 
    random_state=125)
cvs = cross_val_score(clf, X_crp, y_crp_typ, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2)
print(np.mean(cvs))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  1.5min remaining:   58.7s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.5min finished


0.9534102843532285
Wall time: 1min 28s


### CatBoostClassifier

In [24]:
%%time
clf = CatBoostClassifier(
    depth=6,
    iterations=20000,
    early_stopping_rounds=500,
    l2_leaf_reg=1e-20,
    leaf_estimation_iterations=10,
    logging_level='Silent',
    loss_function='Logloss',
    eval_metric='AUC',
    boosting_type='Plain',
    task_type='GPU',
    random_state=125)
cvs = cross_val_score(clf, X_crp, y_crp_typ, scoring='roc_auc', cv=cv, verbose=2)
print(np.mean(cvs))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time= 8.4min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  8.4min remaining:    0.0s


[CV] END .................................................... total time= 9.7min




[CV] END .................................................... total time= 9.3min




[CV] END .................................................... total time= 9.6min




[CV] END .................................................... total time= 9.9min
0.995638543976483
Wall time: 46min 57s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 47.0min finished


### XGBClassifier

In [25]:
%%time
clf = XGBClassifier(
    n_estimators=400,
    colsample_bytree=1.0,
    gamma=1,
    max_depth=3,
    min_child_weight=1,
    subsample=1.0,
    eval_metric='auc', 
    use_label_encoder=False,
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    random_state=125)
cvs = cross_val_score(clf, X_crp, y_crp_typ, scoring='roc_auc', cv=cv, verbose=2)
print(np.mean(cvs))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  15.2s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   15.2s remaining:    0.0s


[CV] END .................................................... total time=  15.1s
[CV] END .................................................... total time=  15.3s
[CV] END .................................................... total time=  14.9s
[CV] END .................................................... total time=  14.9s
0.9939376484170884
Wall time: 1min 15s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.3min finished


### SVM

In [26]:
%%time
clf = svm.SVC(
    C=0.1,
    gamma='scale',
    kernel='rbf',
    random_state=125)
cvs = cross_val_score(clf, X_crp, y_crp_typ, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2)
print(np.mean(cvs))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed: 10.7min remaining:  7.1min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 10.7min finished


0.9170779231945364
Wall time: 10min 43s
