In [1]:
import numpy as np
import os
import pandas as pd
from skimage.io import imread, imsave
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import roc_auc_score, mean_absolute_error
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [2]:
er_trdat_path = './data/train/ER/'
nr_trdat_path = './data/train/NR/'
cropdat_path = './data/train/all_cropped/'

### form crp_ dataset

In [3]:
%%time
conv_type = {'ER': 1, 'NR': 0}
conv_nrj = {'1': 0, '3': 1, '6': 2, '10': 3, '20': 4, '30': 5}
X_crp = []
y_crp_typ = []
y_crp_nrj = []
for filename in os.listdir(cropdat_path):
    img = imread(cropdat_path + filename)    
    X_crp.append(img.flatten())
    fn_parts = filename.split('-')
    y_crp_typ.append(conv_type[fn_parts[1]]) 
    y_crp_nrj.append(conv_nrj[fn_parts[2].split('.')[0]])
print(len(X_crp))
print(len(y_crp_typ))
print(len(y_crp_nrj))

13404
13404
13404
Wall time: 4.03 s


In [4]:
unique, counts = np.unique(y_crp_typ, return_counts=True)
print(dict(zip(unique, counts)))
unique, counts = np.unique(y_crp_nrj, return_counts=True)
print(dict(zip(unique, counts)))

{0: 6646, 1: 6758}
{0: 2180, 1: 2245, 2: 2256, 3: 2274, 4: 2210, 5: 2239}


# 1. Binary Classification

In [5]:
X_crp = np.array(X_crp)
y_crp_typ = np.array(y_crp_typ)
y_crp_nrj = np.array(y_crp_nrj)
cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=125)

### SGD

In [6]:
%%time
clf = SGDClassifier(
    alpha=0.1, 
    l1_ratio=0.05, 
    loss='hinge', 
    penalty='elasticnet', 
    random_state=125)
cvs = cross_val_score(clf, X_crp, y_crp_typ, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2)
print(np.mean(cvs))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:   46.7s remaining:   31.1s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.1min finished


0.7011147379534307
Wall time: 1min 5s


### Random Forest

In [7]:
%%time
clf = RandomForestClassifier(
    criterion='gini', 
    max_depth=8, 
    max_features='auto', 
    n_estimators=800, 
    random_state=125)
cvs = cross_val_score(clf, X_crp, y_crp_typ, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2)
print(np.mean(cvs))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  1.5min remaining:   59.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.5min finished


0.9534102843532285
Wall time: 1min 29s


### CatBoost

In [8]:
%%time
clf = CatBoostClassifier(
    depth=6,
    iterations=20000,
    early_stopping_rounds=500,
    l2_leaf_reg=1e-20,
    leaf_estimation_iterations=10,
    logging_level='Silent',
    loss_function='Logloss',
    eval_metric='AUC',
    boosting_type='Plain',
    task_type='GPU',
    random_state=125)
cvs = cross_val_score(clf, X_crp, y_crp_typ, scoring='roc_auc', cv=cv, verbose=2)
print(np.mean(cvs))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time= 9.4min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  9.4min remaining:    0.0s


[CV] END .................................................... total time= 9.5min
[CV] END .................................................... total time= 9.4min
[CV] END .................................................... total time= 9.7min
[CV] END .................................................... total time= 9.5min
0.9956154053876766
Wall time: 47min 33s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 47.6min finished


### XGBoost

In [9]:
%%time
clf = XGBClassifier(
    n_estimators=500,
    colsample_bytree=1.0,
    gamma=1,
    max_depth=3,
    min_child_weight=1,
    subsample=1.0,
    eval_metric='auc', 
    use_label_encoder=False,
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    random_state=125)
cvs = cross_val_score(clf, X_crp, y_crp_typ, scoring='roc_auc', cv=cv, verbose=2)
print(np.mean(cvs))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time=  17.9s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   17.9s remaining:    0.0s


[CV] END .................................................... total time=  17.7s
[CV] END .................................................... total time=  17.1s
[CV] END .................................................... total time=  16.7s
[CV] END .................................................... total time=  17.5s
0.9939376484170884
Wall time: 1min 27s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  1.5min finished


### SVM

In [10]:
%%time
clf = svm.SVC(
    C=0.1,
    gamma='scale',
    kernel='rbf',
    random_state=125)
cvs = cross_val_score(clf, X_crp, y_crp_typ, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2)
print(np.mean(cvs))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed: 10.8min remaining:  7.2min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 10.8min finished


0.9170779231945364
Wall time: 10min 48s


# 2. Six-classes Classification

### SGD

In [11]:
y_crp_nrj

array([5, 1, 1, ..., 4, 4, 0])

In [12]:
%%time
clf = SGDClassifier(
    alpha=0.1, 
    l1_ratio=0.05, 
    loss='hinge', 
    penalty='elasticnet', 
    random_state=125)
cvs = cross_val_score(clf, X_crp, y_crp_nrj, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2)
print(np.mean(cvs))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  2.9min remaining:  1.9min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  3.4min finished


nan
Wall time: 3min 26s


### RandomForestClassifier

In [13]:
%%time
clf = RandomForestClassifier(
    criterion='gini', 
    max_depth=8, 
    max_features='auto', 
    n_estimators=800, 
    random_state=125)
cvs = cross_val_score(clf, X_crp, y_crp_nrj, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2)
print(np.mean(cvs))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   5 | elapsed:  1.6min remaining:  1.1min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.6min finished


nan
Wall time: 1min 35s


### CatBoostClassifier

In [None]:
%%time
clf = CatBoostClassifier(
    depth=6,
    iterations=20000,
    early_stopping_rounds=500,
    l2_leaf_reg=1e-20,
    leaf_estimation_iterations=10,
    logging_level='Silent',
    loss_function='MultiClass',
    eval_metric='AUC',
    boosting_type='Plain',
    task_type='GPU',
    random_state=125)
cvs = cross_val_score(clf, X_crp, y_crp_nrj, scoring='roc_auc', cv=cv, verbose=2)
print(np.mean(cvs))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
AUC is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


### XGBClassifier

In [None]:
%%time
clf = XGBClassifier(
    objective='multi:softmax',
    num_classes=6,
    n_estimators=500,
    colsample_bytree=1.0,
    gamma=1,
    max_depth=3,
    min_child_weight=1,
    subsample=1.0,
    eval_metric='auc', 
    use_label_encoder=False,
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    random_state=125)
cvs = cross_val_score(clf, X_crp, y_crp_nrj, scoring='roc_auc', cv=cv, verbose=2)
print(np.mean(cvs))

### SVM

In [None]:
%%time
clf = svm.SVC(
    C=0.1,
    gamma='scale',
    kernel='rbf',
    random_state=125)
cvs = cross_val_score(clf, X_crp, y_crp_nrj, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=2)
print(np.mean(cvs))

# Results

### 2-class

- sgd - 0.7011
- rf - 0.9534
- cat - 0.9956
- xgb - 0.9939
- svm - 0.9170

### 6-class

- sgd - 
-  rf - 
- cat - 
- xgb - 
- svm - 