In [None]:
import numpy as np
import os
import pandas as pd
from skimage.io import imread, imsave
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import roc_auc_score, mean_absolute_error
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
er_trdat_path = './data/train/ER/'
nr_trdat_path = './data/train/NR/'
cropdat_path = './data/train/all_cropped/'
balanced_path = './data/train/all_balanced/'

### form crp_ and bal_ datasets

In [None]:
conv_type = {'ER': 1, 'NR': 0}

In [None]:
%%time
X_crp = []
y_crp_typ = []
y_crp_enr = []
for filename in os.listdir(cropdat_path):
    img = imread(cropdat_path + filename)    
    X_crp.append(img.flatten())
    fn_parts = filename.split('-')
    y_crp_typ.append(conv_type[fn_parts[1]]) 
    y_crp_enr.append(int(fn_parts[2].split('.')[0]  ))
print(len(X_crp))
print(len(y_crp_typ))
print(len(y_crp_enr))

In [None]:
unique, counts = np.unique(y_crp_typ, return_counts=True)
print(dict(zip(unique, counts)))
unique, counts = np.unique(y_crp_enr, return_counts=True)
print(dict(zip(unique, counts)))

In [None]:
%%time
X_bal = []
y_bal_typ = []
y_bal_enr = []
for filename in os.listdir(balanced_path):
    img = imread(balanced_path + filename)    
    X_bal.append(img.flatten())
    fn_parts = filename.split('-')
    y_bal_typ.append(conv_type[fn_parts[1]]) 
    y_bal_enr.append(int(fn_parts[2].split('.')[0]  ))
print(len(X_bal))
print(len(y_bal_typ))
print(len(y_bal_enr))

In [None]:
unique, counts = np.unique(y_bal_typ, return_counts=True)
print(dict(zip(unique, counts)))
unique, counts = np.unique(y_bal_enr, return_counts=True)
print(dict(zip(unique, counts)))

# 1. Classification

In [None]:
cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=125)

### SVM

In [None]:
%%time
param_grid = {
    'kernel': ('linear', 'rbf', 'sigmoid', 'poly'),
    'C': [1, 10, 100, 1000],
    'gamma': [1e-3, 1e-4, 'scale']
}
clf = svm.SVC(random_state=125)
gscv = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=cv, n_jobs=-2, verbose=2)
gscv.fit(X_crp, y_crp_typ)

### Random Forest

In [None]:
%%time
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
clf = RandomForestClassifier(random_state=125)
gscv = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=cv, n_jobs=-1)
gscv.fit(X_crp, y_crp_typ)

### SGD

In [None]:
%%time
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
clf = RandomForestClassifier(random_state=125)
gscv = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=cv, n_jobs=-1)
gscv.fit(X_crp, y_crp_typ)

### CatBoost


In [None]:
%%time
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
clf = RandomForestClassifier(random_state=125)
gscv = GridSearchCV(clf, param_grid, scoring='roc_auc', cv=cv, n_jobs=-1)
gscv.fit(X_crp, y_crp_typ)

### XGBoost