In [1]:
import numpy as np
import os
import pandas as pd
from skimage.io import imread
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [2]:
cropdat_path = './data/train/all_cropped/'
augmented_path = './data/train/all_augmented/'
cropped_testdat_path = './data/test_cropped/'
submissions_path = './submissions/'

### Load Data

In [5]:
%%time
conv_type = {'ER': 1, 'NR': 0}
conv_nrj = {'1': 0, '3': 1, '6': 2, '10': 3, '20': 4, '30': 5}

X_crp = []
y_crp_typ = []
y_crp_nrj = []
for filename in os.listdir(cropdat_path):
    img = imread(cropdat_path + filename)    
    X_crp.append(img.flatten())
    fn_parts = filename.split('-')
    y_crp_typ.append(conv_type[fn_parts[1]]) 
    y_crp_nrj.append(conv_nrj[fn_parts[2].split('.')[0]])
print(len(X_crp))
print(len(y_crp_typ))
print(len(y_crp_nrj))

X_test_crp = []
X_test_labels = []
for filename in os.listdir(cropped_testdat_path):
    img = imread(cropped_testdat_path + filename)    
    X_test_crp.append(img.flatten())
    X_test_labels.append(filename.split('.')[0])
print(len(X_test_crp))
print(len(X_test_labels))

X_crp = np.array(X_crp)
y_crp_typ = np.array(y_crp_typ)
y_crp_nrj = np.array(y_crp_nrj)
X_test_crp = np.array(X_test_crp)

13404
13404
13404
16564
16564
Wall time: 9.14 s


# 1. Binary Classification

### SGD

In [None]:
%%time
clf2_sgd = SGDClassifier(
    alpha=0.1, 
    l1_ratio=0.05, 
    loss='hinge', 
    penalty='elasticnet',
    n_jobs=-1,
    random_state=125)
clf2_sgd.fit(X_crp, y_crp_typ)
y2_sgd = clf2_sgd.predict(X_test_crp)

### Random Forest

In [None]:
%%time
clf2_rf = RandomForestClassifier(
    criterion='gini', 
    max_depth=8, 
    max_features='auto', 
    n_estimators=800,
    n_jobs=-1,
    random_state=125)
clf2_rf.fit(X_crp, y_crp_typ)
y2_rf = clf2_rf.predict(X_test_crp)

### CatBoost

In [None]:
%%time
clf2_cb = CatBoostClassifier(
    depth=6,
    iterations=20000,
    early_stopping_rounds=500,
    l2_leaf_reg=1e-20,
    leaf_estimation_iterations=10,
    logging_level='Silent',
    loss_function='Logloss',
    eval_metric='AUC',
    boosting_type='Plain',
    task_type='GPU',
    random_state=125)
clf2_cb.fit(X_crp, y_crp_typ)
y2_cb = clf2_cb.predict(X_test_crp)

### XGBoost

In [6]:
%%time
clf2_xgb = XGBClassifier(
    n_estimators=800,
    colsample_bytree=1.0,
    gamma=1,
    max_depth=3,
    min_child_weight=1,
    subsample=1.0,
    eval_metric='auc', 
    use_label_encoder=False,
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    random_state=125)
clf2_xgb.fit(X_crp, y_crp_typ)


Wall time: 27.2 s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1.0, eval_metric='auc',
              gamma=1, gpu_id=0, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=3, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=800, n_jobs=12,
              num_parallel_tree=1, predictor='gpu_predictor', random_state=125,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1.0,
              tree_method='gpu_hist', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [7]:
y2_xgb = clf2_xgb.predict(X_test_crp)

### SVM

In [None]:
%%time
clf2_svc = svm.SVC(
    C=0.1,
    gamma='scale',
    kernel='rbf',
    random_state=125)
clf2_svc.fit(X_crp, y_crp_typ)
y2_svc = clf2_svc.predict(X_test_crp)

# 2. Six-classes Classification

### SGD

In [None]:
%%time
clf6_sgd = SGDClassifier(
    alpha=0.1, 
    l1_ratio=0.05, 
    loss='log', 
    penalty='elasticnet',
    n_jobs=-1,
    random_state=125)
clf6_sgd.fit(X_crp, y_crp_nrj)
y6_sgd = clf6_sgd.predict(X_test_crp)

### RandomForestClassifier

In [None]:
%%time
clf6_rf = RandomForestClassifier(
    criterion='gini', 
    max_depth=8, 
    max_features='auto', 
    n_estimators=800,
    n_jobs=-1,
    random_state=125)
clf6_rf.fit(X_crp, y_crp_nrj)
y6_rf = clf6_rf.predict(X_test_crp)

### CatBoostClassifier

In [None]:
%%time
clf6_cb = CatBoostClassifier(
    depth=6,
    iterations=20000,
    early_stopping_rounds=500,
    l2_leaf_reg=1e-20,
    leaf_estimation_iterations=10,
    logging_level='Silent',
    loss_function='MultiClass',
    eval_metric='MultiClass',
    boosting_type='Plain',
    task_type='GPU',
    random_state=125)
clf6_cb.fit(X_crp, y_crp_nrj)

In [None]:
y6_cb = clf6_cb.predict(X_test_crp)

### XGBoost

In [8]:
%%time
clf6_xgb = XGBClassifier(
    objective='multi:softmax',
    num_classes=6,
    n_estimators=500,
    colsample_bytree=1.0,
    gamma=1,
    max_depth=3,
    min_child_weight=1,
    subsample=1.0,
    eval_metric='mlogloss', 
    use_label_encoder=False,
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    n_jobs=-1,
    random_state=125)
clf6_xgb.fit(X_crp, y_crp_nrj)

Parameters: { num_classes } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Wall time: 1min 22s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1.0, eval_metric='mlogloss',
              gamma=1, gpu_id=0, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=3, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=500, n_jobs=-1,
              num_classes=6, num_parallel_tree=1, objective='multi:softprob',
              predictor='gpu_predictor', random_state=125, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1.0,
              tree_method='gpu_hist', use_label_encoder=False,
              validate_parameters=1, ...)

In [9]:
y6_xgb = clf6_xgb.predict(X_test_crp)

### SVM

In [None]:
%%time
clf6_svc = OneVsRestClassifier(svm.SVC(
        C=0.1,
        gamma='scale',
        kernel='rbf',
        probability=True,
        random_state=125),
    n_jobs=-1)
clf6_svc.fit(X_crp, y_crp_nrj)
y6_svc = clf6_svc.predict(X_test_crp)

# 3. Form Submission

In [None]:
sub_bin = pd.DataFrame(X_test_labels, columns=['id'])
sub_six = pd.DataFrame(X_test_labels, columns=['id'])
sub_cat = pd.DataFrame(X_test_labels, columns=['id'])
sub_xgb = pd.DataFrame(X_test_labels, columns=['id'])
sub_mode = pd.DataFrame(X_test_labels, columns=['id'])

In [None]:
y6_cat = np.stack(y6_cb, axis=1)[0]

In [None]:
conv_six = {0: 1, 1: 3, 2: 6, 3: 10, 4: 20, 5: 30}

sub_bin['y2_sgd'] = y2_sgd
sub_bin['y2_rf'] = y2_rf
sub_bin['y2_cb'] = y2_cb
sub_bin['y2_xgb'] = y2_xgb
sub_bin['y2_svc'] = y2_svc
sub_bin.drop('id', 1, inplace=True)

sub_six['y6_sgd'] = list(map(lambda x: conv_six[x], y6_sgd))
sub_six['y6_rf'] = list(map(lambda x: conv_six[x], y6_rf))
sub_six['y6_cb'] = list(map(lambda x: conv_six[x], y6_cat))
sub_six['y6_xgb'] = list(map(lambda x: conv_six[x], y6_xgb))
sub_six['y6_svc'] = list(map(lambda x: conv_six[x], y6_svc))
sub_six.drop('id', 1, inplace=True)

sub_bin.head()

In [None]:
sub_six.head()

In [None]:
sub_cat['classification_predictions'] = sub_bin['y2_cb']
sub_cat['regression_predictions'] = sub_six['y6_cb']

sub_xgb['classification_predictions'] = sub_bin['y2_xgb']
sub_xgb['regression_predictions'] = sub_six['y6_xgb']

sub_mode['classification_predictions'] = sub_bin.mode(axis=1)[0].astype('int32')
sub_mode['regression_predictions'] = sub_six.mode(axis=1)[0].astype('int32')
sub_mode.head()

In [None]:
sub_cat.head()

In [None]:
sub_xgb.head()

In [None]:
sub_mode.to_csv(submissions_path + 'basic_modes.csv', index=False)
sub_cat.to_csv(submissions_path + 'basic_cat.csv', index=False)
sub_xgb.to_csv(submissions_path + 'basic_xgb.csv', index=False)

In [None]:
sub_rf = pd.DataFrame(X_test_labels, columns=['id'])
sub_rf['classification_predictions'] = sub_bin['y2_rf']
sub_rf['regression_predictions'] = sub_six['y6_rf']
sub_rf.head()

In [None]:
sub_rf.to_csv(submissions_path + 'basic_rf.csv', index=False)

## Yandex Contest Scores
-  rf - 46.07 
- cat - 663.1
- xgb - 579.88
- modes - 537.95

In [10]:
clf6_xgb.save_model('./saved_models/xgb6')
clf2_xgb.save_model('./saved_models/xgb2')