In [1]:
import numpy as np
import os
import pandas as pd
from skimage.io import imread
from sklearn.linear_model import SGDClassifier
from sklearn import svm
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [2]:
cropdat_path = './data/train/all_cropped/'
augmented_path = './data/train/all_augmented/'
cropped_testdat_path = './data/test_cropped/'
submissions_path = './submissions/'

### Load Data

In [3]:
%%time
conv_type = {'ER': 1, 'NR': 0}
conv_nrj = {'1': 0, '3': 1, '6': 2, '10': 3, '20': 4, '30': 5}

X_crp = []
y_crp_typ = []
y_crp_nrj = []
for filename in os.listdir(cropdat_path):
    img = imread(cropdat_path + filename)    
    X_crp.append(img.flatten())
    fn_parts = filename.split('-')
    y_crp_typ.append(conv_type[fn_parts[1]]) 
    y_crp_nrj.append(conv_nrj[fn_parts[2].split('.')[0]])
print(len(X_crp))
print(len(y_crp_typ))
print(len(y_crp_nrj))

X_test_crp = []
X_test_labels = []
for filename in os.listdir(cropped_testdat_path):
    img = imread(cropped_testdat_path + filename)    
    X_test_crp.append(img.flatten())
    X_test_labels.append(filename.split('.')[0])
print(len(X_test_crp))
print(len(X_test_labels))

X_crp = np.array(X_crp)
y_crp_typ = np.array(y_crp_typ)
y_crp_nrj = np.array(y_crp_nrj)
X_test_crp = np.array(X_test_crp)

13404
13404
13404
16564
16564
Wall time: 1min 53s


# 1. Binary Classification

### SGD

In [5]:
%%time
clf2_sgd = SGDClassifier(
    alpha=0.1, 
    l1_ratio=0.05, 
    loss='hinge', 
    penalty='elasticnet',
    n_jobs=-1,
    random_state=125)
clf2_sgd.fit(X_crp, y_crp_typ)
y2_sgd = clf2_sgd.predict(X_test_crp)

Wall time: 51.1 s


### Random Forest

In [6]:
%%time
clf2_rf = RandomForestClassifier(
    criterion='gini', 
    max_depth=8, 
    max_features='auto', 
    n_estimators=800,
    n_jobs=-1,
    random_state=125)
clf2_rf.fit(X_crp, y_crp_typ)
y2_rf = clf2_rf.predict(X_test_crp)

Wall time: 16.6 s


### CatBoost

In [7]:
%%time
clf2_cb = CatBoostClassifier(
    depth=6,
    iterations=20000,
    early_stopping_rounds=500,
    l2_leaf_reg=1e-20,
    leaf_estimation_iterations=10,
    logging_level='Silent',
    loss_function='Logloss',
    eval_metric='AUC',
    boosting_type='Plain',
    task_type='GPU',
    random_state=125)
clf2_cb.fit(X_crp, y_crp_typ)
y2_cb = clf2_cb.predict(X_test_crp)

Wall time: 10min 13s


### XGBoost

In [8]:
%%time
clf2_xgb = XGBClassifier(
    n_estimators=800,
    colsample_bytree=1.0,
    gamma=1,
    max_depth=3,
    min_child_weight=1,
    subsample=1.0,
    eval_metric='auc', 
    use_label_encoder=False,
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    random_state=125)
clf2_xgb.fit(X_crp, y_crp_typ)
y2_xgb = clf2_xgb.predict(X_test_crp)

Wall time: 27 s


### SVM

In [9]:
%%time
clf2_svc = svm.SVC(
    C=0.1,
    gamma='scale',
    kernel='rbf',
    random_state=125)
clf2_svc.fit(X_crp, y_crp_typ)
y2_svc = clf2_svc.predict(X_test_crp)

Wall time: 11min 32s


# 2. Six-classes Classification

### SGD

In [10]:
%%time
clf6_sgd = SGDClassifier(
    alpha=0.1, 
    l1_ratio=0.05, 
    loss='log', 
    penalty='elasticnet',
    n_jobs=-1,
    random_state=125)
clf6_sgd.fit(X_crp, y_crp_nrj)
y6_sgd = clf6_sgd.predict(X_test_crp)

Wall time: 1min 26s


### RandomForestClassifier

In [11]:
%%time
clf6_rf = RandomForestClassifier(
    criterion='gini', 
    max_depth=8, 
    max_features='auto', 
    n_estimators=800,
    n_jobs=-1,
    random_state=125)
clf6_rf.fit(X_crp, y_crp_nrj)
y6_rf = clf6_rf.predict(X_test_crp)

Wall time: 16.1 s


### CatBoostClassifier

In [12]:
%%time
clf6_cb = CatBoostClassifier(
    depth=6,
    iterations=20000,
    early_stopping_rounds=500,
    l2_leaf_reg=1e-20,
    leaf_estimation_iterations=10,
    logging_level='Silent',
    loss_function='MultiClass',
    eval_metric='MultiClass',
    boosting_type='Plain',
    task_type='GPU',
    random_state=125)
clf6_cb.fit(X_crp, y_crp_nrj)
y6_cb = clf6_cb.predict(X_test_crp)



Wall time: 19min 10s


### XGBoost

In [13]:
%%time
clf6_xgb = XGBClassifier(
    objective='multi:softmax',
    num_classes=6,
    n_estimators=500,
    colsample_bytree=1.0,
    gamma=1,
    max_depth=3,
    min_child_weight=1,
    subsample=1.0,
    eval_metric='mlogloss', 
    use_label_encoder=False,
    tree_method='gpu_hist',
    predictor='gpu_predictor',
    n_jobs=-1,
    random_state=125)
clf6_xgb.fit(X_crp, y_crp_nrj)
y6_xgb = clf6_xgb.predict(X_test_crp)

Parameters: { num_classes } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Wall time: 1min 22s


### SVM

In [14]:
%%time
clf6_svc = OneVsRestClassifier(svm.SVC(
        C=0.1,
        gamma='scale',
        kernel='rbf',
        probability=True,
        random_state=125),
    n_jobs=-1)
clf6_svc.fit(X_crp, y_crp_nrj)
y6_svc = clf6_svc.predict(X_test_crp)

Wall time: 51min 38s


# 3. Form Submission

In [29]:
sub_bin = pd.DataFrame(X_test_labels, columns=['id'])
sub_six = pd.DataFrame(X_test_labels, columns=['id'])
sub_cat = pd.DataFrame(X_test_labels, columns=['id'])
sub_xgb = pd.DataFrame(X_test_labels, columns=['id'])
sub_mode = pd.DataFrame(X_test_labels, columns=['id'])

In [30]:
y6_cat = np.stack(y6_cb, axis=1)[0]

In [31]:
conv_six = {0: 1, 1: 3, 2: 6, 3: 10, 4: 20, 5: 30}

sub_bin['y2_sgd'] = y2_sgd
sub_bin['y2_rf'] = y2_rf
sub_bin['y2_cb'] = y2_cb
sub_bin['y2_xgb'] = y2_xgb
sub_bin['y2_svc'] = y2_svc
sub_bin.drop('id', 1, inplace=True)

sub_six['y6_sgd'] = list(map(lambda x: conv_six[x], y6_sgd))
sub_six['y6_rf'] = list(map(lambda x: conv_six[x], y6_rf))
sub_six['y6_cb'] = list(map(lambda x: conv_six[x], y6_cat))
sub_six['y6_xgb'] = list(map(lambda x: conv_six[x], y6_xgb))
sub_six['y6_svc'] = list(map(lambda x: conv_six[x], y6_svc))
sub_six.drop('id', 1, inplace=True)

sub_bin.head()

Unnamed: 0,y2_sgd,y2_rf,y2_cb,y2_xgb,y2_svc
0,1,1,1,1,1
1,1,0,0,0,0
2,1,0,0,0,0
3,1,1,1,1,1
4,1,1,0,0,0


In [33]:
sub_six.head()

Unnamed: 0,y6_sgd,y6_rf,y6_cb,y6_xgb,y6_svc
0,30,30,30,30,30
1,20,20,20,20,20
2,20,20,20,20,20
3,30,30,30,30,30
4,20,6,6,6,10


In [38]:
sub_cat['classification_predictions'] = sub_bin['y2_cb']
sub_cat['regression_predictions'] = sub_six['y6_cb']

sub_xgb['classification_predictions'] = sub_bin['y2_xgb']
sub_xgb['regression_predictions'] = sub_six['y6_xgb']

sub_mode['classification_predictions'] = sub_bin.mode(axis=1)[0].astype('int32')
sub_mode['regression_predictions'] = sub_six.mode(axis=1)[0].astype('int32')
sub_mode.head()

Unnamed: 0,id,classification_predictions,regression_predictions
0,0002894871bb30af2670648c58b2506e9801a321,1,30
1,000a95ca23df016a149ff9af94b6e9d8633d6691,0,20
2,000b4407dee7dad2ba46586e6ba1264f45965e51,0,20
3,000c45d729066044e3dd3f885fddf013b845b6d7,1,30
4,0011f80e44faa713714ead6aeab3b2f9db54aebd,0,6


In [35]:
sub_cat.head()

Unnamed: 0,id,classification_predictions,regression_predictions
0,0002894871bb30af2670648c58b2506e9801a321,1,30
1,000a95ca23df016a149ff9af94b6e9d8633d6691,0,20
2,000b4407dee7dad2ba46586e6ba1264f45965e51,0,20
3,000c45d729066044e3dd3f885fddf013b845b6d7,1,30
4,0011f80e44faa713714ead6aeab3b2f9db54aebd,0,6


In [36]:
sub_xgb.head()

Unnamed: 0,id,classification_predictions,regression_predictions
0,0002894871bb30af2670648c58b2506e9801a321,1,30
1,000a95ca23df016a149ff9af94b6e9d8633d6691,0,20
2,000b4407dee7dad2ba46586e6ba1264f45965e51,0,20
3,000c45d729066044e3dd3f885fddf013b845b6d7,1,30
4,0011f80e44faa713714ead6aeab3b2f9db54aebd,0,6


In [40]:
sub_mode.to_csv(submissions_path + 'basic_modes.csv', index=False)
sub_cat.to_csv(submissions_path + 'basic_cat.csv', index=False)
sub_xgb.to_csv(submissions_path + 'basic_xgb.csv', index=False)

In [41]:
sub_rf = pd.DataFrame(X_test_labels, columns=['id'])
sub_rf['classification_predictions'] = sub_bin['y2_rf']
sub_rf['regression_predictions'] = sub_six['y6_rf']
sub_rf.head()

Unnamed: 0,id,classification_predictions,regression_predictions
0,0002894871bb30af2670648c58b2506e9801a321,1,30
1,000a95ca23df016a149ff9af94b6e9d8633d6691,0,20
2,000b4407dee7dad2ba46586e6ba1264f45965e51,0,20
3,000c45d729066044e3dd3f885fddf013b845b6d7,1,30
4,0011f80e44faa713714ead6aeab3b2f9db54aebd,1,6


In [42]:
sub_rf.to_csv(submissions_path + 'basic_rf.csv', index=False)

## Yandex Contest Scores
-  rf - 46.07 
- cat - 663.1
- xgb - 579.88
- modes - 537.95