In [1]:
import os

import numpy as np
import pandas as pd

from sklearn.metrics import accuracy_score, roc_auc_score

In [2]:
num_fold = 5

In [3]:
def get_fold(fold_id, replace_unknown='zero'):
    '''
    Get X_train, y_train, X_val, y_val corressponding to a fold_id
    '''
    X_train = np.load('05d2b4ea-c-Dataset/train_split/X_fold_%d.npy'%fold_id)
    y_train = np.load('05d2b4ea-c-Dataset/train_split/y_fold_%d.npy'%fold_id)
    X_val = np.load('05d2b4ea-c-Dataset/train_split/X_hold_out_%d.npy'%fold_id)
    y_val = np.load('05d2b4ea-c-Dataset/train_split/y_hold_out_%d.npy'%fold_id)
    if replace_unknown == 'zero':
        X_train[X_train == '?'] = 0
        X_val[X_val == '?'] = 0
    if replace_unknown == 'nan':
        X_train[X_train == '?'] = np.nan
        X_val[X_val == '?'] = np.nan
    return X_train, y_train, X_val, y_val

In [4]:
X = np.load('05d2b4ea-c-Dataset/train_split/X_train.npy')
X[X == '?'] = 0
y = np.load('05d2b4ea-c-Dataset/train_split/y_train.npy')

X_test = np.load('05d2b4ea-c-Dataset/train_split/X_test.npy')
X_test[X_test == '?'] = 0

### Random Forest

In [6]:
prediction_path = 'predictions/rfc'
if not os.path.exists(prediction_path):
    os.mkdir(prediction_path)
    
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=50000, n_jobs=-1)
for fold_id in range(num_fold):
    print("-"*100)
    print("Fold %d"%fold_id)
    X_train, y_train, X_val, y_val = get_fold(fold_id)
    rfc.fit(X_train, y_train)
    class_idx = list(rfc.classes_).index(1)
    
    y_predict_train = rfc.predict_proba(X)[:, class_idx]
    print("Traing AUC Score: ", roc_auc_score(y, y_predict_train))
    
    y_predict_val = rfc.predict_proba(X_val)[:, class_idx]
    print("Validation AUC Score: ", roc_auc_score(y_val, y_predict_val))
    
    y_predict_test = rfc.predict_proba(X_test)[:, class_idx]
    
    np.save(prediction_path+"/train_%d.npy"%fold_id, y_predict_train)
    np.save(prediction_path+"/test_%d.npy"%fold_id, y_predict_test)

----------------------------------------------------------------------------------------------------
Fold 0
Traing AUC Score:  0.955681894074323
Validation AUC Score:  0.5608629971022928
----------------------------------------------------------------------------------------------------
Fold 1
Traing AUC Score:  0.9598877268199697
Validation AUC Score:  0.5875959569101339
----------------------------------------------------------------------------------------------------
Fold 2
Traing AUC Score:  0.9570239657653538
Validation AUC Score:  0.57262630184431
----------------------------------------------------------------------------------------------------
Fold 3
Traing AUC Score:  0.9606829504005874
Validation AUC Score:  0.576046361801762
----------------------------------------------------------------------------------------------------
Fold 4
Traing AUC Score:  0.9596395705045916
Validation AUC Score:  0.5749707121060961


### XGBoost

In [7]:
prediction_path = 'predictions/xgbc'
if not os.path.exists(prediction_path):
    os.mkdir(prediction_path)
    
from xgboost import XGBClassifier

xgbc = XGBClassifier(n_estimators=50000, n_jobs=-1, verbose=10)
for fold_id in range(num_fold):
    print("-"*100)
    print("Fold %d"%fold_id)
    X_train, y_train, X_val, y_val = get_fold(fold_id, replace_unknown='zero')
    xgbc.fit(X_train, y_train)
    class_idx = list(xgbc.classes_).index(1)
    
    y_predict_train = xgbc.predict_proba(X)[:, class_idx]
    print("Traing AUC Score: ", roc_auc_score(y, y_predict_train))
    
    y_predict_val = xgbc.predict_proba(X_val)[:, class_idx]
    print("Validation AUC Score: ", roc_auc_score(y_val, y_predict_val))
    
    y_predict_test = xgbc.predict_proba(X_test)[:, class_idx]
    
    np.save(prediction_path+"/train_%d.npy"%fold_id, y_predict_train)
    np.save(prediction_path+"/test_%d.npy"%fold_id, y_predict_test)

----------------------------------------------------------------------------------------------------
Fold 0


KeyboardInterrupt: 