In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from imblearn.ensemble import EasyEnsembleClassifier
from lifelines.utils import concordance_index

from datetime import timedelta
from timeit import default_timer as timer
from params_manager import seed_everything, get_column_names

In [2]:
from params_manager import INTERNAL_PATH

seed = 42

In [3]:
train_df = pd.read_csv(INTERNAL_PATH / 'train.csv')
train_df = train_df.fillna(0)

feature_columns, target_column, num_features, cat_features, ignored_features = get_column_names(train_df, full=True)
len(feature_columns)

167

In [4]:
X_train, X_test, y_train, y_test = train_test_split(train_df[num_features], train_df[target_column],
                                                    test_size=0.1, random_state=seed,
                                                    stratify=train_df['target'])
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((57272, 163), (6364, 163), (57272,), (6364,))

In [5]:
from sklearn.utils.class_weight import compute_class_weight

classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))
class_weights

{0: 0.5458739205856001, 1: 5.949719509661334}

In [6]:
seed_everything(seed)

classifiers = [
    ('RandomForestClassifier',
     RandomForestClassifier(
         n_estimators=100,
         random_state=seed,
         n_jobs=-1,
         class_weight=class_weights
     )),
    ('EasyEnsembleClassifier',
     EasyEnsembleClassifier(
         n_estimators=100,
         random_state=seed,
         n_jobs=-1,
     )),
    ('ExtraTreesClassifier', 
     ExtraTreesClassifier(
         n_estimators=100,
         random_state=seed,
         n_jobs=-1,
         class_weight=class_weights
     )),
    ('GradientBoostingClassifier',
     GradientBoostingClassifier(
         n_estimators=100,
         random_state=seed,
         learning_rate=0.02
     )),
    ('AdaBoostClassifier',
     AdaBoostClassifier(
         n_estimators=100,
         random_state=seed,
         learning_rate=0.02
     )),
    ('XGBoost', 
     XGBClassifier(
         n_estimators=100,
         learning_rate=0.02,
         objective='binary:logistic',
         verbosity=0,
         device='cuda'
    )),
    ('CatBoost',
     CatBoostClassifier(
         random_state=seed,
         learning_rate=0.02,
         eval_metric='AUC',
         verbose=False,
         early_stopping_rounds=500,
         iterations=1000
    ))
]

def cross_validate(classifiers, X_train, y_train, X_test, y_test):
    cv_results = []
    for name, classifier in classifiers:
        print(name)
        start = timer()
        
        classifier.fit(X_train, y_train)
        roc = (cross_val_score(classifier, X_train, y_train,
                               scoring = "roc_auc", cv=5, n_jobs=-1)).mean()
        
        runtime = timer() - start
        print(f'Runtime: {timedelta(seconds=int(round(runtime, 0)))}')
        
        CI = concordance_index(y_test, classifier.predict_proba(X_test)[:,1])
        print(f'ROC-AUC: {roc}')
        print(f'Concordance index: {CI}')
        print()
        cv_results.append((name, runtime, roc, CI))
    best_clf = classifiers[np.argmax(cv_results)]
    return best_clf

res = cross_validate(classifiers, X_train, y_train, X_test, y_test)

RandomForestClassifier
Runtime: 0:01:14
ROC-AUC: 0.7408428450637511
Concordance index: 0.7356604024671999

EasyEnsembleClassifier
Runtime: 0:02:42
ROC-AUC: 0.7552669773190365
Concordance index: 0.757698135170105

ExtraTreesClassifier
Runtime: 0:00:22
ROC-AUC: 0.7410630891205316
Concordance index: 0.7590794015741467

GradientBoostingClassifier
Runtime: 0:42:15
ROC-AUC: 0.7473311032731995
Concordance index: 0.7577386993488888

AdaBoostClassifier
Runtime: 0:11:03
ROC-AUC: 0.7304979138519397
Concordance index: 0.7417376539795384

XGBoost
Runtime: 0:00:10
ROC-AUC: 0.7586834537075344
Concordance index: 0.7675940311334081

CatBoost
Runtime: 0:03:05
ROC-AUC: 0.7674411575460218
Concordance index: 0.7708758816295577
