In [47]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from xgboost import cv
import time

In [48]:
train = np.array(pd.read_csv('data/train.csv'))
train_labels = pd.read_csv('data/train_labels.csv')
train_id = train_labels['id']
train_decision = np.array(train_labels['Decision'])

test = np.array(pd.read_csv('data/test.csv'))
test_labels = pd.read_csv('data/test_labels.csv')

X_train, X_test, y_train, y_test = train_test_split(
    train, train_decision, test_size=0.2, random_state=0)

In [49]:
train.shape

(7471, 34)

In [50]:
# APPROACH 1: RANDOM FOREST

In [51]:
# NOTE, I would optimize 3-4 hyper-paramaters at a time with testing
# Multiple values per hyper-parameter, however I did not do all at a time
# To avoid blowing up the search time

In [52]:
param_grid = {
    'bootstrap': [False],
    'max_depth': [10],
    'criterion': ['gini'],
    'max_features': ['auto'],
    'n_estimators': [40],
    'min_samples_split': [2],
    'min_samples_leaf': [1]
}

score = 'accuracy'

In [34]:
rf = RandomForestClassifier(random_state=0)
clf = GridSearchCV(rf, param_grid, cv=5, n_jobs=10, scoring=score) 
clf.fit(X=train, y=train_decision)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=0), n_jobs=10,
             param_grid={'bootstrap': [False], 'criterion': ['gini'],
                         'max_depth': [10], 'max_features': ['auto'],
                         'min_samples_leaf': [1], 'min_samples_split': [2],
                         'n_estimators': [40]},
             scoring='accuracy')

In [53]:
print(f'Params: {clf.best_params_}')
print(f'Test: {clf.score(X_test, y_test)}')

start = time.time()
lr = RandomForestClassifier(random_state=0,
                           max_depth=10,
                           max_features='auto',
                           n_estimators=60).fit(X=train, y=train_decision)
end = time.time()
print(f'Random Forest took {end - start} seconds to train')

test_submission = test_labels.copy()
test_submission['Decision'] = lr.predict(test)
test_submission.to_csv('submissions/rf_submission.csv', index=False)

Params: {'bootstrap': False, 'criterion': 'gini', 'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 40}
Test: 0.8107023411371237
Random Forest took 0.546334981918335 seconds to train


In [None]:
# APPROACH 2: LOGISTIC REGRESSION

In [54]:
param_grid = {'penalty' : ['l2'],
              'C' : np.logspace(-4, 4, 20),
              'solver' : ['liblinear'],
             'fit_intercept': [True]}

lr = GridSearchCV(LogisticRegression(random_state=0), param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=2)
lr.fit(X=train, y=train_decision)

print(f'Params: {lr.best_params_}')
print(f'Test: {lr.best_score_}')

start = time.time()
lr = LogisticRegression(random_state=0,
                       C = 4.28133,
                       fit_intercept = True).fit(X=train, y=train_decision)
end = time.time()
print(f'Logistic Regression took {end - start} seconds to train')

test_submission = test_labels.copy()
test_submission['Decision'] = lr.predict(test)
test_submission.to_csv('submissions/lr_submission.csv', index=False)

Params: {'C': 4.281332398719396, 'fit_intercept': True, 'penalty': 'l2', 'solver': 'liblinear'}
Test: 0.6937491773112516
Logistic Regression took 0.1419990062713623 seconds to train


In [None]:
# APPROACH 3: XGBOOST

In [None]:
# NOTE, I would optimize 3-4 hyper-paramaters at a time with testing
# Multiple values per hyper-parameter, however I did not do all at a time
# To avoid blowing up the search time

In [3]:
data = xgb.DMatrix(data=train,label=train_decision)

xgb_clf = XGBClassifier(
    nthread=10,
    seed=7
)

params = {
    'objective': ['binary:logistic'],
    'max_depth': [2,4,6,10],
    'gamma': [0,1,2],
    'learning_rate': [0.01,0.03,0.05,0.10, 0.12,0.15],
    'subsample': [0.6,0.8,0.10],
    'colsample_bylevel': [1],
    'eval_metric' : ['error'],
    'early_stopping_rounds': [10, 30],
    'n_estimators': [35, 60, 100, 150]
}


grid_search = GridSearchCV(estimator=xgb_clf, param_grid=params, scoring = 'accuracy', 
                           n_jobs = 10, cv = 5, verbose=True
)

grid_search.fit(train, train_decision)
print("Finished")

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   2 out of  10 | elapsed:   28.5s remaining:  1.9min
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:   40.5s finished


Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Finished


In [57]:
print(f'Params: {grid_search.best_params_}')
print(f'Test: {grid_search.best_score_}')

start = time.time()
xgb_model = XGBClassifier(
    nthread=10,
    seed=0,
    n_estimators=35, learning_rate=0.12, max_depth=6, gamma = 1, early_stop_rounds = 30, subsample = 0.8, random_state=0
).fit(train, train_decision)
end = time.time()
print(f'XGBoost took {end - start} seconds to train')

test_submission = test_labels.copy()
test_submission['Decision'] = grid_search.best_estimator_.predict(test)
test_submission['Decision'] = xgb_model.predict(test)
test_submission.to_csv('submissions/xgb_submission.csv', index=False)

Params: {'colsample_bylevel': 1, 'early_stopping_rounds': 30, 'eval_metric': 'error', 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 250, 'objective': 'binary:logistic', 'subsample': 0.8}
Test: 0.7978837087480356
Parameters: { "early_stop_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


XGBoost took 0.6348779201507568 seconds to train


0.00012803077697753906
