In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from xgboost import cv

In [14]:
train = np.array(pd.read_csv('data/train.csv'))
train_labels = pd.read_csv('data/train_labels.csv')
train_id = train_labels['id']
train_decision = np.array(train_labels['Decision'])

test = np.array(pd.read_csv('data/test.csv'))
test_labels = pd.read_csv('data/test_labels.csv')

X_train, X_test, y_train, y_test = train_test_split(
    train, train_decision, test_size=0.2, random_state=0)

In [None]:
# APPROACH 1: RANDOM FOREST

In [None]:
param_grid = {
    'bootstrap': [True, False],
    'max_depth': [2, 4, 6, 10],
    'max_features': ['auto', 'log2'],
    'n_estimators': [100, 200]
}

score = 'accuracy'

In [None]:
rf = RandomForestClassifier(random_state=0)
clf = GridSearchCV(rf, param_grid, cv=5, n_jobs=2, scoring=score) 
clf.fit(X=train, y=train_decision)

In [None]:
print(f'Params: {clf.best_params_}')
print(f'Test: {clf.score(X_test, y_test)}')

test_submission = test_labels.copy()
test_submission['Decision'] = clf.predict(test)
test_submission.to_csv('submissions/rf_submission.csv', index=False)

In [None]:
# APPROACH 2: LOGISTIC REGRESSION

In [16]:
param_grid = {'penalty' : ['l1', 'l2'],
              'C' : np.logspace(-4, 4, 20),
              'solver' : ['liblinear']}

lr = GridSearchCV(LogisticRegression(random_state=0), param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=2)
lr.fit(X=train, y=train_decision)

print(f'Params: {lr.best_params_}')
print(f'Test: {lr.best_score_}')

test_submission = test_labels.copy()
test_submission['Decision'] = lr.predict(test)
test_submission.to_csv('submissions/lr_submission.csv', index=False)

Params: {'C': 0.08858667904100823, 'penalty': 'l2', 'solver': 'liblinear'}
Test: 0.6738052320765783


In [None]:
# APPROACH 3: XGBOOST

In [18]:
data = xgb.DMatrix(data=train,label=train_decision)

xgb_clf = XGBClassifier(
    objective= 'binary:logistic',
    nthread=4,
    seed=7
)

params = {
    'max_depth': [2, 4, 6, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 1]
}


grid_search = GridSearchCV(estimator=xgb_clf, param_grid=params, scoring = 'accuracy', 
                           n_jobs = 10, cv = 5, verbose=True
)

grid_search.fit(train, train_decision)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:   23.9s
[Parallel(n_jobs=10)]: Done 120 out of 120 | elapsed:  1.7min finished




GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     enable_categorical=False, gamma=None,
                                     gpu_id=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None, nthread=4,
                                     num_parallel_tree=None, predictor=None,
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale

In [22]:
test_submission = test_labels.copy()
test_submission['Decision'] = grid_search.best_estimator_.predict(test)
test_submission.to_csv('submissions/xgb_submission.csv', index=False)