In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from xgboost import cv

In [14]:
train = np.array(pd.read_csv('data/train.csv'))
train_labels = pd.read_csv('data/train_labels.csv')
train_id = train_labels['id']
train_decision = np.array(train_labels['Decision'])

test = np.array(pd.read_csv('data/test.csv'))
test_labels = pd.read_csv('data/test_labels.csv')

X_train, X_test, y_train, y_test = train_test_split(
    train, train_decision, test_size=0.2, random_state=0)

In [None]:
# APPROACH 1: RANDOM FOREST

In [None]:
param_grid = {
    'bootstrap': [True, False],
    'max_depth': [2, 4, 6, 10],
    'max_features': ['auto', 'log2'],
    'n_estimators': [100, 200]
}

score = 'accuracy'

In [None]:
rf = RandomForestClassifier(random_state=0)
clf = GridSearchCV(rf, param_grid, cv=5, n_jobs=2, scoring=score) 
clf.fit(X=train, y=train_decision)

In [None]:
print(f'Params: {clf.best_params_}')
print(f'Test: {clf.score(X_test, y_test)}')

test_submission = test_labels.copy()
test_submission['Decision'] = clf.predict(test)
test_submission.to_csv('submissions/rf_submission.csv', index=False)

In [None]:
# APPROACH 2: LOGISTIC REGRESSION

In [13]:
param_grid = {'penalty' : ['l1', 'l2'],
              'C' : np.logspace(-4, 4, 20),
              'solver' : ['liblinear']}

lr = GridSearchCV(LogisticRegression(random_state=0), param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=2)
lr.fit(X=train, y=train_decision)

print(f'Params: {lr.best_params_}')
print(f'Test: {lr.best_score_}')

test_submission = test_labels.copy()
test_submission['Decision'] = clf.predict(test)
test_submission.to_csv('submissions/kr_submission.csv', index=False)

Params: {'C': 0.08858667904100823, 'penalty': 'l2', 'solver': 'liblinear'}


NameError: name 'X_test' is not defined

In [None]:
# APPROACH 3: XGBOOST

In [3]:
data = xgb.DMatrix(data=train,label=train_decision)

params = {
            'objective':'binary:logistic',
            'max_depth': 4,
            'alpha': 10,
            'learning_rate': 1.0,
            'n_estimators':100
}

xgb_cv = cv(dtrain=data, params=params, nfold=5,
                    num_boost_round=50, early_stopping_rounds=10, metrics="error", as_pandas=True, seed=7)


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "n_estimators" } might not be

In [4]:
xgb_cv

Unnamed: 0,train-error-mean,train-error-std,test-error-mean,test-error-std
0,0.29822,0.007351,0.314551,0.009786
1,0.292598,0.00856,0.309868,0.01796
2,0.281857,0.002948,0.303578,0.018159
3,0.275834,0.006433,0.300899,0.010686
4,0.268237,0.004108,0.297018,0.010245
5,0.264155,0.004725,0.293805,0.00969
6,0.260575,0.004497,0.293137,0.01324
7,0.255087,0.005523,0.288184,0.012771
8,0.250971,0.005657,0.288452,0.013352
9,0.246989,0.00419,0.285908,0.013982


In [9]:
xgb_clf = XGBClassifier(**params)
xgb_clf.fit(train, train_decision)
predictions = xgb_clf.predict(test)



In [11]:
test_submission = test_labels.copy()
test_submission['Decision'] = predictions
test_submission.to_csv('submissions/xgb_submission.csv', index=False)