In [1]:
import numpy as np
import pandas as pd

import os

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score

import xgboost as xgb
from xgboost import XGBClassifier

In [2]:
DATA_PATH = 'data'

In [3]:
train = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))
test = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))
sub = pd.read_csv(os.path.join(DATA_PATH, 'sample_submission.csv'))

### EDA

In [4]:
print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')

Train shape: (200000, 202)
Test shape: (200000, 201)


In [5]:
train.head()

Unnamed: 0,ID_code,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,train_0,0,8.9255,-6.7863,11.9081,5.093,11.4607,-9.2834,5.1187,18.6266,...,4.4354,3.9642,3.1364,1.691,18.5227,-2.3978,7.8784,8.5635,12.7803,-1.0914
1,train_1,0,11.5006,-4.1473,13.8588,5.389,12.3622,7.0433,5.6208,16.5338,...,7.6421,7.7214,2.5837,10.9516,15.4305,2.0339,8.1267,8.7889,18.356,1.9518
2,train_2,0,8.6093,-2.7457,12.0805,7.8928,10.5825,-9.0837,6.9427,14.6155,...,2.9057,9.7905,1.6704,1.6858,21.6042,3.1417,-6.5213,8.2675,14.7222,0.3965
3,train_3,0,11.0604,-2.1518,8.9522,7.1957,12.5846,-1.8361,5.8428,14.925,...,4.4666,4.7433,0.7178,1.4214,23.0347,-1.2706,-2.9275,10.2922,17.9697,-8.9996
4,train_4,0,9.8369,-1.4834,12.8746,6.6375,12.2772,2.4486,5.9405,19.2514,...,-1.4905,9.5214,-0.1508,9.1942,13.2876,-1.5121,3.9267,9.5031,17.9974,-8.8104


In [6]:
test.head()

Unnamed: 0,ID_code,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,test_0,11.0656,7.7798,12.9536,9.4292,11.4327,-2.3805,5.8493,18.2675,2.1337,...,-2.1556,11.8495,-1.43,2.4508,13.7112,2.4669,4.3654,10.72,15.4722,-8.7197
1,test_1,8.5304,1.2543,11.3047,5.1858,9.1974,-4.0117,6.0196,18.6316,-4.4131,...,10.6165,8.8349,0.9403,10.1282,15.5765,0.4773,-1.4852,9.8714,19.1293,-20.976
2,test_2,5.4827,-10.3581,10.1407,7.0479,10.2628,9.8052,4.895,20.2537,1.5233,...,-0.7484,10.9935,1.9803,2.18,12.9813,2.1281,-7.1086,7.0618,19.8956,-23.1794
3,test_3,8.5374,-1.3222,12.022,6.5749,8.8458,3.1744,4.9397,20.566,3.3755,...,9.5702,9.0766,1.658,3.5813,15.1874,3.1656,3.9567,9.2295,13.0168,-4.2108
4,test_4,11.7058,-0.1327,14.1295,7.7506,9.1035,-8.5848,6.8595,10.6048,2.989,...,4.2259,9.1723,1.2835,3.3778,19.5542,-0.286,-5.1612,7.2882,13.926,-9.1846


In [7]:
train.target.value_counts()

0    179902
1     20098
Name: target, dtype: int64

In [8]:
na = train.isna().sum().reset_index()
na.columns = ['column', 'na_count']

n = na[na['na_count'] > 0].shape[0]
print(f'NaN in columns: {n}')

NaN in columns: 0


### Splitting data

In [9]:
X = train.drop(['ID_code', 'target'], axis=1)
y = train['target']

x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
y_train.value_counts()

0    143922
1     16078
Name: target, dtype: int64

In [11]:
y_val.value_counts()

0    35980
1     4020
Name: target, dtype: int64

### Baseline

In [12]:
predictions = np.zeros_like(y_val)

acc = accuracy_score(y_val, predictions)
auc = roc_auc_score(y_val, predictions)

print(f'Constant accuracy: {acc}')
print(f'Constant ROC AUC: {auc}')

Constant accuracy: 0.8995
Constant ROC AUC: 0.5


### XGB

In [13]:
%%time

xgb1 = XGBClassifier(
                    learning_rate=0.1,
                    n_estimators=1000,
                    max_depth=5,
                    min_child_weight=5,
                    gamma=0,
                    reg_alpha=1e-5,
                    subsample=0.8,
                    colsample_bytree=0.8,
                    objective= 'binary:logistic',
                    nthread=4,
                    scale_pos_weight=1,
                    seed=42)

xgb1.fit(x_train, y_train)
predictions = xgb1.predict(x_val)

acc = accuracy_score(y_val, predictions)
auc = roc_auc_score(y_val, predictions)

print(f'Accuracy: {acc}')
print(f'ROC AUC: {auc}')

Accuracy: 0.919875
ROC AUC: 0.653183982809687
Wall time: 9min 25s


In [30]:
%%time

model = XGBClassifier(max_depth=2,
                      n_estimators=99999,
                      colsample_bytree=0.3,
                      learning_rate=0.02,
                      objective='binary:logistic', 
                      n_jobs=-1,
                      evals=[(valid_xgb, 'Validation')])


model.fit(x_train, y_train)
predictions = model.predict(x_val)

print(f'ROC AUC: {roc_auc_score(y_val, predictions)}')

KeyboardInterrupt: 

In [None]:
import pickle

pickle.dump(model, 'data/xgb.dat')