In [1]:
import pandas as pd
import numpy as np

from os.path import join as path_join

from sklearn.model_selection import \
    (StratifiedKFold, GridSearchCV, train_test_split)
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import roc_auc_score


import warnings
warnings.filterwarnings('ignore')

### Roc_auc_score on
 - Validation: 0.7205158936480607
 - Pseudo_test: 0.7252473557512785
 - leaderbord: 0.74237519

In [2]:
CSV_DIR = r'../../data/Modulbank'

train = pd.read_csv(path_join(CSV_DIR, 'new_train.csv'))
test = pd.read_csv(path_join(CSV_DIR, 'new_test.csv'))

In [3]:
test.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,336,337,338,339,340,341,342,343,344,345
0,0,,1,0,0,1,0,0,0.136364,0,...,0.192984,0,1,0,0,0.222222,1,1,1,1
1,1,,1,0,0,1,0,0,0.181818,0,...,0.19569,0,1,0,0,0.0,1,1,1,0
2,2,,1,0,0,0,0,0,0.090909,0,...,0.192984,0,1,0,0,0.222222,1,1,1,0
3,3,,1,0,0,1,0,0,0.090909,0,...,0.19569,0,1,0,0,0.0,1,1,1,0
4,4,,1,0,0,1,0,0,0.090909,0,...,0.289893,0,0,1,0,0.0,1,1,1,1


In [4]:
X_train, Y_train = train.drop(columns=['0']), train['0']
X_test, Y_test = test.drop(columns=['0', 'Unnamed: 0']), test[['Unnamed: 0', '0']]

In [5]:
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size=0.33, random_state=42)

In [6]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [8]:
parameters = {# 'loss': ['ls', 'lad', 'huber', 'quantile'],
              'learning_rate': [0.01, 0.1, 1, 2], 
              # 'min_samples_leaf': [2, 5, 10],
              # 'max_depth': [2, 5, 10],
              # 'n_estimators': [20, 50, 100],
             }

gb = GradientBoostingRegressor(random_state=42)

gbcv = GridSearchCV(gb, 
                    parameters, 
                    cv=skf, 
                    verbose=1, 
                    scoring='roc_auc',
                    n_jobs=-1,
                   )
gbcv.fit(x_train, y_train)

best_score = gbcv.best_score_
best_model = gbcv.best_estimator_

print(best_score)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  2.4min finished


0.7205158936480607


In [9]:
best_model

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=42,
             subsample=1.0, verbose=0, warm_start=False)

In [10]:
prediction = best_model.predict(x_val)

print(roc_auc_score(y_val, prediction))

0.7252473557512785


In [11]:
Y_test['0'] = best_model.predict(X_test)

In [12]:
Y_test.head()

Unnamed: 0.1,Unnamed: 0,0
0,0,0.2151
1,1,0.426352
2,2,0.24723
3,3,0.292402
4,4,0.474632


In [14]:
Y_test.to_csv(path_join(CSV_DIR, 'submission_7task.csv'), index=False)