In [1]:
import pandas as pd
import numpy

from os.path import join as path_join
from os import cpu_count

from sklearn.model_selection import \
    (StratifiedKFold, GridSearchCV, train_test_split)
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import roc_auc_score


import warnings
warnings.filterwarnings('ignore')

### Roc_auc_score on
 - Validation: 0.714202031922375
 - Pseudo_test: 0.7157898362618176
 - leaderbord: 0.72176007

In [2]:
CSV_DIR = r'../../data/Modulbank'

train = pd.read_csv(path_join(CSV_DIR, 'new_train.csv'))
test = pd.read_csv(path_join(CSV_DIR, 'new_test.csv'))

In [3]:
test.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,336,337,338,339,340,341,342,343,344,345
0,0,,1,0,0,1,0,0,0.136364,0,...,0.192984,0,1,0,0,0.222222,1,1,1,1
1,1,,1,0,0,1,0,0,0.181818,0,...,0.19569,0,1,0,0,0.0,1,1,1,0
2,2,,1,0,0,0,0,0,0.090909,0,...,0.192984,0,1,0,0,0.222222,1,1,1,0
3,3,,1,0,0,1,0,0,0.090909,0,...,0.19569,0,1,0,0,0.0,1,1,1,0
4,4,,1,0,0,1,0,0,0.090909,0,...,0.289893,0,0,1,0,0.0,1,1,1,1


In [4]:
X_train, Y_train = train.drop(columns=['0']), train['0']
X_test, Y_test = test.drop(columns=['0', 'Unnamed: 0']), test[['Unnamed: 0', '0']]

In [5]:
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size=0.33, random_state=42)

In [22]:
parameters = {
    'learning_rate': [0.1, 0.5, 1, 2, 4], 
    'loss': ['linear', 'square', 'exponential'], 
    'n_estimators': [10, 25, 50, 100]
}

skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)


ada = AdaBoostRegressor(random_state=42)

rcv = GridSearchCV(ada, 
                    parameters, 
                    scoring ='roc_auc', 
                    cv=skf, 
                    n_jobs=-1,
                    )

rcv.fit(x_train, y_train)

best_score = rcv.best_score_
best_model = rcv.best_estimator_

print(best_score)

0.714202031922375


In [23]:
best_model

AdaBoostRegressor(base_estimator=None, learning_rate=0.1, loss='square',
         n_estimators=100, random_state=42)

In [24]:
prediction = best_model.predict(x_val)

print(roc_auc_score(y_val, prediction))

0.7157898362618176


In [25]:
Y_test['0'] = best_model.predict(X_test)

In [26]:
Y_test.head()

Unnamed: 0.1,Unnamed: 0,0
0,0,0.446952
1,1,0.478323
2,2,0.462714
3,3,0.468095
4,4,0.491486


In [28]:
Y_test.to_csv(path_join(CSV_DIR, 'submission_SimpleEnsemble.csv'), index=False)