In [11]:
import pandas as pd
import numpy as np

from os.path import join as path_join

from sklearn.model_selection import \
    (StratifiedKFold, GridSearchCV, train_test_split)
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import roc_auc_score


import warnings
warnings.filterwarnings('ignore')

### Roc_auc_score on
 - Validation: 0.7257746041247851
 - Pseudo_test: 0.724010304434074
 - leaderbord: 

In [12]:
CSV_DIR = r'../../data/Modulbank'

train = pd.read_csv(path_join(CSV_DIR, 'new_train.csv'))
test = pd.read_csv(path_join(CSV_DIR, 'new_test.csv'))

In [13]:
test.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,336,337,338,339,340,341,342,343,344,345
0,0,,1,0,0,1,0,0,0.136364,0,...,0.192984,0,1,0,0,0.222222,1,1,1,1
1,1,,1,0,0,1,0,0,0.181818,0,...,0.19569,0,1,0,0,0.0,1,1,1,0
2,2,,1,0,0,0,0,0,0.090909,0,...,0.192984,0,1,0,0,0.222222,1,1,1,0
3,3,,1,0,0,1,0,0,0.090909,0,...,0.19569,0,1,0,0,0.0,1,1,1,0
4,4,,1,0,0,1,0,0,0.090909,0,...,0.289893,0,0,1,0,0.0,1,1,1,1


In [14]:
X_train, Y_train = train.drop(columns=['0']), train['0']
X_test, Y_test = test.drop(columns=['0', 'Unnamed: 0']), test[['Unnamed: 0', '0']]

In [15]:
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size=0.33, random_state=42)

In [17]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [18]:
lr = LogisticRegression(class_weight='balanced')

parameters = {
    'C': [0.01, 0.1, 1, 2, 3]
}

log_reg_cv = GridSearchCV( lr, 
                           parameters, 
                           cv=skf, 
                           verbose=1, 
                           scoring='roc_auc',
                           n_jobs=-1,
                          )

log_reg_cv.fit(x_train, y_train)

log_model = log_reg_cv.best_estimator_

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:   17.4s finished


In [20]:
parameters = {'learning_rate': [0.01, 0.1, 1, 2]}


gb = GradientBoostingRegressor(random_state=42)

grad_cv = GridSearchCV( gb, 
                        parameters, 
                        cv=skf, 
                        verbose=1, 
                        scoring='roc_auc',
                        n_jobs=-1,
                    )

grad_cv.fit(x_train, y_train)

grad_model = grad_cv.best_estimator_

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  1.3min finished


In [21]:
log_predictions = []
grad_predictions = []

for train, val in skf.split(x_train, y_train):
    log_model.fit(x_train.iloc[train], y_train.iloc[train])
    grad_model.fit(x_train.iloc[train], y_train.iloc[train])
    log_predictions.append([y_train.iloc[val], log_model.predict_proba(x_train.iloc[val])[:,1]])
    grad_predictions.append([y_train.iloc[val], grad_model.predict(x_train.iloc[val])])
    
np.mean([roc_auc_score(i[0], i[1]) for i in log_predictions]), np.mean([roc_auc_score(i[0], i[1]) for i in grad_predictions])

(0.7251333997535544, 0.7205159483316551)

In [22]:
x_train["log_preds"] = 0
x_train["grad_preds"] = 0

for i, fold in enumerate(skf.split(x_train, y_train)):
    train, val = fold[0], fold[1]
    x_train.iloc[val, -2] = log_predictions[i][1]
    x_train.iloc[val, -1] = grad_predictions[i][1]
    
print(x_train.shape)
x_train.head()

(20435, 347)


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,338,339,340,341,342,343,344,345,log_preds,grad_preds
14248,1,0,0,0,0,0,0.090909,0,0,1,...,0,0,0,0.0,1,1,1,1,0.101755,-0.017192
18936,1,0,0,1,0,0,0.181818,0,0,1,...,1,0,0,0.333333,0,1,1,0,0.540982,0.174505
24290,1,0,0,1,0,0,0.090909,0,0,1,...,1,0,0,0.444444,1,1,1,1,0.227982,0.143754
20990,1,0,0,1,0,0,0.090909,0,0,1,...,1,0,0,0.222222,1,1,1,1,0.8877,0.467831
2305,1,0,0,1,0,0,0.090909,0,0,1,...,0,1,0,0.222222,1,1,1,1,0.456707,0.12785


In [25]:
parameters = {'learning_rate': [0.01, 0.1, 1, 2]}


gbr = GradientBoostingRegressor(random_state=42)
              
grad_cv = GridSearchCV(gbr, 
                    parameters, 
                    n_jobs=-1, 
                    cv=skf, 
                    verbose=1, 
                    scoring='roc_auc'
                   )
              
grad_cv.fit(x_train, y_train)

model = grad_cv.best_estimator_
best_score = grad_cv.best_score_
print(best_score)

model.fit(x_train, y_train)

0.7257746041247851


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=42,
             subsample=1.0, verbose=0, warm_start=False)

In [26]:
log_prediction = log_model.predict(x_val)
grad_prediction = grad_model.predict(x_val)

x_val["log_preds"] = log_prediction
x_val["grad_preds"] = grad_prediction

prediction = model.predict(x_val)

print(roc_auc_score(y_val, prediction))

0.724010304434074


In [27]:
log_prediction = log_model.predict(X_test)
grad_prediction = grad_model.predict(X_test)

X_test["log_preds"] = log_prediction
X_test["grad_preds"] = grad_prediction

prediction = model.predict(X_test)

Y_test['0'] = prediction

In [28]:
Y_test.head()

Unnamed: 0.1,Unnamed: 0,0
0,0,0.126743
1,1,0.342808
2,2,0.314817
3,3,0.311508
4,4,0.438408


In [29]:
Y_test.to_csv(path_join(CSV_DIR, 'submission_7task_new.csv'), index=False)