In [1]:
import pandas as pd
import numpy as np

from os.path import join as path_join

from sklearn.model_selection import \
    (StratifiedKFold, GridSearchCV, train_test_split)
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.metrics import roc_auc_score


import warnings
warnings.filterwarnings('ignore')

### Roc_auc_score on
 - Validation: 0.7205854842866108
 - Pseudo_test: 0.726261929305357
 - leaderbord: 0.73137475

In [2]:
CSV_DIR = r'../../data/Modulbank'

train = pd.read_csv(path_join(CSV_DIR, 'new_train.csv'))
test = pd.read_csv(path_join(CSV_DIR, 'new_test.csv'))

In [3]:
test.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,336,337,338,339,340,341,342,343,344,345
0,0,,1,0,0,1,0,0,0.136364,0,...,0.192984,0,1,0,0,0.222222,1,1,1,1
1,1,,1,0,0,1,0,0,0.181818,0,...,0.19569,0,1,0,0,0.0,1,1,1,0
2,2,,1,0,0,0,0,0,0.090909,0,...,0.192984,0,1,0,0,0.222222,1,1,1,0
3,3,,1,0,0,1,0,0,0.090909,0,...,0.19569,0,1,0,0,0.0,1,1,1,0
4,4,,1,0,0,1,0,0,0.090909,0,...,0.289893,0,0,1,0,0.0,1,1,1,1


In [4]:
X_train, Y_train = train.drop(columns=['0']), train['0']
X_test, Y_test = test.drop(columns=['0', 'Unnamed: 0']), test[['Unnamed: 0', '0']]

In [5]:
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size=0.33, random_state=42)

In [6]:
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [7]:
parameters = {'max_features': [3, 4, 5, 6, 7], 
              'min_samples_leaf': [3, 5],
              'max_depth': [2, 4, 6, 8, 10],
              'n_estimators': [5, 10, 20, 50],
             }

rfc = RandomForestRegressor(random_state=42, n_jobs=-1)

rfcv = GridSearchCV(rfc, 
                    parameters, 
                    cv=skf, 
                    verbose=1, 
                    scoring='roc_auc'
                   )
rfcv.fit(x_train, y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=1)]: Done 600 out of 600 | elapsed:  8.1min finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=True),
       error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_features': [3, 4, 5, 6, 7], 'min_samples_leaf': [3, 5], 'max_depth': [2, 4, 6, 8, 10], 'n_estimators': [5, 10, 20, 50]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [8]:
print(rfcv.best_score_)

0.703817174042295


In [9]:
parameters = {'max_features': [3, 4, 5, 6, 7], 
              'min_samples_leaf': [3, 5],
              'n_estimators': [5, 10, 20, 50],
              'max_depth': [2, 3, 4, 8, 12],
             }

etc = ExtraTreesRegressor(random_state=42, n_jobs=-1)

etcv = GridSearchCV(etc, 
                   parameters, 
                   cv=skf, 
                   verbose=1, 
                   scoring='roc_auc'
                  )

etcv.fit(x_train, y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=1)]: Done 600 out of 600 | elapsed:  7.1min finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=42, shuffle=True),
       error_score='raise',
       estimator=ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
          oob_score=False, random_state=42, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_features': [3, 4, 5, 6, 7], 'min_samples_leaf': [3, 5], 'n_estimators': [5, 10, 20, 50], 'max_depth': [2, 3, 4, 8, 12]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=1)

In [10]:
print(etcv.best_score_)

0.7090836562221701


In [15]:
rf = rfcv.best_estimator_
et = etcv.best_estimator_
rf_predictions = []
et_predictions = []

for train, val in skf.split(x_train, y_train):
    rf.fit(x_train.iloc[train], y_train.iloc[train])
    et.fit(x_train.iloc[train], y_train.iloc[train])
    rf_predictions.append([y_train.iloc[val], rf.predict(x_train.iloc[val])])
    et_predictions.append([y_train.iloc[val], et.predict(x_train.iloc[val])])
    
np.mean([roc_auc_score(i[0], i[1]) for i in rf_predictions]), np.mean([roc_auc_score(i[0], i[1]) for i in et_predictions])

(0.7038172425910952, 0.7090836097356151)

In [16]:
x_train["rf_preds"] = 0
x_train["et_preds"] = 0

for i, fold in enumerate(skf.split(x_train, y_train)):
    train, val = fold[0], fold[1]
    x_train.iloc[val, -2] = rf_predictions[i][1]
    x_train.iloc[val, -1] = et_predictions[i][1]
    
print(x_train.shape)
x_train.head()

(20435, 347)


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,338,339,340,341,342,343,344,345,rf_preds,et_preds
14248,1,0,0,0,0,0,0.090909,0,0,1,...,0,0,0,0.0,1,1,1,1,0.103445,0.105634
18936,1,0,0,1,0,0,0.181818,0,0,1,...,1,0,0,0.333333,0,1,1,0,0.190211,0.181321
24290,1,0,0,1,0,0,0.090909,0,0,1,...,1,0,0,0.444444,1,1,1,1,0.179309,0.170279
20990,1,0,0,1,0,0,0.090909,0,0,1,...,1,0,0,0.222222,1,1,1,1,0.406286,0.354928
2305,1,0,0,1,0,0,0.090909,0,0,1,...,0,1,0,0.222222,1,1,1,1,0.137041,0.191169


In [17]:
feature_importance = sorted(list(zip(x_train.columns, rf.feature_importances_)), key=lambda x: x[0], reverse=True)[:150]
feature_importance_list = [item[0] for item in feature_importance]
feature_importance_list.extend(['rf_preds', 'et_preds'])

parameters = {'max_features': [3, 4, 5, 6, 7], 
              'min_samples_leaf': [1, 3, 5], 
              'max_depth': [5, 10, 15, 20],
              'n_estimators': [5, 20, 25, 50, 100],
             }

etc = ExtraTreesRegressor(random_state=42, n_jobs=-1)

etcv = GridSearchCV(etc, 
                    parameters, 
                    n_jobs=-1, 
                    cv=skf, 
                    verbose=1, 
                    scoring='roc_auc'
                   )
etcv.fit(x_train[feature_importance_list], y_train)

model = etcv.best_estimator_
best_score = etcv.best_score_
print(best_score)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  6.0min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:  6.9min finished


0.7205854842866108


In [20]:
rf_prediction = rf.predict(x_val)
et_prediction = et.predict(x_val)

x_val["rf_preds"] = rf_prediction
x_val["et_preds"] = et_prediction

prediction = model.predict(x_val[feature_importance_list])

print(roc_auc_score(y_val, prediction))

0.726261929305357


In [22]:
rf_prediction = rf.predict(X_test)
et_prediction = et.predict(X_test)

X_test["rf_preds"] = rf_prediction
X_test["et_preds"] = et_prediction

prediction = model.predict(X_test[feature_importance_list])

Y_test['0'] = prediction

In [23]:
Y_test.head()

Unnamed: 0.1,Unnamed: 0,0
0,0,0.180624
1,1,0.328941
2,2,0.22385
3,3,0.269618
4,4,0.46121


In [25]:
Y_test.to_csv(path_join(CSV_DIR, 'submission_Stacking.csv'), index=False)