In [1]:
import pandas as pd
import numpy as np

from os.path import join as path_join

from sklearn.model_selection import \
    (cross_val_score, StratifiedKFold, GridSearchCV, train_test_split)
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score


import warnings
warnings.filterwarnings('ignore')

### Roc_auc_score on
 - Validation: 0.7209632573298573
 - Pseudo_test: 0.7232600126577241
 - leaderbord: 0.72776497

In [2]:
CSV_DIR = r'../../data/Modulbank'

train = pd.read_csv(path_join(CSV_DIR, 'new_train.csv'))
test = pd.read_csv(path_join(CSV_DIR, 'new_test.csv'))

In [3]:
test.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,336,337,338,339,340,341,342,343,344,345
0,0,,1,0,0,1,0,0,0.136364,0,...,0.192984,0,1,0,0,0.222222,1,1,1,1
1,1,,1,0,0,1,0,0,0.181818,0,...,0.19569,0,1,0,0,0.0,1,1,1,0
2,2,,1,0,0,0,0,0,0.090909,0,...,0.192984,0,1,0,0,0.222222,1,1,1,0
3,3,,1,0,0,1,0,0,0.090909,0,...,0.19569,0,1,0,0,0.0,1,1,1,0
4,4,,1,0,0,1,0,0,0.090909,0,...,0.289893,0,0,1,0,0.0,1,1,1,1


In [4]:
X_train, Y_train = train.drop(columns=['0']), train['0']
X_test, Y_test = test.drop(columns=['0', 'Unnamed: 0']), test[['Unnamed: 0', '0']]

In [5]:
x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size=0.33, random_state=42)

In [6]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rfc = RandomForestRegressor(random_state=42, n_jobs=-1, oob_score=True)

results = cross_val_score(rfc, 
                          x_train, 
                          y_train, 
                          cv=skf,
                          scoring='roc_auc',
                         ).mean()

print(results)

0.6721329928817211


In [7]:
rfc.fit(x_train, y_train)
prediction = rfc.predict(x_val)

print(roc_auc_score(y_val, prediction))

0.6680659030239532


In [8]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

parameters = {'n_estimators': [10, 25, 50, 100],
              'max_features': [4, 7, 10, 13], 
              'min_samples_leaf': [1, 3, 5, 7], 
              'max_depth': [5, 10, 15, 20]
             }

rfc = RandomForestRegressor(random_state=42, n_jobs=-1)

gcv = GridSearchCV(rfc, 
                   parameters, 
                   n_jobs=-1, 
                   cv=skf, 
                   verbose=1,
                   scoring='roc_auc'
                  )

gcv.fit(x_train, y_train)

best_score = gcv.best_score_
best_model = gcv.best_estimator_

print(best_score)

Fitting 5 folds for each of 256 candidates, totalling 1280 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   50.6s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  6.3min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed: 18.3min
[Parallel(n_jobs=-1)]: Done 1280 out of 1280 | elapsed: 19.9min finished


0.7209632573298573


In [9]:
best_model

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features=13, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=5,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=-1, oob_score=False, random_state=42,
           verbose=0, warm_start=False)

In [10]:
prediction = best_model.predict(x_val)

print(roc_auc_score(y_val, prediction))

0.7232600126577241


In [11]:
Y_test['0'] = best_model.predict(X_test)

In [12]:
Y_test.head()

Unnamed: 0.1,Unnamed: 0,0
0,0,0.194408
1,1,0.348158
2,2,0.200254
3,3,0.197709
4,4,0.53931


In [14]:
Y_test.to_csv(path_join(CSV_DIR, 'submission_RandomForest.csv'), index=False)