In [1]:
import pandas as pd
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import numpy as np
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

In [2]:
df_train = pd.read_csv('onetwotrip_challenge_train.csv')
df_test = pd.read_csv('onetwotrip_challenge_test.csv')


In [3]:
min_cl_len = len(df_train[df_train['goal1'] == 1])
maj_cl_ind = df_train[df_train['goal1'] == 0].index
rand_maj_ind = np.random.choice(maj_cl_ind, min_cl_len, replace=False)

print(len(rand_maj_ind))

min_cl_ind = df_train[df_train['goal1'] == 1].index

4341


In [4]:
under_sample_ind = np.concatenate([min_cl_ind, rand_maj_ind])

under_sample = df_train.loc[under_sample_ind]

In [5]:
# features for training
features = list(filter(lambda x: 'field' in x, under_sample.columns))

In [6]:
forest = RandomForestClassifier(n_estimators=700, n_jobs=-1, 
                                random_state=31)
forest_params = {'max_depth': [13],
                 'max_features': [4],
                'min_samples_leaf': [4, 5]}
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=17)

forest_grid = GridSearchCV(forest, forest_params,
                           cv=skf, n_jobs=-1, verbose=True, scoring='roc_auc')

forest_grid.fit(under_sample[features], under_sample['goal1'])

forest_grid.best_params_, forest_grid.best_score_

Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    7.5s remaining:    5.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    9.3s finished


({'max_depth': 13, 'max_features': 4, 'min_samples_leaf': 4},
 0.6723403102514163)

In [7]:
result = forest_grid.predict_proba(df_test[features])

In [8]:
pd.DataFrame(result[:, 1], columns=['proba'], index=df_test['orderid']).to_csv('sub01.csv')