### Import packages

In [5]:
import numpy as np
import pandas as pd
import datetime
import pickle
import itertools

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from sklearn.metrics.pairwise import manhattan_distances,pairwise_distances
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from metric_learn import NCA

from sklearn.metrics import roc_auc_score
from imblearn.over_sampling import SMOTE

### Read data

In [6]:
X_train = pd.read_csv('X_train_unique.csv')
X_val = pd.read_csv('X_val_unique.csv')
X_test = pd.read_csv('X_test_unique.csv')

y_train = pd.read_csv('y_train_unique.csv')
y_val = pd.read_csv('y_val_unique.csv')

X_train.set_index('unique_id',inplace=True)
X_val.set_index('unique_id',inplace=True)
X_test.set_index('unique_id',inplace=True)
y_train.set_index('unique_id',inplace=True)
y_val.set_index('unique_id',inplace=True)

X_train_experienced = X_train[X_train['experience_flag']==1]
y_train_experienced = y_train[y_train.index.isin(list(X_train_experienced.index))]
X_val_experienced = X_val[X_val['experience_flag']==1]
y_val_experienced = y_val[y_val.index.isin(list(X_val_experienced.index))]
X_test_experienced = X_test[X_test['experience_flag']==1]

X_train_cold = X_train[X_train['experience_flag']==0]
y_train_cold = y_train[y_train.index.isin(list(X_train_cold.index))]
X_val_cold = X_val[X_val['experience_flag']==0]
y_val_cold = y_val[y_val.index.isin(list(X_val_cold.index))]
X_test_cold = X_test[X_test['experience_flag']==0]

In [7]:
X_train_experienced.shape

(2063, 1539)

In [8]:
X_val_experienced.shape

(520, 1539)

In [3]:
X_train_under = pd.read_csv('X_train_under.csv')
y_train_under = pd.read_csv('y_train_under.csv')

X_train_under.set_index('unique_id',inplace=True)
y_train_under.set_index('unique_id',inplace=True)

In [3]:
X_train_experienced_under = pd.read_csv('X_train_experienced_under.csv')
y_train_experienced_under = pd.read_csv('y_train_experienced_under.csv')

X_train_experienced_under.set_index('unique_id',inplace=True)
y_train_experienced_under.set_index('unique_id',inplace=True)

In [4]:
X_train_experienced_under.shape

(3104, 1539)

In [5]:
X_train_cold_under = pd.read_csv('X_train_cold_under.csv')
y_train_cold_under = pd.read_csv('y_train_cold_under.csv')

X_train_cold_under.set_index('unique_id',inplace=True)
y_train_cold_under.set_index('unique_id',inplace=True)

In [6]:
id_test_df = pd.read_csv('test_ids_in_prediction.csv')
id_test_df

Unnamed: 0,unique_id
0,9
1,18
2,21
3,25
4,31
...,...
2375,7982
2376,7990
2377,7993
2378,7994


In [7]:
id_test_df.shape

(2380, 1)

In [8]:
X_train.sort_index(inplace=True)
X_val.sort_index(inplace=True)
X_test.sort_index(inplace=True)
y_train.sort_index(inplace=True)
y_val.sort_index(inplace=True)

X_train_under.sort_index(inplace=True)
y_train_under.sort_index(inplace=True)
X_train_experienced_under.sort_index(inplace=True)
y_train_experienced_under.sort_index(inplace=True)
X_train_cold_under.sort_index(inplace=True)
X_train_cold_under.sort_index(inplace=True)


### Submission 11

In [15]:
lgbm = LGBMClassifier(max_depth=2,learning_rate=0.01,n_estimators=100,min_child_samples=100,colsample_bytree=0.5)
lgbm.fit(X_train_cold_under,y_train_cold_under)

pred_train = pd.DataFrame(lgbm.predict_proba(X_train)[:,1],columns=['pred'],index=X_train.index)
pred_val = pd.DataFrame(lgbm.predict_proba(X_val)[:,1],columns=['pred'],index=X_val.index)
pred_test = pd.DataFrame(lgbm.predict_proba(X_test)[:,1],columns=['pred'],index=X_test.index)

pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  return f(*args, **kwargs)


In [10]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  0.8514203072108882
Val ROC:  0.8444668833422739


In [12]:
err_rate_df = pd.DataFrame()

for th in [0.5]:

    pred_train['pred_binary'] = np.where(pred_train['pred']>=th,1,0)
    pred_val['pred_binary'] = np.where(pred_val['pred']>=th,1,0)

    err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
    err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
    err_rate_train = err_rate1_train+err_rate0_train

    err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
    err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
    err_rate_val = err_rate1_val+err_rate0_val

    err_rate_df_tmp = pd.DataFrame({'threshold':[th],
                                    'err_rate1_train':[err_rate1_train],
                                    'err_rate0_train':[err_rate0_train],
                                    'err_rate_train':[err_rate_train],
                                    'err_rate1_val':[err_rate1_val],
                                    'err_rate0_val':[err_rate0_val],
                                    'err_rate_val':[err_rate_val]
                                   })
    err_rate_df = pd.concat([err_rate_df,err_rate_df_tmp])

err_rate_df

Unnamed: 0,threshold,err_rate1_train,err_rate0_train,err_rate_train,err_rate1_val,err_rate0_val,err_rate_val
0,0.5,0.359959,0.104381,0.464341,0.38806,0.085271,0.473331


In [17]:
sub11 = pd.merge(id_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub11['pred'] = sub11['pred'].apply(lambda x: round(x,3))
sub11_txt = ''
for prob in list(sub11['pred'].values):
    sub11_txt = sub11_txt+','+str(prob)
sub11_txt = sub11_txt[1:]

sub11_txt

'0.675,0.603,0.56,0.756,0.699,0.763,0.534,0.305,0.614,0.644,0.483,0.362,0.768,0.768,0.532,0.673,0.461,0.303,0.551,0.75,0.756,0.551,0.683,0.621,0.718,0.632,0.739,0.712,0.573,0.557,0.698,0.724,0.547,0.715,0.768,0.753,0.489,0.554,0.768,0.609,0.565,0.756,0.59,0.661,0.765,0.711,0.761,0.388,0.582,0.756,0.308,0.763,0.732,0.301,0.318,0.298,0.768,0.746,0.768,0.678,0.745,0.634,0.417,0.677,0.763,0.64,0.768,0.613,0.761,0.768,0.756,0.606,0.751,0.58,0.61,0.673,0.604,0.756,0.693,0.732,0.315,0.547,0.765,0.327,0.742,0.627,0.735,0.712,0.617,0.596,0.761,0.587,0.506,0.763,0.511,0.766,0.768,0.694,0.456,0.634,0.723,0.637,0.715,0.768,0.549,0.427,0.485,0.329,0.567,0.462,0.563,0.768,0.755,0.729,0.312,0.759,0.595,0.76,0.715,0.768,0.735,0.768,0.329,0.58,0.621,0.768,0.761,0.666,0.305,0.297,0.742,0.561,0.294,0.688,0.638,0.755,0.573,0.768,0.489,0.648,0.393,0.636,0.577,0.524,0.692,0.768,0.547,0.596,0.339,0.761,0.763,0.721,0.768,0.761,0.534,0.675,0.657,0.573,0.761,0.537,0.743,0.595,0.768,0.544,0.761,0.722,0.61,0.74,0

In [20]:
sub11.to_csv('sub11.csv')

### Submission 12

In [21]:
xgb = XGBClassifier(max_depth=2,learning_rate=0.01,n_estimators=100,min_child_samples=20,colsample_bytree=1)
xgb.fit(X_train_cold_under,y_train_cold_under)

pred_train = pd.DataFrame(xgb.predict_proba(X_train)[:,1],columns=['pred'],index=X_train.index)
pred_val = pd.DataFrame(xgb.predict_proba(X_val)[:,1],columns=['pred'],index=X_val.index)
pred_test = pd.DataFrame(xgb.predict_proba(X_test)[:,1],columns=['pred'],index=X_test.index)

pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  return f(*args, **kwargs)


Parameters: { "min_child_samples" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [22]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  0.8570074927989235
Val ROC:  0.8496138055318896


In [23]:
err_rate_df = pd.DataFrame()

for th in [0.5]:

    pred_train['pred_binary'] = np.where(pred_train['pred']>=th,1,0)
    pred_val['pred_binary'] = np.where(pred_val['pred']>=th,1,0)

    err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
    err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
    err_rate_train = err_rate1_train+err_rate0_train

    err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
    err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
    err_rate_val = err_rate1_val+err_rate0_val

    err_rate_df_tmp = pd.DataFrame({'threshold':[th],
                                    'err_rate1_train':[err_rate1_train],
                                    'err_rate0_train':[err_rate0_train],
                                    'err_rate_train':[err_rate_train],
                                    'err_rate1_val':[err_rate1_val],
                                    'err_rate0_val':[err_rate0_val],
                                    'err_rate_val':[err_rate_val]
                                   })
    err_rate_df = pd.concat([err_rate_df,err_rate_df_tmp])

err_rate_df

Unnamed: 0,threshold,err_rate1_train,err_rate0_train,err_rate_train,err_rate1_val,err_rate0_val,err_rate_val
0,0.5,0.359959,0.097938,0.457897,0.382632,0.080103,0.462736


In [24]:
sub12 = pd.merge(id_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub12['pred'] = sub12['pred'].apply(lambda x: round(x,3))
sub12_txt = ''
for prob in list(sub12['pred'].values):
    sub12_txt = sub12_txt+','+str(prob)
sub12_txt = sub12_txt[1:]

sub12_txt

'0.677,0.618,0.581,0.736,0.711,0.765,0.515,0.3,0.645,0.613,0.459,0.354,0.763,0.765,0.577,0.688,0.487,0.3,0.595,0.748,0.758,0.549,0.686,0.636,0.738,0.628,0.733,0.728,0.58,0.581,0.714,0.731,0.577,0.73,0.765,0.744,0.494,0.581,0.765,0.609,0.577,0.76,0.59,0.692,0.75,0.685,0.758,0.419,0.586,0.742,0.306,0.765,0.724,0.308,0.308,0.291,0.765,0.746,0.765,0.688,0.74,0.628,0.408,0.684,0.765,0.634,0.763,0.628,0.762,0.765,0.762,0.609,0.744,0.558,0.618,0.675,0.626,0.763,0.68,0.731,0.311,0.577,0.752,0.325,0.746,0.645,0.741,0.722,0.611,0.576,0.765,0.566,0.503,0.763,0.515,0.765,0.763,0.707,0.461,0.628,0.689,0.626,0.711,0.765,0.587,0.433,0.483,0.341,0.57,0.449,0.587,0.765,0.756,0.72,0.323,0.754,0.602,0.762,0.712,0.765,0.726,0.765,0.339,0.582,0.598,0.765,0.763,0.686,0.3,0.293,0.735,0.574,0.29,0.677,0.618,0.754,0.585,0.765,0.494,0.61,0.375,0.631,0.556,0.515,0.702,0.765,0.526,0.561,0.347,0.758,0.752,0.726,0.765,0.756,0.532,0.675,0.649,0.614,0.756,0.515,0.733,0.607,0.763,0.552,0.756,0.74,0.638,0.747,0.546,0.3

### Submission 13

* Ensemble

In [28]:
th = 0.5
sub13 = pd.merge(sub11,sub12,how='left',on='unique_id')
sub13['pred'] = th*sub13['pred_x']+(1-th)*sub13['pred_y']
sub13['pred'] = sub13['pred'].apply(lambda x: round(x,3))
sub13_txt = ''
for prob in list(sub13['pred'].values):
    sub13_txt = sub13_txt+','+str(prob)
sub13_txt = sub13_txt[1:]

sub13_txt


'0.676,0.611,0.571,0.746,0.705,0.764,0.524,0.302,0.629,0.629,0.471,0.358,0.766,0.766,0.554,0.68,0.474,0.301,0.573,0.749,0.757,0.55,0.685,0.629,0.728,0.63,0.736,0.72,0.577,0.569,0.706,0.728,0.562,0.722,0.766,0.748,0.491,0.568,0.766,0.609,0.571,0.758,0.59,0.676,0.758,0.698,0.76,0.403,0.584,0.749,0.307,0.764,0.728,0.304,0.313,0.294,0.766,0.746,0.766,0.683,0.742,0.631,0.412,0.681,0.764,0.637,0.766,0.621,0.762,0.766,0.759,0.607,0.748,0.569,0.614,0.674,0.615,0.76,0.686,0.732,0.313,0.562,0.758,0.326,0.744,0.636,0.738,0.717,0.614,0.586,0.763,0.577,0.504,0.763,0.513,0.766,0.766,0.7,0.459,0.631,0.706,0.631,0.713,0.766,0.568,0.43,0.484,0.335,0.569,0.456,0.575,0.766,0.756,0.724,0.318,0.756,0.599,0.761,0.714,0.766,0.73,0.766,0.334,0.581,0.609,0.766,0.762,0.676,0.302,0.295,0.738,0.568,0.292,0.682,0.628,0.754,0.579,0.766,0.491,0.629,0.384,0.633,0.567,0.52,0.697,0.766,0.536,0.579,0.343,0.76,0.758,0.724,0.766,0.758,0.533,0.675,0.653,0.593,0.758,0.526,0.738,0.601,0.766,0.548,0.758,0.731,0.624,0.744,0.56

In [25]:
sub13_prep = pd.merge(sub11,sub12,how='left',on='unique_id')
sub13_prep

Unnamed: 0,unique_id,pred_x,pred_y
0,9,0.675,0.677
1,18,0.603,0.618
2,21,0.560,0.581
3,25,0.756,0.736
4,31,0.699,0.711
...,...,...,...
2375,7982,0.486,0.485
2376,7990,0.337,0.332
2377,7993,0.485,0.485
2378,7994,0.485,0.480


In [26]:
sub13_prep['pred'] = th*sub13_prep['pred_x']+(1-th)*sub13_prep['pred_y']

### Submission 14

In [9]:
imp = SimpleImputer()
imp.fit(X_train)

SimpleImputer()

In [10]:
X_train_imputed = pd.DataFrame(imp.transform(X_train),columns= X_train.columns, index=X_train.index)
X_train_under_imputed = pd.DataFrame(imp.transform(X_train_under),columns= X_train_under.columns, index=X_train_under.index)
X_train_experienced_imputed = pd.DataFrame(imp.transform(X_train_experienced),columns= X_train_experienced.columns, index=X_train_experienced.index)
X_train_experienced_under_imputed = pd.DataFrame(imp.transform(X_train_experienced_under),columns= X_train_experienced_under.columns, index=X_train_experienced_under.index)
X_train_cold_imputed = pd.DataFrame(imp.transform(X_train_cold),columns= X_train_cold.columns, index=X_train_cold.index)
X_train_cold_under_imputed = pd.DataFrame(imp.transform(X_train_cold_under),columns= X_train_cold_under.columns, index=X_train_cold_under.index)

In [11]:
X_val_imputed = pd.DataFrame(imp.transform(X_val),columns= X_val.columns, index=X_val.index)
X_val_experienced_imputed = pd.DataFrame(imp.transform(X_val_experienced),columns= X_val_experienced.columns, index=X_val_experienced.index)
X_val_cold_imputed = pd.DataFrame(imp.transform(X_val_cold),columns= X_val_cold.columns, index=X_val_cold.index)

In [12]:
X_test_imputed = pd.DataFrame(imp.transform(X_test),columns= X_test.columns, index=X_test.index)
X_test_experienced_imputed = pd.DataFrame(imp.transform(X_test_experienced),columns= X_test_experienced.columns, index=X_test_experienced.index)
X_test_cold_imputed = pd.DataFrame(imp.transform(X_test_cold),columns= X_test_cold.columns, index=X_test_cold.index)

In [13]:
rf = XGBClassifier(max_depth=10,criterion='entropy',n_estimators=100,min_child_samples=20,colsample_bytree=0.8)
rf.fit(X_train_experienced_under_imputed,y_train_experienced_under)

pred_train = pd.DataFrame(rf.predict_proba(X_train_imputed)[:,1],columns=['pred'],index=X_train_imputed.index)
pred_val = pd.DataFrame(rf.predict_proba(X_val_imputed)[:,1],columns=['pred'],index=X_val_imputed.index)
pred_test = pd.DataFrame(rf.predict_proba(X_test_imputed)[:,1],columns=['pred'],index=X_test_imputed.index)

pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  return f(*args, **kwargs)


Parameters: { "criterion", "min_child_samples" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [14]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  0.9916020292668568
Val ROC:  0.8513037350246653


In [15]:
err_rate_df = pd.DataFrame()

for th in [0.5]:

    pred_train['pred_binary'] = np.where(pred_train['pred']>=th,1,0)
    pred_val['pred_binary'] = np.where(pred_val['pred']>=th,1,0)

    err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
    err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
    err_rate_train = err_rate1_train+err_rate0_train

    err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
    err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
    err_rate_val = err_rate1_val+err_rate0_val

    err_rate_df_tmp = pd.DataFrame({'threshold':[th],
                                    'err_rate1_train':[err_rate1_train],
                                    'err_rate0_train':[err_rate0_train],
                                    'err_rate_train':[err_rate_train],
                                    'err_rate1_val':[err_rate1_val],
                                    'err_rate0_val':[err_rate0_val],
                                    'err_rate_val':[err_rate_val]
                                   })
    err_rate_df = pd.concat([err_rate_df,err_rate_df_tmp])

err_rate_df

Unnamed: 0,threshold,err_rate1_train,err_rate0_train,err_rate_train,err_rate1_val,err_rate0_val,err_rate_val
0,0.5,0.126105,0.0,0.126105,0.278155,0.134367,0.412522


In [16]:
sub14 = pd.merge(id_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub14['pred'] = sub14['pred'].apply(lambda x: round(x,3))
sub14_txt = ''
for prob in list(sub14['pred'].values):
    sub14_txt = sub14_txt+','+str(prob)
sub14_txt = sub14_txt[1:]

sub14_txt

'0.997,0.965,0.994,0.998,0.997,1.0,0.49,0.011,0.958,0.994,0.055,0.014,0.999,1.0,0.959,0.998,0.011,0.002,0.985,0.996,1.0,0.992,0.999,0.999,0.999,0.997,0.966,0.999,0.892,0.925,0.998,0.998,0.995,0.976,1.0,0.997,0.546,0.964,0.991,0.974,0.913,0.988,0.998,0.904,0.999,0.994,0.999,0.311,0.56,0.849,0.007,0.999,0.996,0.002,0.009,0.009,0.997,0.999,0.997,0.998,1.0,0.02,0.041,0.998,1.0,0.759,0.999,0.995,0.999,0.999,1.0,0.998,0.999,0.999,0.887,0.99,0.985,1.0,0.997,0.995,0.019,0.962,0.999,0.08,0.999,0.906,0.718,1.0,0.997,0.969,1.0,0.985,0.441,0.999,0.012,0.977,0.996,0.98,0.052,1.0,0.989,0.999,0.999,0.999,0.859,0.348,0.775,0.32,0.998,0.845,0.982,1.0,0.998,0.997,0.931,0.998,0.928,0.998,0.992,0.999,0.996,1.0,0.519,0.453,0.959,0.983,0.997,1.0,0.008,0.001,0.983,0.312,0.002,0.997,0.97,0.999,0.243,0.998,0.993,0.998,0.068,0.208,0.998,0.965,0.929,0.999,0.935,0.999,0.021,0.983,1.0,0.995,0.999,0.999,0.66,0.974,0.991,0.988,0.999,0.601,1.0,0.747,0.996,0.971,1.0,0.994,0.794,0.994,0.996,0.212,0.999,0.478,0.548,0.99

In [18]:
pred_train_rf1 = pred_train.copy()
pred_val_rf1 = pred_val.copy()
pred_test_rf1 = pred_test.copy()

### Submission 15

In [19]:
rf = XGBClassifier(max_depth=10,criterion='entropy',n_estimators=200,min_child_samples=20,colsample_bytree=0.8)
rf.fit(X_train_under_imputed,y_train_under)

pred_train = pd.DataFrame(rf.predict_proba(X_train_imputed)[:,1],columns=['pred'],index=X_train_imputed.index)
pred_val = pd.DataFrame(rf.predict_proba(X_val_imputed)[:,1],columns=['pred'],index=X_val_imputed.index)
pred_test = pd.DataFrame(rf.predict_proba(X_test_imputed)[:,1],columns=['pred'],index=X_test_imputed.index)

pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  return f(*args, **kwargs)


Parameters: { "criterion", "min_child_samples" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [20]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  0.9876854583809317
Val ROC:  0.8522538821046284


In [21]:
err_rate_df = pd.DataFrame()

for th in [0.5]:

    pred_train['pred_binary'] = np.where(pred_train['pred']>=th,1,0)
    pred_val['pred_binary'] = np.where(pred_val['pred']>=th,1,0)

    err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
    err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
    err_rate_train = err_rate1_train+err_rate0_train

    err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
    err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
    err_rate_val = err_rate1_val+err_rate0_val

    err_rate_df_tmp = pd.DataFrame({'threshold':[th],
                                    'err_rate1_train':[err_rate1_train],
                                    'err_rate0_train':[err_rate0_train],
                                    'err_rate_train':[err_rate_train],
                                    'err_rate1_val':[err_rate1_val],
                                    'err_rate0_val':[err_rate0_val],
                                    'err_rate_val':[err_rate_val]
                                   })
    err_rate_df = pd.concat([err_rate_df,err_rate_df_tmp])

err_rate_df

Unnamed: 0,threshold,err_rate1_train,err_rate0_train,err_rate_train,err_rate1_val,err_rate0_val,err_rate_val
0,0.5,0.124405,0.0,0.124405,0.276798,0.155039,0.431837


In [22]:
sub15 = pd.merge(id_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub15['pred'] = sub15['pred'].apply(lambda x: round(x,3))
sub15_txt = ''
for prob in list(sub15['pred'].values):
    sub15_txt = sub15_txt+','+str(prob)
sub15_txt = sub15_txt[1:]

sub15_txt

'0.999,0.939,0.994,1.0,1.0,0.999,0.361,0.005,0.981,0.997,0.005,0.001,0.999,1.0,0.974,1.0,0.02,0.007,0.982,0.997,1.0,0.998,1.0,1.0,1.0,0.986,0.997,1.0,0.981,0.997,1.0,0.994,0.994,0.982,0.992,1.0,0.864,0.96,0.997,0.963,0.982,0.993,0.994,1.0,1.0,0.994,1.0,0.293,0.029,0.966,0.019,1.0,0.99,0.004,0.023,0.003,0.999,1.0,0.998,1.0,1.0,0.435,0.009,0.997,1.0,0.975,1.0,0.998,0.991,1.0,1.0,0.99,0.999,0.998,0.829,0.904,0.999,1.0,0.995,0.987,0.002,0.714,0.999,0.086,1.0,0.981,0.849,1.0,0.994,0.997,1.0,0.999,0.933,0.998,0.482,0.998,0.997,0.932,0.714,1.0,0.949,0.997,1.0,0.999,0.886,0.463,0.312,0.113,0.983,0.336,0.996,1.0,0.996,1.0,0.501,0.999,0.984,0.998,0.994,0.999,1.0,1.0,0.581,0.859,0.995,0.964,1.0,1.0,0.002,0.0,0.999,0.842,0.008,0.998,0.994,1.0,0.758,0.999,0.91,0.999,0.105,0.557,0.993,0.981,0.907,1.0,0.803,0.995,0.019,0.997,1.0,0.996,0.998,1.0,0.401,0.999,0.695,0.996,0.999,0.9,1.0,0.968,0.997,0.899,1.0,0.995,0.85,0.997,0.999,0.074,1.0,0.03,0.749,1.0,0.994,0.838,0.999,0.967,0.215,0.936,0.067,0.002,0.

In [23]:
pred_train_rf2 = pred_train.copy()
pred_val_rf2 = pred_val.copy()
pred_test_rf2 = pred_test.copy()

### Submission 16

In [27]:
pred_test_rf1['pred_cold'] = pred_test_rf1['pred']
pred_test_rf2['pred_exp'] = pred_test_rf2['pred']

sub16_df = pd.merge(X_test[['experience_flag']],pred_test_rf1,how='left',right_index=True,left_index=True)
sub16_df = pd.merge(sub16_df,pred_test_rf2,how='left',right_index=True,left_index=True)
sub16_df

Unnamed: 0_level_0,experience_flag,pred_x,pred_cold,pred_y,pred_exp
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9,1,0.997364,0.997364,0.998875,0.998875
18,1,0.964852,0.964852,0.939222,0.939222
21,1,0.994150,0.994150,0.994372,0.994372
25,1,0.998056,0.998056,0.999699,0.999699
31,1,0.996662,0.996662,0.999612,0.999612
...,...,...,...,...,...
7982,0,0.077783,0.077783,0.606835,0.606835
7990,0,0.422620,0.422620,0.086012,0.086012
7993,0,0.380592,0.380592,0.794154,0.794154
7994,0,0.880442,0.880442,0.049052,0.049052


In [28]:
sub16_df['pred'] = np.where(sub16_df['experience_flag']==1,sub16_df['pred_exp'],sub16_df['pred_cold'])

In [29]:
sub16 = pd.merge(id_test_df,sub16_df.reset_index(),how='left',on='unique_id')
sub16['pred'] = sub16['pred'].apply(lambda x: round(x,3))
sub16_txt = ''
for prob in list(sub16['pred'].values):
    sub16_txt = sub16_txt+','+str(prob)
sub16_txt = sub16_txt[1:]

sub16_txt

'0.999,0.939,0.994,1.0,1.0,0.999,0.361,0.005,0.981,0.997,0.005,0.001,0.999,1.0,0.974,1.0,0.02,0.007,0.982,0.997,1.0,0.998,1.0,1.0,1.0,0.986,0.997,1.0,0.981,0.997,1.0,0.994,0.994,0.982,0.992,1.0,0.546,0.96,0.997,0.963,0.982,0.993,0.998,1.0,1.0,0.994,1.0,0.293,0.56,0.966,0.019,1.0,0.99,0.004,0.009,0.003,0.999,1.0,0.998,1.0,1.0,0.435,0.009,0.997,1.0,0.759,1.0,0.998,0.991,1.0,1.0,0.99,0.999,0.998,0.829,0.99,0.999,1.0,0.995,0.987,0.002,0.714,0.999,0.086,1.0,0.981,0.849,1.0,0.994,0.997,1.0,0.999,0.933,0.998,0.482,0.998,0.997,0.932,0.714,1.0,0.949,0.997,1.0,0.999,0.886,0.348,0.775,0.32,0.983,0.336,0.996,1.0,0.996,1.0,0.931,0.999,0.984,0.998,0.994,0.999,1.0,1.0,0.519,0.859,0.995,0.964,1.0,1.0,0.002,0.0,0.999,0.842,0.008,0.998,0.994,1.0,0.758,0.999,0.993,0.999,0.105,0.557,0.993,0.981,0.907,1.0,0.803,0.995,0.019,0.997,1.0,0.996,0.998,1.0,0.401,0.974,0.695,0.996,0.999,0.9,1.0,0.968,0.997,0.971,1.0,0.995,0.85,0.997,0.999,0.212,1.0,0.03,0.749,1.0,0.994,0.838,0.999,0.967,0.113,0.936,0.067,0.002,0.99

### Submission 17

In [9]:
lgbm = LGBMClassifier(max_depth=2,learning_rate=0.1,n_estimators=200,min_child_samples=20,colsample_bytree=1)
lgbm.fit(X_train,y_train)

pred_train = pd.DataFrame(lgbm.predict_proba(X_train)[:,1],columns=['pred'],index=X_train.index)
pred_val = pd.DataFrame(lgbm.predict_proba(X_val)[:,1],columns=['pred'],index=X_val.index)
pred_test = pd.DataFrame(lgbm.predict_proba(X_test)[:,1],columns=['pred'],index=X_test.index)

pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  return f(*args, **kwargs)


In [10]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  0.9425785329076932
Val ROC:  0.8703101827017133


In [11]:
err_rate_df = pd.DataFrame()

for th in [0.5]:

    pred_train['pred_binary'] = np.where(pred_train['pred']>=th,1,0)
    pred_val['pred_binary'] = np.where(pred_val['pred']>=th,1,0)

    err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
    err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
    err_rate_train = err_rate1_train+err_rate0_train

    err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
    err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
    err_rate_val = err_rate1_val+err_rate0_val

    err_rate_df_tmp = pd.DataFrame({'threshold':[th],
                                    'err_rate1_train':[err_rate1_train],
                                    'err_rate0_train':[err_rate0_train],
                                    'err_rate_train':[err_rate_train],
                                    'err_rate1_val':[err_rate1_val],
                                    'err_rate0_val':[err_rate0_val],
                                    'err_rate_val':[err_rate_val]
                                   })
    err_rate_df = pd.concat([err_rate_df,err_rate_df_tmp])

err_rate_df

Unnamed: 0,threshold,err_rate1_train,err_rate0_train,err_rate_train,err_rate1_val,err_rate0_val,err_rate_val
0,0.5,0.094494,0.21134,0.305834,0.137042,0.317829,0.454872


In [12]:
sub17 = pd.merge(id_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub17['pred'] = sub17['pred'].apply(lambda x: round(x,3))
sub17_txt = ''
for prob in list(sub17['pred'].values):
    sub17_txt = sub17_txt+','+str(prob)
sub17_txt = sub17_txt[1:]

sub17_txt

'0.987,0.965,0.946,0.994,0.961,0.996,0.686,0.291,0.898,0.982,0.371,0.088,0.994,0.992,0.943,0.992,0.353,0.169,0.952,0.959,0.998,0.917,0.994,0.981,0.991,0.98,0.974,0.994,0.956,0.83,0.99,0.879,0.958,0.988,0.984,0.99,0.734,0.653,0.991,0.966,0.951,0.895,0.887,0.927,0.986,0.947,0.996,0.429,0.737,0.751,0.217,0.99,0.953,0.051,0.294,0.101,0.984,0.994,0.985,0.991,0.99,0.732,0.296,0.981,0.982,0.786,0.99,0.957,0.966,0.986,0.994,0.961,0.98,0.976,0.926,0.834,0.926,0.995,0.983,0.945,0.329,0.853,0.995,0.095,0.988,0.973,0.713,0.993,0.946,0.934,0.995,0.95,0.857,0.993,0.459,0.952,0.916,0.804,0.388,0.984,0.959,0.993,0.991,0.97,0.677,0.65,0.678,0.598,0.946,0.399,0.987,0.999,0.966,0.998,0.482,0.948,0.913,0.982,0.968,0.943,0.998,0.996,0.537,0.898,0.962,0.993,0.975,0.991,0.247,0.067,0.996,0.818,0.11,0.965,0.925,0.983,0.809,0.978,0.727,0.984,0.191,0.758,0.968,0.954,0.671,0.982,0.678,0.962,0.22,0.972,0.993,0.983,0.965,0.995,0.37,0.785,0.967,0.946,0.986,0.816,0.983,0.958,0.989,0.831,0.989,0.966,0.865,0.954,0.976

In [13]:
pred_train_lgbm_cold = pred_train.copy()
pred_val_lgbm_cold = pred_val.copy()
pred_test_lgbm_cold = pred_test.copy()

### Submission 18

In [14]:
lgbm = LGBMClassifier(max_depth=2,learning_rate=0.1,n_estimators=200,min_child_samples=20,colsample_bytree=0.5)
lgbm.fit(X_train,y_train)

pred_train = pd.DataFrame(lgbm.predict_proba(X_train)[:,1],columns=['pred'],index=X_train.index)
pred_val = pd.DataFrame(lgbm.predict_proba(X_val)[:,1],columns=['pred'],index=X_val.index)
pred_test = pd.DataFrame(lgbm.predict_proba(X_test)[:,1],columns=['pred'],index=X_test.index)

pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  return f(*args, **kwargs)


In [15]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  0.9392778205092265
Val ROC:  0.8698596517062327


In [16]:
err_rate_df = pd.DataFrame()

for th in [0.5]:

    pred_train['pred_binary'] = np.where(pred_train['pred']>=th,1,0)
    pred_val['pred_binary'] = np.where(pred_val['pred']>=th,1,0)

    err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
    err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
    err_rate_train = err_rate1_train+err_rate0_train

    err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
    err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
    err_rate_val = err_rate1_val+err_rate0_val

    err_rate_df_tmp = pd.DataFrame({'threshold':[th],
                                    'err_rate1_train':[err_rate1_train],
                                    'err_rate0_train':[err_rate0_train],
                                    'err_rate_train':[err_rate_train],
                                    'err_rate1_val':[err_rate1_val],
                                    'err_rate0_val':[err_rate0_val],
                                    'err_rate_val':[err_rate_val]
                                   })
    err_rate_df = pd.concat([err_rate_df,err_rate_df_tmp])

err_rate_df

Unnamed: 0,threshold,err_rate1_train,err_rate0_train,err_rate_train,err_rate1_val,err_rate0_val,err_rate_val
0,0.5,0.094154,0.226804,0.320958,0.142469,0.297158,0.439627


In [17]:
sub18 = pd.merge(id_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub18['pred'] = sub18['pred'].apply(lambda x: round(x,3))
sub18_txt = ''
for prob in list(sub18['pred'].values):
    sub18_txt = sub18_txt+','+str(prob)
sub18_txt = sub18_txt[1:]

sub18_txt

'0.988,0.964,0.936,0.993,0.962,0.995,0.724,0.281,0.841,0.978,0.376,0.108,0.99,0.991,0.906,0.995,0.198,0.23,0.916,0.947,0.996,0.871,0.986,0.983,0.99,0.976,0.979,0.992,0.958,0.751,0.988,0.924,0.964,0.984,0.978,0.989,0.73,0.603,0.99,0.969,0.952,0.887,0.898,0.881,0.991,0.958,0.996,0.463,0.782,0.67,0.175,0.99,0.942,0.072,0.32,0.093,0.973,0.986,0.982,0.992,0.986,0.668,0.172,0.977,0.983,0.803,0.983,0.956,0.98,0.983,0.996,0.947,0.955,0.972,0.935,0.829,0.882,0.99,0.978,0.923,0.306,0.851,0.992,0.144,0.983,0.978,0.645,0.993,0.946,0.893,0.995,0.964,0.866,0.987,0.514,0.931,0.957,0.755,0.357,0.977,0.951,0.983,0.988,0.974,0.672,0.547,0.641,0.574,0.953,0.521,0.974,0.999,0.951,0.996,0.572,0.95,0.903,0.968,0.943,0.94,0.996,0.997,0.585,0.862,0.955,0.99,0.963,0.985,0.267,0.115,0.996,0.861,0.118,0.968,0.939,0.986,0.729,0.984,0.736,0.971,0.194,0.71,0.968,0.948,0.55,0.974,0.674,0.936,0.201,0.976,0.991,0.984,0.951,0.993,0.334,0.746,0.969,0.948,0.991,0.828,0.98,0.958,0.97,0.84,0.991,0.956,0.798,0.913,0.966,0.3

In [18]:
pred_train_lgbm_exp = pred_train.copy()
pred_val_lgbm_exp = pred_val.copy()
pred_test_lgbm_exp = pred_test.copy()

### Submission 19

In [20]:
pred_test_lgbm_cold['pred_cold'] = pred_test_lgbm_cold['pred']
pred_test_lgbm_exp['pred_exp'] = pred_test_lgbm_exp['pred']

sub19_df = pd.merge(X_test[['experience_flag']],pred_test_lgbm_cold,how='left',right_index=True,left_index=True)
sub19_df = pd.merge(sub19_df,pred_test_lgbm_exp,how='left',right_index=True,left_index=True)
sub19_df

Unnamed: 0_level_0,experience_flag,pred_x,pred_cold,pred_y,pred_exp
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9,1,0.986811,0.986811,0.987962,0.987962
18,1,0.964566,0.964566,0.964265,0.964265
21,1,0.945521,0.945521,0.935638,0.935638
25,1,0.994099,0.994099,0.993278,0.993278
31,1,0.960582,0.960582,0.962000,0.962000
...,...,...,...,...,...
7982,0,0.648155,0.648155,0.640795,0.640795
7990,0,0.380371,0.380371,0.406924,0.406924
7993,0,0.549757,0.549757,0.568044,0.568044
7994,0,0.584830,0.584830,0.610451,0.610451


In [21]:
sub19_df['pred'] = np.where(sub19_df['experience_flag']==1,sub19_df['pred_exp'],sub19_df['pred_cold'])

In [22]:
sub19 = pd.merge(id_test_df,sub19_df.reset_index(),how='left',on='unique_id')
sub19['pred'] = sub19['pred'].apply(lambda x: round(x,3))
sub19_txt = ''
for prob in list(sub19['pred'].values):
    sub19_txt = sub19_txt+','+str(prob)
sub19_txt = sub19_txt[1:]

sub19_txt

'0.988,0.964,0.936,0.993,0.962,0.995,0.724,0.281,0.841,0.978,0.376,0.108,0.99,0.991,0.906,0.995,0.198,0.23,0.916,0.947,0.996,0.871,0.986,0.983,0.99,0.976,0.979,0.992,0.958,0.751,0.988,0.924,0.964,0.984,0.978,0.989,0.734,0.603,0.99,0.969,0.952,0.887,0.887,0.881,0.991,0.958,0.996,0.463,0.737,0.67,0.175,0.99,0.942,0.072,0.294,0.093,0.973,0.986,0.982,0.992,0.986,0.668,0.172,0.977,0.983,0.786,0.983,0.956,0.98,0.983,0.996,0.947,0.955,0.972,0.935,0.834,0.882,0.99,0.978,0.923,0.306,0.851,0.992,0.144,0.983,0.978,0.645,0.993,0.946,0.893,0.995,0.964,0.866,0.987,0.514,0.931,0.957,0.755,0.357,0.977,0.951,0.983,0.988,0.974,0.672,0.65,0.678,0.598,0.953,0.521,0.974,0.999,0.951,0.996,0.482,0.95,0.903,0.968,0.943,0.94,0.996,0.997,0.537,0.862,0.955,0.99,0.963,0.985,0.267,0.115,0.996,0.861,0.118,0.968,0.939,0.986,0.729,0.984,0.727,0.971,0.194,0.71,0.968,0.948,0.55,0.974,0.674,0.936,0.201,0.976,0.991,0.984,0.951,0.993,0.334,0.785,0.969,0.948,0.991,0.828,0.98,0.958,0.97,0.831,0.991,0.956,0.798,0.913,0.966,0

### Submission 20

In [13]:
rf = XGBClassifier(max_depth=50,criterion='gini',n_estimators=200,min_child_samples=20,colsample_bytree=0.5)
rf.fit(X_train_experienced_under_imputed,y_train_experienced_under)

pred_train = pd.DataFrame(rf.predict_proba(X_train_imputed)[:,1],columns=['pred'],index=X_train_imputed.index)
pred_val = pd.DataFrame(rf.predict_proba(X_val_imputed)[:,1],columns=['pred'],index=X_val_imputed.index)
pred_test = pd.DataFrame(rf.predict_proba(X_test_imputed)[:,1],columns=['pred'],index=X_test_imputed.index)

pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  return f(*args, **kwargs)


Parameters: { "criterion", "min_child_samples" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [14]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  0.9922903803429884
Val ROC:  0.8491632745364088


In [15]:
err_rate_df = pd.DataFrame()

for th in [0.5]:

    pred_train['pred_binary'] = np.where(pred_train['pred']>=th,1,0)
    pred_val['pred_binary'] = np.where(pred_val['pred']>=th,1,0)

    err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
    err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
    err_rate_train = err_rate1_train+err_rate0_train

    err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
    err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
    err_rate_val = err_rate1_val+err_rate0_val

    err_rate_df_tmp = pd.DataFrame({'threshold':[th],
                                    'err_rate1_train':[err_rate1_train],
                                    'err_rate0_train':[err_rate0_train],
                                    'err_rate_train':[err_rate_train],
                                    'err_rate1_val':[err_rate1_val],
                                    'err_rate0_val':[err_rate0_val],
                                    'err_rate_val':[err_rate_val]
                                   })
    err_rate_df = pd.concat([err_rate_df,err_rate_df_tmp])

err_rate_df

Unnamed: 0,threshold,err_rate1_train,err_rate0_train,err_rate_train,err_rate1_val,err_rate0_val,err_rate_val
0,0.5,0.126785,0.0,0.126785,0.265943,0.142119,0.408062


In [16]:
sub20 = pd.merge(id_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub20['pred'] = sub20['pred'].apply(lambda x: round(x,3))
sub20_txt = ''
for prob in list(sub20['pred'].values):
    sub20_txt = sub20_txt+','+str(prob)
sub20_txt = sub20_txt[1:]

sub20_txt

'0.999,0.993,0.863,1.0,0.997,1.0,0.483,0.001,0.997,0.99,0.041,0.007,1.0,1.0,0.978,0.999,0.008,0.011,0.998,0.998,1.0,0.988,0.999,1.0,1.0,0.999,0.994,0.999,0.84,0.991,0.999,0.984,0.993,0.98,0.999,0.999,0.788,0.418,0.997,0.825,0.981,0.978,0.991,0.996,1.0,0.994,1.0,0.182,0.31,0.961,0.002,1.0,1.0,0.024,0.014,0.007,0.998,1.0,1.0,0.999,1.0,0.296,0.097,1.0,0.999,0.91,0.996,0.993,0.997,1.0,0.998,0.999,0.998,0.999,0.955,0.265,0.998,1.0,0.992,0.998,0.024,0.918,1.0,0.112,1.0,0.929,0.63,1.0,0.995,0.994,1.0,0.993,0.313,1.0,0.049,0.999,0.977,0.951,0.375,0.996,0.993,0.999,1.0,1.0,0.963,0.104,0.67,0.641,0.999,0.667,0.994,1.0,0.996,0.999,0.826,0.991,0.992,0.996,0.998,0.994,1.0,1.0,0.188,0.967,0.999,0.968,1.0,1.0,0.004,0.002,0.998,0.725,0.001,0.996,0.917,1.0,0.527,0.999,0.996,0.996,0.028,0.525,0.999,0.956,0.965,0.999,0.956,0.999,0.017,1.0,1.0,0.999,1.0,1.0,0.2,0.967,0.991,0.995,1.0,0.945,1.0,0.994,0.998,0.971,0.999,0.987,0.961,0.992,0.996,0.145,1.0,0.714,0.717,0.996,1.0,0.903,0.999,0.993,0.025,0.999,0.28

### Submission 21

In [17]:
rf = XGBClassifier(max_depth=200,criterion='gini',n_estimators=200,min_child_samples=20,colsample_bytree=0.5)
rf.fit(X_train_experienced_under_imputed,y_train_experienced_under)

pred_train = pd.DataFrame(rf.predict_proba(X_train_imputed)[:,1],columns=['pred'],index=X_train_imputed.index)
pred_val = pd.DataFrame(rf.predict_proba(X_val_imputed)[:,1],columns=['pred'],index=X_val_imputed.index)
pred_test = pd.DataFrame(rf.predict_proba(X_test_imputed)[:,1],columns=['pred'],index=X_test_imputed.index)

pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  return f(*args, **kwargs)


Parameters: { "criterion", "min_child_samples" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [18]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  0.9922903803429884
Val ROC:  0.8491632745364088


In [19]:
err_rate_df = pd.DataFrame()

for th in [0.5]:

    pred_train['pred_binary'] = np.where(pred_train['pred']>=th,1,0)
    pred_val['pred_binary'] = np.where(pred_val['pred']>=th,1,0)

    err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
    err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
    err_rate_train = err_rate1_train+err_rate0_train

    err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
    err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
    err_rate_val = err_rate1_val+err_rate0_val

    err_rate_df_tmp = pd.DataFrame({'threshold':[th],
                                    'err_rate1_train':[err_rate1_train],
                                    'err_rate0_train':[err_rate0_train],
                                    'err_rate_train':[err_rate_train],
                                    'err_rate1_val':[err_rate1_val],
                                    'err_rate0_val':[err_rate0_val],
                                    'err_rate_val':[err_rate_val]
                                   })
    err_rate_df = pd.concat([err_rate_df,err_rate_df_tmp])

err_rate_df

Unnamed: 0,threshold,err_rate1_train,err_rate0_train,err_rate_train,err_rate1_val,err_rate0_val,err_rate_val
0,0.5,0.126785,0.0,0.126785,0.265943,0.142119,0.408062


In [20]:
sub21 = pd.merge(id_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub21['pred'] = sub21['pred'].apply(lambda x: round(x,3))
sub21_txt = ''
for prob in list(sub21['pred'].values):
    sub21_txt = sub21_txt+','+str(prob)
sub21_txt = sub21_txt[1:]

sub21_txt

'0.999,0.993,0.863,1.0,0.997,1.0,0.483,0.001,0.997,0.99,0.041,0.007,1.0,1.0,0.978,0.999,0.008,0.011,0.998,0.998,1.0,0.988,0.999,1.0,1.0,0.999,0.994,0.999,0.84,0.991,0.999,0.984,0.993,0.98,0.999,0.999,0.788,0.418,0.997,0.825,0.981,0.978,0.991,0.996,1.0,0.994,1.0,0.182,0.31,0.961,0.002,1.0,1.0,0.024,0.014,0.007,0.998,1.0,1.0,0.999,1.0,0.296,0.097,1.0,0.999,0.91,0.996,0.993,0.997,1.0,0.998,0.999,0.998,0.999,0.955,0.265,0.998,1.0,0.992,0.998,0.024,0.918,1.0,0.112,1.0,0.929,0.63,1.0,0.995,0.994,1.0,0.993,0.313,1.0,0.049,0.999,0.977,0.951,0.375,0.996,0.993,0.999,1.0,1.0,0.963,0.104,0.67,0.641,0.999,0.667,0.994,1.0,0.996,0.999,0.826,0.991,0.992,0.996,0.998,0.994,1.0,1.0,0.188,0.967,0.999,0.968,1.0,1.0,0.004,0.002,0.998,0.725,0.001,0.996,0.917,1.0,0.527,0.999,0.996,0.996,0.028,0.525,0.999,0.956,0.965,0.999,0.956,0.999,0.017,1.0,1.0,0.999,1.0,1.0,0.2,0.967,0.991,0.995,1.0,0.945,1.0,0.994,0.998,0.971,0.999,0.987,0.961,0.992,0.996,0.145,1.0,0.714,0.717,0.996,1.0,0.903,0.999,0.993,0.025,0.999,0.28

### Submission 21

In [21]:
rf = XGBClassifier(max_depth=200,criterion='gini',n_estimators=200,min_child_samples=20,colsample_bytree=0.5,class_weight='balanced')
rf.fit(X_train_experienced_under_imputed,y_train_experienced_under)

pred_train = pd.DataFrame(rf.predict_proba(X_train_imputed)[:,1],columns=['pred'],index=X_train_imputed.index)
pred_val = pd.DataFrame(rf.predict_proba(X_val_imputed)[:,1],columns=['pred'],index=X_val_imputed.index)
pred_test = pd.DataFrame(rf.predict_proba(X_test_imputed)[:,1],columns=['pred'],index=X_test_imputed.index)

pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  return f(*args, **kwargs)


Parameters: { "class_weight", "criterion", "min_child_samples" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [22]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  0.9922903803429884
Val ROC:  0.8491632745364088


In [23]:
err_rate_df = pd.DataFrame()

for th in [0.5]:

    pred_train['pred_binary'] = np.where(pred_train['pred']>=th,1,0)
    pred_val['pred_binary'] = np.where(pred_val['pred']>=th,1,0)

    err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
    err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
    err_rate_train = err_rate1_train+err_rate0_train

    err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
    err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
    err_rate_val = err_rate1_val+err_rate0_val

    err_rate_df_tmp = pd.DataFrame({'threshold':[th],
                                    'err_rate1_train':[err_rate1_train],
                                    'err_rate0_train':[err_rate0_train],
                                    'err_rate_train':[err_rate_train],
                                    'err_rate1_val':[err_rate1_val],
                                    'err_rate0_val':[err_rate0_val],
                                    'err_rate_val':[err_rate_val]
                                   })
    err_rate_df = pd.concat([err_rate_df,err_rate_df_tmp])

err_rate_df

Unnamed: 0,threshold,err_rate1_train,err_rate0_train,err_rate_train,err_rate1_val,err_rate0_val,err_rate_val
0,0.5,0.126785,0.0,0.126785,0.265943,0.142119,0.408062


In [24]:
sub27 = pd.merge(id_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub27['pred'] = sub27['pred'].apply(lambda x: round(x,3))
sub27_txt = ''
for prob in list(sub27['pred'].values):
    sub27_txt = sub27_txt+','+str(prob)
sub27_txt = sub27_txt[1:]

sub27_txt

'0.999,0.993,0.863,1.0,0.997,1.0,0.483,0.001,0.997,0.99,0.041,0.007,1.0,1.0,0.978,0.999,0.008,0.011,0.998,0.998,1.0,0.988,0.999,1.0,1.0,0.999,0.994,0.999,0.84,0.991,0.999,0.984,0.993,0.98,0.999,0.999,0.788,0.418,0.997,0.825,0.981,0.978,0.991,0.996,1.0,0.994,1.0,0.182,0.31,0.961,0.002,1.0,1.0,0.024,0.014,0.007,0.998,1.0,1.0,0.999,1.0,0.296,0.097,1.0,0.999,0.91,0.996,0.993,0.997,1.0,0.998,0.999,0.998,0.999,0.955,0.265,0.998,1.0,0.992,0.998,0.024,0.918,1.0,0.112,1.0,0.929,0.63,1.0,0.995,0.994,1.0,0.993,0.313,1.0,0.049,0.999,0.977,0.951,0.375,0.996,0.993,0.999,1.0,1.0,0.963,0.104,0.67,0.641,0.999,0.667,0.994,1.0,0.996,0.999,0.826,0.991,0.992,0.996,0.998,0.994,1.0,1.0,0.188,0.967,0.999,0.968,1.0,1.0,0.004,0.002,0.998,0.725,0.001,0.996,0.917,1.0,0.527,0.999,0.996,0.996,0.028,0.525,0.999,0.956,0.965,0.999,0.956,0.999,0.017,1.0,1.0,0.999,1.0,1.0,0.2,0.967,0.991,0.995,1.0,0.945,1.0,0.994,0.998,0.971,0.999,0.987,0.961,0.992,0.996,0.145,1.0,0.714,0.717,0.996,1.0,0.903,0.999,0.993,0.025,0.999,0.28

### Submission 22 no

In [14]:
oversample = SMOTE()
X_tr2, y_tr2 = oversample.fit_resample(X_train_imputed, y_train)

In [15]:
rf = XGBClassifier(max_depth=10,criterion='entropy',n_estimators=100,min_child_samples=20,colsample_bytree=0.8)
rf.fit(X_tr2,y_tr2)

pred_train = pd.DataFrame(rf.predict_proba(X_train_imputed)[:,1],columns=['pred'],index=X_train_imputed.index)
pred_val = pd.DataFrame(rf.predict_proba(X_val_imputed)[:,1],columns=['pred'],index=X_val_imputed.index)
pred_test = pd.DataFrame(rf.predict_proba(X_test_imputed)[:,1],columns=['pred'],index=X_test_imputed.index)

pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Parameters: { "criterion", "min_child_samples" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [16]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  1.0
Val ROC:  0.8514544963694566


In [17]:
err_rate_df = pd.DataFrame()

for th in [0.5]:

    pred_train['pred_binary'] = np.where(pred_train['pred']>=th,1,0)
    pred_val['pred_binary'] = np.where(pred_val['pred']>=th,1,0)

    err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
    err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
    err_rate_train = err_rate1_train+err_rate0_train

    err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
    err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
    err_rate_val = err_rate1_val+err_rate0_val

    err_rate_df_tmp = pd.DataFrame({'threshold':[th],
                                    'err_rate1_train':[err_rate1_train],
                                    'err_rate0_train':[err_rate0_train],
                                    'err_rate_train':[err_rate_train],
                                    'err_rate1_val':[err_rate1_val],
                                    'err_rate0_val':[err_rate0_val],
                                    'err_rate_val':[err_rate_val]
                                   })
    err_rate_df = pd.concat([err_rate_df,err_rate_df_tmp])

err_rate_df

Unnamed: 0,threshold,err_rate1_train,err_rate0_train,err_rate_train,err_rate1_val,err_rate0_val,err_rate_val
0,0.5,0.0,0.0,0.0,0.17232,0.317829,0.49015


In [18]:
sub22 = pd.merge(id_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub22['pred'] = sub22['pred'].apply(lambda x: round(x,3))
sub22_txt = ''
for prob in list(sub22['pred'].values):
    sub22_txt = sub22_txt+','+str(prob)
sub22_txt = sub22_txt[1:]

sub22_txt

'0.998,0.996,0.995,0.999,0.995,0.999,0.364,0.006,0.989,0.999,0.068,0.194,0.999,1.0,0.986,1.0,0.725,0.104,0.998,0.996,1.0,0.982,0.999,1.0,1.0,0.996,0.998,1.0,0.982,0.999,1.0,0.992,0.997,0.998,0.997,0.998,0.864,0.921,0.998,0.897,0.907,0.983,0.998,1.0,1.0,0.987,1.0,0.273,0.129,0.483,0.063,1.0,0.998,0.0,0.029,0.016,1.0,0.999,0.998,1.0,1.0,0.856,0.075,0.998,1.0,0.955,1.0,0.975,0.998,0.999,1.0,0.999,0.999,0.999,0.94,0.908,0.991,1.0,0.999,0.994,0.019,0.937,0.999,0.122,1.0,0.982,0.544,1.0,0.999,0.998,1.0,0.995,0.883,1.0,0.107,0.999,0.994,0.988,0.341,0.999,0.959,1.0,1.0,1.0,0.957,0.693,0.864,0.889,0.996,0.803,0.947,1.0,0.996,1.0,0.896,0.995,0.949,0.999,0.998,1.0,1.0,1.0,0.98,0.842,1.0,0.998,1.0,1.0,0.03,0.0,0.999,0.978,0.003,0.999,0.997,0.999,0.474,0.999,0.973,0.997,0.178,0.989,0.998,0.996,0.926,1.0,0.959,0.997,0.222,0.999,0.999,0.996,0.999,1.0,0.748,0.996,0.995,0.997,1.0,0.97,0.999,0.999,1.0,0.789,1.0,0.998,0.973,0.994,1.0,0.127,1.0,0.212,0.93,1.0,0.998,0.966,1.0,0.999,0.635,0.999,0.36,0.026,0

### Submission 23

In [19]:
oversample = SMOTE()
X_tr2, y_tr2 = oversample.fit_resample(X_train_experienced_imputed, y_train_experienced)

In [20]:
rf = XGBClassifier(max_depth=10,criterion='entropy',n_estimators=100,min_child_samples=20,colsample_bytree=0.8)
rf.fit(X_tr2,y_tr2)

pred_train = pd.DataFrame(rf.predict_proba(X_train_imputed)[:,1],columns=['pred'],index=X_train_imputed.index)
pred_val = pd.DataFrame(rf.predict_proba(X_val_imputed)[:,1],columns=['pred'],index=X_val_imputed.index)
pred_test = pd.DataFrame(rf.predict_proba(X_test_imputed)[:,1],columns=['pred'],index=X_test_imputed.index)

pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Parameters: { "criterion", "min_child_samples" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [21]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  0.8949664519192359
Val ROC:  0.8175559832970454


In [22]:
err_rate_df = pd.DataFrame()

for th in [0.5]:

    pred_train['pred_binary'] = np.where(pred_train['pred']>=th,1,0)
    pred_val['pred_binary'] = np.where(pred_val['pred']>=th,1,0)

    err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
    err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
    err_rate_train = err_rate1_train+err_rate0_train

    err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
    err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
    err_rate_val = err_rate1_val+err_rate0_val

    err_rate_df_tmp = pd.DataFrame({'threshold':[th],
                                    'err_rate1_train':[err_rate1_train],
                                    'err_rate0_train':[err_rate0_train],
                                    'err_rate_train':[err_rate_train],
                                    'err_rate1_val':[err_rate1_val],
                                    'err_rate0_val':[err_rate0_val],
                                    'err_rate_val':[err_rate_val]
                                   })
    err_rate_df = pd.concat([err_rate_df,err_rate_df_tmp])

err_rate_df

Unnamed: 0,threshold,err_rate1_train,err_rate0_train,err_rate_train,err_rate1_val,err_rate0_val,err_rate_val
0,0.5,0.07172,0.46134,0.53306,0.126187,0.496124,0.622311


In [18]:
sub22 = pd.merge(id_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub22['pred'] = sub22['pred'].apply(lambda x: round(x,3))
sub22_txt = ''
for prob in list(sub22['pred'].values):
    sub22_txt = sub22_txt+','+str(prob)
sub22_txt = sub22_txt[1:]

sub22_txt

'0.998,0.996,0.995,0.999,0.995,0.999,0.364,0.006,0.989,0.999,0.068,0.194,0.999,1.0,0.986,1.0,0.725,0.104,0.998,0.996,1.0,0.982,0.999,1.0,1.0,0.996,0.998,1.0,0.982,0.999,1.0,0.992,0.997,0.998,0.997,0.998,0.864,0.921,0.998,0.897,0.907,0.983,0.998,1.0,1.0,0.987,1.0,0.273,0.129,0.483,0.063,1.0,0.998,0.0,0.029,0.016,1.0,0.999,0.998,1.0,1.0,0.856,0.075,0.998,1.0,0.955,1.0,0.975,0.998,0.999,1.0,0.999,0.999,0.999,0.94,0.908,0.991,1.0,0.999,0.994,0.019,0.937,0.999,0.122,1.0,0.982,0.544,1.0,0.999,0.998,1.0,0.995,0.883,1.0,0.107,0.999,0.994,0.988,0.341,0.999,0.959,1.0,1.0,1.0,0.957,0.693,0.864,0.889,0.996,0.803,0.947,1.0,0.996,1.0,0.896,0.995,0.949,0.999,0.998,1.0,1.0,1.0,0.98,0.842,1.0,0.998,1.0,1.0,0.03,0.0,0.999,0.978,0.003,0.999,0.997,0.999,0.474,0.999,0.973,0.997,0.178,0.989,0.998,0.996,0.926,1.0,0.959,0.997,0.222,0.999,0.999,0.996,0.999,1.0,0.748,0.996,0.995,0.997,1.0,0.97,0.999,0.999,1.0,0.789,1.0,0.998,0.973,0.994,1.0,0.127,1.0,0.212,0.93,1.0,0.998,0.966,1.0,0.999,0.635,0.999,0.36,0.026,0

In [23]:
sub23 = pd.merge(id_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub23['pred'] = sub23['pred'].apply(lambda x: round(x,3))
sub23_txt = ''
for prob in list(sub23['pred'].values):
    sub23_txt = sub23_txt+','+str(prob)
sub23_txt = sub23_txt[1:]

sub23_txt

'0.995,0.998,0.957,1.0,0.997,1.0,0.619,0.008,0.998,0.997,0.44,0.005,0.999,1.0,0.988,0.999,0.014,0.045,0.999,0.999,1.0,0.99,0.999,0.999,0.999,0.995,0.999,1.0,0.996,0.998,1.0,0.995,0.996,0.999,0.999,0.999,0.938,0.978,0.997,0.965,0.987,0.938,0.98,0.994,1.0,0.937,1.0,0.345,0.321,0.658,0.008,1.0,0.998,0.003,0.942,0.011,1.0,0.999,0.999,0.999,1.0,0.201,0.033,0.998,0.999,0.972,1.0,0.984,0.998,1.0,1.0,0.982,1.0,1.0,0.999,0.943,0.993,1.0,0.999,0.998,0.025,0.991,1.0,0.011,1.0,0.994,0.432,1.0,0.998,0.997,1.0,0.998,0.846,0.999,0.02,0.997,0.999,0.981,0.493,0.997,0.997,1.0,0.996,1.0,0.92,0.494,0.997,0.51,0.998,0.204,0.977,1.0,0.999,0.999,0.732,1.0,0.696,0.998,0.999,0.998,1.0,1.0,0.766,0.977,0.999,0.998,0.999,1.0,0.01,0.012,1.0,0.942,0.014,0.997,0.999,0.999,0.91,0.999,0.947,0.992,0.215,0.89,0.997,0.998,0.846,1.0,0.937,0.992,0.68,1.0,1.0,0.999,0.999,1.0,0.346,0.99,0.992,0.997,1.0,0.897,0.993,0.989,1.0,0.969,1.0,0.996,0.852,0.999,0.998,0.167,1.0,0.385,0.987,1.0,0.985,0.901,0.999,0.999,0.075,1.0,0.076,0.

### Submission 24

In [24]:
oversample = SMOTE()
X_tr2, y_tr2 = oversample.fit_resample(X_train_imputed, y_train)

In [25]:
rf = XGBClassifier(max_depth=20,criterion='gini',n_estimators=200,min_child_samples=20,colsample_bytree=0.5)
rf.fit(X_tr2,y_tr2)

pred_train = pd.DataFrame(rf.predict_proba(X_train_imputed)[:,1],columns=['pred'],index=X_train_imputed.index)
pred_val = pd.DataFrame(rf.predict_proba(X_val_imputed)[:,1],columns=['pred'],index=X_val_imputed.index)
pred_test = pd.DataFrame(rf.predict_proba(X_test_imputed)[:,1],columns=['pred'],index=X_test_imputed.index)

pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Parameters: { "criterion", "min_child_samples" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [26]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  1.0
Val ROC:  0.8582983602074197


In [27]:
err_rate_df = pd.DataFrame()

for th in [0.5]:

    pred_train['pred_binary'] = np.where(pred_train['pred']>=th,1,0)
    pred_val['pred_binary'] = np.where(pred_val['pred']>=th,1,0)

    err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
    err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
    err_rate_train = err_rate1_train+err_rate0_train

    err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
    err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
    err_rate_val = err_rate1_val+err_rate0_val

    err_rate_df_tmp = pd.DataFrame({'threshold':[th],
                                    'err_rate1_train':[err_rate1_train],
                                    'err_rate0_train':[err_rate0_train],
                                    'err_rate_train':[err_rate_train],
                                    'err_rate1_val':[err_rate1_val],
                                    'err_rate0_val':[err_rate0_val],
                                    'err_rate_val':[err_rate_val]
                                   })
    err_rate_df = pd.concat([err_rate_df,err_rate_df_tmp])

err_rate_df

Unnamed: 0,threshold,err_rate1_train,err_rate0_train,err_rate_train,err_rate1_val,err_rate0_val,err_rate_val
0,0.5,0.0,0.0,0.0,0.17232,0.258398,0.430718


In [29]:
sub24 = pd.merge(id_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub24['pred'] = sub24['pred'].apply(lambda x: round(x,3))
sub24_txt = ''
for prob in list(sub24['pred'].values):
    sub24_txt = sub24_txt+','+str(prob)
sub24_txt = sub24_txt[1:]

sub24_txt

'0.999,0.998,0.984,1.0,0.997,0.999,0.702,0.013,0.999,0.997,0.011,0.009,1.0,1.0,0.994,1.0,0.147,0.011,0.995,0.998,1.0,0.998,1.0,1.0,1.0,0.998,1.0,1.0,0.999,1.0,1.0,0.996,0.999,0.991,1.0,0.999,0.688,0.974,0.999,0.976,0.974,0.998,1.0,0.999,1.0,0.998,1.0,0.884,0.034,0.521,0.019,1.0,0.998,0.001,0.025,0.001,1.0,0.996,0.997,1.0,1.0,0.957,0.583,0.998,1.0,0.942,1.0,0.997,0.999,1.0,1.0,0.979,1.0,0.999,0.984,0.992,0.998,1.0,0.999,0.999,0.098,0.989,1.0,0.019,1.0,0.993,0.93,1.0,0.998,0.995,1.0,0.999,0.884,1.0,0.031,0.999,0.999,0.963,0.641,0.999,0.996,0.999,1.0,1.0,0.982,0.005,0.195,0.726,0.999,0.838,0.996,1.0,0.996,1.0,0.865,0.999,0.788,0.999,0.993,0.999,1.0,1.0,0.897,0.937,0.998,0.999,1.0,1.0,0.017,0.001,0.999,0.988,0.012,0.998,0.999,1.0,0.61,1.0,0.992,0.998,0.069,0.557,0.997,0.998,0.978,0.999,0.972,0.999,0.189,1.0,1.0,1.0,1.0,1.0,0.873,0.999,0.99,0.993,1.0,0.951,1.0,0.995,0.999,0.977,1.0,0.998,0.968,0.995,1.0,0.206,1.0,0.43,0.788,1.0,0.999,0.985,1.0,0.998,0.824,0.997,0.094,0.014,0.989,0.029,0.595

### Submission 28

In [16]:
var_list = pd.read_csv('variable_list.csv')
var_list.columns=['index','feature']
var_list = [x[2:] for x in list(var_list['feature'].values)]

In [17]:
var_list

['sellingprice_median_basket',
 'all_favorite_Level3_Category_Id_cnt',
 'sellingprice_min_order',
 'sellingprice_median_search',
 'weekly_visit_contentid_cnt',
 'weekly_favorite_secs_btw_first_last',
 'sellingprice_mean_ALTINYILDIZ CLASSICS',
 'sellingprice_sum_Addax',
 'sellingprice_median_Apple',
 'sellingprice_mean_Aqua Di Polo 1987',
 'sellingprice_max_Avva',
 'sellingprice_max_Bambi',
 'sellingprice_sum_Bershka',
 'sellingprice_mean_Cool & Sexy',
 'sellingprice_max_DeFacto',
 'sellingprice_max_Dilvin',
 'sellingprice_max_Elle Shoes',
 'sellingprice_sum_English Home',
 'sellingprice_min_HUMMEL',
 'sellingprice_sum_Happiness İst.',
 'sellingprice_sum_Karaca',
 'sellingprice_sum_Kitchen Life',
 'sellingprice_mean_Koton',
 'sellingprice_mean_Koton Kids',
 'sellingprice_sum_LC Waikiki',
 'sellingprice_sum_MANGO Woman',
 'sellingprice_min_MUGGO',
 'sellingprice_sum_Madame Coco',
 'sellingprice_count_Mavi',
 'sellingprice_min_Nike',
 'sellingprice_sum_Olalook',
 'sellingprice_mean_Oysho'

In [18]:
oversample = SMOTE()
X_tr2, y_tr2 = oversample.fit_resample(X_train_imputed[var_list], y_train)

In [23]:
rf = XGBClassifier(max_depth=20,criterion='gini',n_estimators=200,min_child_samples=20,colsample_bytree=1)
rf.fit(X_tr2,y_tr2)

pred_train = pd.DataFrame(rf.predict_proba(X_train_imputed[var_list])[:,1],columns=['pred'],index=X_train_imputed.index)
pred_val = pd.DataFrame(rf.predict_proba(X_val_imputed[var_list])[:,1],columns=['pred'],index=X_val_imputed.index)
pred_test = pd.DataFrame(rf.predict_proba(X_test_imputed[var_list])[:,1],columns=['pred'],index=X_test_imputed.index)

pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Parameters: { "criterion", "min_child_samples" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [24]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  1.0
Val ROC:  0.841115072979009


In [25]:
err_rate_df = pd.DataFrame()

for th in [0.5]:

    pred_train['pred_binary'] = np.where(pred_train['pred']>=th,1,0)
    pred_val['pred_binary'] = np.where(pred_val['pred']>=th,1,0)

    err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
    err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
    err_rate_train = err_rate1_train+err_rate0_train

    err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
    err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
    err_rate_val = err_rate1_val+err_rate0_val

    err_rate_df_tmp = pd.DataFrame({'threshold':[th],
                                    'err_rate1_train':[err_rate1_train],
                                    'err_rate0_train':[err_rate0_train],
                                    'err_rate_train':[err_rate_train],
                                    'err_rate1_val':[err_rate1_val],
                                    'err_rate0_val':[err_rate0_val],
                                    'err_rate_val':[err_rate_val]
                                   })
    err_rate_df = pd.concat([err_rate_df,err_rate_df_tmp])

err_rate_df

Unnamed: 0,threshold,err_rate1_train,err_rate0_train,err_rate_train,err_rate1_val,err_rate0_val,err_rate_val
0,0.5,0.0,0.0,0.0,0.185889,0.30491,0.490798


In [26]:
sub28 = pd.merge(id_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub28['pred'] = sub28['pred'].apply(lambda x: round(x,3))
sub28_txt = ''
for prob in list(sub28['pred'].values):
    sub28_txt = sub28_txt+','+str(prob)
sub28_txt = sub28_txt[1:]

sub28_txt

'0.995,0.991,0.99,1.0,0.999,1.0,0.646,0.299,0.998,0.999,0.191,0.045,1.0,1.0,0.999,1.0,0.262,0.02,0.999,0.998,1.0,0.999,1.0,1.0,1.0,0.999,1.0,1.0,0.994,1.0,1.0,0.999,0.989,0.999,0.999,1.0,0.737,0.904,0.998,0.883,0.998,0.989,0.999,0.998,1.0,0.996,1.0,0.459,0.867,0.131,0.014,1.0,0.997,0.0,0.006,0.04,1.0,0.999,0.999,1.0,1.0,0.832,0.15,0.996,1.0,0.991,0.999,0.992,0.969,0.999,1.0,0.999,0.999,0.996,0.998,0.973,1.0,1.0,1.0,0.989,0.104,0.924,0.999,0.295,1.0,0.909,0.906,1.0,1.0,0.976,1.0,0.998,0.98,1.0,0.036,0.995,1.0,0.996,0.733,0.999,1.0,0.999,1.0,1.0,0.983,0.025,0.977,0.99,1.0,0.272,0.984,1.0,1.0,0.999,0.993,0.999,0.968,0.999,1.0,0.999,1.0,1.0,0.461,0.975,0.999,1.0,1.0,1.0,0.003,0.003,0.999,0.995,0.002,1.0,0.988,1.0,0.721,1.0,0.991,1.0,0.556,0.691,0.998,0.998,0.995,0.999,0.983,0.994,0.0,1.0,1.0,1.0,1.0,0.999,0.857,0.994,0.999,0.994,1.0,0.998,1.0,0.972,0.998,0.972,1.0,1.0,0.46,0.998,0.998,0.197,1.0,0.339,0.375,1.0,0.997,0.995,1.0,0.994,0.232,0.999,0.622,0.278,1.0,0.119,0.251,1.0,1.0,1.0,1.0,1.

### Submission 30

In [41]:
X_train_and_val = pd.concat([X_train,X_val])
y_train_and_val = pd.concat([y_train,y_val])

In [42]:
lgbm = LGBMClassifier(max_depth=2,learning_rate=0.1,n_estimators=200,min_child_samples=20,colsample_bytree=1)
lgbm.fit(X_train_and_val,y_train_and_val)

pred_train = pd.DataFrame(lgbm.predict_proba(X_train)[:,1],columns=['pred'],index=X_train.index)
pred_val = pd.DataFrame(lgbm.predict_proba(X_val)[:,1],columns=['pred'],index=X_val.index)
pred_test = pd.DataFrame(lgbm.predict_proba(X_test)[:,1],columns=['pred'],index=X_test.index)

pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [43]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  0.9322050843804972
Val ROC:  0.9282218225293547


In [44]:
err_rate_df = pd.DataFrame()

for th in [0.5]:

    pred_train['pred_binary'] = np.where(pred_train['pred']>=th,1,0)
    pred_val['pred_binary'] = np.where(pred_val['pred']>=th,1,0)

    err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
    err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
    err_rate_train = err_rate1_train+err_rate0_train

    err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
    err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
    err_rate_val = err_rate1_val+err_rate0_val

    err_rate_df_tmp = pd.DataFrame({'threshold':[th],
                                    'err_rate1_train':[err_rate1_train],
                                    'err_rate0_train':[err_rate0_train],
                                    'err_rate_train':[err_rate_train],
                                    'err_rate1_val':[err_rate1_val],
                                    'err_rate0_val':[err_rate0_val],
                                    'err_rate_val':[err_rate_val]
                                   })
    err_rate_df = pd.concat([err_rate_df,err_rate_df_tmp])

err_rate_df

Unnamed: 0,threshold,err_rate1_train,err_rate0_train,err_rate_train,err_rate1_val,err_rate0_val,err_rate_val
0,0.5,0.103331,0.243557,0.346888,0.111262,0.222222,0.333484


In [45]:
pred_train_lgbm_cold = pred_train.copy()
pred_val_lgbm_cold = pred_val.copy()
pred_test_lgbm_cold = pred_test.copy()

In [47]:
rf = XGBClassifier(max_depth=50,criterion='gini',n_estimators=200,min_child_samples=20,colsample_bytree=0.5)
rf.fit(X_train_experienced_under_imputed,y_train_experienced_under)

pred_train = pd.DataFrame(rf.predict_proba(X_train_imputed)[:,1],columns=['pred'],index=X_train_imputed.index)
pred_val = pd.DataFrame(rf.predict_proba(X_val_imputed)[:,1],columns=['pred'],index=X_val_imputed.index)
pred_test = pd.DataFrame(rf.predict_proba(X_test_imputed)[:,1],columns=['pred'],index=X_test_imputed.index)

pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Parameters: { "criterion", "min_child_samples" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [48]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  0.9922903803429884
Val ROC:  0.8491632745364088


In [49]:
err_rate_df = pd.DataFrame()

for th in [0.5]:

    pred_train['pred_binary'] = np.where(pred_train['pred']>=th,1,0)
    pred_val['pred_binary'] = np.where(pred_val['pred']>=th,1,0)

    err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
    err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
    err_rate_train = err_rate1_train+err_rate0_train

    err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
    err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
    err_rate_val = err_rate1_val+err_rate0_val

    err_rate_df_tmp = pd.DataFrame({'threshold':[th],
                                    'err_rate1_train':[err_rate1_train],
                                    'err_rate0_train':[err_rate0_train],
                                    'err_rate_train':[err_rate_train],
                                    'err_rate1_val':[err_rate1_val],
                                    'err_rate0_val':[err_rate0_val],
                                    'err_rate_val':[err_rate_val]
                                   })
    err_rate_df = pd.concat([err_rate_df,err_rate_df_tmp])

err_rate_df

Unnamed: 0,threshold,err_rate1_train,err_rate0_train,err_rate_train,err_rate1_val,err_rate0_val,err_rate_val
0,0.5,0.126785,0.0,0.126785,0.265943,0.142119,0.408062


In [50]:
pred_train_rf = pred_train.copy()
pred_val_rf = pred_val.copy()
pred_test_rf = pred_test.copy()

In [51]:
pred_train_rf

Unnamed: 0_level_0,pred,female_label,pred_binary
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.999889,1,1
2,0.999861,1,1
3,0.998537,1,1
4,0.954260,1,1
5,0.998548,1,1
...,...,...,...
7987,0.996615,1,1
7988,0.007574,0,0
7989,0.000132,0,0
7992,0.006232,1,0


In [52]:
th = 0.5
sub30 = pd.merge(pred_test_rf,pred_test_lgbm_cold,how='left',left_index=True,right_index=True)
sub30['pred'] = th*sub30['pred_x']+(1-th)*sub30['pred_y']
sub30['pred'] = sub30['pred'].apply(lambda x: round(x,3))
sub30_txt = ''
for prob in list(sub30['pred'].values):
    sub30_txt = sub30_txt+','+str(prob)
sub30_txt = sub30_txt[1:]

sub30_txt


'0.986,0.971,0.905,0.997,0.964,0.996,0.483,0.158,0.959,0.98,0.182,0.029,0.992,0.997,0.921,0.993,0.094,0.078,0.988,0.968,0.994,0.935,0.981,0.989,0.998,0.977,0.987,0.997,0.899,0.932,0.996,0.914,0.972,0.982,0.995,0.995,0.729,0.606,0.993,0.875,0.958,0.921,0.956,0.96,0.991,0.975,0.996,0.331,0.519,0.792,0.087,0.995,0.967,0.052,0.162,0.044,0.984,0.987,0.991,0.993,0.99,0.538,0.12,0.99,0.996,0.873,0.993,0.959,0.977,0.994,0.998,0.983,0.977,0.986,0.936,0.572,0.972,0.997,0.99,0.927,0.115,0.89,0.996,0.122,0.994,0.945,0.723,0.995,0.979,0.976,0.998,0.99,0.574,0.99,0.248,0.974,0.964,0.888,0.347,0.98,0.976,0.993,0.985,0.993,0.806,0.409,0.651,0.598,0.986,0.619,0.984,0.999,0.976,0.997,0.673,0.936,0.857,0.978,0.974,0.973,0.998,0.998,0.341,0.924,0.976,0.972,0.988,0.988,0.109,0.073,0.995,0.8,0.054,0.977,0.892,0.993,0.632,0.989,0.886,0.989,0.218,0.622,0.987,0.957,0.79,0.993,0.767,0.986,0.076,0.985,0.995,0.994,0.985,0.996,0.291,0.923,0.961,0.976,0.988,0.862,0.996,0.867,0.992,0.873,0.993,0.98,0.825,0.957,0.977

### Submission 31

In [53]:
X_train_and_val = pd.concat([X_train,X_val])
y_train_and_val = pd.concat([y_train,y_val])

In [54]:
lgbm = LGBMClassifier(max_depth=2,learning_rate=0.1,n_estimators=200,min_child_samples=20,colsample_bytree=1)
lgbm.fit(X_train_and_val,y_train_and_val)

pred_train = pd.DataFrame(lgbm.predict_proba(X_train)[:,1],columns=['pred'],index=X_train.index)
pred_val = pd.DataFrame(lgbm.predict_proba(X_val)[:,1],columns=['pred'],index=X_val.index)
pred_test = pd.DataFrame(lgbm.predict_proba(X_test)[:,1],columns=['pred'],index=X_test.index)

pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [55]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  0.9322050843804972
Val ROC:  0.9282218225293547


In [56]:
err_rate_df = pd.DataFrame()

for th in [0.5]:

    pred_train['pred_binary'] = np.where(pred_train['pred']>=th,1,0)
    pred_val['pred_binary'] = np.where(pred_val['pred']>=th,1,0)

    err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
    err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
    err_rate_train = err_rate1_train+err_rate0_train

    err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
    err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
    err_rate_val = err_rate1_val+err_rate0_val

    err_rate_df_tmp = pd.DataFrame({'threshold':[th],
                                    'err_rate1_train':[err_rate1_train],
                                    'err_rate0_train':[err_rate0_train],
                                    'err_rate_train':[err_rate_train],
                                    'err_rate1_val':[err_rate1_val],
                                    'err_rate0_val':[err_rate0_val],
                                    'err_rate_val':[err_rate_val]
                                   })
    err_rate_df = pd.concat([err_rate_df,err_rate_df_tmp])

err_rate_df

Unnamed: 0,threshold,err_rate1_train,err_rate0_train,err_rate_train,err_rate1_val,err_rate0_val,err_rate_val
0,0.5,0.103331,0.243557,0.346888,0.111262,0.222222,0.333484


In [57]:
pred_train_lgbm_cold = pred_train.copy()
pred_val_lgbm_cold = pred_val.copy()
pred_test_lgbm_cold = pred_test.copy()

In [58]:
rf = XGBClassifier(max_depth=50,criterion='gini',n_estimators=200,min_child_samples=20,colsample_bytree=0.5,class_weight= 'balanced')
rf.fit(X_train_and_val,y_train_and_val)

pred_train = pd.DataFrame(rf.predict_proba(X_train_imputed)[:,1],columns=['pred'],index=X_train_imputed.index)
pred_val = pd.DataFrame(rf.predict_proba(X_val_imputed)[:,1],columns=['pred'],index=X_val_imputed.index)
pred_test = pd.DataFrame(rf.predict_proba(X_test_imputed)[:,1],columns=['pred'],index=X_test_imputed.index)

pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Parameters: { "class_weight", "criterion", "min_child_samples" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [59]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  0.8046013739864177
Val ROC:  0.821288203100074


In [60]:
err_rate_df = pd.DataFrame()

for th in [0.5]:

    pred_train['pred_binary'] = np.where(pred_train['pred']>=th,1,0)
    pred_val['pred_binary'] = np.where(pred_val['pred']>=th,1,0)

    err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
    err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
    err_rate_train = err_rate1_train+err_rate0_train

    err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
    err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
    err_rate_val = err_rate1_val+err_rate0_val

    err_rate_df_tmp = pd.DataFrame({'threshold':[th],
                                    'err_rate1_train':[err_rate1_train],
                                    'err_rate0_train':[err_rate0_train],
                                    'err_rate_train':[err_rate_train],
                                    'err_rate1_val':[err_rate1_val],
                                    'err_rate0_val':[err_rate0_val],
                                    'err_rate_val':[err_rate_val]
                                   })
    err_rate_df = pd.concat([err_rate_df,err_rate_df_tmp])

err_rate_df

Unnamed: 0,threshold,err_rate1_train,err_rate0_train,err_rate_train,err_rate1_val,err_rate0_val,err_rate_val
0,0.5,0.0017,0.947165,0.948864,0.002714,0.943152,0.945866


In [61]:
pred_train_rf = pred_train.copy()
pred_val_rf = pred_val.copy()
pred_test_rf = pred_test.copy()

In [62]:
pred_train_rf

Unnamed: 0_level_0,pred,female_label,pred_binary
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.999894,1,1
2,0.999960,1,1
3,0.994198,1,1
4,0.992091,1,1
5,0.999467,1,1
...,...,...,...
7987,0.933481,1,1
7988,0.961368,0,1
7989,0.942432,0,1
7992,0.989745,1,1


In [63]:
th = 0.5
sub31 = pd.merge(pred_test_rf,pred_test_lgbm_cold,how='left',left_index=True,right_index=True)
sub31['pred'] = th*sub31['pred_x']+(1-th)*sub31['pred_y']
sub31['pred'] = sub31['pred'].apply(lambda x: round(x,3))
sub31_txt = ''
for prob in list(sub31['pred'].values):
    sub31_txt = sub31_txt+','+str(prob)
sub31_txt = sub31_txt[1:]

sub31_txt


'0.982,0.962,0.972,0.997,0.964,0.995,0.731,0.605,0.961,0.984,0.652,0.482,0.992,0.997,0.93,0.993,0.449,0.525,0.987,0.969,0.994,0.934,0.981,0.988,0.998,0.971,0.99,0.997,0.977,0.937,0.996,0.905,0.971,0.992,0.994,0.995,0.834,0.897,0.993,0.959,0.964,0.931,0.896,0.935,0.991,0.975,0.996,0.724,0.86,0.811,0.571,0.995,0.966,0.483,0.655,0.529,0.984,0.987,0.976,0.992,0.99,0.888,0.434,0.989,0.995,0.917,0.995,0.959,0.978,0.988,0.998,0.983,0.978,0.986,0.954,0.92,0.971,0.997,0.993,0.926,0.49,0.929,0.988,0.435,0.994,0.962,0.896,0.995,0.982,0.95,0.998,0.993,0.913,0.99,0.667,0.974,0.974,0.906,0.653,0.978,0.978,0.988,0.984,0.993,0.824,0.822,0.802,0.757,0.987,0.78,0.98,0.999,0.975,0.997,0.725,0.925,0.813,0.98,0.974,0.975,0.998,0.998,0.718,0.903,0.965,0.988,0.988,0.988,0.545,0.571,0.996,0.935,0.523,0.978,0.932,0.993,0.856,0.986,0.825,0.953,0.7,0.859,0.988,0.978,0.801,0.993,0.763,0.986,0.45,0.985,0.994,0.994,0.985,0.996,0.69,0.936,0.961,0.979,0.988,0.857,0.996,0.843,0.993,0.883,0.993,0.987,0.841,0.958,0.978,