### Import packages

In [1]:
import numpy as np
import pandas as pd
import datetime
import pickle
import itertools

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from sklearn.metrics.pairwise import manhattan_distances,pairwise_distances
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier

from metric_learn import NCA

from sklearn.metrics import roc_auc_score

### Read data

In [2]:
X_train_raw = pd.read_csv('X_train_raw.csv')
X_val_raw = pd.read_csv('X_val_raw.csv')
X_test_raw = pd.read_csv('X_test_raw.csv')

y_train_raw = pd.read_csv('y_train_raw.csv')
y_val_raw = pd.read_csv('y_val_raw.csv')

X_train_raw_under = pd.read_csv('X_train_raw_under.csv')
y_train_raw_under = pd.read_csv('y_train_raw_under.csv')

In [3]:
X_train_raw.head()

Unnamed: 0,ID,sellingprice,weekend_flag,campaign_flag,hour,4_hour_interval,8_hour_interval,week_no,seconds_between_consecutives_all,seconds_between_consecutives_daily,...,Level3_Category_Name_other_3,Level3_Category_Name_Çizme,Level3_Category_Name_Çorap,day_name_Friday,day_name_Monday,day_name_Saturday,day_name_Sunday,day_name_Thursday,day_name_Tuesday,day_name_Wednesday
0,TRAIN_791869,69.9,0,0,10,3,2,42,5.0,5.0,...,1,0,0,0,0,0,0,0,0,1
1,TRAIN_791512,442.0,0,0,22,6,3,42,55.0,55.0,...,1,0,0,0,0,0,0,1,0,0
2,TRAIN_793084,0.0,0,0,11,3,2,42,17.0,17.0,...,1,0,0,1,0,0,0,0,0,0
3,TRAIN_792916,0.0,0,0,11,3,2,42,2.0,2.0,...,1,0,0,1,0,0,0,0,0,0
4,TRAIN_792175,49.99,1,0,14,4,2,42,12.0,12.0,...,0,0,0,0,0,1,0,0,0,0


In [4]:
X_train_raw.set_index('ID',inplace=True)
X_val_raw.set_index('ID',inplace=True)
X_test_raw.set_index('ID',inplace=True)
y_train_raw.set_index('ID',inplace=True)
y_val_raw.set_index('ID',inplace=True)
X_train_raw_under.set_index('ID',inplace=True)
y_train_raw_under.set_index('ID',inplace=True)

In [5]:
id_test_df = pd.read_csv('test_ids_in_prediction.csv')

In [6]:
id_test_df.shape

(2380, 1)

In [10]:
X_train_raw.sort_index(inplace=True)
X_val_raw.sort_index(inplace=True)
X_test_raw.sort_index(inplace=True)
y_train_raw.sort_index(inplace=True)
y_val_raw.sort_index(inplace=True)
X_train_raw_under.sort_index(inplace=True)
y_train_raw_under.sort_index(inplace=True)

In [16]:
train_unique_id = pd.read_csv('train_unique_id.csv')
val_unique_id = pd.read_csv('val_unique_id.csv')
test_unique_id = pd.read_csv('test_unique_id.csv')

In [18]:
train_unique_id.set_index('ID',inplace=True)
val_unique_id.set_index('ID',inplace=True)
test_unique_id.set_index('ID',inplace=True)

### Submission 6

In [11]:
lgbm = LGBMClassifier(max_depth=2,learning_rate=0.01,n_estimators=100,min_child_samples=100,colsample_bytree=0.5)
lgbm.fit(X_train_raw,y_train_raw)

pred_train = pd.DataFrame(lgbm.predict_proba(X_train_raw)[:,1],columns=['pred'],index=X_train_raw.index)
pred_val = pd.DataFrame(lgbm.predict_proba(X_val_raw)[:,1],columns=['pred'],index=X_val_raw.index)
pred_test = pd.DataFrame(lgbm.predict_proba(X_test_raw)[:,1],columns=['pred'],index=X_test_raw.index)

pred_train = pd.merge(pred_train,y_train_raw,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val_raw,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  return f(*args, **kwargs)


In [12]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  0.7890786122664951
Val ROC:  0.7856708849975005


In [13]:
err_rate_df = pd.DataFrame()

for th in np.arange(0,1,0.1):

    pred_train['pred_binary'] = np.where(pred_train['pred']>=th,1,0)
    pred_val['pred_binary'] = np.where(pred_val['pred']>=th,1,0)

    err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
    err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
    err_rate_train = err_rate1_train+err_rate0_train

    err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
    err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
    err_rate_val = err_rate1_val+err_rate0_val

    err_rate_df_tmp = pd.DataFrame({'threshold':[th],
                                    'err_rate1_train':[err_rate1_train],
                                    'err_rate0_train':[err_rate0_train],
                                    'err_rate_train':[err_rate_train],
                                    'err_rate1_val':[err_rate1_val],
                                    'err_rate0_val':[err_rate0_val],
                                    'err_rate_val':[err_rate_val]
                                   })
    err_rate_df = pd.concat([err_rate_df,err_rate_df_tmp])



In [14]:
err_rate_df

Unnamed: 0,threshold,err_rate1_train,err_rate0_train,err_rate_train,err_rate1_val,err_rate0_val,err_rate_val
0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
0,0.1,0.0,1.0,1.0,0.0,1.0,1.0
0,0.2,0.0,1.0,1.0,0.0,1.0,1.0
0,0.3,0.0,1.0,1.0,0.0,1.0,1.0
0,0.4,0.0,1.0,1.0,0.0,1.0,1.0
0,0.5,0.0,1.0,1.0,0.0,1.0,1.0
0,0.6,0.003662,0.94654,0.950202,0.002769,0.973433,0.976202
0,0.7,0.056693,0.614519,0.671212,0.057549,0.594595,0.652144
0,0.8,0.099882,0.529106,0.628988,0.100565,0.501183,0.601749
0,0.9,0.889543,0.016998,0.906541,0.88122,0.030496,0.911716


In [15]:
pred_test

Unnamed: 0_level_0,pred
ID,Unnamed: 1_level_1
TEST_0,0.886529
TEST_1,0.874606
TEST_10,0.798317
TEST_100,0.886713
TEST_1000,0.891372
...,...
TEST_99995,0.869851
TEST_99996,0.882476
TEST_99997,0.630236
TEST_99998,0.869851


In [20]:
pred_test2 = pd.merge(pred_test,test_unique_id[['unique_id']],how='left',left_index=True,right_index=True)

In [22]:
pred_test = pred_test2.groupby('unique_id').agg({'pred':np.mean})
pred_test.describe()

Unnamed: 0,pred
count,2380.0
mean,0.812808
std,0.063453
min,0.588505
25%,0.793603
50%,0.829235
75%,0.857032
max,0.904495


In [23]:
pred_test.reset_index(inplace=True)

In [24]:
pred_test[pred_test['unique_id']==108]

Unnamed: 0,unique_id,pred
28,108,0.865845


In [26]:
minn = pred_test['pred'].min()
maxx = pred_test['pred'].max()

pred_test['pred'] = (pred_test['pred']-minn)/(maxx-minn)

In [27]:
pred_test.describe()

Unnamed: 0,unique_id,pred
count,2380.0,2380.0
mean,3922.168908,0.709841
std,2307.573917,0.200808
min,9.0,0.0
25%,1938.5,0.649066
50%,3842.5,0.761828
75%,5925.25,0.849795
max,7998.0,1.0


In [28]:
sub6 = pd.merge(id_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub6

Unnamed: 0,unique_id,index,pred
0,9,0,0.896322
1,18,1,0.870544
2,21,2,0.866504
3,25,3,0.864755
4,31,4,0.846263
...,...,...,...
2375,7982,2375,0.967845
2376,7990,2376,0.130801
2377,7993,2377,0.726886
2378,7994,2378,0.661322


In [29]:
sub6['pred'] = sub6['pred'].apply(lambda x: round(x,3))

In [30]:
sub6_txt = ''
for prob in list(sub6['pred'].values):
    sub6_txt = sub6_txt+','+str(prob)
sub6_txt = sub6_txt[1:]

sub6_txt

'0.896,0.871,0.867,0.865,0.846,0.905,0.514,0.427,0.801,0.866,0.502,0.233,0.815,0.917,0.847,0.799,0.576,0.266,0.828,0.786,0.939,0.764,0.818,0.815,0.863,0.896,0.912,0.893,0.878,0.76,0.898,0.786,0.922,0.787,0.928,0.859,0.712,0.751,0.896,0.839,0.811,0.807,0.874,0.706,0.865,0.832,0.84,0.715,0.903,0.702,0.354,0.903,0.736,0.342,0.607,0.311,0.758,0.835,0.842,0.822,0.787,0.684,0.31,0.775,0.919,0.754,0.902,0.749,0.769,0.826,0.876,0.887,0.85,0.92,0.782,0.844,0.655,0.851,0.938,0.847,0.66,0.783,0.783,0.377,0.883,0.877,0.75,0.924,0.875,0.776,0.831,0.792,0.773,0.843,0.652,0.881,0.887,0.788,0.71,0.823,0.761,0.831,0.811,0.835,0.723,0.737,0.73,0.71,0.774,0.708,0.845,0.85,0.808,0.824,0.716,0.766,0.743,0.884,0.877,0.845,0.851,0.865,0.646,0.692,0.812,0.842,0.924,0.904,0.312,0.468,0.903,0.605,0.544,0.906,0.79,0.819,0.807,0.918,0.777,0.796,0.693,0.661,0.886,0.762,0.769,0.908,0.693,0.796,0.377,0.837,0.787,0.863,0.811,0.913,0.705,0.835,0.891,0.792,0.857,0.507,0.844,0.791,0.85,0.821,0.823,0.882,0.738,0.928,0.86

### Submission 7

In [31]:
lgbm = LGBMClassifier(max_depth=2,learning_rate=0.01,n_estimators=100,min_child_samples=100,colsample_bytree=0.5)
lgbm.fit(X_train_raw_under,y_train_raw_under)

pred_train = pd.DataFrame(lgbm.predict_proba(X_train_raw)[:,1],columns=['pred'],index=X_train_raw.index)
pred_val = pd.DataFrame(lgbm.predict_proba(X_val_raw)[:,1],columns=['pred'],index=X_val_raw.index)
pred_test = pd.DataFrame(lgbm.predict_proba(X_test_raw)[:,1],columns=['pred'],index=X_test_raw.index)

pred_train = pd.merge(pred_train,y_train_raw,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val_raw,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  return f(*args, **kwargs)


In [32]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  0.7889178376458337
Val ROC:  0.7854915086904803


In [33]:
err_rate_df = pd.DataFrame()

for th in np.arange(0,1,0.1):

    pred_train['pred_binary'] = np.where(pred_train['pred']>=th,1,0)
    pred_val['pred_binary'] = np.where(pred_val['pred']>=th,1,0)

    err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
    err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
    err_rate_train = err_rate1_train+err_rate0_train

    err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
    err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
    err_rate_val = err_rate1_val+err_rate0_val

    err_rate_df_tmp = pd.DataFrame({'threshold':[th],
                                    'err_rate1_train':[err_rate1_train],
                                    'err_rate0_train':[err_rate0_train],
                                    'err_rate_train':[err_rate_train],
                                    'err_rate1_val':[err_rate1_val],
                                    'err_rate0_val':[err_rate0_val],
                                    'err_rate_val':[err_rate_val]
                                   })
    err_rate_df = pd.concat([err_rate_df,err_rate_df_tmp])



In [34]:
err_rate_df

Unnamed: 0,threshold,err_rate1_train,err_rate0_train,err_rate_train,err_rate1_val,err_rate0_val,err_rate_val
0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
0,0.1,0.0,1.0,1.0,0.0,1.0,1.0
0,0.2,0.0,1.0,1.0,0.0,1.0,1.0
0,0.3,0.0,1.0,1.0,0.0,1.0,1.0
0,0.4,0.0,1.0,1.0,0.0,1.0,1.0
0,0.5,0.040482,0.708379,0.748861,0.039817,0.713202,0.753019
0,0.6,0.056693,0.614519,0.671212,0.057549,0.594595,0.652144
0,0.7,0.192887,0.374316,0.567204,0.195689,0.359985,0.555674
0,0.8,0.624355,0.092771,0.717127,0.624844,0.106238,0.731082
0,0.9,1.0,0.0,1.0,1.0,0.0,1.0


In [35]:
pred_test

Unnamed: 0_level_0,pred
ID,Unnamed: 1_level_1
TEST_0,0.798157
TEST_1,0.779986
TEST_10,0.667117
TEST_100,0.798930
TEST_1000,0.808469
...,...
TEST_99995,0.769575
TEST_99996,0.790180
TEST_99997,0.489081
TEST_99998,0.769575


In [36]:
pred_test2 = pd.merge(pred_test,test_unique_id[['unique_id']],how='left',left_index=True,right_index=True)

In [37]:
pred_test = pred_test2.groupby('unique_id').agg({'pred':np.mean})
pred_test.describe()

Unnamed: 0,pred
count,2380.0
mean,0.700609
std,0.078219
min,0.449356
25%,0.666345
50%,0.716065
75%,0.75786
max,0.832121


In [38]:
pred_test.reset_index(inplace=True)

In [39]:
pred_test[pred_test['unique_id']==108]

Unnamed: 0,unique_id,pred
28,108,0.77268


In [40]:
minn = pred_test['pred'].min()
maxx = pred_test['pred'].max()

pred_test['pred'] = (pred_test['pred']-minn)/(maxx-minn)

In [41]:
pred_test.describe()

Unnamed: 0,unique_id,pred
count,2380.0,2380.0
mean,3922.168908,0.656414
std,2307.573917,0.204352
min,9.0,0.0
25%,1938.5,0.566899
50%,3842.5,0.696795
75%,5925.25,0.805988
max,7998.0,1.0


In [42]:
sub7 = pd.merge(id_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub7

Unnamed: 0,unique_id,index,pred
0,9,0,0.871136
1,18,1,0.841140
2,21,2,0.829149
3,25,3,0.827655
4,31,4,0.795869
...,...,...,...
2375,7982,2375,0.956234
2376,7990,2376,0.107890
2377,7993,2377,0.639419
2378,7994,2378,0.566899


In [43]:
sub7['pred'] = sub7['pred'].apply(lambda x: round(x,3))

In [44]:
sub7_txt = ''
for prob in list(sub7['pred'].values):
    sub7_txt = sub7_txt+','+str(prob)
sub7_txt = sub7_txt[1:]

sub7_txt

'0.871,0.841,0.829,0.828,0.796,0.872,0.468,0.383,0.745,0.831,0.448,0.191,0.772,0.889,0.812,0.741,0.515,0.221,0.797,0.72,0.914,0.707,0.767,0.769,0.815,0.864,0.877,0.857,0.845,0.689,0.869,0.727,0.9,0.734,0.909,0.812,0.63,0.684,0.856,0.805,0.763,0.758,0.832,0.651,0.827,0.792,0.78,0.638,0.865,0.636,0.291,0.871,0.684,0.288,0.531,0.254,0.69,0.785,0.795,0.77,0.725,0.608,0.266,0.714,0.886,0.704,0.87,0.69,0.703,0.771,0.833,0.856,0.821,0.901,0.729,0.799,0.603,0.8,0.922,0.8,0.575,0.745,0.718,0.328,0.847,0.848,0.699,0.897,0.839,0.73,0.776,0.753,0.709,0.795,0.593,0.849,0.851,0.725,0.639,0.768,0.705,0.781,0.752,0.782,0.688,0.667,0.654,0.639,0.715,0.633,0.809,0.8,0.756,0.777,0.658,0.699,0.691,0.849,0.846,0.795,0.802,0.821,0.6,0.634,0.748,0.79,0.892,0.879,0.274,0.404,0.865,0.545,0.482,0.879,0.742,0.767,0.747,0.891,0.705,0.746,0.616,0.608,0.854,0.733,0.714,0.876,0.658,0.758,0.348,0.786,0.722,0.824,0.752,0.886,0.632,0.788,0.854,0.744,0.822,0.481,0.811,0.742,0.81,0.763,0.772,0.845,0.695,0.907,0.836,0.264

### KMeans - 50

In [46]:
X_train_raw_50 = pd.read_csv('X_train_raw_50.csv')
X_val_raw_50 = pd.read_csv('X_val_raw_50.csv')
X_test_raw_50 = pd.read_csv('X_test_raw_50.csv')

In [48]:
y_train = pd.read_csv('y_train_50.csv')
y_val = pd.read_csv('y_val_50.csv')

In [50]:
X_train_raw_50.set_index('unique_id',inplace=True)
X_val_raw_50.set_index('unique_id',inplace=True)
X_test_raw_50.set_index('unique_id',inplace=True)
y_train.set_index('unique_id',inplace=True)
y_val.set_index('unique_id',inplace=True)

In [51]:
X_train_raw_50.sort_index(inplace=True)
X_val_raw_50.sort_index(inplace=True)
X_test_raw_50.sort_index(inplace=True)
y_train.sort_index(inplace=True)
y_val.sort_index(inplace=True)

In [52]:
lgbm = LGBMClassifier(max_depth=2,learning_rate=0.01,n_estimators=100,min_child_samples=100,colsample_bytree=0.5)
lgbm.fit(X_train_raw_50,y_train)

pred_train = pd.DataFrame(lgbm.predict_proba(X_train_raw_50)[:,1],columns=['pred'],index=X_train_raw_50.index)
pred_val = pd.DataFrame(lgbm.predict_proba(X_val_raw_50)[:,1],columns=['pred'],index=X_val_raw_50.index)
pred_test = pd.DataFrame(lgbm.predict_proba(X_test_raw_50)[:,1],columns=['pred'],index=X_test_raw_50.index)

pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  return f(*args, **kwargs)


In [53]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  0.8338222385360966
Val ROC:  0.8170476020180983


In [54]:
err_rate_df = pd.DataFrame()

for th in np.arange(0,1,0.1):

    pred_train['pred_binary'] = np.where(pred_train['pred']>=th,1,0)
    pred_val['pred_binary'] = np.where(pred_val['pred']>=th,1,0)

    err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
    err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
    err_rate_train = err_rate1_train+err_rate0_train

    err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
    err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
    err_rate_val = err_rate1_val+err_rate0_val

    err_rate_df_tmp = pd.DataFrame({'threshold':[th],
                                    'err_rate1_train':[err_rate1_train],
                                    'err_rate0_train':[err_rate0_train],
                                    'err_rate_train':[err_rate_train],
                                    'err_rate1_val':[err_rate1_val],
                                    'err_rate0_val':[err_rate0_val],
                                    'err_rate_val':[err_rate_val]
                                   })
    err_rate_df = pd.concat([err_rate_df,err_rate_df_tmp])



In [55]:
err_rate_df

Unnamed: 0,threshold,err_rate1_train,err_rate0_train,err_rate_train,err_rate1_val,err_rate0_val,err_rate_val
0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
0,0.1,0.0,1.0,1.0,0.0,1.0,1.0
0,0.2,0.0,1.0,1.0,0.0,1.0,1.0
0,0.3,0.0,1.0,1.0,0.0,1.0,1.0
0,0.4,0.00136,0.957474,0.958834,0.001357,0.956072,0.957429
0,0.5,0.020394,0.793814,0.814209,0.031208,0.821705,0.852913
0,0.6,0.354861,0.14433,0.499191,0.374491,0.124031,0.498522
0,0.7,0.470088,0.073454,0.543542,0.497965,0.041344,0.539308
0,0.8,0.672332,0.016108,0.68844,0.702849,0.023256,0.726105
0,0.9,1.0,0.0,1.0,1.0,0.0,1.0


In [56]:
pred_test

Unnamed: 0_level_0,pred
unique_id,Unnamed: 1_level_1
9,0.823875
18,0.791611
21,0.840235
25,0.819388
31,0.841512
...,...
7982,0.590442
7990,0.564747
7993,0.590442
7994,0.590442


In [57]:
pred_test.reset_index(inplace=True)

In [58]:
pred_test[pred_test['unique_id']==108]

Unnamed: 0,unique_id,pred
28,108,0.791611


In [59]:
sub8 = pd.merge(id_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub8

Unnamed: 0,unique_id,index,pred
0,9,0,0.823875
1,18,1,0.791611
2,21,2,0.840235
3,25,3,0.819388
4,31,4,0.841512
...,...,...,...
2375,7982,2375,0.590442
2376,7990,2376,0.564747
2377,7993,2377,0.590442
2378,7994,2378,0.590442


In [60]:
sub8['pred'] = sub8['pred'].apply(lambda x: round(x,3))

In [61]:
sub8_txt = ''
for prob in list(sub8['pred'].values):
    sub8_txt = sub8_txt+','+str(prob)
sub8_txt = sub8_txt[1:]

sub8_txt

'0.824,0.792,0.84,0.819,0.842,0.847,0.683,0.416,0.7,0.8,0.595,0.398,0.782,0.845,0.807,0.821,0.636,0.474,0.794,0.798,0.84,0.708,0.8,0.8,0.847,0.843,0.793,0.846,0.792,0.663,0.82,0.821,0.789,0.823,0.846,0.817,0.559,0.598,0.84,0.817,0.795,0.804,0.683,0.831,0.839,0.818,0.845,0.574,0.598,0.764,0.381,0.807,0.751,0.388,0.562,0.401,0.79,0.829,0.846,0.842,0.826,0.55,0.476,0.799,0.845,0.802,0.842,0.811,0.808,0.836,0.846,0.817,0.802,0.825,0.788,0.574,0.791,0.845,0.847,0.818,0.547,0.797,0.847,0.425,0.845,0.845,0.786,0.845,0.805,0.797,0.845,0.827,0.771,0.846,0.531,0.753,0.691,0.684,0.645,0.844,0.836,0.827,0.838,0.69,0.785,0.69,0.588,0.6,0.826,0.648,0.818,0.819,0.816,0.798,0.506,0.815,0.677,0.846,0.799,0.755,0.842,0.845,0.569,0.623,0.813,0.792,0.844,0.829,0.425,0.448,0.842,0.689,0.522,0.821,0.807,0.754,0.545,0.699,0.588,0.842,0.485,0.78,0.819,0.797,0.7,0.841,0.789,0.798,0.471,0.794,0.785,0.752,0.776,0.846,0.639,0.744,0.847,0.81,0.847,0.785,0.814,0.842,0.829,0.537,0.847,0.843,0.731,0.847,0.817,0.518,0

### Random Tree - 10-20

In [85]:
train_bag_df_10_20 = pd.read_csv('train_bag_df_10_20.csv')
val_bag_df_10_20 = pd.read_csv('val_bag_df_10_20.csv')
test_bag_df_10_20 = pd.read_csv('test_bag_df_10_20.csv')

In [86]:
y_train = pd.read_csv('y_train_50.csv')
y_val = pd.read_csv('y_val_50.csv')

In [87]:
train_bag_df_10_20.set_index('unique_id',inplace=True)
val_bag_df_10_20.set_index('unique_id',inplace=True)
test_bag_df_10_20.set_index('unique_id',inplace=True)
y_train.set_index('unique_id',inplace=True)
y_val.set_index('unique_id',inplace=True)

In [89]:
train_bag_df_10_20.sort_index(inplace=True)
val_bag_df_10_20.sort_index(inplace=True)
test_bag_df_10_20.sort_index(inplace=True)
y_train.sort_index(inplace=True)
y_val.sort_index(inplace=True)

In [90]:
train_bag_df_10_20.drop('Unnamed: 0',axis=1,inplace=True)
val_bag_df_10_20.drop('Unnamed: 0',axis=1,inplace=True)
test_bag_df_10_20.drop('Unnamed: 0',axis=1,inplace=True)

In [91]:
lgbm = LGBMClassifier(max_depth=2,learning_rate=0.01,n_estimators=100,min_child_samples=100,colsample_bytree=0.5)
lgbm.fit(train_bag_df_10_20,y_train)

pred_train = pd.DataFrame(lgbm.predict_proba(train_bag_df_10_20)[:,1],columns=['pred'],index=train_bag_df_10_20.index)
pred_val = pd.DataFrame(lgbm.predict_proba(val_bag_df_10_20)[:,1],columns=['pred'],index=val_bag_df_10_20.index)
pred_test = pd.DataFrame(lgbm.predict_proba(test_bag_df_10_20)[:,1],columns=['pred'],index=test_bag_df_10_20.index)

pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  return f(*args, **kwargs)


In [92]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  0.8277067987973676
Val ROC:  0.8226310308920515


In [93]:
err_rate_df = pd.DataFrame()

for th in np.arange(0,1,0.1):

    pred_train['pred_binary'] = np.where(pred_train['pred']>=th,1,0)
    pred_val['pred_binary'] = np.where(pred_val['pred']>=th,1,0)

    err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
    err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
    err_rate_train = err_rate1_train+err_rate0_train

    err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
    err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
    err_rate_val = err_rate1_val+err_rate0_val

    err_rate_df_tmp = pd.DataFrame({'threshold':[th],
                                    'err_rate1_train':[err_rate1_train],
                                    'err_rate0_train':[err_rate0_train],
                                    'err_rate_train':[err_rate_train],
                                    'err_rate1_val':[err_rate1_val],
                                    'err_rate0_val':[err_rate0_val],
                                    'err_rate_val':[err_rate_val]
                                   })
    err_rate_df = pd.concat([err_rate_df,err_rate_df_tmp])



In [94]:
err_rate_df

Unnamed: 0,threshold,err_rate1_train,err_rate0_train,err_rate_train,err_rate1_val,err_rate0_val,err_rate_val
0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
0,0.1,0.0,1.0,1.0,0.0,1.0,1.0
0,0.2,0.0,1.0,1.0,0.0,1.0,1.0
0,0.3,0.0,1.0,1.0,0.0,1.0,1.0
0,0.4,0.003399,0.952964,0.956363,0.008141,0.943152,0.951294
0,0.5,0.058124,0.645619,0.703742,0.065129,0.669251,0.73438
0,0.6,0.139361,0.449742,0.589103,0.150611,0.444444,0.595055
0,0.7,0.447995,0.068943,0.516938,0.476255,0.064599,0.540855
0,0.8,0.651937,0.011598,0.663535,0.663501,0.015504,0.679005
0,0.9,1.0,0.0,1.0,1.0,0.0,1.0


In [95]:
pred_test

Unnamed: 0_level_0,pred
unique_id,Unnamed: 1_level_1
9,0.789218
18,0.815701
21,0.792594
25,0.826026
31,0.816466
...,...
7982,0.619746
7990,0.616601
7993,0.619746
7994,0.619746


In [96]:
pred_test.reset_index(inplace=True)

In [97]:
pred_test[pred_test['unique_id']==108]

Unnamed: 0,unique_id,pred
28,108,0.715972


In [99]:
sub9 = pd.merge(id_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub9

Unnamed: 0,unique_id,index,pred
0,9,0,0.789218
1,18,1,0.815701
2,21,2,0.792594
3,25,3,0.826026
4,31,4,0.816466
...,...,...,...
2375,7982,2375,0.619746
2376,7990,2376,0.616601
2377,7993,2377,0.619746
2378,7994,2378,0.619746


In [100]:
sub9['pred'] = sub9['pred'].apply(lambda x: round(x,3))

In [101]:
sub9_txt = ''
for prob in list(sub9['pred'].values):
    sub9_txt = sub9_txt+','+str(prob)
sub9_txt = sub9_txt[1:]

sub9_txt

'0.789,0.816,0.793,0.826,0.816,0.842,0.731,0.41,0.558,0.745,0.673,0.463,0.791,0.842,0.721,0.839,0.43,0.402,0.7,0.809,0.842,0.62,0.781,0.808,0.839,0.842,0.821,0.837,0.716,0.546,0.838,0.837,0.786,0.82,0.842,0.818,0.62,0.54,0.809,0.77,0.801,0.841,0.803,0.693,0.84,0.778,0.842,0.543,0.609,0.837,0.449,0.842,0.842,0.399,0.493,0.473,0.842,0.842,0.842,0.725,0.842,0.519,0.402,0.789,0.842,0.838,0.842,0.761,0.84,0.842,0.842,0.799,0.775,0.775,0.73,0.722,0.772,0.842,0.818,0.763,0.543,0.818,0.842,0.497,0.816,0.838,0.806,0.839,0.804,0.737,0.842,0.81,0.668,0.839,0.458,0.805,0.789,0.751,0.598,0.826,0.842,0.842,0.799,0.841,0.735,0.58,0.62,0.582,0.745,0.603,0.818,0.842,0.788,0.823,0.531,0.841,0.702,0.836,0.814,0.691,0.825,0.842,0.543,0.616,0.842,0.742,0.842,0.752,0.402,0.402,0.839,0.498,0.402,0.815,0.74,0.814,0.592,0.808,0.62,0.762,0.464,0.822,0.822,0.817,0.673,0.842,0.69,0.811,0.402,0.842,0.841,0.832,0.841,0.839,0.504,0.766,0.842,0.818,0.819,0.69,0.838,0.825,0.84,0.732,0.781,0.839,0.78,0.73,0.773,0.547,0

### Random Tree - 20-10

In [102]:
train_bag_df_20_10 = pd.read_csv('train_bag_df_20_10.csv')
val_bag_df_20_10 = pd.read_csv('val_bag_df_20_10.csv')
test_bag_df_20_10 = pd.read_csv('test_bag_df_20_10.csv')

In [103]:
y_train = pd.read_csv('y_train_50.csv')
y_val = pd.read_csv('y_val_50.csv')

In [104]:
train_bag_df_20_10.set_index('unique_id',inplace=True)
val_bag_df_20_10.set_index('unique_id',inplace=True)
test_bag_df_20_10.set_index('unique_id',inplace=True)
y_train.set_index('unique_id',inplace=True)
y_val.set_index('unique_id',inplace=True)

In [105]:
train_bag_df_20_10.sort_index(inplace=True)
val_bag_df_20_10.sort_index(inplace=True)
test_bag_df_20_10.sort_index(inplace=True)
y_train.sort_index(inplace=True)
y_val.sort_index(inplace=True)

In [106]:
train_bag_df_20_10.drop('Unnamed: 0',axis=1,inplace=True)
val_bag_df_20_10.drop('Unnamed: 0',axis=1,inplace=True)
test_bag_df_20_10.drop('Unnamed: 0',axis=1,inplace=True)

In [107]:
lgbm = LGBMClassifier(max_depth=2,learning_rate=0.01,n_estimators=100,min_child_samples=100,colsample_bytree=0.5)
lgbm.fit(train_bag_df_20_10,y_train)

pred_train = pd.DataFrame(lgbm.predict_proba(train_bag_df_20_10)[:,1],columns=['pred'],index=train_bag_df_20_10.index)
pred_val = pd.DataFrame(lgbm.predict_proba(val_bag_df_20_10)[:,1],columns=['pred'],index=val_bag_df_20_10.index)
pred_test = pd.DataFrame(lgbm.predict_proba(test_bag_df_20_10)[:,1],columns=['pred'],index=test_bag_df_20_10.index)

pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  return f(*args, **kwargs)


In [108]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  0.8326132986887383
Val ROC:  0.8171825860128532


In [109]:
err_rate_df = pd.DataFrame()

for th in np.arange(0,1,0.1):

    pred_train['pred_binary'] = np.where(pred_train['pred']>=th,1,0)
    pred_val['pred_binary'] = np.where(pred_val['pred']>=th,1,0)

    err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
    err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
    err_rate_train = err_rate1_train+err_rate0_train

    err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
    err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
    err_rate_val = err_rate1_val+err_rate0_val

    err_rate_df_tmp = pd.DataFrame({'threshold':[th],
                                    'err_rate1_train':[err_rate1_train],
                                    'err_rate0_train':[err_rate0_train],
                                    'err_rate_train':[err_rate_train],
                                    'err_rate1_val':[err_rate1_val],
                                    'err_rate0_val':[err_rate0_val],
                                    'err_rate_val':[err_rate_val]
                                   })
    err_rate_df = pd.concat([err_rate_df,err_rate_df_tmp])



In [110]:
err_rate_df

Unnamed: 0,threshold,err_rate1_train,err_rate0_train,err_rate_train,err_rate1_val,err_rate0_val,err_rate_val
0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
0,0.1,0.0,1.0,1.0,0.0,1.0,1.0
0,0.2,0.0,1.0,1.0,0.0,1.0,1.0
0,0.3,0.0,1.0,1.0,0.0,1.0,1.0
0,0.4,0.0,1.0,1.0,0.0,1.0,1.0
0,0.5,0.029232,0.76933,0.798562,0.037992,0.801034,0.839025
0,0.6,0.148199,0.411727,0.559925,0.16825,0.431525,0.599774
0,0.7,0.485384,0.064433,0.549817,0.50882,0.046512,0.555331
0,0.8,0.745071,0.007088,0.752159,0.75848,0.005168,0.763648
0,0.9,1.0,0.0,1.0,1.0,0.0,1.0


In [111]:
pred_test

Unnamed: 0_level_0,pred
unique_id,Unnamed: 1_level_1
9,0.812072
18,0.772804
21,0.806367
25,0.840528
31,0.812865
...,...
7982,0.624551
7990,0.603584
7993,0.619567
7994,0.619567


In [112]:
pred_test.reset_index(inplace=True)

In [113]:
pred_test[pred_test['unique_id']==108]

Unnamed: 0,unique_id,pred
28,108,0.772804


In [120]:
sub10 = pd.merge(id_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub10

Unnamed: 0,unique_id,index,pred
0,9,0,0.812072
1,18,1,0.772804
2,21,2,0.806367
3,25,3,0.840528
4,31,4,0.812865
...,...,...,...
2375,7982,2375,0.624551
2376,7990,2376,0.603584
2377,7993,2377,0.619567
2378,7994,2378,0.619567


In [121]:
sub10['pred'] = sub10['pred'].apply(lambda x: round(x,3))

In [122]:
sub10['pred']

0       0.812
1       0.773
2       0.806
3       0.841
4       0.813
        ...  
2375    0.625
2376    0.604
2377    0.620
2378    0.620
2379    0.620
Name: pred, Length: 2380, dtype: float64

In [123]:
list(sub10['pred'].values)

[0.812,
 0.773,
 0.806,
 0.841,
 0.813,
 0.844,
 0.738,
 0.567,
 0.759,
 0.783,
 0.512,
 0.536,
 0.8,
 0.844,
 0.773,
 0.783,
 0.584,
 0.557,
 0.772,
 0.694,
 0.822,
 0.674,
 0.813,
 0.799,
 0.844,
 0.807,
 0.81,
 0.844,
 0.773,
 0.716,
 0.829,
 0.776,
 0.772,
 0.82,
 0.806,
 0.841,
 0.62,
 0.524,
 0.669,
 0.813,
 0.791,
 0.821,
 0.703,
 0.763,
 0.823,
 0.801,
 0.844,
 0.499,
 0.606,
 0.755,
 0.542,
 0.805,
 0.782,
 0.423,
 0.559,
 0.589,
 0.841,
 0.834,
 0.834,
 0.808,
 0.829,
 0.626,
 0.468,
 0.755,
 0.844,
 0.662,
 0.843,
 0.751,
 0.826,
 0.828,
 0.844,
 0.8,
 0.773,
 0.772,
 0.791,
 0.738,
 0.773,
 0.811,
 0.806,
 0.793,
 0.537,
 0.788,
 0.822,
 0.442,
 0.841,
 0.805,
 0.751,
 0.844,
 0.8,
 0.76,
 0.818,
 0.778,
 0.751,
 0.817,
 0.556,
 0.758,
 0.667,
 0.779,
 0.549,
 0.808,
 0.779,
 0.84,
 0.841,
 0.739,
 0.704,
 0.739,
 0.62,
 0.671,
 0.805,
 0.546,
 0.791,
 0.844,
 0.791,
 0.79,
 0.537,
 0.798,
 0.791,
 0.802,
 0.79,
 0.567,
 0.813,
 0.844,
 0.531,
 0.663,
 0.79,
 0.642,
 0.844,

In [124]:
sub10_txt = ''
for prob in list(sub10['pred'].values):
    sub10_txt = sub10_txt+','+str(prob)
sub10_txt = sub10_txt[1:]

sub10_txt

'0.812,0.773,0.806,0.841,0.813,0.844,0.738,0.567,0.759,0.783,0.512,0.536,0.8,0.844,0.773,0.783,0.584,0.557,0.772,0.694,0.822,0.674,0.813,0.799,0.844,0.807,0.81,0.844,0.773,0.716,0.829,0.776,0.772,0.82,0.806,0.841,0.62,0.524,0.669,0.813,0.791,0.821,0.703,0.763,0.823,0.801,0.844,0.499,0.606,0.755,0.542,0.805,0.782,0.423,0.559,0.589,0.841,0.834,0.834,0.808,0.829,0.626,0.468,0.755,0.844,0.662,0.843,0.751,0.826,0.828,0.844,0.8,0.773,0.772,0.791,0.738,0.773,0.811,0.806,0.793,0.537,0.788,0.822,0.442,0.841,0.805,0.751,0.844,0.8,0.76,0.818,0.778,0.751,0.817,0.556,0.758,0.667,0.779,0.549,0.808,0.779,0.84,0.841,0.739,0.704,0.739,0.62,0.671,0.805,0.546,0.791,0.844,0.791,0.79,0.537,0.798,0.791,0.802,0.79,0.567,0.813,0.844,0.531,0.663,0.79,0.642,0.844,0.812,0.438,0.47,0.829,0.657,0.502,0.819,0.774,0.777,0.611,0.812,0.615,0.772,0.532,0.795,0.772,0.759,0.671,0.824,0.753,0.764,0.474,0.826,0.835,0.832,0.83,0.812,0.654,0.63,0.775,0.754,0.814,0.733,0.82,0.8,0.841,0.593,0.817,0.834,0.73,0.841,0.773,0.528,0