In [1]:
import numpy as np
import pandas as pd
import datetime
import pickle
import itertools

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from sklearn.metrics.pairwise import manhattan_distances,pairwise_distances
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

from metric_learn import NCA

from sklearn.metrics import roc_auc_score

In [35]:
id_test_df = pd.read_csv('test_ids_in_prediction.csv')
id_test_df

Unnamed: 0,unique_id
0,9
1,18
2,21
3,25
4,31
...,...
2375,7982
2376,7990
2377,7993
2378,7994


In [2]:
X_train = pd.read_csv('X_train_unique.csv')
X_val = pd.read_csv('X_val_unique.csv')
X_test = pd.read_csv('X_test_unique.csv')

y_train = pd.read_csv('y_train_unique.csv')
y_val = pd.read_csv('y_val_unique.csv')

In [3]:
X_train.set_index('unique_id',inplace=True)
X_val.set_index('unique_id',inplace=True)
X_test.set_index('unique_id',inplace=True)
y_train.set_index('unique_id',inplace=True)
y_val.set_index('unique_id',inplace=True)

In [4]:
X_train_experienced = X_train[X_train['experience_flag']==1]
y_train_experienced = y_train[y_train.index.isin(list(X_train_experienced.index))]
X_val_experienced = X_val[X_val['experience_flag']==1]
y_val_experienced = y_val[y_val.index.isin(list(X_val_experienced.index))]
X_test_experienced = X_test[X_test['experience_flag']==1]

X_train_cold = X_train[X_train['experience_flag']==0]
y_train_cold = y_train[y_train.index.isin(list(X_train_cold.index))]
X_val_cold = X_val[X_val['experience_flag']==0]
y_val_cold = y_val[y_val.index.isin(list(X_val_cold.index))]
X_test_cold = X_test[X_test['experience_flag']==0]

In [5]:
X_train_under = pd.read_csv('X_train_under.csv')
y_train_under = pd.read_csv('y_train_under.csv')

X_train_under.set_index('unique_id',inplace=True)
y_train_under.set_index('unique_id',inplace=True)

In [6]:
X_train_experienced_under = pd.read_csv('X_train_experienced_under.csv')
y_train_experienced_under = pd.read_csv('y_train_experienced_under.csv')

X_train_experienced_under.set_index('unique_id',inplace=True)
y_train_experienced_under.set_index('unique_id',inplace=True)

y_train_experienced_under['female_label'].sum()/y_train_experienced_under['female_label'].count()

0.5

In [7]:
X_train_cold_under = pd.read_csv('X_train_cold_under.csv')
y_train_cold_under = pd.read_csv('y_train_cold_under.csv')

X_train_cold_under.set_index('unique_id',inplace=True)
y_train_cold_under.set_index('unique_id',inplace=True)

y_train_cold_under['female_label'].sum()/y_train_cold_under['female_label'].count()

0.5

In [20]:
class coarse_class_classifier_numeric:
    

    def __init__(self, min_class_size,target_name='y',target_mean=None,coarse_class_table=None):
                    
        self.min_class_size = min_class_size
        self.target_name = target_name   
        self.target_mean = 0
        self.coarse_class_table = pd.DataFrame()

    
    def fit(self, X, y=None):
        
        self.target_mean = y.mean()
        
        for col in X.columns:
            dt = DecisionTreeClassifier(min_samples_leaf=self.min_class_size)
            dt.fit(X[X[col].isnull()==False][[col]],y[y.index.isin(X[X[col].isnull()==False].index)])

            dt_df = pd.merge(pd.DataFrame(dt.predict_proba(X[X[col].isnull()==False][[col]])[:,1],columns=['dt'],index=X[X[col].isnull()==False][[col]].index),X[X[col].isnull()==False][[col]],how='left',left_index=True,right_index=True)
            dt_df = pd.merge(dt_df, y[y.index.isin(X[X[col].isnull()==False].index)], how='left',left_index=True,right_index=True)

            roc = roc_auc_score(dt_df[self.target_name],dt_df['dt'])
            
            dt_df = dt_df.groupby('dt').agg({col:[min,max,'count'],self.target_name:np.mean})
            dt_df.columns = ['class_min','class_max','class_count','class_target_mean']
            dt_df = pd.concat([dt_df,pd.DataFrame({'class_min':[np.nan],'class_max':[np.nan],'class_count':[X[X[col].isnull()][[col]].shape[0]],'class_target_mean':[y[y.index.isin(X[X[col].isnull()].index)].mean()[0]]})])
            dt_df.reset_index(inplace=True)
            dt_df['class_no'] = np.where(dt_df['class_min'].isnull(),'nulls',dt_df.index + 1)
            dt_df['class_percent'] = dt_df['class_count'] / X.shape[0]
            dt_df['class_target_deviation'] = (dt_df['class_target_mean'] - self.target_mean[0]) / self.target_mean[0]
            dt_df['variable'] = col
            dt_df['roc'] = roc
            dt_df.drop('index',axis=1,inplace=True)
            self.coarse_class_table = pd.concat([self.coarse_class_table,dt_df])
            del dt_df

        return self
    
    def transform(self, X, y = None):

        base_df = X.drop(X.columns,axis=1)
        for col in X.columns:
            #print(col)
            cc_table = self.coarse_class_table[self.coarse_class_table['variable']==col]
            nulls_count = cc_table[cc_table['class_no']=='nulls']['class_count'].values[0]
            nulls_target_mean = cc_table[cc_table['class_no']=='nulls']['class_target_mean'].values[0]
            cc_table = cc_table[cc_table['class_no']!='nulls']
            nulls_target_mean = float(np.where(nulls_count>=self.min_class_size,nulls_target_mean,self.target_mean))
        
            x_df = X[[col]]
            x_df['Z_'+col] = np.nan
            x_df['Z_'+col] = np.where(x_df[col].isnull(),nulls_target_mean,x_df['Z_'+col])

            lenn = cc_table.shape[0]
            for cc_ind in range(cc_table['class_no'].astype(int).min(),cc_table['class_no'].astype(int).max()+1):
                maxx = cc_table[cc_table['class_no']==str(cc_ind)]['class_max'].values[0]
                meann = cc_table[cc_table['class_no']==str(cc_ind)]['class_target_mean'].values[0]
                x_df['Z_'+col] = np.where((x_df['Z_'+col].isnull()),meann,x_df['Z_'+col])

            #if cc_table[cc_table['class_no']==str(cc_ind+1)].shape[0]>1:
            #    meann = cc_table[cc_table['class_no']==str(cc_ind+1)]['class_target_mean'].values[0]    
            #    x_df['Z_'+col] = np.where((x_df['Z_'+col].isnull())&(x_df[col]>maxx),meann,x_df['Z_'+col])
#
            x_df.drop(col,axis=1,inplace=True)

            base_df = pd.merge(base_df,x_df,how='left',right_index=True,left_index=True)
            del x_df

        return base_df
    
    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X, y)
    
    def get_coarse_class_table(self):
        return self.coarse_class_table

In [21]:
cc = coarse_class_classifier_numeric(min_class_size=50,target_name='female_label')
cc.fit(X_train,y_train)

<__main__.coarse_class_classifier_numeric at 0x19e45954820>

In [22]:
X_train_cc = cc.transform(X_train)
X_val_cc = cc.transform(X_val)
X_test_cc = cc.transform(X_test)

X_train_under_cc = cc.transform(X_train_under)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_df['Z_'+col] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_df['Z_'+col] = np.where(x_df[col].isnull(),nulls_target_mean,x_df['Z_'+col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_df['Z_'+col] = np.where((x_df['Z_'+col].isnull()),meann,x_df['Z_'+col])
A value is trying to be set 

In [23]:
X_val_cc.isnull().any()

Z_sellingprice_count_basket      False
Z_sellingprice_count_favorite    False
Z_sellingprice_count_order       False
Z_sellingprice_count_search      False
Z_sellingprice_count_visit       False
                                 ...  
Z_time_btw_two_orders_mean       False
Z_time_btw_two_orders_max        False
Z_time_btw_two_orders_min        False
Z_transaction_count              False
Z_experience_flag                False
Length: 1539, dtype: bool

In [24]:
lr = LogisticRegression(penalty='l1',C=1,solver='saga')

In [25]:
lr.fit(X_train_cc,y_train)

  return f(*args, **kwargs)


LogisticRegression(C=1, penalty='l1', solver='saga')

In [26]:
pred_train = pd.DataFrame(lr.predict_proba(X_train_cc)[:,1],columns=['pred'],index=X_train_cc.index)
pred_val = pd.DataFrame(lr.predict_proba(X_val_cc)[:,1],columns=['pred'],index=X_val_cc.index)
pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

pred_train['pred_binary'] = np.where(pred_train['pred']>0.5,1,0)
pred_val['pred_binary'] = np.where(pred_val['pred']>0.5,1,0)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
err_rate_train = err_rate1_train+err_rate0_train

err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
err_rate_val = err_rate1_val+err_rate0_val

In [27]:
roc_val

0.8587015591527913

In [28]:
err_rate_val

0.48898215055799227

In [33]:
pd.DataFrame(lr.coef_.T,columns=['coefficient'],index=X_train_cc.columns).sort_values(by='coefficient',ascending=False)

Unnamed: 0,coefficient
Z_sellingprice_count_TRENDYOLMİLLA,0.736971
Z_time_btw_basketand_order_max,0.691350
Z_sellingprice_count_PL Woman,0.584203
Z_sellingprice_median_Elektronik,0.571614
Z_sellingprice_sum_Erkek,0.564160
...,...
Z_hourly_favorite_secs_btw_consecutives_median,-0.574231
Z_daily_favorite_secs_btw_consecutives_median,-0.590596
Z_sellingprice_median_Kadın,-0.797179
Z_sellingprice_min_Kadın,-1.023228


In [34]:
pred_test = pd.DataFrame(lr.predict_proba(X_test_cc)[:,1],columns=['pred'],index=X_test_cc.index)

* Submission 22

In [36]:
sub22 = pd.merge(id_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub22['pred'] = sub22['pred'].apply(lambda x: round(x,3))
sub22_txt = ''
for prob in list(sub22['pred'].values):
    sub22_txt = sub22_txt+','+str(prob)
sub22_txt = sub22_txt[1:]

sub22_txt

'0.984,0.964,0.976,0.984,0.982,0.993,0.654,0.573,0.858,0.955,0.448,0.127,0.932,0.989,0.984,0.989,0.326,0.125,0.979,0.918,0.989,0.964,0.976,0.977,0.985,0.997,0.898,0.984,0.933,0.955,0.99,0.946,0.948,0.942,0.956,0.988,0.73,0.72,0.945,0.969,0.955,0.777,0.978,0.982,0.989,0.966,0.98,0.596,0.511,0.188,0.298,0.991,0.892,0.049,0.202,0.072,0.951,0.948,0.988,0.979,0.994,0.725,0.141,0.979,0.995,0.649,0.969,0.946,0.932,0.992,0.981,0.959,0.91,0.965,0.836,0.749,0.979,0.992,0.967,0.903,0.444,0.855,0.996,0.193,0.977,0.966,0.748,0.994,0.972,0.961,0.992,0.971,0.788,0.98,0.485,0.872,0.859,0.881,0.598,0.969,0.955,0.989,0.995,0.918,0.579,0.568,0.647,0.753,0.929,0.62,0.949,0.984,0.973,0.988,0.819,0.982,0.966,0.957,0.98,0.812,0.991,0.996,0.556,0.907,0.994,0.904,0.972,0.986,0.16,0.082,0.986,0.639,0.095,0.976,0.93,0.986,0.648,0.983,0.802,0.995,0.258,0.772,0.948,0.969,0.592,0.951,0.972,0.956,0.262,0.912,0.996,0.974,0.975,0.99,0.515,0.902,0.977,0.924,0.982,0.912,0.961,0.974,0.951,0.739,0.994,0.957,0.728,0.943,0.

* Submission 22

In [37]:
rf = XGBClassifier(max_depth=10,criterion='entropy',n_estimators=100,min_child_samples=20,colsample_bytree=0.8)
rf.fit(X_train_cc,y_train)

pred_train = pd.DataFrame(rf.predict_proba(X_train_cc)[:,1],columns=['pred'],index=X_train_cc.index)
pred_val = pd.DataFrame(rf.predict_proba(X_val_cc)[:,1],columns=['pred'],index=X_val_cc.index)
pred_test = pd.DataFrame(rf.predict_proba(X_test_cc)[:,1],columns=['pred'],index=X_test_cc.index)

pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  return f(*args, **kwargs)


Parameters: { "criterion", "min_child_samples" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [38]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  0.999999890494579
Val ROC:  0.838832616340426


In [39]:
sub23 = pd.merge(id_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub23['pred'] = sub23['pred'].apply(lambda x: round(x,3))
sub23_txt = ''
for prob in list(sub23['pred'].values):
    sub23_txt = sub23_txt+','+str(prob)
sub23_txt = sub23_txt[1:]

sub23_txt

'0.999,0.998,0.98,1.0,0.997,1.0,0.958,0.55,0.977,0.997,0.141,0.041,0.995,0.999,0.998,0.997,0.506,0.005,0.999,0.997,1.0,0.987,0.999,0.989,0.999,0.999,0.99,1.0,0.981,0.995,1.0,0.98,0.997,0.998,0.999,1.0,0.958,0.966,0.992,0.999,0.995,0.972,0.997,0.998,0.999,0.994,1.0,0.212,0.511,0.049,0.19,1.0,0.996,0.0,0.018,0.006,0.993,0.999,1.0,0.997,1.0,0.935,0.068,0.998,0.999,0.905,0.999,0.994,0.967,1.0,0.998,0.999,0.984,0.994,0.913,0.968,0.987,1.0,1.0,0.977,0.827,0.927,1.0,0.087,0.997,0.998,0.911,0.999,0.998,0.984,1.0,0.998,0.741,1.0,0.662,0.941,0.997,0.982,0.901,0.999,0.995,1.0,0.999,0.998,0.231,0.159,0.769,0.968,0.996,0.563,0.996,0.997,1.0,0.999,0.992,1.0,0.955,0.999,0.999,0.979,0.999,1.0,0.186,0.918,1.0,0.99,0.998,0.999,0.004,0.048,0.999,0.699,0.025,0.995,0.954,0.998,0.888,0.999,0.999,0.999,0.003,0.736,0.998,0.998,0.886,0.999,0.999,0.984,0.012,0.947,1.0,0.997,0.999,1.0,0.32,0.997,0.999,0.94,1.0,0.932,1.0,0.999,1.0,0.79,0.999,0.999,0.71,0.992,0.999,0.91,1.0,0.89,0.979,1.0,1.0,0.993,0.997,0.93,0.06

In [50]:
imp_df = pd.DataFrame(rf.feature_importances_.T,columns=['importance'],index=X_train_cc.columns).sort_values(by='importance',ascending=False)
var_list = list(imp_df[imp_df['importance']>0].index)

* LR trials

In [54]:
lr = LogisticRegression(penalty='l1',C=1000,solver='saga')
lr.fit(X_train_cc[var_list],y_train)
pred_train = pd.DataFrame(lr.predict_proba(X_train_cc[var_list])[:,1],columns=['pred'],index=X_train_cc.index)
pred_val = pd.DataFrame(lr.predict_proba(X_val_cc[var_list])[:,1],columns=['pred'],index=X_val_cc.index)
pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

pred_train['pred_binary'] = np.where(pred_train['pred']>0.5,1,0)
pred_val['pred_binary'] = np.where(pred_val['pred']>0.5,1,0)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
err_rate_train = err_rate1_train+err_rate0_train

err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
err_rate_val = err_rate1_val+err_rate0_val

print(roc_val)
print(err_rate_val)
pd.DataFrame(lr.coef_.T,columns=['coefficient'],index=X_train_cc[var_list].columns).sort_values(by='coefficient',ascending=False)

  return f(*args, **kwargs)


0.8547291730214328
0.5073294556113022




Unnamed: 0,coefficient
Z_time_btw_basketand_order_max,1.529302
Z_time_btw_basketand_order_mean,1.357291
Z_sellingprice_count_LC Waikiki,1.123619
Z_sellingprice_count_Ev Tekstil,1.065566
Z_hour_sellingprice_mean_4,1.009217
...,...
Z_sellingprice_median_Kadın,-1.035228
Z_sellingprice_count_Çorap,-1.061797
Z_sellingprice_sum_Apple,-1.109767
Z_sellingprice_min_Kadın,-1.256233


In [55]:
pred_test = pd.DataFrame(lr.predict_proba(X_test_cc[var_list])[:,1],columns=['pred'],index=X_test_cc.index)

* Submission 24

In [56]:
sub24 = pd.merge(id_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub24['pred'] = sub24['pred'].apply(lambda x: round(x,3))
sub24_txt = ''
for prob in list(sub24['pred'].values):
    sub24_txt = sub24_txt+','+str(prob)
sub24_txt = sub24_txt[1:]

sub24_txt

'0.993,0.971,0.986,0.989,0.989,0.997,0.688,0.581,0.928,0.978,0.353,0.082,0.958,0.993,0.99,0.994,0.276,0.071,0.991,0.925,0.994,0.979,0.982,0.979,0.993,0.999,0.907,0.994,0.949,0.975,0.998,0.938,0.975,0.973,0.972,0.995,0.816,0.812,0.962,0.98,0.977,0.751,0.987,0.979,0.991,0.971,0.988,0.591,0.467,0.139,0.317,0.996,0.919,0.025,0.185,0.061,0.964,0.975,0.992,0.988,0.997,0.817,0.082,0.987,0.997,0.524,0.981,0.96,0.925,0.993,0.988,0.987,0.964,0.987,0.849,0.672,0.988,0.997,0.981,0.922,0.428,0.828,0.998,0.131,0.99,0.974,0.666,0.997,0.984,0.978,0.996,0.985,0.844,0.984,0.41,0.823,0.891,0.9,0.507,0.984,0.966,0.995,0.997,0.947,0.526,0.601,0.691,0.73,0.939,0.605,0.971,0.992,0.989,0.994,0.94,0.99,0.975,0.979,0.984,0.836,0.996,0.999,0.553,0.959,0.997,0.929,0.974,0.992,0.107,0.041,0.996,0.721,0.055,0.987,0.97,0.995,0.631,0.993,0.815,0.999,0.184,0.785,0.967,0.982,0.544,0.935,0.986,0.968,0.273,0.94,0.998,0.986,0.979,0.995,0.626,0.905,0.988,0.939,0.986,0.919,0.979,0.984,0.942,0.698,0.997,0.972,0.731,0.965,0.9

### Correlation Analysis

In [83]:
uni_roc = cc.get_coarse_class_table()[['variable','roc']].drop_duplicates().sort_values(by = 'roc',ascending=False)

In [84]:
uni_roc['variable'] = uni_roc['variable'].apply(lambda x: 'Z_'+x)
uni_roc.set_index('variable',inplace=True)

In [85]:
uni_roc

Unnamed: 0_level_0,roc
variable,Unnamed: 1_level_1
Z_sellingprice_mean_Oysho,0.827269
Z_sellingprice_median_Oysho,0.826220
Z_sellingprice_count_Kadın,0.788881
Z_sellingprice_median_Pantolon,0.788565
Z_sellingprice_median_529.0,0.788565
...,...
Z_daily_order_secs_btw_consecutives_min,0.529611
Z_hourly_order_secs_btw_first_last,0.527055
Z_hourly_order_secs_btw_consecutives_min,0.524916
Z_sellingprice_max_Terapi Men,0.519928


In [73]:
corr_df = X_train_cc.corr(method='spearman')

In [74]:
corr_df

Unnamed: 0,Z_sellingprice_count_basket,Z_sellingprice_count_favorite,Z_sellingprice_count_order,Z_sellingprice_count_search,Z_sellingprice_count_visit,Z_sellingprice_sum_basket,Z_sellingprice_sum_favorite,Z_sellingprice_sum_order,Z_sellingprice_sum_search,Z_sellingprice_sum_visit,...,Z_favorite_cnt_before_order_12hour,Z_favorite_cnt_before_order_24hour,Z_time_btw_basketand_order_mean,Z_time_btw_basketand_order_max,Z_time_btw_basketand_order_min,Z_time_btw_two_orders_mean,Z_time_btw_two_orders_max,Z_time_btw_two_orders_min,Z_transaction_count,Z_experience_flag
Z_sellingprice_count_basket,1.000000,-0.270760,0.489683,0.415717,0.294884,1.000000,0.270760,0.489683,0.415717,0.294884,...,-0.234207,-0.249585,0.573644,0.573644,0.573644,0.440678,0.440678,0.440678,,
Z_sellingprice_count_favorite,-0.270760,1.000000,-0.238396,-0.214980,-0.099177,-0.270760,-1.000000,-0.238396,-0.214980,-0.099177,...,0.376103,0.399822,-0.283978,-0.283978,-0.283978,-0.300478,-0.300478,-0.300478,,
Z_sellingprice_count_order,0.489683,-0.238396,1.000000,0.194701,0.105027,0.489683,0.238396,1.000000,0.194701,0.105027,...,-0.388591,-0.413097,0.928729,0.928729,0.928729,0.794102,0.794102,0.794102,,
Z_sellingprice_count_search,0.415717,-0.214980,0.194701,1.000000,0.364051,0.415717,0.214980,0.194701,1.000000,0.364051,...,-0.133130,-0.141967,0.276398,0.276398,0.276398,0.214505,0.214505,0.214505,,
Z_sellingprice_count_visit,0.294884,-0.099177,0.105027,0.364051,1.000000,0.294884,0.099177,0.105027,0.364051,1.000000,...,-0.107404,-0.114177,0.205723,0.205723,0.205723,0.149270,0.149270,0.149270,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Z_time_btw_two_orders_mean,0.440678,-0.300478,0.794102,0.214505,0.149270,0.440678,0.300478,0.794102,0.214505,0.149270,...,-0.446959,-0.472197,0.792685,0.792685,0.792685,1.000000,1.000000,1.000000,,
Z_time_btw_two_orders_max,0.440678,-0.300478,0.794102,0.214505,0.149270,0.440678,0.300478,0.794102,0.214505,0.149270,...,-0.446959,-0.472197,0.792685,0.792685,0.792685,1.000000,1.000000,1.000000,,
Z_time_btw_two_orders_min,0.440678,-0.300478,0.794102,0.214505,0.149270,0.440678,0.300478,0.794102,0.214505,0.149270,...,-0.446959,-0.472197,0.792685,0.792685,0.792685,1.000000,1.000000,1.000000,,
Z_transaction_count,,,,,,,,,,,...,,,,,,,,,,


In [75]:
corr_df.iloc[0,0]

1.0

In [106]:
keep_list = []

In [107]:
for col in corr_df.columns:

    tmp_df = corr_df[[col]]
    tmp_df = tmp_df[(tmp_df[col]>0.9)]

    if tmp_df.shape[0]>1:
        tmp_df = pd.merge(tmp_df,uni_roc,how='left',left_index=True,right_index=True)
        tmp_df = tmp_df.sort_values(by='roc',ascending = False)

        keep_list = keep_list + list(tmp_df.head(1).index)
    else:
        keep_list = keep_list + [col]

In [108]:
len(keep_list)

1539

In [117]:
variable_list = list(pd.DataFrame(keep_list,columns=['variable_name']).drop_duplicates()['variable_name'])

* LR trials

In [123]:
lr = LogisticRegression(penalty='l1',C=10,solver='saga')
lr.fit(X_train_cc[variable_list],y_train)
pred_train = pd.DataFrame(lr.predict_proba(X_train_cc[variable_list])[:,1],columns=['pred'],index=X_train_cc.index)
pred_val = pd.DataFrame(lr.predict_proba(X_val_cc[variable_list])[:,1],columns=['pred'],index=X_val_cc.index)
pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

pred_train['pred_binary'] = np.where(pred_train['pred']>0.5,1,0)
pred_val['pred_binary'] = np.where(pred_val['pred']>0.5,1,0)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
err_rate_train = err_rate1_train+err_rate0_train

err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
err_rate_val = err_rate1_val+err_rate0_val

print(roc_val)
print(err_rate_val)
pd.DataFrame(lr.coef_.T,columns=['coefficient'],index=X_train_cc[variable_list].columns).sort_values(by='coefficient',ascending=False)

  return f(*args, **kwargs)


0.8539578359085476
0.5101728846956199




Unnamed: 0,coefficient
Z_sellingprice_sum_PL Woman,3.897415
Z_sellingprice_median_Makyaj_x,3.508219
Z_time_btw_basketand_order_max,3.334961
Z_sellingprice_mean_SOHO,3.289930
Z_sellingprice_max_Saat,2.358966
...,...
Z_sellingprice_sum_Ev & Mobilya,-1.986683
Z_sellingprice_median_Apple,-2.211903
Z_sellingprice_mean_Elektrikli Ev Aletleri,-2.357573
Z_weekly_favorite_secs_btw_consecutives_max,-2.952093


In [124]:
pred_test = pd.DataFrame(lr.predict_proba(X_test_cc[variable_list])[:,1],columns=['pred'],index=X_test_cc.index)

In [125]:
sub25 = pd.merge(id_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub25['pred'] = sub25['pred'].apply(lambda x: round(x,3))
sub25_txt = ''
for prob in list(sub25['pred'].values):
    sub25_txt = sub25_txt+','+str(prob)
sub25_txt = sub25_txt[1:]

sub25_txt

'0.992,0.973,0.987,0.988,0.989,0.997,0.537,0.565,0.925,0.971,0.37,0.104,0.964,0.993,0.99,0.995,0.303,0.093,0.991,0.931,0.993,0.977,0.984,0.982,0.993,0.999,0.909,0.995,0.951,0.976,0.998,0.95,0.972,0.962,0.983,0.994,0.796,0.794,0.961,0.974,0.981,0.638,0.985,0.976,0.992,0.976,0.984,0.657,0.464,0.159,0.208,0.994,0.892,0.029,0.178,0.057,0.958,0.982,0.992,0.989,0.998,0.813,0.092,0.987,0.996,0.568,0.974,0.961,0.94,0.995,0.989,0.979,0.947,0.99,0.845,0.655,0.991,0.997,0.977,0.924,0.516,0.733,0.998,0.169,0.989,0.983,0.721,0.997,0.978,0.983,0.997,0.983,0.884,0.989,0.415,0.843,0.92,0.899,0.628,0.986,0.97,0.995,0.997,0.93,0.508,0.649,0.656,0.783,0.934,0.575,0.979,0.987,0.99,0.995,0.875,0.988,0.975,0.974,0.983,0.785,0.994,0.999,0.562,0.952,0.998,0.887,0.964,0.994,0.128,0.052,0.996,0.705,0.054,0.983,0.968,0.994,0.6,0.99,0.787,0.999,0.217,0.753,0.958,0.979,0.528,0.95,0.986,0.96,0.22,0.95,0.998,0.989,0.977,0.993,0.618,0.896,0.991,0.943,0.989,0.887,0.974,0.984,0.95,0.689,0.997,0.968,0.785,0.948,0.97,0.2

* Submission 26

In [130]:
rf = XGBClassifier(max_depth=10,criterion='entropy',n_estimators=100,min_child_samples=20,colsample_bytree=0.8)
rf.fit(X_train_cc[variable_list],y_train)

pred_train = pd.DataFrame(rf.predict_proba(X_train_cc[variable_list])[:,1],columns=['pred'],index=X_train_cc.index)
pred_val = pd.DataFrame(rf.predict_proba(X_val_cc[variable_list])[:,1],columns=['pred'],index=X_val_cc.index)
pred_test = pd.DataFrame(rf.predict_proba(X_test_cc[variable_list])[:,1],columns=['pred'],index=X_test_cc.index)

pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  return f(*args, **kwargs)


Parameters: { "criterion", "min_child_samples" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [131]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  0.999999890494579
Val ROC:  0.8507392565011447


In [132]:
sub26 = pd.merge(id_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub26['pred'] = sub26['pred'].apply(lambda x: round(x,3))
sub26_txt = ''
for prob in list(sub26['pred'].values):
    sub26_txt = sub26_txt+','+str(prob)
sub26_txt = sub26_txt[1:]

sub26_txt

'1.0,0.998,0.985,1.0,0.994,1.0,0.918,0.345,0.949,0.997,0.039,0.041,0.985,1.0,0.996,0.999,0.57,0.006,1.0,0.999,1.0,0.995,0.998,0.923,0.998,1.0,0.99,1.0,0.99,0.986,1.0,0.982,0.998,0.998,1.0,1.0,0.908,0.952,0.999,0.998,0.994,0.951,0.999,1.0,0.999,0.976,1.0,0.542,0.24,0.035,0.157,1.0,0.999,0.001,0.044,0.007,0.993,0.999,1.0,0.998,1.0,0.867,0.025,0.998,0.999,0.938,0.999,0.989,0.954,1.0,0.999,0.997,0.986,0.993,0.965,0.861,0.997,1.0,1.0,0.967,0.544,0.833,1.0,0.1,0.995,0.997,0.841,0.998,0.999,0.982,1.0,0.998,0.762,0.999,0.032,0.946,0.992,0.997,0.068,1.0,0.999,1.0,1.0,0.998,0.584,0.325,0.884,0.943,0.996,0.145,0.995,0.999,1.0,0.998,0.918,1.0,0.918,0.999,0.991,0.985,0.998,1.0,0.074,0.963,1.0,0.946,0.998,1.0,0.004,0.109,0.998,0.768,0.008,0.999,0.931,1.0,0.689,0.999,0.996,0.997,0.009,0.745,0.999,0.999,0.572,0.994,0.999,0.978,0.096,0.973,1.0,0.997,0.999,1.0,0.326,0.996,0.997,0.948,0.999,0.929,0.999,0.998,0.999,0.935,1.0,0.998,0.899,0.988,0.999,0.821,1.0,0.771,0.78,1.0,1.0,0.998,0.997,0.999,0.109,0.96

In [136]:
pd.DataFrame(variable_list).to_csv('variable_list.csv')

In [137]:
X_train_cc.to_csv('X_train_cc.csv')
X_val_cc.to_csv('X_val_cc.csv')
X_test_cc.to_csv('X_test_cc.csv')