### Import packages

In [1]:
import numpy as np
import pandas as pd
import datetime
import pickle
import itertools

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from sklearn.metrics.pairwise import manhattan_distances,pairwise_distances
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier

from metric_learn import NCA

from sklearn.metrics import roc_auc_score

### Read data

In [2]:
X_train = pd.read_csv('X_train_unique.csv')
X_val = pd.read_csv('X_val_unique.csv')
X_test = pd.read_csv('X_test_unique.csv')

y_train = pd.read_csv('y_train_unique.csv')
y_val = pd.read_csv('y_val_unique.csv')

In [3]:
X_train.set_index('unique_id',inplace=True)
X_val.set_index('unique_id',inplace=True)
X_test.set_index('unique_id',inplace=True)
y_train.set_index('unique_id',inplace=True)
y_val.set_index('unique_id',inplace=True)

In [10]:
X_test.shape

(2380, 1539)

In [4]:
raw_test_df = pd.read_csv('test.csv')

In [5]:
raw_test_df.shape

(2324814, 19)

In [6]:
raw_test_df = raw_test_df[['unique_id']].drop_duplicates()

In [7]:
raw_test_df.shape

(2380, 1)

### Submission 1

In [8]:
lgbm = LGBMClassifier(max_depth=2,learning_rate=0.01,n_estimators=100,min_child_samples=100,colsample_bytree=0.5)
lgbm.fit(X_train,y_train)

pred_train = pd.DataFrame(lgbm.predict_proba(X_train)[:,1],columns=['pred'],index=X_train.index)
pred_val = pd.DataFrame(lgbm.predict_proba(X_val)[:,1],columns=['pred'],index=X_val.index)
pred_test = pd.DataFrame(lgbm.predict_proba(X_test)[:,1],columns=['pred'],index=X_test.index)

pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  return f(*args, **kwargs)


In [9]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  0.857003441098348
Val ROC:  0.8480658721894403


In [11]:
err_rate_df = pd.DataFrame()

for th in np.arange(0,1,0.1):

    pred_train['pred_binary'] = np.where(pred_train['pred']>=th,1,0)
    pred_val['pred_binary'] = np.where(pred_val['pred']>=th,1,0)

    err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
    err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
    err_rate_train = err_rate1_train+err_rate0_train

    err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
    err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
    err_rate_val = err_rate1_val+err_rate0_val

    err_rate_df_tmp = pd.DataFrame({'threshold':[th],
                                    'err_rate1_train':[err_rate1_train],
                                    'err_rate0_train':[err_rate0_train],
                                    'err_rate_train':[err_rate_train],
                                    'err_rate1_val':[err_rate1_val],
                                    'err_rate0_val':[err_rate0_val],
                                    'err_rate_val':[err_rate_val]
                                   })
    err_rate_df = pd.concat([err_rate_df,err_rate_df_tmp])



In [12]:
err_rate_df

Unnamed: 0,threshold,err_rate1_train,err_rate0_train,err_rate_train,err_rate1_val,err_rate0_val,err_rate_val
0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
0,0.1,0.0,1.0,1.0,0.0,1.0,1.0
0,0.2,0.0,1.0,1.0,0.0,1.0,1.0
0,0.3,0.0,1.0,1.0,0.0,1.0,1.0
0,0.4,0.002719,0.970361,0.97308,0.002714,0.976744,0.979458
0,0.5,0.078858,0.552191,0.631049,0.082768,0.576227,0.658995
0,0.6,0.133923,0.38982,0.523742,0.143826,0.413437,0.557263
0,0.7,0.414684,0.063789,0.478473,0.443691,0.046512,0.490202
0,0.8,0.619307,0.016108,0.635415,0.658073,0.02584,0.683913
0,0.9,1.0,0.0,1.0,1.0,0.0,1.0


In [42]:
pred_test

Unnamed: 0_level_0,pred
unique_id,Unnamed: 1_level_1
108,0.722639
2724,0.798873
717,0.760979
578,0.828495
147,0.726179
...,...
6191,0.633658
6121,0.632104
3245,0.596183
6640,0.627353


In [43]:
pred_test.reset_index(inplace=True)

In [45]:
pred_test[pred_test['unique_id']==108]

Unnamed: 0,unique_id,pred
0,108,0.722639


In [46]:
sub1 = pd.merge(raw_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub1

Unnamed: 0,unique_id,index,pred
0,108,0,0.722639
1,2724,1,0.798873
2,717,2,0.760979
3,578,3,0.828495
4,147,4,0.726179
...,...,...,...
2375,6191,2375,0.633658
2376,6121,2376,0.632104
2377,3245,2377,0.596183
2378,6640,2378,0.627353


In [58]:
sub1['pred'] = sub1['pred'].apply(lambda x: round(x,3))

In [59]:
sub1_txt = ''
for prob in list(sub1['pred'].values):
    sub1_txt = sub1_txt+','+str(prob)
sub1_txt = sub1_txt[1:]

sub1_txt

'0.723,0.799,0.761,0.828,0.726,0.841,0.841,0.748,0.713,0.839,0.439,0.632,0.521,0.632,0.645,0.654,0.752,0.768,0.787,0.839,0.654,0.838,0.841,0.759,0.838,0.447,0.426,0.704,0.726,0.624,0.743,0.749,0.668,0.841,0.759,0.84,0.839,0.66,0.733,0.737,0.841,0.537,0.515,0.839,0.841,0.718,0.748,0.765,0.471,0.626,0.766,0.838,0.513,0.841,0.452,0.832,0.761,0.525,0.841,0.841,0.41,0.578,0.407,0.825,0.515,0.752,0.499,0.685,0.834,0.634,0.737,0.65,0.684,0.439,0.704,0.841,0.604,0.839,0.807,0.838,0.634,0.812,0.624,0.627,0.645,0.45,0.618,0.634,0.41,0.841,0.825,0.599,0.758,0.754,0.816,0.757,0.677,0.536,0.665,0.822,0.839,0.718,0.71,0.841,0.513,0.675,0.807,0.673,0.841,0.632,0.834,0.841,0.712,0.663,0.726,0.839,0.834,0.632,0.754,0.596,0.64,0.565,0.455,0.562,0.731,0.841,0.762,0.834,0.634,0.709,0.811,0.736,0.819,0.759,0.677,0.839,0.833,0.627,0.84,0.838,0.679,0.645,0.841,0.634,0.739,0.732,0.841,0.74,0.746,0.689,0.841,0.617,0.694,0.743,0.488,0.795,0.636,0.841,0.444,0.632,0.841,0.832,0.749,0.696,0.837,0.841,0.44,0.759,0.

In [76]:
sub1.sort_values(by='unique_id')

Unnamed: 0,unique_id,index,pred
612,9,612,0.738
203,18,203,0.750
186,21,186,0.729
92,25,92,0.758
39,31,39,0.737
...,...,...,...
1266,7982,1266,0.634
1222,7990,1222,0.510
1610,7993,1610,0.634
1559,7994,1559,0.632


* Right order:

In [77]:
sub1_txt = ''
for prob in list(sub1.sort_values(by='unique_id')['pred'].values):
    sub1_txt = sub1_txt+','+str(prob)
sub1_txt = sub1_txt[1:]

sub1_txt

'0.738,0.75,0.729,0.758,0.737,0.841,0.675,0.409,0.698,0.701,0.593,0.535,0.807,0.841,0.708,0.746,0.617,0.426,0.71,0.832,0.822,0.762,0.761,0.738,0.78,0.759,0.817,0.813,0.723,0.692,0.791,0.733,0.712,0.817,0.84,0.76,0.644,0.698,0.841,0.731,0.726,0.819,0.834,0.8,0.813,0.785,0.838,0.674,0.743,0.832,0.41,0.841,0.779,0.416,0.433,0.405,0.839,0.806,0.841,0.744,0.787,0.807,0.562,0.759,0.841,0.748,0.837,0.759,0.759,0.841,0.761,0.736,0.787,0.707,0.75,0.832,0.733,0.834,0.743,0.749,0.457,0.714,0.84,0.433,0.754,0.761,0.819,0.814,0.738,0.719,0.841,0.718,0.726,0.807,0.696,0.839,0.839,0.823,0.58,0.759,0.757,0.757,0.79,0.841,0.713,0.513,0.632,0.499,0.715,0.595,0.734,0.841,0.768,0.829,0.652,0.839,0.735,0.834,0.819,0.841,0.839,0.841,0.481,0.714,0.749,0.839,0.836,0.738,0.41,0.409,0.818,0.692,0.399,0.776,0.712,0.768,0.714,0.841,0.65,0.747,0.491,0.752,0.709,0.677,0.774,0.837,0.675,0.715,0.555,0.838,0.838,0.821,0.841,0.837,0.63,0.841,0.748,0.737,0.766,0.675,0.822,0.732,0.839,0.736,0.768,0.822,0.739,0.814,0.701,

### Preprocessings

* Imputation

In [60]:
imputer = SimpleImputer(strategy='mean')
imputer.fit(X_train)

X_train_imp = pd.DataFrame(imputer.transform(X_train),columns=X_train.columns,index=X_train.index)
X_val_imp = pd.DataFrame(imputer.transform(X_val),columns=X_val.columns,index=X_val.index)
X_test_imp = pd.DataFrame(imputer.transform(X_test),columns=X_test.columns,index=X_test.index)

* Min-max scaler

In [61]:
mmscaler = MinMaxScaler()
mmscaler.fit(X_train_imp)

X_train_scaled = pd.DataFrame(mmscaler.transform(X_train_imp),columns=X_train_imp.columns,index=X_train_imp.index)
X_val_scaled = pd.DataFrame(mmscaler.transform(X_val_imp),columns=X_val_imp.columns,index=X_val_imp.index)
X_test_scaled = pd.DataFrame(mmscaler.transform(X_test_imp),columns=X_test_imp.columns,index=X_test_imp.index)

### Submission 2

In [63]:
knn = KNeighborsClassifier(n_neighbors=20,weights='uniform')
knn.fit(X_train_scaled,y_train)

pred_train = pd.DataFrame(knn.predict(X_train_scaled),columns=['pred'],index=X_train.index)
pred_val = pd.DataFrame(knn.predict(X_val_scaled),columns=['pred'],index=X_val.index)
pred_test = pd.DataFrame(knn.predict(X_test_scaled),columns=['pred'],index=X_test.index)

pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  return self._fit(X, y)


In [64]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  0.6366143639574733
Val ROC:  0.5936508437376191


In [67]:
pred_train.dtypes

pred            int64
female_label    int64
dtype: object

In [68]:
err_rate_df = pd.DataFrame()

for th in [1]:

    err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred'].sum()/pred_train[pred_train['female_label']==1]['pred'].count())
    err_rate0_train = pred_train[pred_train['female_label']==0]['pred'].sum()/pred_train[pred_train['female_label']==0]['pred'].count()
    err_rate_train = err_rate1_train+err_rate0_train

    err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred'].sum()/pred_val[pred_val['female_label']==1]['pred'].count())
    err_rate0_val = pred_val[pred_val['female_label']==0]['pred'].sum()/pred_val[pred_val['female_label']==0]['pred'].count()
    err_rate_val = err_rate1_val+err_rate0_val

    err_rate_df_tmp = pd.DataFrame({'threshold':[th],
                                    'err_rate1_train':[err_rate1_train],
                                    'err_rate0_train':[err_rate0_train],
                                    'err_rate_train':[err_rate_train],
                                    'err_rate1_val':[err_rate1_val],
                                    'err_rate0_val':[err_rate0_val],
                                    'err_rate_val':[err_rate_val]
                                   })
    err_rate_df = pd.concat([err_rate_df,err_rate_df_tmp])



In [69]:
err_rate_df

Unnamed: 0,threshold,err_rate1_train,err_rate0_train,err_rate_train,err_rate1_val,err_rate0_val,err_rate_val
0,1,0.147519,0.579253,0.726771,0.189959,0.622739,0.812698


In [71]:
pred_test.reset_index(inplace=True)

In [72]:
pred_test[pred_test['unique_id']==108]

Unnamed: 0,unique_id,pred
0,108,1


In [73]:
sub2 = pd.merge(raw_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub2

Unnamed: 0,unique_id,index,pred
0,108,0,1
1,2724,1,1
2,717,2,1
3,578,3,1
4,147,4,1
...,...,...,...
2375,6191,2375,1
2376,6121,2376,1
2377,3245,2377,1
2378,6640,2378,1


In [74]:
sub2['pred'] = sub2['pred'].apply(lambda x: round(x,3))

In [75]:
sub2_txt = ''
for prob in list(sub2['pred'].values):
    sub2_txt = sub2_txt+','+str(prob)
sub2_txt = sub2_txt[1:]

sub2_txt

'1,1,1,1,1,1,1,1,1,1,0,1,0,0,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,0,1,1,0,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1,1,0,1,1,1,0,1,1,1,1,0,0,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,0,1,1,1,0,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,1,0,1,0,1,1,0,0,1,1,1,1,1,1,1,1,1,1,0,0,1,0,1,1,1,1,1,1,0,0,1,1,1,1,1,1,0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,0,1,1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,1,0,1,1,1,1,1,0,1,1,0,1,1,1,1,1

* Right order

In [79]:
sub2_txt = ''
for prob in list(sub2.sort_values(by='unique_id')['pred'].values):
    sub2_txt = sub2_txt+','+str(prob)
sub2_txt = sub2_txt[1:]

sub2_txt

'1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,0,0,1,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1

### Submission 5 - Undersampling

In [8]:
y_train.sum()

female_label    2942
dtype: int64

In [9]:
y_train.count()

female_label    4494
dtype: int64

In [10]:
y_train.sum()/y_train.count()

female_label    0.654651
dtype: float64

In [23]:
(y_train.count()-y_train.sum()).values[0]

1552

In [13]:
y_train_0 = y_train[y_train['female_label']==0]
y_train_1 = y_train[y_train['female_label']==1]

In [17]:
y_train_1['rand'] = np.random.rand(y_train_1.shape[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_train_1['rand'] = np.random.rand(y_train_1.shape[0])


In [19]:
y_train_1.sort_values(by='rand',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [25]:
y_train_1 = y_train_1.head((y_train.count()-y_train.sum()).values[0])

In [27]:
y_train_under = pd.concat([y_train_1[['female_label']],y_train_0])
y_train_under.sort_index(inplace=True)

In [31]:
len(list(y_train_under.index))

3104

In [32]:
X_train_under = X_train[X_train.index.isin(list(y_train_under.index))]

In [36]:
X_train_under.sort_index(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().sort_index(


In [37]:
lgbm = LGBMClassifier(max_depth=2,learning_rate=0.01,n_estimators=100,min_child_samples=100,colsample_bytree=0.5)
lgbm.fit(X_train_under,y_train_under)

pred_train = pd.DataFrame(lgbm.predict_proba(X_train)[:,1],columns=['pred'],index=X_train.index)
pred_val = pd.DataFrame(lgbm.predict_proba(X_val)[:,1],columns=['pred'],index=X_val.index)
pred_test = pd.DataFrame(lgbm.predict_proba(X_test)[:,1],columns=['pred'],index=X_test.index)

pred_train = pd.merge(pred_train,y_train,how='left',left_index=True, right_index=True)
pred_val = pd.merge(pred_val,y_val,how='left',left_index=True, right_index=True)

roc_train = roc_auc_score(pred_train['female_label'],pred_train['pred'])
roc_val = roc_auc_score(pred_val['female_label'],pred_val['pred'])

  return f(*args, **kwargs)


In [38]:
print('Train ROC: ',roc_train)
print('Val ROC: ',roc_val)

Train ROC:  0.8555797611204944
Val ROC:  0.8452995768164113


In [39]:
err_rate_df = pd.DataFrame()

for th in np.arange(0,1,0.1):

    pred_train['pred_binary'] = np.where(pred_train['pred']>=th,1,0)
    pred_val['pred_binary'] = np.where(pred_val['pred']>=th,1,0)

    err_rate1_train = 1-(pred_train[pred_train['female_label']==1]['pred_binary'].sum()/pred_train[pred_train['female_label']==1]['pred_binary'].count())
    err_rate0_train = pred_train[pred_train['female_label']==0]['pred_binary'].sum()/pred_train[pred_train['female_label']==0]['pred_binary'].count()
    err_rate_train = err_rate1_train+err_rate0_train

    err_rate1_val = 1-(pred_val[pred_val['female_label']==1]['pred_binary'].sum()/pred_val[pred_val['female_label']==1]['pred_binary'].count())
    err_rate0_val = pred_val[pred_val['female_label']==0]['pred_binary'].sum()/pred_val[pred_val['female_label']==0]['pred_binary'].count()
    err_rate_val = err_rate1_val+err_rate0_val

    err_rate_df_tmp = pd.DataFrame({'threshold':[th],
                                    'err_rate1_train':[err_rate1_train],
                                    'err_rate0_train':[err_rate0_train],
                                    'err_rate_train':[err_rate_train],
                                    'err_rate1_val':[err_rate1_val],
                                    'err_rate0_val':[err_rate0_val],
                                    'err_rate_val':[err_rate_val]
                                   })
    err_rate_df = pd.concat([err_rate_df,err_rate_df_tmp])



In [40]:
err_rate_df

Unnamed: 0,threshold,err_rate1_train,err_rate0_train,err_rate_train,err_rate1_val,err_rate0_val,err_rate_val
0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
0,0.1,0.0,1.0,1.0,0.0,1.0,1.0
0,0.2,0.0,1.0,1.0,0.0,1.0,1.0
0,0.3,0.017335,0.858892,0.876227,0.014925,0.883721,0.898646
0,0.4,0.116927,0.449742,0.56667,0.127544,0.478036,0.60558
0,0.5,0.325629,0.11018,0.435809,0.331072,0.103359,0.434431
0,0.6,0.471788,0.046392,0.51818,0.507463,0.046512,0.553974
0,0.7,0.633923,0.012242,0.646165,0.662144,0.020672,0.682816
0,0.8,1.0,0.0,1.0,1.0,0.0,1.0
0,0.9,1.0,0.0,1.0,1.0,0.0,1.0


In [41]:
pred_test

Unnamed: 0_level_0,pred
unique_id,Unnamed: 1_level_1
108,0.579206
2724,0.688762
717,0.628206
578,0.736456
147,0.550859
...,...
6191,0.479936
6121,0.478385
3245,0.458107
6640,0.473278


In [42]:
pred_test.reset_index(inplace=True)

In [43]:
pred_test[pred_test['unique_id']==108]

Unnamed: 0,unique_id,pred
0,108,0.579206


In [44]:
sub5 = pd.merge(raw_test_df,pred_test.reset_index(),how='left',on='unique_id')
sub5

Unnamed: 0,unique_id,index,pred
0,108,0,0.579206
1,2724,1,0.688762
2,717,2,0.628206
3,578,3,0.736456
4,147,4,0.550859
...,...,...,...
2375,6191,2375,0.479936
2376,6121,2376,0.478385
2377,3245,2377,0.458107
2378,6640,2378,0.473278


In [45]:
sub5['pred'] = sub5['pred'].apply(lambda x: round(x,3))

In [48]:
sub5.sort_values(by='unique_id')

Unnamed: 0,unique_id,index,pred
612,9,612,0.621
203,18,203,0.610
186,21,186,0.549
92,25,92,0.606
39,31,39,0.619
...,...,...,...
1266,7982,1266,0.485
1222,7990,1222,0.335
1610,7993,1610,0.480
1559,7994,1559,0.480


* Right order:

In [47]:
sub5_txt = ''
for prob in list(sub5.sort_values(by='unique_id')['pred'].values):
    sub5_txt = sub5_txt+','+str(prob)
sub5_txt = sub5_txt[1:]

sub5_txt

'0.621,0.61,0.549,0.606,0.619,0.768,0.543,0.316,0.514,0.549,0.452,0.43,0.72,0.768,0.543,0.63,0.488,0.342,0.545,0.75,0.744,0.667,0.646,0.621,0.658,0.644,0.712,0.694,0.579,0.597,0.68,0.614,0.553,0.683,0.752,0.626,0.501,0.565,0.761,0.609,0.551,0.724,0.75,0.703,0.737,0.7,0.75,0.52,0.625,0.748,0.289,0.768,0.688,0.309,0.3,0.301,0.762,0.699,0.768,0.626,0.689,0.684,0.451,0.644,0.768,0.662,0.751,0.629,0.626,0.768,0.651,0.56,0.672,0.56,0.61,0.746,0.608,0.743,0.584,0.624,0.321,0.561,0.768,0.31,0.616,0.63,0.738,0.702,0.592,0.61,0.768,0.573,0.615,0.71,0.535,0.764,0.757,0.729,0.442,0.644,0.647,0.64,0.717,0.766,0.555,0.357,0.477,0.347,0.564,0.449,0.559,0.768,0.656,0.76,0.511,0.754,0.619,0.756,0.711,0.763,0.764,0.768,0.335,0.536,0.644,0.757,0.758,0.621,0.305,0.293,0.723,0.554,0.289,0.682,0.578,0.638,0.569,0.766,0.506,0.636,0.388,0.606,0.563,0.543,0.611,0.764,0.545,0.572,0.426,0.76,0.763,0.7,0.766,0.76,0.521,0.763,0.594,0.566,0.636,0.543,0.71,0.585,0.762,0.635,0.656,0.712,0.586,0.73,0.558,0.304,0.768,0