In [14]:
import os
import numpy as np 
import pandas as pd 
from subprocess import check_output

In [15]:
sub_path = "D:\dataset\submit\kaggle_iceberg"
all_files = os.listdir(sub_path)

# Read and concatenate submissions
outs = [pd.read_csv(os.path.join(sub_path, f), index_col=0) for f in all_files]
concat_sub = pd.concat(outs, axis=1)
cols = list(map(lambda x: "is_iceberg_" + str(x), range(len(concat_sub.columns))))
concat_sub.columns = cols
concat_sub.reset_index(inplace=True)
concat_sub.head()



Unnamed: 0,id,is_iceberg_0,is_iceberg_1,is_iceberg_2,is_iceberg_3
0,5941774d,0.128001,0.007656341,0.004421687,0.02407886
1,4023181e,0.378543,0.327098,0.2339116,0.5688198
2,b20200e4,5.7e-05,2.564777e-15,2.151925e-12,9.120745e-09
3,e7f018bb,0.997554,0.9991825,0.9993609,0.9999933
4,4371c8c3,0.162165,0.1127017,0.1747521,0.2794653


In [16]:
# check correlation
concat_sub.corr()

Unnamed: 0,is_iceberg_0,is_iceberg_1,is_iceberg_2,is_iceberg_3
is_iceberg_0,1.0,0.836502,0.852531,0.955068
is_iceberg_1,0.836502,1.0,0.989168,0.847326
is_iceberg_2,0.852531,0.989168,1.0,0.865881
is_iceberg_3,0.955068,0.847326,0.865881,1.0


In [17]:
# get the data fields ready for stacking
concat_sub['is_iceberg_max'] = concat_sub.iloc[:, 1:6].max(axis=1)
concat_sub['is_iceberg_min'] = concat_sub.iloc[:, 1:6].min(axis=1)
concat_sub['is_iceberg_mean'] = concat_sub.iloc[:, 1:6].mean(axis=1)
concat_sub['is_iceberg_median'] = concat_sub.iloc[:, 1:6].median(axis=1)

In [18]:
# set up cutoff threshold for lower and upper bounds, easy to twist 
cutoff_lo = 0.8
cutoff_hi = 0.2


# stack mean

In [19]:
concat_sub['is_iceberg'] = concat_sub['is_iceberg_mean']
concat_sub[['id', 'is_iceberg']].to_csv('stack_mean.csv', 
                                        index=False, float_format='%.6f')

# stack median

In [20]:
concat_sub['is_iceberg'] = concat_sub['is_iceberg_median']
concat_sub[['id', 'is_iceberg']].to_csv('stack_median.csv', 
                                        index=False, float_format='%.6f')

# PushOut + Median Stacking

In [21]:
concat_sub['is_iceberg'] = np.where(np.all(concat_sub.iloc[:,1:6] > cutoff_lo, axis=1), 1, 
                                    np.where(np.all(concat_sub.iloc[:,1:6] < cutoff_hi, axis=1),
                                             0, concat_sub['is_iceberg_median']))
concat_sub[['id', 'is_iceberg']].to_csv('stack_pushout_median.csv', 
                                        index=False, float_format='%.6f')

# MinMax + Mean Stacking

In [22]:
concat_sub['is_iceberg'] = np.where(np.all(concat_sub.iloc[:,1:6] > cutoff_lo, axis=1), 
                                    concat_sub['is_iceberg_max'], 
                                    np.where(np.all(concat_sub.iloc[:,1:6] < cutoff_hi, axis=1),
                                             concat_sub['is_iceberg_min'], 
                                             concat_sub['is_iceberg_mean']))
concat_sub[['id', 'is_iceberg']].to_csv('stack_minmax_mean.csv', 
                                        index=False, float_format='%.6f')



# Minmax + Median Stacking

In [23]:
concat_sub['is_iceberg'] = np.where(np.all(concat_sub.iloc[:,1:6] > cutoff_lo, axis=1), 
                                    concat_sub['is_iceberg_max'], 
                                    np.where(np.all(concat_sub.iloc[:,1:6] < cutoff_hi, axis=1),
                                             concat_sub['is_iceberg_min'], 
                                             concat_sub['is_iceberg_median']))
concat_sub[['id', 'is_iceberg']].to_csv('stack_minmax_median.csv', 
                                        index=False, float_format='%.6f')

