In [13]:
import os
import numpy as np
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn import preprocessing, model_selection, metrics, decomposition
from zipfile import ZipFile

%matplotlib inline
pd.options.display.max_columns = 500

## Initial Examination of Training Data

In [2]:
data_dir = 'input_data'
file_name = "santander-value-prediction-challenge.zip"
seed=0

with ZipFile(file_name, 'r') as zip_file: 
    for file in zip_file.namelist():
        if os.path.exists(data_dir + r'/' + file):
            pass
        else:
            zip_file.extract(file, data_dir)

In [3]:
train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'), index_col='ID')

In [None]:
train_df 

In [None]:
train_df.shape

In [None]:
train_df.head()

In [None]:
# Looking for missing values
list(filter(lambda x: x < len(train_df.columns), train_df.apply(lambda x: x.count(), axis=1)))

In [None]:
train_df.info

In [None]:
# Looking at column names
train_df.columns

Most data seems to consist of zeros, with some apparently continuous values. Feature names do not contain any relevant information. Checking whether there are any columns with probably binary categorical data.

In [9]:
binary_columns = bool_cols = [col for col in train_df 
             if np.isin(train_df[col].dropna().unique(), [0, 1]).all()]

In [14]:
train_df[binary_columns].describe()

Unnamed: 0,d5308d8bc,c330f1a67,eeac16933,7df8788e8,5b91580ee,6f29fbbc7,46dafc868,ae41a98b6,f416800e9,6d07828ca,7ac332a1d,70ee7950a,833b35a7c,2f9969eab,8b1372217,68322788b,2288ac1a6,dc7f76962,467044c26,39ebfbfd9,9a5ff8c23,f6fac27c8,664e2800e,ae28689a2,d87dcac58,4065efbb6,f944d9d43,c2c4491d5,a4346e2e2,1af366d4f,cfff5b7c8,da215e99e,5acd26139,9be9c6cef,1210d0271,21b0a54cb,da35e792b,754c502dd,0b346adbd,0f196b049,b603ed95d,2a50e001c,1e81432e7,10350ea43,3c7c7e24c,7585fce2a,64d036163,f25d9935c,d98484125,95c85e227,9a5273600,746cdb817,6377a6293,7d944fb0c,87eb21c50,5ea313a8c,0987a65a1,2fb7c2443,f5dde409b,1ae50d4c3,2b21cd7d8,0db8a9272,804d8b55b,76f135fa6,7d7182143,f88e61ae6,378ed28e0,ca4ba131e,1352ddae5,2b601ad67,6e42ff7c7,22196a84c,0e410eb3d,992e6d1d3,90a742107,08b9ec4ae,d95203ded,58ad51def,9f69ae59f,863de8a31,be10df47c,f006d9618,a7e39d23d,5ed0abe85,6c578fe94,7fa4fcee9,5e0571f07,fd5659511,e06b9f40f,c506599c8,99de8c2dc,b05f4b229,5e0834175,eb1cc0d9c,b281a62b9,00fcf67e4,e37b65992,2308e2b29,c342e8709,708471ebf,f614aac15,15ecf7b68,3bfe540f1,7a0d98f3c,e642315a5,c16d456a7,0c9b5bcfa,b778ab129,2ace87cdd,697a566f0,97b1f84fc,34eff114b,5281333d7,c89f3ba7e,cd6d3c7e6,fc7c8f2e8,abbbf9f82,24a233e8f,8e26b560e,a28ac1049,504502ce1,d9a8615f3,4efd6d283,34cc56e83,93e98252a,2b6cef19e,c7f70a49b,0d29ab7eb,e4a0d39b7,a4d1a8409,bc694fc8f,3a36fc3a2,4ffba44d3,9bfdec4bc,66a866d2f,f941e9df7,e7af4dbf3,dc9a54a3e,748168a04,bba8ce4bb,ff6f62aa4,b06fe66ba,ae87ebc42,f26589e57,963bb53b1,a531a4bf0,9fc79985d,9350d55c1,de06e884c,fc10bdf18,e0907e883,c586d79a1,e15e1513d,a06067897,643e42fcb,217cd3838,047ebc242,9b6ce40cf,3b2c972b3,17a7bf25a,c9028d46b,9e0473c91,6b041d374,783c50218,19122191d,ce573744f,1c4ea481e,fbd6e0a0b,69831c049,b87e3036b,54ba515ee,a09ba0b15,90f77ec55,fb02ef0ea,3b0cccd29,fe9ed417c,589e8bd6f,17b5a03fd,80e16b49a,a3d5c2c2a,1bd3a4e92,611d81daa,3d7780b1c,113fd0206,5e5894826,cb36204f9,bc4e3d600,c66e2deb0,c25851298,a7f6de992,3f93a3272,c1b95c2ec,6bda21fee,4a64e56e7,943743753,20854f8bf,ac2e428a9,5ee7de0be,316423a21,2e52b0c6a,8bdf6bc7e,8f523faf2,4758340d5,8411096ec,9678b95b7,a185e35cc,fa980a778,c8d90f7d7,080540c81,32591c8b4,5779da33c,bb425b41e,01599af81,1654ab770,d334a588e,b4353599c,51b53eaec,2cc0fbc52,45ffef194,c15ac04ee,5b055c8ea,d0466eb58,a80633823,a117a5409,7ddac276f,8c32df8b3,e5649663e,6c16efbb8,9118fd5ca,ca8d565f1,16a5bb8d2,fd6347461,f5179fb9c,97428b646,f684b0a96,e4b2caa9f,2c2d9f267,96eb14eaf,cb2cb460c,86f843927,ecd16fc60,801c6dc8e,f859a25b8,ae846f332,2252c7403,fb9e07326,d196ca1fd,a8e562e8e,eb6bb7ce1,5beff147e,52b347cdc,4600aadcf,6fa0b9dab,43d70cc4d,408021ef8,e29d22b59
count,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0,4459.0
mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Results consist entirely of columns of all zeroes. Checking to see if this is consistent in test data.

In [20]:
test_df_binary = pd.read_csv(os.path.join(data_dir, 'test.csv'), usecols=binary_columns)

In [18]:
test_df_binary.describe()

Unnamed: 0,d5308d8bc,c330f1a67,eeac16933,7df8788e8,5b91580ee,6f29fbbc7,46dafc868,ae41a98b6,f416800e9,6d07828ca,7ac332a1d,70ee7950a,833b35a7c,2f9969eab,8b1372217,68322788b,2288ac1a6,dc7f76962,467044c26,39ebfbfd9,9a5ff8c23,f6fac27c8,664e2800e,ae28689a2,d87dcac58,4065efbb6,f944d9d43,c2c4491d5,a4346e2e2,1af366d4f,cfff5b7c8,da215e99e,5acd26139,9be9c6cef,1210d0271,21b0a54cb,da35e792b,754c502dd,0b346adbd,0f196b049,b603ed95d,2a50e001c,1e81432e7,10350ea43,3c7c7e24c,7585fce2a,64d036163,f25d9935c,d98484125,95c85e227,9a5273600,746cdb817,6377a6293,7d944fb0c,87eb21c50,5ea313a8c,0987a65a1,2fb7c2443,f5dde409b,1ae50d4c3,2b21cd7d8,0db8a9272,804d8b55b,76f135fa6,7d7182143,f88e61ae6,378ed28e0,ca4ba131e,1352ddae5,2b601ad67,6e42ff7c7,22196a84c,0e410eb3d,992e6d1d3,90a742107,08b9ec4ae,d95203ded,58ad51def,9f69ae59f,863de8a31,be10df47c,f006d9618,a7e39d23d,5ed0abe85,6c578fe94,7fa4fcee9,5e0571f07,fd5659511,e06b9f40f,c506599c8,99de8c2dc,b05f4b229,5e0834175,eb1cc0d9c,b281a62b9,00fcf67e4,e37b65992,2308e2b29,c342e8709,708471ebf,f614aac15,15ecf7b68,3bfe540f1,7a0d98f3c,e642315a5,c16d456a7,0c9b5bcfa,b778ab129,2ace87cdd,697a566f0,97b1f84fc,34eff114b,5281333d7,c89f3ba7e,cd6d3c7e6,fc7c8f2e8,abbbf9f82,24a233e8f,8e26b560e,a28ac1049,504502ce1,d9a8615f3,4efd6d283,34cc56e83,93e98252a,2b6cef19e,c7f70a49b,0d29ab7eb,e4a0d39b7,a4d1a8409,bc694fc8f,3a36fc3a2,4ffba44d3,9bfdec4bc,66a866d2f,f941e9df7,e7af4dbf3,dc9a54a3e,748168a04,bba8ce4bb,ff6f62aa4,b06fe66ba,ae87ebc42,f26589e57,963bb53b1,a531a4bf0,9fc79985d,9350d55c1,de06e884c,fc10bdf18,e0907e883,c586d79a1,e15e1513d,a06067897,643e42fcb,217cd3838,047ebc242,9b6ce40cf,3b2c972b3,17a7bf25a,c9028d46b,9e0473c91,6b041d374,783c50218,19122191d,ce573744f,1c4ea481e,fbd6e0a0b,69831c049,b87e3036b,54ba515ee,a09ba0b15,90f77ec55,fb02ef0ea,3b0cccd29,fe9ed417c,589e8bd6f,17b5a03fd,80e16b49a,a3d5c2c2a,1bd3a4e92,611d81daa,3d7780b1c,113fd0206,5e5894826,cb36204f9,bc4e3d600,c66e2deb0,c25851298,a7f6de992,3f93a3272,c1b95c2ec,6bda21fee,4a64e56e7,943743753,20854f8bf,ac2e428a9,5ee7de0be,316423a21,2e52b0c6a,8bdf6bc7e,8f523faf2,4758340d5,8411096ec,9678b95b7,a185e35cc,fa980a778,c8d90f7d7,080540c81,32591c8b4,5779da33c,bb425b41e,01599af81,1654ab770,d334a588e,b4353599c,51b53eaec,2cc0fbc52,45ffef194,c15ac04ee,5b055c8ea,d0466eb58,a80633823,a117a5409,7ddac276f,8c32df8b3,e5649663e,6c16efbb8,9118fd5ca,ca8d565f1,16a5bb8d2,fd6347461,f5179fb9c,97428b646,f684b0a96,e4b2caa9f,2c2d9f267,96eb14eaf,cb2cb460c,86f843927,ecd16fc60,801c6dc8e,f859a25b8,ae846f332,2252c7403,fb9e07326,d196ca1fd,a8e562e8e,eb6bb7ce1,5beff147e,52b347cdc,4600aadcf,6fa0b9dab,43d70cc4d,408021ef8,e29d22b59
count,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0,49342.0
mean,62748.62,66266.68,131080.7,83556.7,57906.13,76203.71,69154.79,86495.39,60365.43,86084.84,82954.49,40983.0,80515.78,74080.65,74079.42,77482.45,68776.2,67673.91,53960.14,171637.5,89432.37,63937.08,78615.77,61523.21,57316.09,69615.33,59507.21,59158.91,83260.24,60455.07,85474.02,57740.45,44628.45,56692.89,55743.63,45357.48,72182.16,78633.5,63075.18,50701.75,60302.76,54107.6,62703.93,52524.47,170075.3,77889.45,58602.0,61175.47,76674.12,62269.18,57047.23,62496.83,75240.03,65491.2,50964.51,63300.84,52482.32,76888.55,98167.4,75855.22,82162.09,74231.41,56488.05,75606.32,89677.68,99150.58,65710.3,70305.46,71497.97,73180.12,65196.25,76417.16,65471.18,85900.68,75220.27,83398.39,75903.04,112736.2,53295.92,53623.97,67259.66,57950.53,72092.87,88642.24,82245.03,55210.41,74335.9,73911.14,69423.57,67837.86,50201.5,105702.0,77289.13,85428.01,49095.29,54683.88,59553.64,93756.2,60621.74,54720.97,53319.07,66095.22,82166.0,77443.49,74027.21,69411.24,94407.65,61818.63,81834.92,74481.01,53188.73,67602.63,89788.15,59595.96,77976.6,71730.22,70183.44,69849.95,87850.76,69777.05,169807.5,72768.21,77674.79,62542.97,90735.1,78009.93,68958.9,87650.69,67563.63,57307.93,54767.52,80488.67,68625.75,74056.22,94071.77,66261.9,70059.31,71811.88,66668.54,72283.91,81963.1,78625.24,75325.74,66107.38,77917.04,70908.64,86230.64,83166.03,59701.45,47344.98,53434.79,82453.94,84619.43,60591.9,83765.04,62714.97,55961.23,57230.84,58356.98,62661.61,52611.62,55140.08,56999.88,74990.63,78499.37,67497.33,69686.14,73762.47,65186.55,65207.34,43605.78,53927.65,96036.25,72804.42,56736.57,77579.36,83131.63,68254.97,75469.78,75816.74,73887.0,70790.59,50877.67,50530.42,87912.56,75435.91,85630.62,94609.31,72981.03,61598.46,82355.78,64113.46,112270.3,54908.35,66265.75,59311.0,63342.05,48867.07,82797.97,53283.66,80653.53,88570.62,74155.88,67967.5,56562.01,70380.74,93066.49,77321.5,95532.45,83247.18,49427.08,79198.47,86903.89,80099.11,55327.91,89086.74,52970.22,76392.28,92791.95,94880.88,68501.83,58909.66,59049.59,77031.69,64897.43,72578.0,71827.99,68182.84,63262.43,51689.09,62797.44,57640.59,58516.72,78244.26,42723.76,57837.14,64251.63,58445.85,101407.4,83437.52,58852.64,58447.86,89362.94,56788.01,55599.93,75406.95,68039.21,64900.16,70999.98,78542.21,76063.48,70320.37,78634.84,78648.36,86006.96,80328.72
std,1912701.0,3025246.0,10740900.0,2947519.0,1935330.0,2490503.0,2730657.0,4982128.0,2330954.0,3773596.0,3217847.0,1202397.0,2885604.0,3062078.0,3289250.0,2731729.0,2340723.0,2434567.0,1604841.0,21086430.0,3043253.0,2384621.0,3555467.0,2284894.0,2005298.0,2646775.0,1868921.0,2210904.0,5120700.0,1740838.0,5228891.0,1821578.0,1636304.0,2605920.0,1794277.0,1390285.0,3098674.0,3052368.0,1879016.0,2032699.0,2295181.0,1590347.0,2622603.0,1604589.0,20851850.0,3389435.0,2114657.0,1811593.0,4399081.0,2620870.0,2205711.0,2143539.0,3113712.0,2768544.0,2275249.0,2937262.0,1760720.0,2671105.0,3683667.0,3046756.0,3176588.0,3219725.0,1618616.0,2482278.0,4458176.0,5590245.0,2476508.0,3240691.0,2085826.0,3359579.0,2480035.0,2784565.0,2790332.0,3020709.0,2522352.0,3406351.0,2440266.0,6057416.0,1607852.0,1767765.0,4375075.0,1867639.0,2930953.0,4777714.0,2617326.0,1625735.0,2481458.0,2553893.0,2355715.0,2332568.0,2061339.0,5707971.0,2736518.0,4883650.0,1586519.0,1750485.0,2052577.0,4503162.0,1690889.0,1544814.0,1650927.0,3196506.0,3493248.0,2691079.0,2689424.0,2364056.0,4070705.0,1908797.0,2695647.0,2648963.0,3008100.0,2129316.0,3194543.0,2078411.0,2756920.0,2335934.0,2434456.0,2830833.0,3045000.0,2678435.0,19996630.0,2456843.0,2350355.0,2137529.0,3028681.0,3197244.0,2201615.0,4748968.0,2028222.0,1855920.0,2370675.0,3594722.0,2134327.0,2389493.0,3557580.0,2225593.0,2060415.0,2037818.0,2046769.0,2343863.0,3132632.0,2885217.0,2866803.0,2191942.0,2842369.0,2545707.0,2787505.0,3409191.0,1901480.0,1441656.0,2543539.0,3152627.0,2786027.0,2747586.0,3109313.0,2098559.0,1552747.0,1875659.0,1922567.0,2022297.0,1767976.0,1948662.0,1895542.0,2378257.0,3059950.0,2172532.0,2968030.0,3076346.0,2151616.0,1946089.0,1337219.0,1481050.0,4824007.0,2659021.0,2529528.0,2938701.0,5243046.0,2499498.0,3110229.0,2569879.0,2944402.0,2581898.0,1743312.0,1815619.0,3465922.0,2921398.0,4765534.0,3271645.0,2069189.0,4028036.0,2952254.0,2503458.0,10453930.0,1772431.0,2262135.0,1997936.0,2705385.0,1809358.0,3086153.0,1601813.0,3777298.0,3896355.0,3831528.0,2323302.0,1656734.0,2548832.0,4966275.0,2792443.0,2956498.0,2712473.0,1412522.0,4364733.0,4756508.0,3075047.0,1749014.0,4143312.0,1837888.0,3237311.0,3877372.0,5105191.0,2547660.0,2082341.0,2112021.0,2949444.0,2110505.0,2256165.0,3024988.0,2410885.0,2039486.0,1539373.0,2044454.0,1705038.0,2063697.0,2333781.0,1274470.0,1924567.0,2514063.0,1800227.0,4035555.0,3506643.0,2269159.0,2149843.0,2868568.0,1971918.0,1772508.0,2975533.0,2233237.0,2464174.0,2154137.0,2672268.0,2460398.0,2904047.0,4919788.0,3267872.0,3403785.0,3054114.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,209060900.0,427187800.0,2297832000.0,344381200.0,231383300.0,280543100.0,312296500.0,863460400.0,213892700.0,487507800.0,391226200.0,93972590.0,311020400.0,474236100.0,498130300.0,316749600.0,377436900.0,252821000.0,120990400.0,4651199000.0,300600900.0,385001300.0,588005100.0,343481800.0,274060200.0,389786100.0,162196100.0,318017800.0,1047040000.0,139649300.0,1057578000.0,189289300.0,197917500.0,359363000.0,175027800.0,144840100.0,415388300.0,486398700.0,200368500.0,364563200.0,288488600.0,161874300.0,332116300.0,188796800.0,4592494000.0,412966200.0,278005300.0,155910300.0,847644500.0,444659200.0,300938500.0,186552400.0,406342500.0,345577500.0,422325900.0,419875100.0,139688100.0,231650600.0,472035300.0,407383100.0,360771600.0,414832200.0,118954300.0,276336200.0,839389600.0,872637800.0,242138200.0,510613700.0,164305200.0,529322600.0,315197000.0,403834800.0,372755400.0,401769000.0,262959600.0,415978400.0,238272000.0,1026199000.0,125891500.0,168508600.0,878006900.0,265636900.0,353396700.0,882258600.0,225569600.0,153229600.0,292967400.0,372008200.0,299928000.0,243944000.0,268054100.0,986839200.0,322198100.0,787347600.0,147873300.0,175027800.0,272451000.0,842268700.0,106418400.0,146403900.0,175504900.0,435666700.0,509675500.0,315197000.0,325895300.0,294206800.0,684184900.0,181676500.0,273275100.0,336596600.0,597937900.0,210372800.0,414321200.0,244002600.0,299879000.0,290210300.0,393143200.0,400000000.0,400998000.0,340733900.0,4375799000.0,276511800.0,256055500.0,232740400.0,252821000.0,302092400.0,236507700.0,936951500.0,191510000.0,186942600.0,371178400.0,525786900.0,220383200.0,324426300.0,435575000.0,179116900.0,163451800.0,143774400.0,216953800.0,197824300.0,422325900.0,332711700.0,463987600.0,218431600.0,343336100.0,292814200.0,321996600.0,602935600.0,159158300.0,127278600.0,472179900.0,415984100.0,221549900.0,328023100.0,338928000.0,258271200.0,111725300.0,252585000.0,218447600.0,165188900.0,192164800.0,218878500.0,216740600.0,228172900.0,415438400.0,175295000.0,315197000.0,508963700.0,238574800.0,171875700.0,164576800.0,106840200.0,777359100.0,350014100.0,356923500.0,375386600.0,1057578000.0,353035300.0,387552000.0,322415400.0,377982600.0,361360400.0,216740600.0,190855400.0,397848400.0,315197000.0,787347600.0,375838200.0,179368200.0,841013400.0,377830800.0,303495900.0,2271166000.0,155598900.0,248596700.0,194249500.0,408873500.0,299365600.0,402523500.0,133783600.0,433179700.0,514886200.0,626789500.0,228879500.0,145285800.0,419603100.0,918494200.0,334854100.0,340373800.0,240243600.0,155022700.0,831141000.0,882258600.0,423305300.0,201130900.0,614055600.0,176057300.0,434798500.0,419109300.0,847259300.0,315530300.0,265887800.0,280383100.0,291016300.0,202858500.0,238237100.0,529076100.0,272965500.0,236906600.0,152135200.0,193265800.0,149104400.0,171182100.0,260145400.0,114826000.0,171797700.0,366321000.0,143479400.0,402523500.0,612945500.0,294158600.0,355821500.0,323976000.0,264773400.0,170209300.0,494196400.0,211783600.0,290473900.0,198166700.0,246844700.0,275766300.0,377846700.0,985721100.0,569951500.0,400000000.0,381695500.0


The same features in the test dataframe do have values, with extreme outliers, indicating that the two datasets may have different underlying distributions

## Examining Target Distribution

Now- examining the characteristics of the target data. There is a very high range of values in the train target, with significant outliers. There is a strong apparent left skew.

In [None]:
train_df.target.describe()

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(20,6))
ax1.scatter(range(train_df.shape[0]), np.sort(train_df['target'].values))
ax1.set_xlabel('Index Number', fontsize=12)
ax1.set_ylabel('Target', fontsize=12)
ax1.set_title("Target Distribution", fontsize=16)

ax2.hist(x=np.sort(train_df.target.values), bins=30)
ax2.set_xlabel('Transaction Value', fontsize=12)
ax2.set_ylabel('Frequency', fontsize=12)
ax2.set_title('Target Histogram', fontsize=16)
plt.show()

Now, exploring transforms of target data to see if either results in a more normal distribution. Taking a log of the values helps normalize the data some, but a Box-Cox transform appears to do much better.

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(20, 5)) 
sns.distplot(np.log(train_df.target.values), bins=30, ax=ax1, kde=True).set_title('Distribution of Target Log Transform', fontsize=14)
sns.distplot(stats.boxcox(train_df.target.values)[0], bins=30, ax=ax2, kde=True).set_title('Distribution of Target Box-Cox Transform', fontsize=14)


## Exploring Feature Data Distributions

Quickly examining the distributions of a random subsample of the feature data. Most appear to have a vast majority of zero values.

In [None]:
fig = plt.figure(figsize=(20, 100)) 

for i, col in enumerate(train_df.iloc[:, 1:].sample(30, axis=1)):
    ax = fig.add_subplot(25, 2, i+1)
    sns.distplot(train_df[col].values, bins=10, ax = ax, kde=False).set_title('{} Distribution'.format(col))

Now, looking at general relationship between a random subsample of features and the target

In [None]:
fig = plt.figure(figsize=(20, 100)) 

for i, col in enumerate(train_df.iloc[:, 1:].sample(50, axis=1)):
    ax = fig.add_subplot(25, 2, i+1)
    sns.regplot(train_df[col].values, train_df['target'], ax = ax, seed=seed).set_title('Relationship Between {} and Target'.format(col))

Next, examining different potential transformations to data

In [None]:
sample=50

fig, axs = plt.subplots(ncols=4, nrows=sample, figsize=(20, 150))

for i, col in enumerate(train_df.iloc[:, 1:].sample(sample, axis=1)):
    sns.regplot(np.log(train_df[col]), stats.boxcox(train_df['target'])[0], ax=axs[i, 0]).set_title('Relationship Between Log of {} and Target'.format(col), fontsize=8, pad=15)
    sns.regplot(train_df[col], stats.boxcox(train_df['target'])[0], ax=axs[i, 1], color='magenta').set_title('Relationship Between {} and Box-Cox of Target'.format(col), fontsize=8, pad=20)
    sns.regplot(stats.yeojohnson(train_df[col])[0], train_df['target'], color='green', ax=axs[i, 2]).set_title('Relationship Between Yeo-Johnson of {} and Target'.format(col), fontsize=8, pad=20)
    sns.regplot(stats.yeojohnson(train_df[col])[0], stats.boxcox(train_df['target'])[0], color='orange', ax=axs[i, 3]).set_title('Relationship Between Yeo-Johnson of {} and Box-Cox of Target'.format(col), fontsize=8, pad=20)

fig.tight_layout(pad=2.0)


The Yeo-Johnson transforms dont seem to work well with the data. Log transforming the features and Box-Cox transforming the target seems to result in decent variation, but drops most of the values. 

# ONLY WHEN FINISHED

In [None]:
# Cleaning up saved data files
import shutil

shutil.rmtree(data_dir)