In [59]:
# Import modules

import pandas as pd
import numpy as np
import os
import sys
import random
import copy

import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import colorlover as cl

src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)

from sklearn.model_selection import ShuffleSplit
from scipy.stats import spearmanr

import plotting_methods as pm

init_notebook_mode(connected=True)

%reload_ext autoreload
%autoreload 2

pd.options.display.float_format = '{:,.4f}'.format

In [2]:
# Constants

split_seed = 4
train_prop = 0.5
valid_prop = 0.25
test_propr = 0.25

In [3]:
# Load data

raw_data_dir = os.path.join(os.getcwd(), os.pardir, 'data')

f = open(os.path.join(raw_data_dir, 'train.csv'), 'r')
train_data = pd.read_csv(f)
f.close()

id_col = 'ID'
tar_col = 'target'

In [4]:
# Get data stats:

num_samps = train_data.shape[0]
feat_names = [x for x in train_data.columns if x not in [id_col, tar_col]]
num_feats = len(feat_names)


In [5]:
print 'Num samples: ' + str(num_samps)
print 'Num features: ' + str(num_feats)

Num samples: 4459
Num features: 4991


Feature data types:

In [41]:
train_data.dtypes

ID            object
target       float64
48df886f9    float64
0deb4b6a8      int64
34b15f335    float64
a8cb14b00      int64
2f0771a37      int64
30347e683      int64
d08d1fbe3      int64
6ee66e115      int64
20aa07010    float64
dc5a8f1d8    float64
11d86fa6a    float64
77c9823f2      int64
8d6c2a0b2      int64
4681de4fd      int64
adf119b9a      int64
cff75dd09    float64
96f83a237      int64
b8a716ebf    float64
6c7a4567c      int64
4fcfd2b4d      int64
f3b9c0b95    float64
71cebf11c      int64
d966ac62c      int64
68b647452    float64
c88d108c9      int64
ff7b471cd      int64
d5308d8bc      int64
0d866c3d7    float64
              ...   
cdfc2b069    float64
2a879b4f7    float64
6b119d8ce    float64
98dea9e42      int64
9f2471031      int64
88458cb21      int64
f40da20f4      int64
7ad6b38bd    float64
c901e7df1      int64
8f55955dc      int64
85dcc913d    float64
5ca0b9b0c      int64
eab8abf7a      int64
8d8bffbae    float64
2a1f6c7f9      int64
9437d8b64      int64
5831f4c76    

Many of the features are integer types thus there are ordinal or categorical features. 

To explore further, we calculate the %unique values for each feature

In [22]:
perc_unique = train_data.apply(lambda x: float(100.0 * len(np.unique(x))) / float(len(x)))
iplot([go.Bar(
    x = train_data.columns,
    y = list(perc_unique))])

In [23]:
iplot([go.Histogram(x = list(perc_unique))])

This shows many of these features could be considered categorical or ordinal, since there are few unique values compared to the total number of values.  The target variable itself has the following percent unique values:

In [45]:
perc_unique.loc[tar_col]

31.688719443821483

Thus the target variable isn't strictly continuous but varies more than the features.

Looking further into this, we plot the % of samples in the biggest category to determine how skewed the distributions are.

In [36]:
perc_big_cat = train_data.apply(lambda x: float(max(x.value_counts())) * 100.0 / float(len(x)))

In [38]:
iplot([go.Histogram(x = list(perc_big_cat))])

For most features, the vast majority of values lies in one category.  So most features have highly skewed distributions. Listing the mode for each feature shows that the majority of values for the features is 0.

In [55]:
train_data.loc[:, feat_names].mode().T



Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike



Unnamed: 0,0
48df886f9,0.0000
0deb4b6a8,0.0000
34b15f335,0.0000
a8cb14b00,0.0000
2f0771a37,0.0000
30347e683,0.0000
d08d1fbe3,0.0000
6ee66e115,0.0000
20aa07010,0.0000
dc5a8f1d8,0.0000


We also plot the standard deviation of the features to look for any zero-variance.

Feature Distributions

Given that many of the features are highly skewed, plotting their distributions won't given much.  We should, however plot the target distribution.

In [48]:
iplot([go.Histogram(x = train_data[tar_col])])

It is skewed, albeit not as skewed as the features.  Next we plot it in a log(n+1) scale

In [49]:
iplot([go.Histogram(x = np.log(train_data[tar_col]))])

There appears to be clusters at certain values (shown by the peaks) thus it is not continuous.

Data Cleaning

Here we remove features with no variation and find any missing values

In [6]:
train_data = train_data.drop(columns = train_data.columns[np.where(train_data.std() == 0.0)[0]])

In [7]:
new_feat_names = [x for x in train_data.columns if x not in [id_col, tar_col]]

In [8]:
print 'num dropped features: ' + str(len(feat_names) - len(new_feat_names))

num dropped features: 256


Next we look for missing values in samples

In [11]:
any(train_data.isnull().sum() > 0)

False

There are no missing values.

Data Splitting

I split the data into training, validation and test set, using proportions 50-25-25.

In [57]:
inds = list(train_data.index)

ss = ShuffleSplit(n_splits=1, train_size=train_prop, random_state=split_seed)
split_inds = [(train_index, test_index) for train_index, test_index in ss.split(train_data)]
num_valid_test = len(split_inds[0][1]) / 2

train_inds = split_inds[0][0]
valid_inds = split_inds[0][1][0:num_valid_test]
test_inds = split_inds[0][1][num_valid_test:len(split_inds[0][1])]

In [58]:
train_feats = train_data.loc[train_inds, :]

Data Visualization

Here we visualize the features of the training data to get a sense of the data and how it could predict the target variable.

First we visualize the distribution of correlations between the features and the target variable.  Given the data is not continuous and highly skewed, we use spearman correlation.

In [62]:
corrs = train_data.apply(lambda x: spearmanr(x, train_data[tar_col])[0])
iplot([go.Histogram(x = list(corrs))])


This shows that there aren't any feature highly correlated with the target.

In [47]:
shuff_feat_names = copy.deepcopy(feat_names)
plt_data = log_feats.copy()
plt_data['grp'] = [1] * plt_data.shape[0]

random.seed(4)
random.shuffle(shuff_feat_names)
fig = pm.feature_distribution_plot(plt_data, shuff_feat_names[0:50], 
                                   'grp', {1: 'black'}, 5,
                              plot_ranges = None, dens_est = False)
fig['layout'].update(height = 1000)
iplot(fig)
