In [None]:
import os
import numpy as np
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn import preprocessing, model_selection, metrics, decomposition
from zipfile import ZipFile

%matplotlib inline

## Initial Examination of Training Data

In [None]:
data_dir = 'input_data'
file_name = "santander-value-prediction-challenge.zip"
seed=0

with ZipFile(file_name, 'r') as zip_file: 
    for file in zip_file.namelist():
        if os.path.exists(data_dir + r'/' + file):
            pass
        else:
            zip_file.extract(file, data_dir)

In [None]:
train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'), index_col='ID')

In [None]:
train_df 

In [None]:
train_df.shape

In [None]:
train_df.head()

To begin with, looking for any observations with N/A values in train data. There do not appear to be any missing values

In [None]:
list(filter(lambda x: x < len(train_df.columns), train_df.apply(lambda x: x.count(), axis=1)))

In [None]:
train_df.info

In [None]:
train_df.columns

Feature names do not contain any meaningful information, so the necessary reduction of features will have to be based on on predictive relevance

## Examining Target Distribution

Now- examining the characteristics of the target data. There is a very high range of values in the train target, with significant outliers. There is a strong apparent left skew.

In [None]:
train_df.target.describe()

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(20,6))
ax1.scatter(range(train_df.shape[0]), np.sort(train_df['target'].values))
ax1.set_xlabel('Index Number', fontsize=12)
ax1.set_ylabel('Target', fontsize=12)
ax1.set_title("Target Distribution", fontsize=16)

ax2.hist(x=np.sort(train_df.target.values), bins=30)
ax2.set_xlabel('Transaction Value', fontsize=12)
ax2.set_ylabel('Frequency', fontsize=12)
ax2.set_title('Target Histogram', fontsize=16)
plt.show()

Now, exploring transforms of target data to see if either results in a more normal distribution. Taking a log of the values helps normalize the data some, but a Box-Cox transform appears to do much better.

In [None]:
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(20, 5)) 
sns.distplot(np.log(train_df.target.values), bins=30, ax=ax1, kde=True).set_title('Distribution of Target Log Transform', fontsize=14)
sns.distplot(stats.boxcox(train_df.target.values)[0], bins=30, ax=ax2, kde=True).set_title('Distribution of Target Box-Cox Transform', fontsize=14)


## Exploring Feature Data Distributions

Quickly examining the distributions of a random subsample of the feature data. Most appear to have a vast majority of zero values.

In [None]:
fig = plt.figure(figsize=(20, 100)) 

for i, col in enumerate(train_df.iloc[:, 1:].sample(30, axis=1)):
    ax = fig.add_subplot(25, 2, i+1)
    sns.distplot(train_df[col].values, bins=10, ax = ax, kde=False).set_title('{} Distribution'.format(col))

Now, looking at general relationship between a random subsample of features and the target

In [None]:
fig = plt.figure(figsize=(20, 100)) 

for i, col in enumerate(train_df.iloc[:, 1:].sample(50, axis=1)):
    ax = fig.add_subplot(25, 2, i+1)
    sns.regplot(train_df[col].values, train_df['target'], ax = ax, seed=seed).set_title('Relationship Between {} and Target'.format(col))

Next, examining different potential transformations to data

In [None]:
sample=50

fig, axs = plt.subplots(ncols=4, nrows=sample, figsize=(20, 150))

for i, col in enumerate(train_df.iloc[:, 1:].sample(sample, axis=1)):
    sns.regplot(np.log(train_df[col]), stats.boxcox(train_df['target'])[0], ax=axs[i, 0]).set_title('Relationship Between Log of {} and Target'.format(col), fontsize=8, pad=15)
    sns.regplot(train_df[col], stats.boxcox(train_df['target'])[0], ax=axs[i, 1], color='magenta').set_title('Relationship Between {} and Box-Cox of Target'.format(col), fontsize=8, pad=20)
    sns.regplot(stats.yeojohnson(train_df[col])[0], train_df['target'], color='green', ax=axs[i, 2]).set_title('Relationship Between Yeo-Johnson of {} and Target'.format(col), fontsize=8, pad=20)
    sns.regplot(stats.yeojohnson(train_df[col])[0], stats.boxcox(train_df['target'])[0], color='orange', ax=axs[i, 3]).set_title('Relationship Between Yeo-Johnson of {} and Box-Cox of Target'.format(col), fontsize=8, pad=20)

fig.tight_layout(pad=2.0)


The Yeo-Johnson transforms dont seem to work well with the data. Log transforming the features and Box-Cox transforming the target seems to result in decent variation, but drops most of the values. 

# ONLY WHEN FINISHED

In [None]:
# Cleaning up saved data files
import shutil

shutil.rmtree(data_dir)