In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn import preprocessing

In [None]:
data_dir = 'input_data'
train_df = pd.read_csv(os.path.join(data_dir, 'train.csv'), index_col='ID')

## Manual Feature Selection

Examing correlation between features and the target using a Spearman coefficient. Spearman's rank-order was chosen because of the extreme range of values in the target data

In [None]:
if os.path.exists(os.path.join(data_dir, 'correlation_matrix.csv')):
    correlation_matrix = pd.read_csv(os.path.join(data_dir, 'correlation_matrix.csv'), header=None, index_col=0, squeeze=True)
else:
    correlation_matrix = train_df.corr(method='spearman')['target'][1:]
    correlation_matrix.to_csv(os.path.join(data_dir, 'correlation_matrix.csv'), header=False)

In [None]:
spearman_threshold = .1
correlated_features = correlation_matrix[abs(correlation_matrix) > spearman_threshold].sort_values(ascending=False)
train_df_reduced = pd.concat([train_df.target, train_df[list(correlated_features.index)]], axis=1)

Features have been reduced to only those with an absolute Spearman correlation value of over .1. Now, examining correlation of features with one another, and removing those that have a high Spearman correlation to one another.

In [None]:
feature_matrix = train_df_reduced.iloc[:, 1:].corr(method='spearman').abs()
plt.figure(figsize=(10,10))
sns.heatmap(feature_matrix, cmap='Reds')

In [None]:
#Removing correlated features
columns = np.full((feature_matrix.shape[0],), True, dtype=bool)

for i in range(feature_matrix.shape[0]):
    for j in range(i+1, feature_matrix.shape[0]):
        if feature_matrix.iloc[i,j] >= 0.7:
            if columns[j]:
                columns[j] = False
                
selected_columns = train_df_reduced.iloc[:, 1:].columns[columns]
train_df_reduced = pd.concat((train_df_reduced['target'], train_df_reduced[selected_columns]), axis=1)

Features have been further reduced based on their Spearman correlation to one another. Now applying transforms.

## Feature and Target Data Transformations

Based on results in EDA, i opt to normalize the target using a Box-Cox method, and the features using a log transform (with zeros retained).

In [None]:
transform_method = 'log'

def train_transform(df, method=None):
    transformer = None
    df['target'] = stats.boxcox(df['target'])[0]
    if method == 'log':
        log_vals = np.log(df.iloc[:, 1:].mask(df <=0)).fillna(0).values
        df.iloc[:, 1:] = log_vals
        return df, transformer
    elif method == 'yeo-johnson':
        transformer = preprocessing.PowerTransformer(method='yeo-johnson', standardize=True)
        transformer.fit(df.iloc[:, 1:].values)
        yeo_vals = transformer.transform(df.iloc[:, 1:].values)
        df.iloc[:, 1:] = yeo_vals
        return df, transformer
    else:
        return df, transformer


In [None]:
train_df_reduced, transformer = train_transform(train_df_reduced, transform_method)

In [None]:
#Saving to local drive
train_df_reduced.to_csv(os.path.join(data_dir, 'train_reduced.csv'))

I will apply the same steps to the test dataset here:

In [None]:
#Specifying columns to be used, then reading in reduced dataset
cols_to_use = list(train_df_reduced.reset_index().drop('target', axis=1).columns)
chunksize = 5000

file_reader = pd.read_csv(os.path.join(data_dir, 'test.csv'), index_col='ID', chunksize=chunksize, usecols=cols_to_use)
test_df_reduce = pd.concat(file_reader, ignore_index=False)


In [None]:
#Transforming test dataset, applying same method as that to training data

def test_transform(df, method=None):
    if method == 'log':
        log_vals = np.log(df.iloc[:, 1:].mask(df <=0)).fillna(0).values
        df.iloc[:, 1:] = log_vals
        return df
    elif method == 'yeo-johnson':
        global transformer
        yeo_vals = transformer.transform(df.values)
        df.iloc[:, :] = yeo_vals
        return df
    else:
        return df

In [None]:
test_transform(test_df_reduce, method=transform_method)

In [None]:
# Saving to local drive
test_df_reduce.to_csv(os.path.join(data_dir, 'test_reduced.csv'))