In [1]:
import pandas as pd 
import numpy as np 
import sklearn
import matplotlib.pyplot as plt

In [2]:
! pwd

/home/arbiterelegantiae/cs/master/fds/project/repo/homecreditdefault


In [3]:
#import first data
train = pd.read_csv("../../data/application_train.csv")
test = pd.read_csv("../../data/application_test.csv")

In [ ]:
#Exploration Basics: check the balaceness of the train set
nodef= train[train['TARGET']==0]
print(str(len(nodef)/307511))

In [ ]:
train.dtypes
#shows a lot of categorical features which we want to transform into numerical features
#binary categorical -> 0,1 
#what about more catoegories? Giving more values ie 0,1,2,3,... would make the model attribute them different importances whereas they are just different cateogries! Thus replace them with different features one for each category

In [ ]:
# identify and print all the categorical columns
for col in train.columns:
    if train[col].dtype == object:
        print(col)
        print(train[col].unique())

In [ ]:
from sklearn.preprocessing import LabelEncoder

def onehot_binenc(df):

    le = sklearn.preprocessing.LabelEncoder()
    #find feaures w two categories and transform them either to 0 or 1
    for col in df.columns:
        if df[col].dtype == object and len(df[col].unique()) <= 2 :
            le.fit(df[col])
            df[col]=le.transform(df[col])
    #one hot encoding of the remaining k-categorical features, w/ k>2
    return pd.get_dummies(df)
        

In [ ]:
train = onehot_binenc(train)
test = onehot_binenc(test)

In [ ]:
train.shape

In [ ]:
test.shape
#We remain w/ more columns in the trainset. This is due to the fact that there were some categorical variables w/ categories only present in the train set but not in the test set.

In [ ]:
#We need the same set of columns in both train and test set
target = train['TARGET']

#Align the training and testing data, keep only columns present in both dataframes
train, test = train.align(test, join = 'inner', axis = 1)

#Add the target back in
train['TARGET'] = target

print('Training Features shape: ', train.shape)
print('Testing Features shape: ', test.shape)

In [ ]:
#check for null values
nulls= train.isnull().sum()
nulls= nulls[nulls > 0]

#how many null values are indeed present?
nulls / train.shape[0]

In [ ]:
#as for the previous cell, there a bunch of features with missing values, then we want to substitute those missing values with a certain strategy, e.g. by replacing w/ the median 

from sklearn.preprocessing import Imputer

imputer = Imputer(strategy = "median")

imputer.fit(train)
train.loc[:] = imputer.transform(train)

imputer.fit(test)
test.loc[:] = imputer.transform(test)


In [ ]:
#check for null values
nulls= train.isnull().sum()
nulls= nulls[nulls > 0]

#how many null values are now present?
nulls.shape

In [ ]:
#check for null values
nulls= test.isnull().sum()
nulls= nulls[nulls > 0]

#how many null values are now present?
nulls

In [ ]:
train.head()

In [ ]:
### 1 Feature exploration ###

from featexp import get_univariate_plots

# Plots drawn for all features if nothing is passed in feature_list parameter.
get_univariate_plots(data=train, target_col='TARGET', 
                     features_list=['DAYS_BIRTH'], bins=10)
# RESULT: The more they are young, the more the tend not to pay! From the article: " The plot tells us that customers with high negative values for DAYS_BIRTH (higher age) have lower default rates"

In [ ]:
#### 2. Identifying noisy features ####

#Build a validation set
msk = np.random.rand(len(train)) < 0.6
trainset = train[msk].astype(np.float32)
validationset = train[~msk].astype(np.float32)

In [ ]:
get_univariate_plots(data=trainset, target_col='TARGET', data_test=validationset, features_list=['DAYS_EMPLOYED'], bins=10)

#RESULT: High correlation between the trend in validation and train set, therefore should not be a noisy feature!
# What if the trend changes a lot tho? Well it might be true then that this its a symptom of noisyness nevertheless the considered bins migh differ 
# a lot in other features giving us a different trends in default rates.
# What should we do then? Cross Validation!

In [ ]:
from featexp import get_trend_stats
stats = get_trend_stats(data=trainset, target_col='TARGET', data_test=validationset)

In [ ]:
stats

In [ ]:
#Cross validate to have more representative trend_correlations 
#10-Folds

total_trend_correlations=stats['Trend_correlation']
for i in range(0,9):
    msk = np.random.rand(len(train)) < 0.6
    trainset = train[msk].astype(np.float32)
    validationset = train[~msk].astype(np.float32)
    
    ith_stats = get_trend_stats(data=trainset, target_col='TARGET', data_test=validationset)
    ith_tc = ith_stats['Trend_correlation']
    
    total_trend_correlations += ith_tc

averaged_trend_correlations = total_trend_correlations / 10

In [ ]:
averaged_trend_correlations

In [ ]:
#Now drop from our original train set all the columns we deduced to label as noisy w/ respect to the trend_correlation average and by using a treshold of 0.9.
#Meaning that, we consider as noisy features all the features which have averaged_trend_correlations smaller than 0.90

stats['Trend_correlation']=averaged_trend_correlations

#We also want to save SK_ID_CURR as it will be useful later
noisy_f= stats.loc[(stats['Feature'] != 'SK_ID_CURR') & (stats['Trend_correlation'] < 0.80)]['Feature'].reset_index()

train=train.drop(noisy_f['Feature'], axis=1)
test=test.drop(noisy_f['Feature'], axis=1)

In [ ]:
test.describe()

In [ ]:
#### 3. Anomalies ####
#Look around for anomalies by using the describe method
(train['DAYS_BIRTH'] / -365).describe()
#N: Those ages look reasonable. There are no outliers for the age on either the high or low end.


In [ ]:
#How about the days of employment?
train['DAYS_EMPLOYED'].describe()
#N: Big outliers! Describe shows a guy who apparently has been working for 1k years :S (plus being a positive number)
#   Moreover, the mean is 174 years still unreal

In [ ]:
# Plot DAYS_EMPLOYED per years
train['DAYS_EMPLOYED'].plot.hist(title = 'Years Employment Histogram')
plt.xlabel('Days Employment')

# The plot shows a clearly bipartite distribution. Call anomalies the ones around 1k years and healthy the ones around 0
# Increase now the number of bits to actually see if such a distribution has bigger granularity
train['DAYS_EMPLOYED'].plot.hist(title = 'Years Employment Histogram', bins=150)
plt.xlabel('Days Employment')

In [ ]:
#The previous plot gives us an insight: the anomalies appear to have all exactly 1k years of employment, lets check it analitically
anomalies = train[train['DAYS_EMPLOYED'] == 365243]
healthy = train[(train['DAYS_EMPLOYED'] >= -17912) & (train['DAYS_EMPLOYED'] <= 0)]
print("Number of anomalies with 1k years of work: %d" % len(anomalies))
print("Number of healty records: %d" % len(healthy))
print("Is there anyone missing?: %d" % (307511 - (len(healthy)+len(anomalies))) )

In [ ]:
#Once we have understood that all the anomalies lie in exactly the same outlier, we want to see if they also have some correlation with the ratio of defaults
print('The non-anomalies default on %0.2f%% of loans' % (100 * healthy['TARGET'].mean()))
print('The anomalies default on %0.2f%% of loans' % (100 * anomalies['TARGET'].mean()))
#With respect to the default-ratio of the healthy, the anomalies have a lower default rate.

In [ ]:
#Therefore it seems like there is some correlation between the loans that was wrongly recorded (employment - wise) and the rate of defaults. Like they encode  hidden feature/s. We then might want to tell this to the classifier by explicitly making a new "employment-anomaly" feature. However, we also don't want to loose the idea of days of employment. That's why we will set a new common value to all the anomalies, such as the median of the healthy.

#Create an anomalous flag column
train['DAYS_EMPLOYED_ANOM'] = train["DAYS_EMPLOYED"] == 365243

#Replace the anomalous values with median
train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
train['DAYS_EMPLOYED'].fillna(train['DAYS_EMPLOYED'].median(), inplace=True)

#Note that the test follow the same distribution between anomaly and healthy DAYS_EMPLOYMENT. Therefore to the same preprocessing to the testset
test['DAYS_EMPLOYED_ANOM'] = test["DAYS_EMPLOYED"] == 365243
test['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)
test['DAYS_EMPLOYED'].fillna(test['DAYS_EMPLOYED'].median(), inplace=True)

In [ ]:
train['DAYS_EMPLOYED_ANOM'].describe()

In [ ]:
#Take a look at the remaining columns to see if there are any salient feature to be inspected for anomalies
from IPython.display import display
pd.options.display.max_columns = None
display(train.describe())
train.shape

In [ ]:
#After and in-depth check of all the remaining features we did not find any substantial indication for anomalies. Plus, most of the remaining features are normalized hence difficult to interpret

In [ ]:
#Save what is done so far
train.to_csv('../../data/train.csv', index=False)
test.to_csv('../../data/test.csv', index=False)

In [ ]:
# Continue to the merge phase