In [74]:
# necessary libraries for pre-processing
import utils
import pandas as pd
import numpy as np
import os

# Useful functions for preprocessing

## Imputation

In [75]:
def get_null_summary(dataset):
    '''Get a null summary display'''
    display(dataset.isnull().mean())

In [76]:
def clean_nulls(dataset, threshold=0.7):
    '''Clean nulls from the given table.
    If the nulls in a column are higher than the given threshold the entire column is deleted.
    If the nulls in a row are higher than the row, the row is also deleted.
    The threshold is a value between 0 and 1'''
    #Dropping columns with missing value rate higher than threshold
    dataset = dataset[dataset.columns[dataset.isnull().mean() < threshold]]

    #Dropping rows with missing value rate higher than threshold
    dataset = dataset.loc[dataset.isnull().mean(axis=1) < threshold]
    
    return dataset

In [77]:
def numerical_imputation(dataset, replacer=None):
    '''When null values exist, set them using the median of the colum,
    or a replacer, if one was given'''
    dataset = dataset.fillna(replacer if replacer else dataset.median())

    return dataset

In [78]:
def categorical_imputation(dataset, column_name, replacer=None):
    '''Replace the inexistent values of the given column with the given replacer.
    If None replacer was ginve, use the column maximum value'''
    #Max fill function for categorical columns
    dataset[column_name].fillna(replacer if replacer else \
                                dataset[column_name].value_counts()
                                                    .idxmax(),
                                inplace=True)
    
    return dataset

## Handling Outliers

In [79]:
def display_to_drop_std(dataset, column, mult_factor=3):
    '''Display the rows that will be dropped using the std approach'''
    upper_lim = dataset[column].mean() + dataset[column].std() * mult_factor
    lower_lim = dataset[column].mean() - dataset[column].std() * mult_factor

    display(dataset[(dataset[column] >= upper_lim) & (dataset[column] <= lower_lim)])

def drop_outliers_std(dataset, column, mult_factor=3):
    '''Drop the outlier rows with standard deviation'''
    upper_lim = dataset[column].mean() + dataset[column].std() * mult_factor
    lower_lim = dataset[column].mean() - dataset[column].std() * mult_factor

    return dataset[(dataset[column] < upper_lim) & (dataset[column] > lower_lim)]

In [80]:
def display_to_drop_percentile(dataset, column):
    '''Display the rows that will be dropped with Percentiles approach'''
    upper_lim = dataset[column].quantile(.95)
    lower_lim = dataset[column].quantile(.05)

    display(dataset[(dataset[column] >= upper_lim) & (dataset[column] <= lower_lim)])

def drop_outliers_percentile(dataset, column):
    '''Drop the outlier rows with Percentiles approach'''
    upper_lim = dataset[column].quantile(.95)
    lower_lim = dataset[column].quantile(.05)

    data = dataset[(dataset[column] < upper_lim) & (dataset[column] > lower_lim)]

## Binning

In [81]:
def numerical_binning(dataset):
    # TODO
    return False

In [82]:
def categorical_binning(dataset):
    # TODO
    return False

# Predictions

### For a first simpler approach, we will only use the 'loan' table

In [83]:
# Reading the different tables
loan_df = utils.read_csv_to_df('competition_dataset/loan_train.csv', delimiter=';')
account_df = utils.read_csv_to_df('competition_dataset/account.csv', delimiter=';')

display(loan_df)
display(account_df)

Unnamed: 0,loan_id,account_id,date,amount,duration,payments,status
0,5314,1787,930705,96396,12,8033,-1
1,5316,1801,930711,165960,36,4610,1
2,6863,9188,930728,127080,60,2118,1
3,5325,1843,930803,105804,36,2939,1
4,7240,11013,930906,274740,60,4579,1
...,...,...,...,...,...,...,...
323,6818,9030,961212,155616,48,3242,1
324,5625,3189,961215,222180,60,3703,-1
325,6805,8972,961221,45024,48,938,1
326,7233,10963,961225,115812,36,3217,1


Unnamed: 0,account_id,district_id,frequency,date
0,576,55,monthly issuance,930101
1,3818,74,monthly issuance,930101
2,704,55,monthly issuance,930101
3,2378,16,monthly issuance,930101
4,2632,24,monthly issuance,930102
...,...,...,...,...
4495,124,55,monthly issuance,971228
4496,3958,59,monthly issuance,971228
4497,777,30,monthly issuance,971228
4498,1573,63,monthly issuance,971229


In [84]:
# Joining the different tables
account_df = account_df.set_index('account_id')\
                       .rename(columns={'date': 'account_date'})
loan_df = loan_df.join(account_df, on='account_id', rsuffix='', lsuffix='')
display(loan_df)

Unnamed: 0_level_0,district_id,frequency,account_date
account_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
576,55,monthly issuance,930101
3818,74,monthly issuance,930101
704,55,monthly issuance,930101
2378,16,monthly issuance,930101
2632,24,monthly issuance,930102
...,...,...,...
124,55,monthly issuance,971228
3958,59,monthly issuance,971228
777,30,monthly issuance,971228
1573,63,monthly issuance,971229


Unnamed: 0,loan_id,account_id,date,amount,duration,payments,status,district_id,frequency,account_date
0,5314,1787,930705,96396,12,8033,-1,30,weekly issuance,930322
1,5316,1801,930711,165960,36,4610,1,46,monthly issuance,930213
2,6863,9188,930728,127080,60,2118,1,45,monthly issuance,930208
3,5325,1843,930803,105804,36,2939,1,12,monthly issuance,930130
4,7240,11013,930906,274740,60,4579,1,1,weekly issuance,930214
...,...,...,...,...,...,...,...,...,...,...
323,6818,9030,961212,155616,48,3242,1,72,monthly issuance,950121
324,5625,3189,961215,222180,60,3703,-1,29,monthly issuance,951129
325,6805,8972,961221,45024,48,938,1,70,monthly issuance,960521
326,7233,10963,961225,115812,36,3217,1,16,monthly issuance,950520


In [85]:
# We should start 'cleansing' the data here
display(loan_df)
get_null_summary(loan_df)
display_to_drop_std(loan_df, 'duration')
display_to_drop_std(loan_df, 'amount')

Unnamed: 0,loan_id,account_id,date,amount,duration,payments,status,district_id,frequency,account_date
0,5314,1787,930705,96396,12,8033,-1,30,weekly issuance,930322
1,5316,1801,930711,165960,36,4610,1,46,monthly issuance,930213
2,6863,9188,930728,127080,60,2118,1,45,monthly issuance,930208
3,5325,1843,930803,105804,36,2939,1,12,monthly issuance,930130
4,7240,11013,930906,274740,60,4579,1,1,weekly issuance,930214
...,...,...,...,...,...,...,...,...,...,...
323,6818,9030,961212,155616,48,3242,1,72,monthly issuance,950121
324,5625,3189,961215,222180,60,3703,-1,29,monthly issuance,951129
325,6805,8972,961221,45024,48,938,1,70,monthly issuance,960521
326,7233,10963,961225,115812,36,3217,1,16,monthly issuance,950520


loan_id         0.0
account_id      0.0
date            0.0
amount          0.0
duration        0.0
payments        0.0
status          0.0
district_id     0.0
frequency       0.0
account_date    0.0
dtype: float64

Unnamed: 0,loan_id,account_id,date,amount,duration,payments,status,district_id,frequency,account_date


Unnamed: 0,loan_id,account_id,date,amount,duration,payments,status,district_id,frequency,account_date


In [86]:
# Good tutorial for feature engineering:
# https://medium.com/datadriveninvestor/a-simple-guide-to-creating-predictive-models-in-python-part-1-8e3ddc3d7008

In [87]:
# Outputting the resultant table to a final csv
utils.write_df_to_csv(loan_df, 'dataset', 'preprocessed_data.csv')