In [1]:
# necessary libraries for pre-processing
import utils
import pandas as pd
import numpy as np
import os

# Useful functions for preprocessing

## Imputation

In [2]:
def get_null_summary(dataset):
    '''Get a null summary display'''
    display(dataset.isnull().mean())

In [3]:
def clean_nulls(dataset, threshold=0.7):
    '''Clean nulls from the given table.
    If the nulls in a column are higher than the given threshold the entire column is deleted.
    If the nulls in a row are higher than the row, the row is also deleted.
    The threshold is a value between 0 and 1'''
    #Dropping columns with missing value rate higher than threshold
    dataset = dataset[dataset.columns[dataset.isnull().mean() < threshold]]

    #Dropping rows with missing value rate higher than threshold
    dataset = dataset.loc[dataset.isnull().mean(axis=1) < threshold]
    
    return dataset

In [4]:
def numerical_imputation(dataset, replacer=None):
    '''When null values exist, set them using the median of the colum,
    or a replacer, if one was given'''
    dataset = dataset.fillna(replacer if replacer else dataset.median())

    return dataset

In [5]:
def categorical_imputation(dataset, column_name, replacer=None):
    '''Replace the inexistent values of the given column with the given replacer.
    If None replacer was ginve, use the column maximum value'''
    #Max fill function for categorical columns
    dataset[column_name].fillna(replacer if replacer else \
                                dataset[column_name].value_counts()
                                                    .idxmax(),
                                inplace=True)
    
    return dataset

## Handling Outliers

In [6]:
def display_to_drop_std(dataset, column, mult_factor=3):
    '''Display the rows that will be dropped using the std approach'''
    upper_lim = dataset[column].mean() + dataset[column].std() * mult_factor
    lower_lim = dataset[column].mean() - dataset[column].std() * mult_factor

    display(dataset[(dataset[column] >= upper_lim) & (dataset[column] <= lower_lim)])

def drop_outliers_std(dataset, column, mult_factor=3):
    '''Drop the outlier rows with standard deviation'''
    upper_lim = dataset[column].mean() + dataset[column].std() * mult_factor
    lower_lim = dataset[column].mean() - dataset[column].std() * mult_factor

    return dataset[(dataset[column] < upper_lim) & (dataset[column] > lower_lim)]

In [7]:
def display_to_drop_percentile(dataset, column):
    '''Display the rows that will be dropped with Percentiles approach'''
    upper_lim = dataset[column].quantile(.95)
    lower_lim = dataset[column].quantile(.05)

    display(dataset[(dataset[column] >= upper_lim) & (dataset[column] <= lower_lim)])

def drop_outliers_percentile(dataset, column):
    '''Drop the outlier rows with Percentiles approach'''
    upper_lim = dataset[column].quantile(.95)
    lower_lim = dataset[column].quantile(.05)

    data = dataset[(dataset[column] < upper_lim) & (dataset[column] > lower_lim)]

## Binning

In [8]:
def numerical_binning(dataset):
    # TODO
    return False

In [9]:
def categorical_binning(dataset):
    # TODO
    return False

# Predictions

### For a first simpler approach, we will only use the 'loan' table

In [10]:
# Reading the different train tables
loan_df = utils.read_csv_to_df('competition_dataset/loan_train.csv', delimiter=';')
trans_df = utils.read_csv_to_df('competition_dataset/trans_train.csv', delimiter=';')
account_df = utils.read_csv_to_df('competition_dataset/account.csv', delimiter=';')

loan_test_df = utils.read_csv_to_df('competition_dataset/loan_test.csv', delimiter=';')
trans_test_df = utils.read_csv_to_df('competition_dataset/trans_test.csv', delimiter=';')

print(' ::: Tables Scheme :::')
display(loan_df.head())
display(trans_df.head())
display(account_df.head())

 ::: Tables Scheme :::


  if (await self.run_code(code, result,  async_=asy)):


Unnamed: 0,loan_id,account_id,date,amount,duration,payments,status
0,5314,1787,930705,96396,12,8033,-1
1,5316,1801,930711,165960,36,4610,1
2,6863,9188,930728,127080,60,2118,1
3,5325,1843,930803,105804,36,2939,1
4,7240,11013,930906,274740,60,4579,1


Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
0,1548749,5270,930113,credit,credit in cash,800.0,800.0,,,
1,1548750,5270,930114,credit,collection from another bank,44749.0,45549.0,,IJ,80269753.0
2,3393738,11265,930114,credit,credit in cash,1000.0,1000.0,,,
3,3122924,10364,930117,credit,credit in cash,1100.0,1100.0,,,
4,1121963,3834,930119,credit,credit in cash,700.0,700.0,,,


Unnamed: 0,account_id,district_id,frequency,date
0,576,55,monthly issuance,930101
1,3818,74,monthly issuance,930101
2,704,55,monthly issuance,930101
3,2378,16,monthly issuance,930101
4,2632,24,monthly issuance,930102


In [11]:
# Joining the different tables
def compose_dataset(loan_df, account_df, trans_df):
    '''Join the different tables and apply feature engineering'''
    return loan_df.join(account_df.set_index('account_id')\
                                  .rename(columns={'date': 'account_date'}),
                        on='account_id', rsuffix='', lsuffix='')\
                  .join(categorical_imputation(
                            clean_nulls(
                                trans_df.set_index('account_id')\
                                        .rename(columns={'date': 'trans_date',
                                                         'amount': 'trans_amount'})
                            ), 'k_symbol', 'Undefined'),
                        on='account_id')

dataset = compose_dataset(loan_df, account_df, trans_df)
display(dataset)

Unnamed: 0,loan_id,account_id,date,amount,duration,payments,status,district_id,frequency,account_date,trans_id,trans_date,type,operation,trans_amount,balance,k_symbol
0,5314,1787,930705,96396,12,8033,-1,30,weekly issuance,930322,523621,930322,credit,credit in cash,1100.0,1100.0,Undefined
0,5314,1787,930705,96396,12,8033,-1,30,weekly issuance,930322,524054,930421,credit,credit in cash,9900.0,11000.0,Undefined
0,5314,1787,930705,96396,12,8033,-1,30,weekly issuance,930322,524055,930521,credit,credit in cash,5800.0,16800.0,Undefined
0,5314,1787,930705,96396,12,8033,-1,30,weekly issuance,930322,524056,930620,credit,credit in cash,3300.0,20100.0,Undefined
1,5316,1801,930711,165960,36,4610,1,46,monthly issuance,930213,527445,930213,credit,credit in cash,700.0,700.0,Undefined
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327,7308,11362,961227,129408,24,5392,1,67,monthly issuance,951014,3424164,961206,withdrawal,remittance to another bank,129.0,39765.5,
327,7308,11362,961227,129408,24,5392,1,67,monthly issuance,951014,3424358,961207,withdrawal,withdrawal in cash,10400.0,29365.5,Undefined
327,7308,11362,961227,129408,24,5392,1,67,monthly issuance,951014,3424116,961207,withdrawal,remittance to another bank,330.0,29035.5,insurrance payment
327,7308,11362,961227,129408,24,5392,1,67,monthly issuance,951014,3424068,961208,withdrawal,remittance to another bank,56.0,28979.5,


In [12]:
# We should start 'cleansing' the data here
display(dataset)
get_null_summary(dataset)

# for column in dataset:
    # display_to_drop_std(dataset, column)
    # display_to_drop_std(dataset, column)

Unnamed: 0,loan_id,account_id,date,amount,duration,payments,status,district_id,frequency,account_date,trans_id,trans_date,type,operation,trans_amount,balance,k_symbol
0,5314,1787,930705,96396,12,8033,-1,30,weekly issuance,930322,523621,930322,credit,credit in cash,1100.0,1100.0,Undefined
0,5314,1787,930705,96396,12,8033,-1,30,weekly issuance,930322,524054,930421,credit,credit in cash,9900.0,11000.0,Undefined
0,5314,1787,930705,96396,12,8033,-1,30,weekly issuance,930322,524055,930521,credit,credit in cash,5800.0,16800.0,Undefined
0,5314,1787,930705,96396,12,8033,-1,30,weekly issuance,930322,524056,930620,credit,credit in cash,3300.0,20100.0,Undefined
1,5316,1801,930711,165960,36,4610,1,46,monthly issuance,930213,527445,930213,credit,credit in cash,700.0,700.0,Undefined
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
327,7308,11362,961227,129408,24,5392,1,67,monthly issuance,951014,3424164,961206,withdrawal,remittance to another bank,129.0,39765.5,
327,7308,11362,961227,129408,24,5392,1,67,monthly issuance,951014,3424358,961207,withdrawal,withdrawal in cash,10400.0,29365.5,Undefined
327,7308,11362,961227,129408,24,5392,1,67,monthly issuance,951014,3424116,961207,withdrawal,remittance to another bank,330.0,29035.5,insurrance payment
327,7308,11362,961227,129408,24,5392,1,67,monthly issuance,951014,3424068,961208,withdrawal,remittance to another bank,56.0,28979.5,


loan_id         0.000000
account_id      0.000000
date            0.000000
amount          0.000000
duration        0.000000
payments        0.000000
status          0.000000
district_id     0.000000
frequency       0.000000
account_date    0.000000
trans_id        0.000000
trans_date      0.000000
type            0.000000
operation       0.176819
trans_amount    0.000000
balance         0.000000
k_symbol        0.000000
dtype: float64

In [13]:
# Good tutorial for feature engineering:
# https://medium.com/datadriveninvestor/a-simple-guide-to-creating-predictive-models-in-python-part-1-8e3ddc3d7008

In [14]:
# Outputting the resultant table to a final csv
utils.write_df_to_csv(dataset, 'dataset', 'preprocessed_data.csv')
utils.write_df_to_csv(compose_dataset(loan_test_df, account_df, trans_test_df),
                      'dataset', 'test_dataset.csv')