In [1]:
# necessary libraries for pre-processing
import utils
import pandas as pd
import numpy as np
import os
from sklearn import preprocessing

# Useful functions for preprocessing

In [2]:
def convert_date(df, column, date_format='%y%m%d'):
    '''Convert the given column containg dates in the given format
    to the standard date format and type'''
    copy_df = df.copy()
    copy_df[column] = pd.to_datetime(copy_df[column], format=date_format)

    return copy_df

In [3]:
def encode_column(df, column, options_list):
    '''Encode the given column of the given dataframe.
    All column values should be present in the options_list.'''
    copy_df = df.copy()

    le = preprocessing.LabelEncoder()
    le.fit(options_list)
    copy_df[column] = le.transform(copy_df[column])
    
    return copy_df

## Imputation

In [4]:
def get_null_summary(dataset):
    '''Get a null summary display'''
    display(dataset.isnull().mean())

In [5]:
def clean_nulls(dataset, threshold=0.7):
    '''Clean nulls from the given table.
    If the nulls in a column are higher than the given threshold the entire column is deleted.
    If the nulls in a row are higher than the row, the row is also deleted.
    The threshold is a value between 0 and 1'''
    #Dropping columns with missing value rate higher than threshold
    dataset = dataset[dataset.columns[dataset.isnull().mean() < threshold]]

    #Dropping rows with missing value rate higher than threshold
    dataset = dataset.loc[dataset.isnull().mean(axis=1) < threshold]
    
    return dataset

In [6]:
def numerical_imputation(dataset, replacer=None):
    '''When null values exist, set them using the median of the colum,
    or a replacer, if one was given'''
    dataset = dataset.fillna(replacer if replacer else dataset.median())

    return dataset

In [7]:
def categorical_imputation(dataset, column_name, replacer=None):
    '''Replace the inexistent values of the given column with the given replacer.
    If None replacer was ginve, use the column maximum value'''
    #Max fill function for categorical columns
    dataset[column_name].fillna(replacer if replacer else \
                                dataset[column_name].value_counts()
                                                    .idxmax(),
                                inplace=True)
    
    return dataset

## Handling Outliers

In [8]:
def display_to_drop_std(dataset, column, mult_factor=3):
    '''Display the rows that will be dropped using the std approach'''
    upper_lim = dataset[column].mean() + dataset[column].std() * mult_factor
    lower_lim = dataset[column].mean() - dataset[column].std() * mult_factor

    display(dataset[(dataset[column] >= upper_lim) & (dataset[column] <= lower_lim)])

def drop_outliers_std(dataset, column, mult_factor=3):
    '''Drop the outlier rows with standard deviation'''
    upper_lim = dataset[column].mean() + dataset[column].std() * mult_factor
    lower_lim = dataset[column].mean() - dataset[column].std() * mult_factor

    return dataset[(dataset[column] < upper_lim) & (dataset[column] > lower_lim)]

In [9]:
def display_to_drop_percentile(dataset, column):
    '''Display the rows that will be dropped with Percentiles approach'''
    upper_lim = dataset[column].quantile(.95)
    lower_lim = dataset[column].quantile(.05)

    display(dataset[(dataset[column] >= upper_lim) & (dataset[column] <= lower_lim)])

def drop_outliers_percentile(dataset, column):
    '''Drop the outlier rows with Percentiles approach'''
    upper_lim = dataset[column].quantile(.95)
    lower_lim = dataset[column].quantile(.05)

    data = dataset[(dataset[column] < upper_lim) & (dataset[column] > lower_lim)]

## Binning

In [10]:
def numerical_binning(dataset):
    # TODO
    # https://towardsdatascience.com/feature-engineering-for-machine-learning-3a5e293a5114
    return False

In [11]:
def categorical_binning(dataset):
    # TODO
    return False

# Predictions

### For a first simpler approach, we will only use the 'loan' table

In [12]:
# Reading the different train tables
loan_df = utils.read_csv_to_df('competition_dataset/loan_train.csv', delimiter=';')
account_df = utils.read_csv_to_df('competition_dataset/account.csv', delimiter=';')

loan_test_df = utils.read_csv_to_df('competition_dataset/loan_test.csv', delimiter=';')

print(' ::: Tables Scheme :::')
display(loan_df.head())
display(account_df.head())

 ::: Tables Scheme :::


Unnamed: 0,loan_id,account_id,date,amount,duration,payments,status
0,5314,1787,930705,96396,12,8033,-1
1,5316,1801,930711,165960,36,4610,1
2,6863,9188,930728,127080,60,2118,1
3,5325,1843,930803,105804,36,2939,1
4,7240,11013,930906,274740,60,4579,1


Unnamed: 0,account_id,district_id,frequency,date
0,576,55,monthly issuance,930101
1,3818,74,monthly issuance,930101
2,704,55,monthly issuance,930101
3,2378,16,monthly issuance,930101
4,2632,24,monthly issuance,930102


In [13]:
def process_account_df(account_df):
    '''Process the account dataframe'''
    # Processing date
    df = convert_date(account_df, 'date')
    
    # Processing frequency
    return encode_column(df, 'frequency',
                         ['monthly issuance', 'weekly issuance', 'issuance after transaction'])


# Joining the different tables
def compose_dataset(loan_df, account_df):
    '''Join the different tables and apply feature engineering'''
    processed_loan = convert_date(loan_df, 'date')
    processed_account = process_account_df(account_df)

    
    # TODO: Transactions will be one of the hardest, since aggregations and groupbys are needed

    df = processed_loan.merge(processed_account.rename(columns={'date': 'account_creation_date'}),
                        on='account_id')
    
    df['account_age'] = (df['date'] - df['account_creation_date']).dt.days
    df = df.drop(['date', 'account_creation_date'], axis=1)
    
    # Placing status column as last column
    return  df[[col for col in df if col not in ['status']] + ['status']]


dataset = compose_dataset(loan_df, account_df)
display(dataset)

Unnamed: 0,loan_id,account_id,amount,duration,payments,district_id,frequency,account_age,status
0,5314,1787,96396,12,8033,30,2,105,-1
1,5316,1801,165960,36,4610,46,1,148,1
2,6863,9188,127080,60,2118,45,1,170,1
3,5325,1843,105804,36,2939,12,1,185,1
4,7240,11013,274740,60,4579,1,2,204,1
...,...,...,...,...,...,...,...,...,...
323,6818,9030,155616,48,3242,72,1,691,1
324,5625,3189,222180,60,3703,29,1,382,-1
325,6805,8972,45024,48,938,70,1,214,1
326,7233,10963,115812,36,3217,16,1,585,1


In [14]:
# We should start 'cleansing' the data here
get_null_summary(dataset)

# for column in dataset:
    # display_to_drop_std(dataset, column)
    # display_to_drop_percentile(dataset, column)

loan_id        0.0
account_id     0.0
amount         0.0
duration       0.0
payments       0.0
district_id    0.0
frequency      0.0
account_age    0.0
status         0.0
dtype: float64

In [15]:
# Good tutorial for feature engineering:
# https://medium.com/datadriveninvestor/a-simple-guide-to-creating-predictive-models-in-python-part-1-8e3ddc3d7008

In [16]:
# Outputting the resultant table to a final csv
utils.write_df_to_csv(dataset, 'dataset', 'preprocessed_data.csv')
utils.write_df_to_csv(compose_dataset(loan_test_df, account_df),
                      'dataset', 'test_dataset.csv')