# Data Processing

1. Read .csv files
2. Clean tables
3. Merge Tables
4. Feature Selection

In [1]:
import pandas as pd
import numpy as np
from datetime import date as d

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

Utils functions

In [2]:
def read_file(file_name):
    return pd.read_csv('data/' + file_name + '.csv', sep=';', na_values=['NA', ''], low_memory=False).rename(str.strip, axis = 'columns')

In [3]:
# Convert dates in yymmdd format to yyyy-mm-dd
def parse_date(date):
    return pd.to_datetime('19'+str(date)[0:2] + str(date)[2:4] + str(date)[4:6])

# Returns current age given a date in yymmdd
def calc_age(date):
    date = parse_date(date)
    today = d.today()
    return today.year - date.year - ((today.month, today.day) < (date.month, date.day))

Read files, clean data and select features

In [4]:
def process_account():
    account_df = read_file('account')
    
    # Calculate account age
    account_df['date'] = account_df['date'].apply(lambda date: calc_age(date))
    account_df.rename(columns={'date':'age_acc'}, inplace=True)

    
    # Simpler values for frequency
    account_df.loc[account_df['frequency'] == 'monthly issuance', 'frequency'] = 'monthly'
    account_df.loc[account_df['frequency'] == 'weekly issuance', 'frequency'] = 'weekly'
    account_df.loc[account_df['frequency'] == 'issuance after transaction', 'frequency'] = 'pos-transaction'
    
    return account_df

In [5]:
def process_card(type_d):
    card_df = read_file('card_'+type_d)
    
    card_df.rename(columns={'type':'type_card'}, inplace=True)
    
    # issued date in a better format
    #card_df['issued'] = card_df['issued'].apply(lambda date: parse_date(date))
    
    # Remove issued date
    card_df.drop(columns='issued', inplace=True)
    
    return card_df

In [6]:
def process_client():
    client_df = read_file('client')
    
    # Extract client's age and gender from birthnumber
    ages = []
    genders = []
    for bn in client_df['birth_number']:
        month = int(str(bn)[2:4])
    
        # Gender
        if month > 12:
            genders.append('F')
            bn -= 5000
        else:
            genders.append('M')

        # Age
        ages.append(calc_age(bn))
    
    client_df['age_clt'] = ages
    client_df['gender_clt'] = genders
    client_df.drop(columns='birth_number', inplace=True)
    
    return client_df

In [7]:
def process_disposition():
    disp_df = read_file('disp')
    
    # Only owners can ask for loans
    disp_df = disp_df.loc[disp_df.type == 'OWNER']
    disp_df = disp_df.drop(labels='type', axis=1)
    disp_df.rename(columns={'type':'type_disp'}, inplace=True)
    
    return disp_df

In [8]:
def process_district():
    district_df = read_file('district')
    
    # Assign '96 values to missing values' cells from '95
    district_df["unemploymant rate '95"] = np.where(district_df["unemploymant rate '95"] == '?', district_df["unemploymant rate '96"], district_df["unemploymant rate '95"])
    district_df["no. of commited crimes '95"] = np.where(district_df["no. of commited crimes '95"] == '?', district_df["no. of commited crimes '96"], district_df["no. of commited crimes '95"])

    district_df.rename(columns={'code': 'district_id'}, inplace=True)
    
    return district_df

In [9]:
def process_loan(type_d):
    loan_df = read_file('loan_'+type_d)
    
    loan_df.rename(columns={'amount': 'amount_loan'}, inplace=True)
    loan_df.drop(columns='date', inplace=True)
    
    return loan_df

In [10]:
def process_trans(type_d):
    trans_df = read_file('trans_'+type_d)
    
    # Rename Operations
    trans_df.loc[trans_df['operation']=='credit in cash','operation'] = 'A'
    trans_df.loc[trans_df['operation']=='collection from another bank','operation'] = 'B'
    trans_df.loc[trans_df['operation']=='withdrawal in cash','operation'] = 'C'
    trans_df.loc[trans_df['operation']=='remittance to another bank','operation'] = 'D'
    trans_df.loc[trans_df['operation']=='credit card withdrawal','operation'] = 'E'
    trans_df.loc[trans_df['operation']=='interest credited','operation'] = 'F'
    trans_df.loc[trans_df['operation'].isna(),'operation'] = trans_df.loc[trans_df['operation'].isna(),'k_symbol'] # When the op is Null k_symbol has info
    
    # Convert 'withdrawal in cash' to 'withdrawal' in type
    trans_df.loc[trans_df['type']=='withdrawal in cash','type'] = 'withdrawal'
    
    trans_df.rename(columns={'amount': 'amount_trans'}, inplace=True)
    trans_df.drop(columns=['date', 'k_symbol', 'bank', 'account'], inplace=True)
    
    return trans_df

Merge data

In [11]:
def process_merge_data(type_d):
    account_df = process_account()
    card_df = process_card(type_d)
    client_df = process_client()
    disp_df = process_disposition()
    district_df = process_district()
    loan_df = process_loan(type_d)
    trans_df = process_trans(type_d)
    
    data = loan_df.merge(disp_df, on='account_id')
    data = data.merge(client_df, on='client_id')
    data = data.merge(account_df, on='account_id', suffixes=('_clt', '_acc'))
    #data = data.merge(district_df, on='district_id')
    #data = data.merge(trans_df, on='account_id')
    #data = data.merge(card_df, on='disp_id')
    
    return data

Get Train and Test Data

In [12]:
train_data = process_merge_data('train')
test_data = process_merge_data('test')

print("<Train Data>")
print(train_data.head())
print("\n<Test Data>")
print(test_data.head())

<Train Data>
   loan_id  account_id  amount_loan  duration  payments  status  disp_id  client_id  district_id_clt  age_clt gender_clt  district_id_acc frequency  age_acc
0     5314        1787        96396        12      8033      -1     2166       2166               30       74          F               30    weekly       28
1     5316        1801       165960        36      4610       1     2181       2181               46       53          M               46   monthly       28
2     6863        9188       127080        60      2118       1    11006      11314               45       85          M               45   monthly       28
3     5325        1843       105804        36      2939       1     2235       2235               14       81          F               12   monthly       28
4     7240       11013       274740        60      4579       1    13231      13539               63       43          M                1    weekly       28

<Test Data>
   loan_id  account_id  amount_l

**ToDo** : Pôr os ficheiros de baixo dentro de uma pasta `model_data`

In [13]:
train_data.to_pickle('train_data.pkl')
test_data.to_pickle('test_data.pkl')