# Data Processing

1. Read .csv files
2. Clean tables
3. Merge Tables
4. Feature Selection

In [1]:
import pandas as pd
import numpy as np
from datetime import date as d
from sklearn.preprocessing import LabelEncoder

pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)
ENCODING = False

#### Utils functions

In [2]:
def read_file(file_name, date=False):
    return pd.read_csv('data/' + file_name + '.csv', sep=';', parse_dates=date, na_values=['NA', ''], low_memory=False).rename(str.strip, axis = 'columns')

In [3]:
# Convert dates in yymmdd format to yyyy-mm-dd
def parse_date(date):
    return pd.to_datetime('19'+str(date)[0:2] + str(date)[2:4] + str(date)[4:6])

# Returns current age
def calc_age(inital_date, cur_date=d.today()):
    return cur_date.year - inital_date.year - ((cur_date.month, cur_date.day) < (inital_date.month, inital_date.day))

def calc_months(initial_date, cur_date=d.today()):
    return (cur_date.year - initial_date.year) * 12 + cur_date.month - initial_date.month

### Read files, clean data and select features

In [4]:
def process_account():
    account_df = read_file('account', ['date'])
    
    # Rename date
    account_df.rename(columns={'date':'date_acc'}, inplace=True)
    
    # simpler values
    account_df.loc[account_df['frequency'] == 'monthly issuance', 'frequency'] = 'monthly'
    account_df.loc[account_df['frequency'] == 'weekly issuance', 'frequency'] = 'weekly'
    account_df.loc[account_df['frequency'] == 'issuance after transaction', 'frequency'] = 'after-transaction'
    
    return account_df

In [5]:
def process_card(type_d):
    card_df = read_file('card_'+type_d, ['issued'])
    
    card_df.rename(columns={'type':'type_card'}, inplace=True)
    
    # issued date in a better format
    #card_df['issued'] = card_df['issued'].apply(lambda date: parse_date(date))
    
    # Remove issued date
    card_df.drop(columns='issued', inplace=True)
    
    return card_df

In [6]:
def process_client():
    client_df = read_file('client')
    
    # Extract client's age and gender from birthnumber
    birth_date = []
    ages = []
    genders = []
    for bn in client_df['birth_number']:
        month = int(str(bn)[2:4])
    
        # Gender
        if month > 12:
            genders.append('F')
            bn -= 5000
        else:
            genders.append('M')

        # Age
        bd = parse_date(bn)
        birth_date.append(bd)
        ages.append(calc_age(bd))
    
    client_df['birth_number'] = birth_date
    client_df['gender_clt'] = genders
    client_df['age_clt'] = ages

    return client_df

In [7]:
def process_disposition():
    disp_df = read_file('disp')
    
    # Only owners can ask for loans
    disp_df = disp_df.loc[disp_df.type == 'OWNER']
    disp_df.drop(columns='type', inplace=True)
    disp_df.rename(columns={'type':'type_disp'}, inplace=True)
    
    return disp_df

In [8]:
def process_district():
    district_df = read_file('district')

    district_df.rename(columns={'code': 'district_id'}, inplace=True)

    # Replacing '?' by the median value
    median_crimes_95 = (district_df[district_df["no. of commited crimes '95"] != '?']\
        ["no. of commited crimes '95"]).astype(int).median()
    district_df.loc[district_df["no. of commited crimes '95"] == '?', "no. of commited crimes '95"] = median_crimes_95

    median_unemploymant_95 = (district_df[district_df["unemploymant rate \'95"] != '?']\
        ["unemploymant rate \'95"]).astype(float).median()
    district_df.loc[district_df["unemploymant rate \'95"] == '?', "unemploymant rate \'95"] = median_unemploymant_95

    # Transforming object to numeric
    district_df["unemploymant rate '95"] = pd.to_numeric(district_df["unemploymant rate '95"])
    district_df["no. of commited crimes '95"] = pd.to_numeric(district_df["no. of commited crimes '95"])
    
    # New ratios
    district_df['ratio enterpreneurs'] = district_df['no. of enterpreneurs per 1000 inhabitants'] / 1000
    district_df['ratio of urban inhabitants'] = district_df['ratio of urban inhabitants'] / 100
    
    # Growths
    district_df['criminality_growth'] = (district_df["no. of commited crimes '96"] - district_df["no. of commited crimes '95"]) /\
                              district_df["no. of inhabitants"]
    district_df['unemploymant_growth'] = district_df["unemploymant rate '96"] - district_df["unemploymant rate '95"]
    
    district_df.drop(columns=['name', 'no. of enterpreneurs per 1000 inhabitants', 'no. of inhabitants',
        "unemploymant rate '96", "no. of commited crimes '96",
        "unemploymant rate '95", "no. of commited crimes '95"], inplace=True)
    
    return district_df

In [9]:
def process_loan(type_d):
    loan_df = read_file('loan_'+type_d, ['date'])

    if (ENCODING):
        positive = 0
        negative = 1
    else:
        positive = 'No'
        negative = 'Yes'
    
    loan_df.loc[loan_df['status'] == 1, 'status'] = negative
    loan_df.loc[loan_df['status'] == -1, 'status'] = positive
    
    loan_df.rename(columns={'amount': 'amount_loan'}, inplace=True)
    loan_df.rename(columns={'date':'date_loan'}, inplace=True)
    
    return loan_df

In [10]:
def process_trans(type_d):
    trans_df = read_file('trans_'+type_d, ['date'])
    
    # When the Operation is Null k_symbol has info
    #trans_df.loc[trans_df['operation'].isna(),'operation'] = trans_df.loc[trans_df['operation'].isna(),'k_symbol']

    trans_df['operation'].fillna('interest credited', inplace=True)
    # Rename Operations
    trans_df.loc[trans_df['operation']=='credit in cash', 'operation'] = 'CC'
    trans_df.loc[trans_df['operation']=='collection from another bank', 'operation'] = 'CAB'
    trans_df.loc[trans_df['operation']=='withdrawal in cash', 'operation'] = 'WC'
    trans_df.loc[trans_df['operation']=='remittance to another bank', 'operation'] = 'RAB'
    trans_df.loc[trans_df['operation']=='credit card withdrawal', 'operation'] = 'CCW'
    trans_df.loc[trans_df['operation']=='interest credited', 'operation'] = 'IC'
    
    # Convert 'withdrawal in cash' to 'withdrawal' in type
    trans_df.loc[trans_df['type'] == 'withdrawal in cash', 'type'] = 'withdrawal'

    # Making withdrawals' amount negative
    trans_df.loc[trans_df['type']=='withdrawal', 'amount'] *= -1 
    
    trans_df.rename(columns={'type': 'type_trans', 'amount': 'amount_trans'}, inplace=True)
    trans_df.rename(columns={'date':'date_trans'}, inplace=True)
    trans_df.drop(columns=['k_symbol', 'bank', 'account'], inplace=True)
    
    return trans_df

### Merge data

In [11]:
def process_merge_data(type_d):
    account_df = process_account()
    card_df = process_card(type_d)
    client_df = process_client()
    disp_df = process_disposition()
    district_df = process_district()
    loan_df = process_loan(type_d)
    trans_df = process_trans(type_d)
    
    data = loan_df.merge(disp_df, on='account_id')
    data = data.merge(client_df, on='client_id')
    data = data.merge(account_df, on='account_id', suffixes=('_clt', '_acc'))
    data = data.merge(district_df, left_on='district_id_clt', right_on='district_id')
    data = data.merge(trans_df, on='account_id')
    #data = data.merge(card_df, on='disp_id')

    # Client age at loan request
    data['age_at_loan'] = data.apply(lambda row: calc_age(row['birth_number'], row['date_loan']), axis=1)
    # Account months at loan request
    data['months_acc_at_loan'] = data.apply(lambda row: calc_months(row['date_acc'], row['date_loan']), axis=1)
    
    data = data.drop(columns={'account_id', 'disp_id', 'district_id_clt', 'district_id_acc', 'district_id', 'client_id', 'trans_id',
                              'birth_number', 'age_clt', 'date_acc', 'date_loan', 'date_trans',
                              'no. of municipalities with inhabitants < 499',
                              'no. of municipalities with inhabitants 500-1999',
                              'no. of municipalities with inhabitants 2000-9999',
                              'no. of municipalities with inhabitants >10000',
                              'no. of cities'})
    
    return data

In [12]:
train_data = process_merge_data('train')
test_data = process_merge_data('test')

print('<Train Data>')
print(train_data.info())
print('\n<Test Data>')
print(test_data.info())

<Train Data>
<class 'pandas.core.frame.DataFrame'>
Int64Index: 24494 entries, 0 to 24493
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_id                     24494 non-null  int64  
 1   amount_loan                 24494 non-null  int64  
 2   duration                    24494 non-null  int64  
 3   payments                    24494 non-null  int64  
 4   status                      24494 non-null  object 
 5   gender_clt                  24494 non-null  object 
 6   frequency                   24494 non-null  object 
 7   region                      24494 non-null  object 
 8   ratio of urban inhabitants  24494 non-null  float64
 9   average salary              24494 non-null  int64  
 10  ratio enterpreneurs         24494 non-null  float64
 11  criminality_growth          24494 non-null  float64
 12  unemploymant_growth         24494 non-null  float64
 13  type_trans        

In [13]:
train_data

Unnamed: 0,loan_id,amount_loan,duration,payments,status,gender_clt,frequency,region,ratio of urban inhabitants,average salary,ratio enterpreneurs,criminality_growth,unemploymant_growth,type_trans,operation,amount_trans,balance,age_at_loan,months_acc_at_loan
0,5314,96396,12,8033,No,F,weekly,west Bohemia,0.818,9650,0.100,-0.001909,0.29,credit,CC,1100.0,1100.0,45,4
1,5314,96396,12,8033,No,F,weekly,west Bohemia,0.818,9650,0.100,-0.001909,0.29,credit,CC,9900.0,11000.0,45,4
2,5314,96396,12,8033,No,F,weekly,west Bohemia,0.818,9650,0.100,-0.001909,0.29,credit,CC,5800.0,16800.0,45,4
3,5314,96396,12,8033,No,F,weekly,west Bohemia,0.818,9650,0.100,-0.001909,0.29,credit,CC,3300.0,20100.0,45,4
4,5316,165960,36,4610,Yes,M,monthly,east Bohemia,0.735,8369,0.117,-0.002094,0.52,credit,CC,700.0,700.0,24,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24489,6015,110112,24,4588,Yes,M,monthly,south Moravia,0.509,8240,0.099,0.000491,1.03,credit,CC,40890.0,122647.9,33,11
24490,6015,110112,24,4588,Yes,M,monthly,south Moravia,0.509,8240,0.099,0.000491,1.03,credit,IC,299.9,102247.7,33,11
24491,6015,110112,24,4588,Yes,M,monthly,south Moravia,0.509,8240,0.099,0.000491,1.03,withdrawal,WC,-14.6,102233.1,33,11
24492,6015,110112,24,4588,Yes,M,monthly,south Moravia,0.509,8240,0.099,0.000491,1.03,withdrawal,WC,-28800.0,106858.1,33,11


### Data aggregation

In [14]:
def abs_min(x):
    return x.abs().min()
def rangev(x):
    return x.max() - x.min()

# Operations
def count_CC(x):
    return sum(x=='CC')
def count_CAB(x):
    return sum(x=='CAB')
def count_WC(x):
    return sum(x=='WC')
def count_RAB(x):
    return sum(x=='RAB')
def count_CCW(x):
    return sum(x=='CCW')
def count_IC(x):
    return sum(x=='IC')

def mean_CC(x):
    return np.mean(x=='CC')
def mean_CAB(x):
    return np.mean(x=='CAB')
def mean_WC(x):
    return np.mean(x=='WC')
def mean_RAB(x):
    return np.mean(x=='RAB')
def mean_CCW(x):
    return np.mean(x=='CCW')
def mean_IC(x):
    return np.mean(x=='IC')

# Type Transaction
def count_w(x):
    return sum(x=='withdrawal')
def count_c(x):
    return sum(x=='credit')

def mean_w(x):
    return np.mean(x=='withdrawal')
def mean_c(x):
    return np.mean(x=='credit')

In [15]:
def aggregate_data(df):
    # Keep all columns except the ones we are aggregating
    keep_columns = df.columns.to_list()
    keep_columns.remove('type_trans')
    keep_columns.remove('operation')
    keep_columns.remove('amount_trans')
    keep_columns.remove('balance')

    # 'operation': ['count', count_CC, count_CAB, count_WC, count_RAB, count_CCW, count_IC, mean_CC, mean_CAB, mean_WC, mean_RAB, mean_CCW, mean_IC]

    df = df.groupby(keep_columns, as_index=False, group_keys=False).agg({
        'operation': ['count'],
        'type_trans': [count_w, count_c, mean_w, mean_c],
        'amount_trans': ['mean','min','max','std','last', np.cov, abs_min, rangev],
        'balance': ['mean','min','max','std','last', np.cov, abs_min, rangev]
    })
    df.columns = ['%s%s' % (a, '_%s' % b if b else '') for a, b in df.columns]
    
    return df

In [16]:
train_data = aggregate_data(train_data)

test_data.status = test_data.status.fillna('')
test_data = aggregate_data(test_data)

print('<Train Data>')
print(train_data.info())
print('\n<Test Data>')
print(test_data.info())

<Train Data>
<class 'pandas.core.frame.DataFrame'>
Int64Index: 328 entries, 0 to 327
Data columns (total 36 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_id                     328 non-null    int64  
 1   amount_loan                 328 non-null    int64  
 2   duration                    328 non-null    int64  
 3   payments                    328 non-null    int64  
 4   status                      328 non-null    object 
 5   gender_clt                  328 non-null    object 
 6   frequency                   328 non-null    object 
 7   region                      328 non-null    object 
 8   ratio of urban inhabitants  328 non-null    float64
 9   average salary              328 non-null    int64  
 10  ratio enterpreneurs         328 non-null    float64
 11  criminality_growth          328 non-null    float64
 12  unemploymant_growth         328 non-null    float64
 13  age_at_loan           

In [17]:
train_data

Unnamed: 0,loan_id,amount_loan,duration,payments,status,gender_clt,frequency,region,ratio of urban inhabitants,average salary,ratio enterpreneurs,criminality_growth,unemploymant_growth,age_at_loan,months_acc_at_loan,operation_count,type_trans_count_w,type_trans_count_c,type_trans_mean_w,type_trans_mean_c,amount_trans_mean,amount_trans_min,amount_trans_max,amount_trans_std,amount_trans_last,amount_trans_cov,amount_trans_abs_min,amount_trans_rangev,balance_mean,balance_min,balance_max,balance_std,balance_last,balance_cov,balance_abs_min,balance_rangev
0,4959,80952,24,3373,Yes,M,monthly,Prague,1.000,12541,0.167,0.011146,0.14,48,11,54,32,22,0.592593,0.407407,515.568519,-22400.0,30354.0,11895.998465,138.3,1.415148e+08,13.5,52754.0,32590.624074,1100.0,67529.6,12061.705682,27855.2,1.454847e+08,1100.0,66429.6
1,4961,30276,12,2523,No,F,monthly,south Bohemia,0.670,9104,0.123,0.000532,0.56,57,12,80,34,46,0.425000,0.575000,198.180000,-18200.0,22708.0,9205.689080,15139.0,8.474471e+07,14.6,40908.0,25197.092500,715.0,58157.5,15039.248405,15854.0,2.261790e+08,715.0,57442.5
2,4973,165960,24,6915,Yes,F,monthly,south Bohemia,0.569,8427,0.107,0.000415,0.42,51,19,125,88,37,0.704000,0.296000,189.515200,-41400.0,62235.0,20882.029393,114.1,4.360592e+08,14.6,103635.0,52523.244800,700.0,107069.6,20955.646998,23703.8,4.391391e+08,700.0,106369.6
3,4996,88440,12,7370,Yes,F,after-transaction,north Bohemia,0.853,9317,0.097,-0.000601,0.58,51,6,31,15,16,0.483871,0.516129,2545.412903,-35100.0,47976.0,21020.897593,282.6,4.418781e+08,100.0,83076.0,62778.090323,200.0,103239.0,21638.258870,79007.6,4.682142e+08,200.0,103039.0
4,5002,104808,12,8734,Yes,M,monthly,south Moravia,0.483,8512,0.102,-0.001144,0.61,54,6,30,18,12,0.600000,0.400000,933.846667,-16400.0,25970.0,10052.730130,-3900.0,1.010574e+08,14.6,42370.0,38709.830000,500.0,57865.3,11517.175248,28015.4,1.326453e+08,500.0,57365.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323,7271,392460,60,6541,Yes,F,monthly,central Bohemia,0.580,8754,0.137,0.000593,0.48,15,23,143,97,46,0.678322,0.321678,372.344056,-35300.0,58870.0,19837.442360,216.2,3.935241e+08,14.6,94170.0,57131.740559,900.0,115913.4,20293.363480,53259.7,4.118206e+08,900.0,115013.4
324,7284,52788,12,4399,Yes,M,monthly,south Bohemia,0.519,9045,0.124,0.000578,0.47,20,8,43,22,21,0.511628,0.488372,489.046512,-17920.0,11427.0,5366.660291,1900.0,2.880104e+07,14.6,29347.0,22198.179070,1000.0,41469.1,6652.642956,21029.0,4.425766e+07,1000.0,40469.1
325,7304,419880,60,6998,Yes,F,weekly,Prague,1.000,12541,0.167,0.011146,0.14,49,5,18,7,11,0.388889,0.611111,1372.472222,-64800.0,62982.0,33871.421667,300.0,1.147273e+09,39.0,127782.0,59352.833333,200.0,104039.9,27879.396857,24704.4,7.772608e+08,200.0,103839.9
326,7305,54024,12,4502,Yes,M,monthly,south Moravia,0.538,8814,0.107,-0.000450,0.98,28,22,147,102,45,0.693878,0.306122,174.711565,-30100.0,40521.0,13999.218037,109.6,1.959781e+08,14.6,70621.0,36480.185034,1000.0,81705.8,15469.988113,25697.2,2.393205e+08,1000.0,80705.8


### Some more feature engineering

In [18]:
def feature_engineering(df):
    # If client reached negative balance
    df['balance_negative'] = 0
    df.loc[df['balance_min'] < 0, 'balance_negative'] = 1
    df['balance_negative'] = df['balance_negative'].astype(int)

    # Ratios
    #! (these are redundant from the respective 'mean' already created in agg)
    #df['ratio_credit'] = df['type_trans_count_c'] / df['operation_count']
    #df['ratio_withdrawal'] = df['type_trans_count_w'] / df['operation_count']

    # Months until bankrupt
    df['months_until_bankrupt'] =  np.floor(df['balance_last'] / df['payments'])
    df['months_until_bankrupt'] = df['months_until_bankrupt'].astype(int)

    # Operations per month acc
    df['ops_per_month'] = df['operation_count'] / df['months_acc_at_loan']

    # Drop redundant columns
    df.drop(columns=['operation_count', 'type_trans_count_w', 'type_trans_mean_w'], inplace=True)

    return df

In [19]:
train_data = feature_engineering(train_data)
test_data = feature_engineering(test_data)

print('<Train Data>')
print(train_data.info())
print('\n<Test Data>')
print(test_data.info())

<Train Data>
<class 'pandas.core.frame.DataFrame'>
Int64Index: 328 entries, 0 to 327
Data columns (total 36 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   loan_id                     328 non-null    int64  
 1   amount_loan                 328 non-null    int64  
 2   duration                    328 non-null    int64  
 3   payments                    328 non-null    int64  
 4   status                      328 non-null    object 
 5   gender_clt                  328 non-null    object 
 6   frequency                   328 non-null    object 
 7   region                      328 non-null    object 
 8   ratio of urban inhabitants  328 non-null    float64
 9   average salary              328 non-null    int64  
 10  ratio enterpreneurs         328 non-null    float64
 11  criminality_growth          328 non-null    float64
 12  unemploymant_growth         328 non-null    float64
 13  age_at_loan           

In [20]:
train_data

Unnamed: 0,loan_id,amount_loan,duration,payments,status,gender_clt,frequency,region,ratio of urban inhabitants,average salary,ratio enterpreneurs,criminality_growth,unemploymant_growth,age_at_loan,months_acc_at_loan,type_trans_count_c,type_trans_mean_c,amount_trans_mean,amount_trans_min,amount_trans_max,amount_trans_std,amount_trans_last,amount_trans_cov,amount_trans_abs_min,amount_trans_rangev,balance_mean,balance_min,balance_max,balance_std,balance_last,balance_cov,balance_abs_min,balance_rangev,balance_negative,months_until_bankrupt,ops_per_month
0,4959,80952,24,3373,Yes,M,monthly,Prague,1.000,12541,0.167,0.011146,0.14,48,11,22,0.407407,515.568519,-22400.0,30354.0,11895.998465,138.3,1.415148e+08,13.5,52754.0,32590.624074,1100.0,67529.6,12061.705682,27855.2,1.454847e+08,1100.0,66429.6,0,8,4.909091
1,4961,30276,12,2523,No,F,monthly,south Bohemia,0.670,9104,0.123,0.000532,0.56,57,12,46,0.575000,198.180000,-18200.0,22708.0,9205.689080,15139.0,8.474471e+07,14.6,40908.0,25197.092500,715.0,58157.5,15039.248405,15854.0,2.261790e+08,715.0,57442.5,0,6,6.666667
2,4973,165960,24,6915,Yes,F,monthly,south Bohemia,0.569,8427,0.107,0.000415,0.42,51,19,37,0.296000,189.515200,-41400.0,62235.0,20882.029393,114.1,4.360592e+08,14.6,103635.0,52523.244800,700.0,107069.6,20955.646998,23703.8,4.391391e+08,700.0,106369.6,0,3,6.578947
3,4996,88440,12,7370,Yes,F,after-transaction,north Bohemia,0.853,9317,0.097,-0.000601,0.58,51,6,16,0.516129,2545.412903,-35100.0,47976.0,21020.897593,282.6,4.418781e+08,100.0,83076.0,62778.090323,200.0,103239.0,21638.258870,79007.6,4.682142e+08,200.0,103039.0,0,10,5.166667
4,5002,104808,12,8734,Yes,M,monthly,south Moravia,0.483,8512,0.102,-0.001144,0.61,54,6,12,0.400000,933.846667,-16400.0,25970.0,10052.730130,-3900.0,1.010574e+08,14.6,42370.0,38709.830000,500.0,57865.3,11517.175248,28015.4,1.326453e+08,500.0,57365.3,0,3,5.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323,7271,392460,60,6541,Yes,F,monthly,central Bohemia,0.580,8754,0.137,0.000593,0.48,15,23,46,0.321678,372.344056,-35300.0,58870.0,19837.442360,216.2,3.935241e+08,14.6,94170.0,57131.740559,900.0,115913.4,20293.363480,53259.7,4.118206e+08,900.0,115013.4,0,8,6.217391
324,7284,52788,12,4399,Yes,M,monthly,south Bohemia,0.519,9045,0.124,0.000578,0.47,20,8,21,0.488372,489.046512,-17920.0,11427.0,5366.660291,1900.0,2.880104e+07,14.6,29347.0,22198.179070,1000.0,41469.1,6652.642956,21029.0,4.425766e+07,1000.0,40469.1,0,4,5.375000
325,7304,419880,60,6998,Yes,F,weekly,Prague,1.000,12541,0.167,0.011146,0.14,49,5,11,0.611111,1372.472222,-64800.0,62982.0,33871.421667,300.0,1.147273e+09,39.0,127782.0,59352.833333,200.0,104039.9,27879.396857,24704.4,7.772608e+08,200.0,103839.9,0,3,3.600000
326,7305,54024,12,4502,Yes,M,monthly,south Moravia,0.538,8814,0.107,-0.000450,0.98,28,22,45,0.306122,174.711565,-30100.0,40521.0,13999.218037,109.6,1.959781e+08,14.6,70621.0,36480.185034,1000.0,81705.8,15469.988113,25697.2,2.393205e+08,1000.0,80705.8,0,5,6.681818


In [21]:
train_data.to_csv('train_data_all.csv', index=False)

In [22]:
def encode_label(data, cols):
    data_copy = data.copy()
    data_copy[cols] = data_copy[cols].apply(LabelEncoder().fit_transform)

    return data_copy

### Cleaning correlated atributes

Dropping columns with correlation greater than 0.95

In [23]:
cols_encode = ['status', 'gender_clt', 'frequency', 'region']
train_data_labeled = encode_label(train_data, cols_encode)

cor_matrix = train_data_labeled.corr().abs()
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(bool))
#print(upper_tri)

to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]

train_data = train_data.drop(columns=to_drop)
test_data = test_data.drop(columns=to_drop)

print("Dropped cols: ", to_drop)

Dropped cols:  ['amount_trans_cov', 'amount_trans_rangev', 'balance_cov', 'balance_rangev']


### Saving files

In [24]:
def encode(data):
    le = LabelEncoder()
    
    data['gender_clt'] = le.fit_transform(data['gender_clt'])
    data = pd.get_dummies(data, columns=['gender_clt', 'frequency', 'region'], dtype=bool)
    
    return data
    

**ToDo** : Pôr os ficheiros de baixo dentro de uma pasta `model_data`

In [25]:
if (ENCODING):
    train_data = encode(train_data)
    test_data = encode(test_data)
    print('Encoded')
    
    train_data.to_pickle('train_data.pkl')
    test_data.to_pickle('test_data.pkl')
else:
    train_data.to_csv('train_data.csv', index=False)
    test_data.to_csv('test_data.csv', index=False)