# Data processing

## Load data

In [192]:
from utils import *

import matplotlib.pyplot as plt
import seaborn as sb
from copy import deepcopy

%matplotlib inline

account_df = read_to_df("account.csv")
card_test_df = read_to_df("card_test.csv")
card_train_df = read_to_df("card_train.csv")
client_df = read_to_df("client.csv")
disp_df = read_to_df("disp.csv")
district_df = read_to_df("district.csv")
loan_test_df = read_to_df("loan_test.csv")
loan_train_df = read_to_df("loan_train.csv")
trans_test_df = read_to_df("trans_test.csv")
trans_train_df = read_to_df("trans_train.csv")

  trans_train_df = read_to_df("trans_train.csv")


## Process data

### Process account data

In [193]:
avg_transactions_per_week = 3
avg_weeks_per_month = (365.25 / 7 / 12)

account_df['frequency'] = account_df['frequency'].apply(lambda x: 1 if x == 'monthly issuance' else avg_weeks_per_month if x == 'weekly issuance' else (365.25 / 7 / 12) * avg_transactions_per_week)
account_df.head()

Unnamed: 0,account_id,district_id,frequency,date
0,576,55,1.0,930101
1,3818,74,1.0,930101
2,704,55,1.0,930101
3,2378,16,1.0,930101
4,2632,24,1.0,930102


### Process client data

In [194]:
client_df["sex"] = client_df["birth_number"].apply(lambda x: 0 if int(str(x)[2:4]) > 50 else 1)
client_df["age"] = client_df["birth_number"].apply(lambda x: calculate_age(read_date(x)))

client_df.drop("birth_number", inplace=True, axis=1)

client_district_df = client_df.join(district_df)
client_district_df.head()

Unnamed: 0,client_id,district_id,sex,age,code,name,region,no. of inhabitants,no. of municipalities with inhabitants < 499,no. of municipalities with inhabitants 500-1999,no. of municipalities with inhabitants 2000-9999,no. of municipalities with inhabitants >10000,no. of cities,ratio of urban inhabitants,average salary,unemploymant rate '95,unemploymant rate '96,no. of enterpreneurs per 1000 inhabitants,no. of commited crimes '95,no. of commited crimes '96
0,1,18,0,28,1.0,Hl.m. Praha,Prague,1204953.0,0.0,0.0,0.0,1.0,1.0,100.0,12541.0,0.29,0.43,167.0,85677,99107.0
1,2,1,1,53,2.0,Benesov,central Bohemia,88884.0,80.0,26.0,6.0,2.0,5.0,46.7,8507.0,1.67,1.85,132.0,2159,2674.0
2,3,1,0,58,3.0,Beroun,central Bohemia,75232.0,55.0,26.0,4.0,1.0,5.0,41.7,8980.0,1.95,2.21,111.0,2824,2813.0
3,4,5,1,42,4.0,Kladno,central Bohemia,149893.0,63.0,29.0,6.0,2.0,6.0,67.4,9753.0,4.64,5.05,109.0,5244,5892.0
4,5,5,0,38,5.0,Kolin,central Bohemia,95616.0,65.0,30.0,4.0,1.0,6.0,51.4,9307.0,3.85,4.43,118.0,2616,3040.0


### Process disposition data

In [195]:
disp_df.rename(columns={"type": "is_owner"}, inplace=True)
disp_df["is_owner"].replace({"OWNER": True, "DISPONENT": False}, inplace=True)

# Count number clients per account
client_count_df = disp_df.groupby("account_id", as_index=False, group_keys=False).agg(client_count=("is_owner", "count"))

disp_df = disp_df.merge(client_count_df, on="account_id")
disp_df = disp_df[disp_df["is_owner"] == True] 
disp_df.drop("is_owner", axis=1, inplace=True)

disp_df.head()

Unnamed: 0,disp_id,client_id,account_id,client_count
0,1,1,1,1
1,2,2,2,2
3,4,4,3,2
5,6,6,4,1
6,7,7,5,1


### Process transaction data

In [196]:
dataframes = [trans_train_df, trans_test_df]

for i in range(len(dataframes)):
    dataframes[i]["operation"].replace(
        {
            "credit in cash": 1,
            "collection from another bank": 2,
            "withdrawal in cash": 3,
            "remittance to another bank": 4,
            "credit card withdrawal": 5,
            "interest credited": 6
        },
        inplace=True
    )

    # Convert "withdrawal in cash" to "withdrawal" in type column
    dataframes[i].loc[dataframes[i]["type"] == "withdrawal in cash", "type"] = "withdrawal"

    # Withdrawal amounts should be negative
    dataframes[i].loc[dataframes[i]["type"] == "withdrawal", "amount"] *= -1

    dataframes[i].drop(["k_symbol", "bank", "account"], axis=1, inplace=True)

trans_train_df, trans_test_df = dataframes

trans_train_df.head()

Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance
0,1548749,5270,930113,credit,1.0,800.0,800.0
1,1548750,5270,930114,credit,2.0,44749.0,45549.0
2,3393738,11265,930114,credit,1.0,1000.0,1000.0
3,3122924,10364,930117,credit,1.0,1100.0,1100.0
4,1121963,3834,930119,credit,1.0,700.0,700.0


### Process card data

In [197]:
dataframes = [card_train_df, card_test_df]

for i in range(len(dataframes)):
    dataframes[i]["type"].replace({"classic": 1, "junior": 2, "gold": 3}, inplace=True)

card_train_df, card_test_df = dataframes

card_train_df.head()

Unnamed: 0,card_id,disp_id,type,issued
0,1005,9285,1,931107
1,104,588,1,940119
2,747,4915,1,940205
3,70,439,1,940208
4,577,3687,1,940215


### Process loan data

In [198]:
dataframes = [loan_train_df, loan_test_df]

for i in range(len(dataframes)):
    # Merge loans with disposition
    dataframes[i] = disp_df.merge(dataframes[i], on="account_id")

    # # Merge loans with cards
    # dataframes[i] = dataframes[i].merge(right=card_train_df, on="disp_id", how="left", suffixes=["_disp", "_card"])

    # # Merge loans with accounts
    # dataframes[i] = dataframes[i].merge(right=account_df, on="account_id", how="left")

loan_train_df, loan_test_df = dataframes

loan_train_df.head()

Unnamed: 0,disp_id,client_id,account_id,client_count,loan_id,date,amount,duration,payments,status
0,2,2,2,2,4959,940105,80952,24,3373,1
1,25,25,19,1,4961,960429,30276,12,2523,-1
2,78,78,67,1,4973,960502,165960,24,6915,1
3,158,158,132,2,4996,961106,88440,12,7370,1
4,210,210,173,2,5002,940531,104808,12,8734,1


## Export preprocessed dataframes

In [199]:
loan_train_df.to_pickle("../out/train.pkl")
loan_test_df.to_pickle("../out/test.pkl")