# Data processing

## Load data

In [None]:
from utils import *

import category_encoders as ce
import matplotlib.pyplot as plt
import seaborn as sb
from copy import deepcopy

%matplotlib inline

account_df = read_to_df("account.csv")
card_test_df = read_to_df("card_test.csv")
card_train_df = read_to_df("card_train.csv")
client_df = read_to_df("client.csv")
disp_df = read_to_df("disp.csv")
district_df = read_to_df("district.csv")
loan_test_df = read_to_df("loan_test.csv")
loan_train_df = read_to_df("loan_train.csv")
trans_test_df = read_to_df("trans_test.csv")
trans_train_df = read_to_df("trans_train.csv")

## Process data

### Process account data

In [None]:
avg_transactions_per_week = 3
avg_weeks_per_month = (365.25 / 7 / 12)

account_df['frequency'] = account_df['frequency'].apply(lambda x: 1 if x == 'monthly issuance' else avg_weeks_per_month if x == 'weekly issuance' else (365.25 / 7 / 12) * avg_transactions_per_week)
account_df["date"] = account_df["date"].apply(lambda x: read_date(x))
account_df.rename(columns={"date": "creation_date", "frequency": "issuance_frequency_per_month"}, inplace=True)

account_df.head()

### Process client data

In [None]:
client_df["sex"] = client_df["birth_number"].apply(lambda x: 0 if int(str(x)[2:4]) > 50 else 1)
client_df["age"] = client_df["birth_number"].apply(lambda x: calculate_age(read_date(x)))

client_df.drop("birth_number", inplace=True, axis=1)

client_df.head()

### Process disposition data

In [None]:
disp_df.rename(columns={"type": "is_owner"}, inplace=True)
disp_df["is_owner"].replace({"OWNER": True, "DISPONENT": False}, inplace=True)

# Count number clients per account
client_count_df = disp_df.groupby("account_id", as_index=False, group_keys=False).agg(client_count=("is_owner", "count"))

disp_df = disp_df.merge(client_count_df, on="account_id")
disp_df = disp_df[disp_df["is_owner"] == True] 
disp_df.drop("is_owner", axis=1, inplace=True)

disp_df.head()

### Process transaction data

In [None]:
dataframes = [trans_train_df, trans_test_df]

for i in range(len(dataframes)):
    dataframes[i]["operation"].replace(
        {
            "credit in cash": 1,
            "collection from another bank": 2,
            "withdrawal in cash": 3,
            "remittance to another bank": 4,
            "credit card withdrawal": 5,
            "interest credited": 6
        },
        inplace=True
    )

    # Convert "withdrawal in cash" to "withdrawal" in type column
    dataframes[i].loc[dataframes[i]["type"] == "withdrawal in cash", "type"] = "withdrawal"

    # Withdrawal amounts should be negative
    dataframes[i].loc[dataframes[i]["type"] == "withdrawal", "amount"] *= -1

    dataframes[i]["date"] = dataframes[i]["date"].apply(lambda x: read_date(x))
    dataframes[i].rename(columns={"date": "transaction_date"}, inplace=True)

    dataframes[i].drop(["k_symbol", "bank", "account"], axis=1, inplace=True)

trans_train_df, trans_test_df = dataframes

trans_train_df.head()

### Process card data

In [None]:
dataframes = [card_train_df, card_test_df]

for i in range(len(dataframes)):
    dataframes[i]["type"].replace({"classic": 1, "junior": 2, "gold": 3}, inplace=True)
    dataframes[i]["issued"] = dataframes[i]["issued"].apply(lambda x: read_date(x))

card_train_df, card_test_df = dataframes

card_train_df.head()

### Process demographic data

In [None]:

district_df.replace("?", np.NaN, inplace=True)

district_df["unemploymant rate '95"].fillna(district_df["unemploymant rate '95"].median(), inplace=True)
district_df["no. of commited crimes '95"].fillna(district_df["no. of commited crimes '95"].median(), inplace=True)

district_df["unemploymant rate '95"] = pd.to_numeric(district_df["unemploymant rate '95"])
district_df["no. of commited crimes '95"] = pd.to_numeric(district_df["no. of commited crimes '95"])

district_df["criminality_growth"] = (district_df["no. of commited crimes '96"] - district_df["no. of commited crimes '95"]) / district_df["no. of inhabitants"]
district_df["unemployment_growth"] = (district_df["unemploymant rate '96"] - district_df["unemploymant rate '95"])
district_df["ratio_entrepeneurs"] = district_df["no. of enterpreneurs per 1000 inhabitants"] / 1000

district_df.drop([
    "unemploymant rate '95",
    "unemploymant rate '96",
    "no. of commited crimes '95",
    "no. of commited crimes '96",
    "no. of enterpreneurs per 1000 inhabitants"
], axis=1, inplace=True)

### Process loan data

In [None]:
loan_dfs = [loan_train_df, loan_test_df]

for i in range(len(loan_dfs)):
    loan_dfs[i]["date"] = loan_dfs[i]["date"].apply(lambda x: read_date(x))
    loan_dfs[i].rename(columns={"date": "loan_date", "amount": "loan_amount"}, inplace=True)

## Generate feature from transaction data

In [None]:
# transactions = (trans_train_df, trans_test_df)
# account_features = [1, 2]

# for i in range(len(transactions)):
#     # Sorting transactions by date to figure out the most recent balance
#     account_features[i] = transactions[i].sort_values(by="date", axis=0, ascending=False)
#     account_features[i].drop_duplicates(subset='account_id', keep='first', inplace=True)

#     account_features[i].drop(account_features[i].columns.difference(['account_id', 'balance']), axis=1, inplace=True)
#     account_features[i].rename(columns={'balance': 'final_amount'}, inplace=True)

# account_features[0]

## Merge data

In [None]:
loan_dfs = [loan_train_df, loan_test_df]
trans_dfs = (trans_train_df, trans_test_df)
cards_dfs = (card_train_df, card_test_df)

for i in range(len(loan_dfs)):
    # Merge with dispositions
    loan_dfs[i] = loan_dfs[i].merge(disp_df, on="account_id", how="left")

    # Merge with accounts
    loan_dfs[i] = loan_dfs[i].merge(account_df, on="account_id")

    # Merge with clients
    loan_dfs[i] = loan_dfs[i].merge(client_df, on="client_id", suffixes=["_account", "_client"])

    # Merge with districts
    loan_dfs[i] = loan_dfs[i].merge(district_df, left_on="district_id_client", right_on="code")

    # Merge with cards
    loan_dfs[i] = loan_dfs[i].merge(cards_dfs[i], on="disp_id", how="left")

    # Merge with transactions
    loan_dfs[i] = loan_dfs[i].merge(trans_dfs[i], on="account_id", suffixes=["_card", "_transaction"])

loan_train_df, loan_test_df = loan_dfs

## Encode district name

In [None]:
loan_train_df["status"] = loan_train_df["status"].apply(lambda x: True if (x == 1) else False)
columns = ["region"]
woe_encoder = ce.WOEEncoder(cols=columns)
woe_encoded_train = woe_encoder.fit_transform(loan_train_df[columns], loan_train_df["status"]).add_suffix('_woe')
loan_train_df = loan_train_df.join(woe_encoded_train)
loan_train_df["status"] = loan_train_df["status"].apply(lambda x: 1 if (x == True) else -1)

## Dropping features

- All cards can be dropped as there are only 11 out of the total 328 loans making it very hard or impossible to fill in missing values.
- IDs are no longer needed

In [None]:
loan_dfs = [loan_train_df, loan_test_df]

for i in range(len(loan_dfs)):
    loan_dfs[i].drop(["card_id", "type_card", "issued"], axis=1, inplace=True)
    loan_dfs[i].drop(["disp_id", "account_id", "client_id"], axis=1, inplace=True)
    loan_dfs[i].drop(["district_id_account", "district_id_client"], axis=1, inplace=True)
    loan_dfs[i].drop(["trans_id"], axis=1, inplace=True)

loan_train_df

## Aggregate data 

In [None]:
loan_dfs = [loan_train_df, loan_test_df]

def count_withdrawal(x):
    return sum(x=="withdrawal")

def count_credit(x):
    return sum(x=="credit")

for i in range(len(loan_dfs)):
    aggregated_columns = ("transaction_date", "operation", "amount", "balance", "type_transaction", "client_count")
    columns = [x for x in loan_dfs[i].columns.to_list() if x not in aggregated_columns]

    df = loan_dfs[i].groupby(columns, as_index=False, group_keys=False, dropna=False)

    num_times_under_zero = df.apply(lambda x: pd.Series(dict(
        num_times_under_zero = (x.balance < x.payments).sum() > 2
    )))["num_times_under_zero"]

    df = df.agg({
        "balance": ["mean", "min", "max"],
        "transaction_date": ["max"],
        "client_count": ["mean"],
        "operation": ["count"],
        "amount": ["mean", "min", "max", "std"],
        "type_transaction": [count_withdrawal, count_credit]
    })

    df["num_times_under_zero"] = num_times_under_zero

    df.columns = ['%s%s' % (a, '_%s' % b if b else '') for a, b in df.columns]

    # Account age at time of loan in days
    df["account_age"] = (df['loan_date'] - df['creation_date']).dt.days

    # Number of days since last transaction
    df["days_since_last_transaction"] = (df["loan_date"] - df["transaction_date_max"]).dt.days

    # Whether an account has reached a negative balance
    # df["reached_negative_balance"] = df["balance_min"] < 0

    # Drop non-numeric columns
    loan_dfs[i] = df.select_dtypes(["number", "bool"])

    # loan_dfs[i].drop(["balance_min", "balance_max"], axis=1, inplace=True)

    status = loan_dfs[i].pop("status")
    loan_dfs[i]["status"] = status

loan_train_df, loan_test_df = loan_dfs

pd.set_option('display.max_columns', 500)

loan_train_df


## Visualization

### Age

In [None]:
plt.title("Age distribution by loan request")
plt.xlabel("Age")
plt.ylabel("Number of loans")

sb.histplot(data=loan_train_df, x="age", hue="status", bins=20).set(title="");

### Loan amount

In [None]:
sb.histplot(data=loan_train_df, x=loan_train_df["loan_amount"], hue="status", bins=30)

### Salary


In [None]:
plt.figure(figsize=(18, 8))

sb.scatterplot(x=loan_train_df["average salary"], y=loan_train_df["loan_amount"], marker="x");

### Correlation matrix

In [None]:
corr_matrix = loan_train_df.drop("loan_id", axis=1).corr(method='spearman')

threshold = 0.05

correlation_status = corr_matrix.loc[['status'], :]
selected_cols = set(correlation_status.loc[:, (abs(correlation_status) > threshold).any()].columns.to_list())
dropped_cols = set.difference(set(correlation_status.columns.to_list()), selected_cols)

loan_train_df.drop(dropped_cols, axis=1, inplace=True)
loan_test_df.drop(dropped_cols, axis=1, inplace=True)

corr_matrix = loan_train_df.drop("loan_id", axis=1).corr(method='spearman')

mask = np.zeros(corr_matrix.shape, dtype=bool)

plt.figure(figsize=(15, 15))

mask[np.triu_indices(len(mask))] = True

plt.title('Correlation Heatmap of client Dataset')

sb.heatmap(corr_matrix, square=True, annot=True, fmt='.2f', linecolor='black', mask=mask)

plt.show()

## Export preprocessed dataframes

In [None]:
loan_train_df.to_pickle("../out/train.pkl")
loan_test_df.to_pickle("../out/test.pkl")