In [None]:
from utils import *

import matplotlib.pyplot as plt
import seaborn as sb
from utils import *
import numpy as np
from copy import deepcopy

train_df = pd.read_pickle("../out/train.pkl")
test_df = pd.read_pickle("../out/test.pkl")

### Client count.
In both the training and test datasets, each account with a loan only has 1 or 2 clients. Therefore, for this analysis, it will be counted as binary

In [None]:
print("Encountered client count values in the training dataset:\n" + str(train_df["client_count_mean"].value_counts().index.to_list()) + " \nTesting dataset:\n" + str(test_df["client_count_mean"].value_counts().index.to_list()))

### Remove categoric data

In [None]:
train_df_analysis = deepcopy(train_df)

# # Removing ids
# del train_df_analysis["loan_id"]
# Removing categoric variables
del train_df_analysis["status"]
#del train_df_analysis["sex"]
del train_df_analysis["num_times_under_zero"]

del train_df_analysis["issuance_frequency_per_month"]
# Also counts as categoric data for this analysis
del train_df_analysis["client_count_mean"]


### Plot Value distributions

### Histogram

In [None]:
nrows = 3
ncols = 4
fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=[12, 12])
i = 0
for col_name in train_df_analysis:
    if(col_name != "loan_id"):
        axs = sb.histplot(data=train_df_analysis, x=train_df_analysis[col_name], ax=ax[(int(i)%nrows), (int(i)//nrows)])
        i = i + 1

plt.show()

Variables that may have outliers from looking at the histogram (by descending order of likelihood): 
balance_min, 
no. of municipalities with inhabitants < 499, 
days_since_last_transaction

### Boxplot

In [None]:
nrows = 3
ncols = 4
fig, ax = plt.subplots(nrows=nrows, ncols=ncols, figsize=[12, 12])
i = 0
for col_name in train_df_analysis:
    if(col_name != "loan_id"):
        axs = sb.boxplot(data=train_df_analysis, x=train_df_analysis[col_name], ax=ax[(int(i)%nrows), (int(i)//nrows)])
        i = i + 1

plt.show()

Variables that may have outliers from looking at the boxplot (by descending order of likelihood): 
loan_amount, 
operation_count, 
days_since_last_transaction, 
no. of municipalities with inhabitants < 499, 
balance_min

### Using coeficients of variation to compare each variable's deviation.

In [None]:
coeficients_of_variation = {}
standard_deviations = {}
means = {}
# Calculate coeficients of variation (standard variation / mean * 100[%])
for col_name in train_df_analysis:
    coeficients_of_variation[col_name] = np.std(train_df_analysis[col_name])
    standard_deviations[col_name] = coeficients_of_variation[col_name]
    means[col_name] = np.average(train_df_analysis[col_name])
    coeficients_of_variation[col_name] /= means[col_name] / 100
coeficients_of_variation

In [None]:
# Barplot of coeficients of variation
coeficients_of_variation_sorted = dict(sorted(coeficients_of_variation.items(), key=lambda item: -item[1]))

num_items = len(coeficients_of_variation_sorted.items())
plt.bar(np.arange(num_items), coeficients_of_variation_sorted.values())
plt.xticks(np.arange(num_items), coeficients_of_variation_sorted.keys(), rotation=90)
plt.tight_layout()
plt.show()

## Outliar detection methods
### Standard deviation method

In [None]:
# Using 2-3 is normal for datasets of this size. This will not work for certain data, such as region data, since it has less values. For those, we will use the quartile method.
std_multiplier = 3
labels = train_df_analysis.columns.difference(train_df.columns.difference(["loan_amount", "operation_count", "type_transaction_count_withdrawal", "days_since_last_transaction"]))
train_df_std_method = train_df#train_df_analysis.drop( axis=1, labels=train_df_analysis.columns.difference(["loan_amount", "operation_count", "type_transaction_count_withdrawal", "no. of municipalities with inhabitants < 499", "days_since_last_transaction"]))
outliers = {}
for col_name in labels:
    low_bound, high_bound = means[col_name] - (standard_deviations[col_name] * std_multiplier), means[col_name] + (standard_deviations[col_name] * std_multiplier)
    outliers[col_name] = train_df.loc[(train_df[col_name] < low_bound) | (train_df[col_name] > high_bound)]
    #print("Name: " + str(col_name) + " Count: " + str(outliers[col_name].count()) + "Head10: " + str(outliers[col_name].head(10)))

for col_name in outliers:
    if(outliers[col_name].any(axis=None)):
        print("Name = " + col_name + ":")
        display(outliers[col_name])
        print("Std: " + str(standard_deviations[col_name]))
        print("Quartile 75s: \n")
        for col_name in train_df_analysis:
            print(str(col_name) + " = " + str(train_df[col_name].quantile(.75)))

### Using first/third quartile method

In [None]:
# Using 2-3 is normal for datasets of this size. This will not work for certain data, such as region data, since it has less values. For those, we will use the quartile method.
iqr_multiplier = 1.5
labels = train_df_analysis.columns.difference(train_df.columns.difference(["no. of municipalities with inhabitants < 499","region_woe"]))
train_df_iqr_method = train_df
outliers = {}
for col_name in labels:
    q1, q3 = train_df_iqr_method[col_name].quantile(.25), train_df_iqr_method[col_name].quantile(.75)
    iqr = q3 - q1
    low_bound, high_bound = q1 - iqr_multiplier * iqr, q3 + iqr_multiplier * iqr
    outliers[col_name] = train_df.loc[(train_df[col_name] < low_bound) | (train_df[col_name] > high_bound)]

for col_name in outliers:
    if(outliers[col_name].any(axis=None)):
        print("Name = " + col_name + ":")
        display(outliers[col_name])
        print("Quartile 75s: \n")
        for col_name in train_df_analysis:
            print(str(col_name) + " = " + str(train_df[col_name].quantile(.75)))


In [None]:
del train_df_analysis["loan_id"]
sb.pairplot(train_df_analysis)