In [None]:
import pandas as pd
import os
import seaborn as sb
import datetime
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
import sklearn as sk

# Load Data

In [None]:
client_df = pd.read_csv('./ficheiros_competicao_dev/client.csv', sep=';', low_memory=False)

account_df = pd.read_csv('./ficheiros_competicao_dev/account.csv', sep=';', low_memory=False)

trans_dev_df = pd.read_csv('./ficheiros_competicao_dev/trans_dev.csv', sep=';', low_memory=False)

loan_dev_df = pd.read_csv('./ficheiros_competicao_dev/loan_dev.csv', sep=';', low_memory=False)

card_dev_df = pd.read_csv('./ficheiros_competicao_dev/card_dev.csv', sep=';', low_memory=False)

disp_df = pd.read_csv('./ficheiros_competicao_dev/disp.csv', sep=';', low_memory=False)

district_df = pd.read_csv('./ficheiros_competicao_dev/district.csv', sep=';', low_memory=False, na_values='?')

# Domain Analysis

## Business Understanding

### End User Requirements

The end user requires a system to determine which clients are and are not capable of paying of the loans they seek to make of the end user.

### Business Goals

In this problem the positive case is a client that cannot pay a loan. The business goal is to create a system to:
- Reduce the amount of loan attributions to clients who will default on the loan
- Mantain the amount of loan attributions to clients who can fulfill the loan

### Data Mining Goals

From the business goals we can determine that the goal of the model is to avoid granting a loan to a client who cannot pay it back, minimize false negatives so we must optimze for recall.

## Data Understanding

In [None]:
print("account", account_df.describe(include='all'), "\n")
print("client", client_df.describe(include='all'), "\n")
print("disposition", disp_df.describe(include='all'), "\n")
print("district", district_df.describe(include='all'), "\n")
print("card_train", card_dev_df.describe(include='all'), "\n")
print("loan_train", loan_dev_df.describe(include='all'), "\n")
print("trans_train", trans_dev_df.describe(include='all'), "\n")

### Loans

In [None]:
len(loan_dev_df[loan_dev_df['status'] == 1]) / len(loan_dev_df) * 100

Around 86% of loans in the dataset have been payed off, so accuracy is not the best measure to optimize for.

In [None]:
status_values = loan_dev_df["status"].value_counts()
plt.title("Was the loan given?")
plt.bar(["yes", "no"], status_values)

This bar plot shows the number of loans given and the number of loans not given side by side. From this graph, you can understand that, for most cases, the loan was given. Therefore, this can be considered and unbalanced dataset, making it harder to predict when loans are not given.

In [None]:
sb.violinplot(x='status', y='amount', data=loan_dev_df, hue='status')

We can see a larger amount of the loans over 100,000 are not payed off.

# Data Treatment

## Whitespace Removal

In [None]:
district_df.rename(columns=lambda x: x.strip(), inplace=True)

## Missing Values

### Acount

In [None]:
account_df.isnull().sum()

No missing values on **account_df**

### Client

In [None]:
client_df.isnull().sum()

No missing values on **client_df**

### Loans

In [None]:
loan_dev_df.isnull().sum()

No missing values on **loan_dev_df**

### Transactions

In [None]:
trans_dev_df.isnull().sum()

There are missing values on **trans_dev_df**

#### Operation

In [None]:
trans_dev_df['operation'].value_counts()

In [None]:
len(trans_dev_df[trans_dev_df['operation'].isnull()]) / len(trans_dev_df['operation']) * 100

There is a significant number of null values in the operations column. These will be replaced by 'N/A'.

In [None]:
trans_dev_df['operation'].fillna('N/A', inplace=True)

In [None]:
trans_dev_df['operation'].value_counts()

#### K Symbol

In [None]:
trans_dev_df['k_symbol'].value_counts()

There are many empty string values. It is assumed these mean the transaction type wasn't registered and they will be treated the same as null values.

In [None]:
def remove_empty_values_k_symbol(k):
    if isinstance(k, float):
        return'N/A'
    elif isinstance(k, str) and k.strip() == '':
        return 'N/A'
    else:
        return k

trans_dev_df['k_symbol'] = trans_dev_df['k_symbol'].apply(remove_empty_values_k_symbol)

trans_dev_df['k_symbol'].value_counts()

#### Bank

In [None]:
trans_dev_df['bank'].value_counts()

There aren't any empty strings. Null values will be replaced by unknown.

In [None]:
trans_dev_df['bank'].fillna('Unknown', inplace=True)

In [None]:
trans_dev_df['bank'].value_counts()

#### Account

In [None]:
trans_dev_df['account'].value_counts()

There are many transactions to an account **0**. This will be treated as a transaction to an unknown account.

In [None]:
trans_dev_df['account'].fillna(0, inplace=True)

In [None]:
trans_dev_df['account'].value_counts()

### Cards

In [None]:
card_dev_df.isnull().sum()

There are no missing values in **card_dev_df**

### Disposition

In [None]:
disp_df.isnull().sum()

There are no missing values in **disp_df**

### District

In [None]:
district_df.isnull().sum()

There are missing values in **district_df**

In [None]:
district_df[district_df['unemploymant rate \'95'].isnull()]

Both null values com from the district of Jesenik.

We will replace the null values by finding the mean values for the region in 1995 and 1996 and inferring the 1995 values for Jesenik from the 1996 values.

In [None]:
unemp_rate_96_mean = district_df[district_df['region'] == 'north Moravia']['unemploymant rate \'96'].median()

unemp_rate_95_mean = district_df[district_df['region'] == 'north Moravia']['unemploymant rate \'95'].median()

unemp_rate_96_jes = district_df.iloc[68]['unemploymant rate \'96']

predicted_unemp_rate_95_jes = (unemp_rate_95_mean/unemp_rate_96_mean) * unemp_rate_96_jes

district_df['unemploymant rate \'95'].fillna(predicted_unemp_rate_95_jes, inplace=True)

crimes_96_mean = district_df[district_df['region'] == 'north Moravia']['no. of commited crimes \'96'].median()

crimes_95_mean = district_df[district_df['region'] == 'north Moravia']['no. of commited crimes \'95'].median()

crimes_96_jes = district_df.iloc[68]['no. of commited crimes \'96']

predicted_crimes_95_jes = (crimes_95_mean/crimes_96_mean) * crimes_96_jes

district_df['no. of commited crimes \'95'].fillna(predicted_crimes_95_jes, inplace=True)

## Date Functions

In [None]:
# Format birth number to date (DD-MM-YY) <- may need to change formatting for algorithms
def get_formatted_date(date_number):
    date_number_string = str(date_number)
    return date_number_string[4:6] + '/' + str(int(date_number_string[2:4]) % 50) + '/' + '19' + date_number_string[0:2]

# Get client sex from birth number (MM > 50 => sex == 'F')
def get_client_sex_from_birth_number(date_number):
    return 'F' if int(str(date_number)[2:4]) >= 51 else 'M'

def get_season_from_Date(date_string):
    date_string_str = str(date_string)
    day = int(date_string_str[8:10])
    month = int(date_string_str[5:7])
    year = int(date_string_str[0:4])

    winterStart = datetime.datetime(year, 12, 22)
    springStart = datetime.datetime(year, 3, 20)
    summerStart = datetime.datetime(year, 6, 21)
    autumnStart = datetime.datetime(year, 9, 22)

    if ((month <= 3 and day <= 20) or (month >= 12 and day > 22)):
        return "winter"
    elif (date_string < summerStart):
        return "spring"
    elif (date_string < autumnStart) : 
        return "summer"
    elif (date_string < winterStart) :
        return "autumn"

    return "FAILEDSEASON"

def get_year_from_date(date_string):
    date_string_str = str(date_string)
    year = int(date_string_str[0:4])

    return year


## Format dates and determine client sex

In [None]:
# Format client birthday and determine sex

client_df['birthday'] = pd.to_datetime(client_df['birth_number'].apply(get_formatted_date), infer_datetime_format=True)

client_df['sex'] = client_df['birth_number'].apply(get_client_sex_from_birth_number)

client_df = client_df.drop(columns=['birth_number'])

# Format other dates

account_df['acc_creation_date'] = pd.to_datetime(account_df['date'].apply(get_formatted_date), infer_datetime_format=True)

account_df = account_df.drop(columns=['date'])

trans_dev_df['trans_date'] = pd.to_datetime(trans_dev_df['date'].apply(get_formatted_date), infer_datetime_format=True)

trans_dev_df = trans_dev_df.drop(columns=['date'])

loan_dev_df['date'] = pd.to_datetime(loan_dev_df['date'].apply(get_formatted_date), infer_datetime_format=True)

card_dev_df['issued'] = pd.to_datetime(card_dev_df['issued'].apply(get_formatted_date), infer_datetime_format=True)

# Join Data

Data must all be displayed in one dataset.

## Join Account and Disposition

In [None]:
joined_df = account_df.merge(disp_df, on='account_id', how='inner', suffixes=['', '_disp'])

joined_df.rename(columns={
    'type': 'account_type',
    'frequency': 'issuance_freq'
}, inplace=True)

# Determine if account is shared or not
owner_number_account = joined_df['account_id'].value_counts()

joined_df['shared'] = joined_df.apply(lambda row: 1 if owner_number_account[row['account_id']] > 1 else 0, axis=1)

# Drop rows with disponents so there are no duplicated account rows
joined_df.drop(joined_df[joined_df['account_type'] == 'DISPONENT'].index, inplace=True)

# Drop account_type and disp_id column
joined_df.drop(columns=['account_type'], inplace=True)

joined_df.head()

## Join Clients

In [None]:
joined_df = joined_df.merge(client_df, on='client_id', how='left', suffixes=['', '_client'])

joined_df.drop(columns=['client_id'], inplace=True)

joined_df.head()

## Join District

There are currently two District ids in the dataset for each row. We are going to join on the client's district id since we surmise this information will be more relevant to wheter they pay off the loan or not than the district the account was registered on.

In [None]:
joined_df = joined_df.merge(district_df, left_on='district_id_client', right_on='code', how='inner', suffixes=['', '_district'])

joined_df.rename(columns={
    'code': 'district_code',
    'name': 'district_name'
}, inplace=True)

joined_df.drop(columns=['district_id', 'district_id_client'], inplace=True)

joined_df.head()

## Join Loan

In [None]:
joined_df = joined_df.merge(loan_dev_df, on='account_id', how='right', suffixes=['', '_loan'])

joined_df.rename(columns={
    'date': 'loan_date'
}, inplace=True)

# Add seasons
joined_df['season_on_loan'] = joined_df['loan_date'].apply(get_season_from_Date)
joined_df['loan_year'] = joined_df['loan_date'].apply(get_year_from_date)

joined_df.head(100)

In [None]:
print(joined_df.columns)

## Determine account owner age on loan request

In [None]:
joined_df['age_on_loan_request'] = joined_df.apply(lambda row: (row['loan_date'] - row['birthday'])/np.timedelta64(1, 'Y'), axis=1)

joined_df.drop(columns=['birthday'], inplace=True)

joined_df.head()

## Join Cards

There are a lot of clients without cards so simply joining the datasets would result in a large amount of null values. Instead we will record the card score attached to each account giving more weight to *classic* and *gold* cards. This is done beacause to have access to better credit cards the client must have a history of making and paying of loans in time.

In [None]:
card_disp_df = card_dev_df.merge(disp_df, on='disp_id', how='inner', suffixes=['_card', '_disp'])

cards_per_user_df = card_disp_df.groupby(['account_id', 'type_card']).size().unstack(fill_value=0)

joined_df = joined_df.merge(cards_per_user_df, on='account_id', how='left', suffixes=['', ''])

joined_df.fillna(0, inplace=True)

joined_df['card_score'] = (joined_df['junior'] + joined_df['classic'] * 3 + joined_df['gold'] * 10).astype(int)

joined_df.drop(columns=['junior', 'classic', 'gold'], inplace=True)

joined_df.head()

## Join Transactions

In [None]:
from dateutil.relativedelta import relativedelta

account_balance_df = trans_dev_df[['account_id', 'balance', 'trans_date']]

def find_balance_at_date(joined_df_row):
    # Get all balance information for account
    account_balances = account_balance_df[account_balance_df['account_id'] == joined_df_row['account_id']]

    if (account_balances.empty):
        joined_df_row['balance_at_loan'] = 0
        joined_df_row['balance_three_months_before'] = 0
        joined_df_row['balance_six_months_before'] = 0

        return joined_df_row[['balance_at_loan', 'balance_three_months_before', 'balance_six_months_before']]
    
    # Get balance at loan date
    account_balances_at_loan_date = account_balances.copy()

    account_balances_at_loan_date['days_since_trans'] = account_balances_at_loan_date.apply(lambda row: (joined_df_row['loan_date'] - row['trans_date']), axis=1)

    account_balances_at_loan_date = account_balances_at_loan_date[account_balances_at_loan_date['days_since_trans'] >= pd.Timedelta(0)]

    account_balances_at_loan_date = account_balances_at_loan_date[account_balances_at_loan_date['days_since_trans'] == account_balances_at_loan_date['days_since_trans'].min()]


    # Get balance three months before loan date
    three_months_ago = joined_df_row['loan_date'] - relativedelta(months=3)

    account_balances_three_months_ago = account_balances.copy()

    account_balances_three_months_ago['days_since_trans'] = account_balances_three_months_ago.apply(lambda row: (three_months_ago - row['trans_date']), axis=1)

    account_balances_three_months_ago = account_balances_three_months_ago[account_balances_three_months_ago['days_since_trans'] >= pd.Timedelta(0)]

    account_balances_three_months_ago = account_balances_three_months_ago[account_balances_three_months_ago['days_since_trans'] == account_balances_three_months_ago['days_since_trans'].min()]

    # Get balance six months before loan date
    six_months_ago = joined_df_row['loan_date'] - relativedelta(months=6)

    account_balances_six_months_ago = account_balances.copy()

    account_balances_six_months_ago['days_since_trans'] = account_balances_six_months_ago.apply(lambda row: (six_months_ago - row['trans_date']), axis=1)

    account_balances_six_months_ago = account_balances_six_months_ago[account_balances_six_months_ago['days_since_trans'] >= pd.Timedelta(0)]

    account_balances_six_months_ago = account_balances_six_months_ago[account_balances_six_months_ago['days_since_trans'] == account_balances_six_months_ago['days_since_trans'].min()]


    joined_df_row['balance_at_loan'] = account_balances_at_loan_date.iloc[0]['balance'] if len(account_balances_at_loan_date.index) >= 1 else 0
    joined_df_row['balance_three_months_before'] = account_balances_three_months_ago.iloc[0]['balance'] if len(account_balances_three_months_ago.index) >= 1 else 0
    joined_df_row['balance_six_months_before'] = account_balances_six_months_ago.iloc[0]['balance'] if len(account_balances_six_months_ago.index) >= 1 else 0

    return joined_df_row[['balance_at_loan', 'balance_three_months_before', 'balance_six_months_before']]

joined_df[['balance_at_loan', 'balance_three_months_before', 'balance_six_months_before']] = joined_df.apply(find_balance_at_date, axis=1)

joined_df.head()

# Total interest accumulated

In [None]:
grouped = trans_dev_df.groupby("account_id")
account_balance_df = trans_dev_df[['account_id', 'k_symbol', 'amount']]

interst_mean_df = pd.DataFrame(columns=["account_id", "interest_mean"])


for acc_id, group in grouped:
    interst_mean = abs(group[group["k_symbol"] == "interest credited"]["amount"].mean())
    interst_mean_df = interst_mean_df.append({"account_id": int(acc_id), "interest_mean": interst_mean}, ignore_index=True)

joined_df["interest_mean"] = interst_mean_df["interest_mean"]
joined_df["interest_mean"].fillna(0.0, inplace=True)

joined_df.head()

# Number of times balance droped below 5k

In [None]:
account_balance_df.head()

In [None]:
def get_balance_below_5k(joined_df_row):
    # Get all balance information for account
    account_balances = trans_dev_df[['account_id', 'balance', 'trans_date']][account_balance_df['account_id'] == joined_df_row['account_id']]

    if (account_balances.empty):
        joined_df_row['balance_below_5k'] = 0

        return joined_df_row[['balance_below_5k']]
    
    # Get balance at loan date
    account_balances_below_5k = account_balances.copy()

    account_balances_below_5k = account_balances_below_5k[account_balances_below_5k['balance'] < 5000]

    joined_df_row['balance_below_5k'] = account_balances_below_5k['balance'].count()

    return joined_df_row[['balance_below_5k']]

In [None]:
joined_df[['balance_below_5k']] = joined_df.apply(get_balance_below_5k, axis=1)

In [None]:
joined_df.head()

# Loan Amount / Account Balance

In [None]:
def get_loan_amount_over_balance(row):
    return row['amount'] / row['balance_at_loan'] if row['balance_at_loan'] > 0 else -1

joined_df['loan_amount_over_balance'] = joined_df.apply(get_loan_amount_over_balance, axis=1)

### Distributions

In [None]:
sb.displot(joined_df, x="age_on_loan_request", kind="kde")

This is a distribution of the age of the client at the time they asked for the loan. From this image, we can conclude that most people apply for a loan when they are 30-40 years.

### 3D Scatter Plot
For these graphs, if it's green, it means that the loan was given and if it's red, it means that the loan was not given.

In [None]:
ys = joined_df['amount']
x = joined_df['payments']

colors = ['g' if s == 1 else 'r' for s in joined_df['status']]
plt.ylabel('Loan amount')
plt.xlabel('Payment')
plt.scatter(x, ys, color=colors)

This scatter plot shows the payment and the loan amount (besides the status). We can then conclude that the bigger the payment, the bigger the amount.

## Outliers

In [None]:
joined_df.describe()[['no. of commited crimes \'95', 'age_on_loan_request', 'duration', 'payments']]

In [None]:
import plotly.express as px

fig = px.histogram(joined_df, x='no. of commited crimes \'95')
fig.show()

In [None]:
def find_outliers_IQR(df):

   q1=df.quantile(0.25)

   q3=df.quantile(0.75)

   IQR=q3-q1

   outliers = df[((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]

   return outliers

outliers = find_outliers_IQR(joined_df['no. of commited crimes \'95'])

print("number of outliers: " + str(len(outliers)))

print("max outlier value: " + str(outliers.max()))

print("min outlier value: " + str(outliers.min()))

### Remove underage users

In [None]:
underage = joined_df.loc[joined_df['age_on_loan_request']<18]

joined_df.drop(underage.index, inplace=True)

joined_df.describe()[['age_on_loan_request']]


# Split into age brackets

In [None]:
import plotly.express as px

fig = px.histogram(joined_df, x='age_on_loan_request')
fig.show()

In [None]:
def get_age_bracket(age):
    if age >= 18 and age <= 24:
        return '18-24'
    elif age >= 25 and age <= 34:
        return '25-34'
    elif age>= 35 and age <= 44:
        return '35-44'
    elif age >= 45 and age <= 54:
        return '45-54'
    elif age >= 55 and age <= 64:
        return '55-64'
    elif age >= 65:
        return '65+'

joined_df['age_bracket'] = joined_df['age_on_loan_request'].apply(get_age_bracket)

joined_df.head()

## Set Correct Data Types

In [None]:
joined_df['shared'] = joined_df['shared'].astype('bool')

In [None]:
joined_df['status'] = joined_df['status'].apply(lambda x: 1 if x == -1 else 0)
joined_df['status'] = joined_df['status'].astype('bool')

In [None]:
joined_df.drop(columns=['district_name', 'account_id', 'disp_id', 'age_on_loan_request'], inplace=True)

### Remove Categorical Values

In [None]:
cat = ['sex', 'region', 'issuance_freq', 'season_on_loan', 'age_bracket']

joined_df[cat] = joined_df[cat].astype('category')

joined_df[cat] = joined_df[cat].apply(lambda x : x.cat.codes)

# Clustering - Data Description

In [None]:
from sklearn.cluster import KMeans
from matplotlib import pyplot as plt
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

In [None]:
print(joined_df.dtypes)

## PCA

In [None]:
X = joined_df.drop(['status'], axis=1)
X.drop(columns=['acc_creation_date', 'loan_date'], inplace=True)
y = joined_df['status']

X = StandardScaler().fit_transform(X, y)
X = PCA(n_components=2, random_state=42).fit_transform(X)

wss = []
silhouette = []

for k in range(2, 10):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    wss.append(kmeans.inertia_)
    silhouette.append(metrics.silhouette_score(X, kmeans.labels_))

fig, ax = plt.subplots(1, 2, figsize=(15, 5))

ax[0].plot(range(2, 10), silhouette)
ax[1].plot(range(2, 10), wss)

plt.grid(True)

ax[0].set_title('Silhouette method')
ax[1].set_title('Elbow curve')

plt.show()

In [None]:
X = joined_df.drop(['status'], axis=1)
X.drop(columns=['acc_creation_date', 'loan_date'], inplace=True)
y = joined_df['status']

X = StandardScaler().fit_transform(X)
X = PCA(n_components=2, random_state=42).fit_transform(X)
db = KMeans(n_clusters=3, random_state=42).fit_predict(X)

# Plot result

fig, axs = plt.subplots(1, 2, figsize=(18, 9))

axs[0].scatter(X[:, 0], X[:, 1], c=db)

scatter = axs[1].scatter(X[:, 0], X[:, 1], c=y)

axs[0].set_title("Kmeans")
axs[1].set_title("Status")

axs[1].legend(handles=scatter.legend_elements()[0], labels=[-1, 1])

In [None]:
import sklearn.cluster as cluster
import seaborn as sns
kmeans = cluster.KMeans(n_clusters=4, random_state=42)
X=joined_df.drop(['status'],axis=1)
X.drop(columns=['acc_creation_date', 'loan_date'], inplace=True)
y = joined_df["status"]

X = StandardScaler().fit_transform(X)
X = PCA(n_components=2, random_state=42).fit_transform(X)

pca_df = pd.DataFrame(data = X, columns = ['principal component 1', 'principal component 2'])
kmeans = kmeans.fit(pca_df)

pca_df['Clusters'] = kmeans.labels_

sns.scatterplot(x="principal component 1", y="principal component 2",hue = 'Clusters',  data=pca_df,palette='viridis')

# Feature Selection

In [None]:
features = [
    'shared',
    'no. of municipalities with inhabitants < 499',
    'no. of municipalities with inhabitants 500-1999',
    'no. of municipalities with inhabitants 2000-9999',
    'no. of municipalities with inhabitants >10000', 
    'no. of cities',
    'ratio of urban inhabitants', 
    'average salary', 
    'unemploymant rate \'96', 
    'no. of enterpreneurs per 1000 inhabitants',
    'amount', 
    'duration', 
    'payments',
    'age_bracket',
    'card_score', 
    'balance_at_loan',
    'balance_three_months_before',
    'balance_six_months_before',
    'issuance_freq',
    'sex', 
    'district_code',
    'region',
    'season_on_loan',
    'loan_year',
    'interest_mean',
    'balance_below_5k',
    'loan_amount_over_balance'
    ]
    
target = 'status'

In [None]:
joined_df[features][joined_df['number_of_payed_off_prev_loans'] > 0].head()

In [None]:
corr_matrix = joined_df[features + [target]].corr(method='spearman')

mask = np.zeros_like(corr_matrix, dtype=bool)
mask[np.triu_indices_from(mask)] = True

plt.figure(figsize=(20, 20))
plt.title('Correlation Matrix', fontsize=25)
_ = sb.heatmap(corr_matrix, mask=mask, cmap='coolwarm', annot=True, fmt='.2f', linewidths=2)

plt.show()

### VIF

We are going to test for VIF to check for multicolinearity between features

In [None]:
# load statmodels functions
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# compute the vif for all given features
def compute_vif(considered_features):
    
    X = joined_df[considered_features].copy()
    # the calculation of variance inflation requires a constant
    X['intercept'] = 1
    
    # create dataframe to store vif values
    vif = pd.DataFrame()
    vif["Variable"] = X.columns
    vif["VIF"] = [variance_inflation_factor(X, i) for i in range(X.shape[1])]
    vif = vif[vif['Variable']!='intercept']
    return vif

In [None]:
numeric_features = [
    'no. of municipalities with inhabitants < 499',
    'no. of municipalities with inhabitants 500-1999',
    'no. of municipalities with inhabitants 2000-9999',
    'no. of municipalities with inhabitants >10000', 
    'no. of cities',
    'ratio of urban inhabitants', 
    'average salary', 
    'unemploymant rate \'96', 
    'no. of enterpreneurs per 1000 inhabitants',
    'amount', 
    'duration', 
    'payments',
    'card_score', 
    'balance_at_loan',
    'balance_three_months_before',
    'balance_six_months_before',
    'loan_year',
    'interest_mean',
    'balance_below_5k',
    'loan_amount_over_balance'
    ]

vif = compute_vif(numeric_features).sort_values(by='VIF', ascending=False)

vif

VIF threshhold at 5. removing amount and recalculating

In [None]:
numeric_features.remove('amount')

vif = compute_vif(numeric_features).sort_values(by='VIF', ascending=False)

vif

VIF on payments improved.

In [None]:
numeric_features.remove('average salary')

vif = compute_vif(numeric_features).sort_values(by='VIF', ascending=False)

vif

All VIF values in threshold

In [None]:
features.remove('average salary')
features.remove('amount')

In [None]:
fig, ax = plt.subplots(2,2)
fig.set_size_inches(15, 10)
sb.countplot(x="season_on_loan", data=joined_df[joined_df["status"] == 1], ax=ax[0][0])
sb.countplot(x="season_on_loan", data=joined_df[joined_df["status"] != 1], ax=ax[0][1])
sb.violinplot(x="season_on_loan", y="amount", data=joined_df, hue="status", ax=ax[1][0])
sb.violinplot(x="season_on_loan", y="duration", data=joined_df, hue="status", ax=ax[1][1])


Throughout the year the amount of accepted loans is descreasing. We can also notice in autumn the lower bound of loans are higher.

In [None]:
print(joined_df.columns)
# sb.lineplot(x="age_on_loan_request", y="balance_at_loan", data=joined_df)
# sb.displot(x="age_on_loan_request", data=joined_df, kde=True, hue="status")

### ANOVA

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

def do_anova(cols, k="all"):
    best_features = SelectKBest(score_func=f_classif, k=k)
    df_cut = joined_df[cols]
    fit = best_features.fit(df_cut, joined_df["status"])
    dfscores = pd.DataFrame(fit.scores_)
    dfcolumns = pd.DataFrame(df_cut.columns)

    # concat 2 dataFrame for better visualization
    feature_score = pd.concat([dfcolumns, dfscores], axis=1)
    feature_score.columns = ["Features", "Score"]
    ret = feature_score.sort_values(by="Score", ascending=False)
    return ret.head(k) if k != "all" else ret

In [None]:
do_anova(features)

In [None]:
selected_features = [
    'shared',
    'balance_at_loan',
    'payments',
    'balance_six_months_before',
    'balance_three_months_before',
    'season_on_loan',
    'age_bracket',
    'loan_year',
    'interest_mean',
    'balance_below_5k',
    'loan_amount_over_balance',
    ]

# Algorithms

In [None]:
print(joined_df.columns)

In [None]:
print(joined_df.dtypes)

In [None]:
X = joined_df[selected_features]
y = joined_df[target]

In [None]:
print(X.shape)

In [None]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y) # 70% training and 30% test


### Normalization
Some of the algorithms we plan on using (KNN and SVM) require the data to be standardized. To do so, we used a StandardScaler from SciKit Learn's preprocessing library.

In [None]:
scaler = sk.preprocessing.StandardScaler()

scaler.fit(X_train)
x_train = scaler.fit_transform(X_train)
x_test = scaler.fit_transform(X_test)

## Descision Tree

In [None]:
# # Create Decision Tree classifer object
# clf = DecisionTreeClassifier(min_samples_leaf=15, max_depth=5)

# # Train Decision Tree Classifer
# clf = clf.fit(X_train,y_train)

# #Predict the response for test dataset
# y_pred = clf.predict(X_test)

In [None]:
# y_pred_proba = clf.predict_proba(X_test)
# print(y_pred_proba)

In [None]:
# from imblearn.over_sampling import SMOTE
# from sklearn.metrics import PrecisionRecallDisplay

# print("Recall:",metrics.recall_score(y_test, y_pred))
# print("Precision:",metrics.precision_score(y_test, y_pred))

# display = PrecisionRecallDisplay.from_estimator(
#     clf, X_test, y_test, name="DecisionTree"
# )
# _ = display.ax_.set_title("2-class Precision-Recall curve")

In [None]:
# confusion_matrix = metrics.confusion_matrix(y_test, y_pred)

# cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True]) 

# cm_display.plot()
# plt.show()

In [None]:
# from sklearn import tree

# fig = plt.figure(figsize=(25,20))
# _ = tree.plot_tree(clf, 
#                    feature_names=selected_features,  
#                    class_names=target,
#                    filled=True)


## SVC

In [None]:
# from sklearn.svm import SVC
# from imblearn.over_sampling import SMOTE
# from sklearn.metrics import PrecisionRecallDisplay

# # lsvc = SVC(C=0.01, cache_size=200, class_weight='balanced', coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.001, kernel='rbf',
# #  max_iter=-1, probability=True, random_state=1234, shrinking=True, tol=0.001, verbose=False)

# lsvc = SVC(verbose=0, C=0.01, kernel='poly')

# lsvc.fit(X_train, y_train)
# y_pred = lsvc.predict(X_test)

# score = lsvc.score(X_train, y_train)
# print(score)
# print("REcall: ", metrics.recall_score(y_test, y_pred))
# print("Precison: ", metrics.precision_score(y_test, y_pred))

# display = PrecisionRecallDisplay.from_estimator(
#     lsvc, X_test, y_test, name="SVC"
# )
# _ = display.ax_.set_title("2-class Precision-Recall curve")


In [None]:
# y_pred_proba = clf.predict_proba(X_test)
# print(y_pred_proba)

In [None]:
# confusion_matrix = metrics.confusion_matrix(y_test, y_pred)

# cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True]) 

# cm_display.plot()
# plt.show()

## Logistic Regression

In [None]:
# from sklearn.linear_model import LogisticRegression
# from imblearn.over_sampling import SMOTE
# from sklearn.metrics import PrecisionRecallDisplay


# logisticRegr = LogisticRegression(C=0.01, class_weight='balanced', dual=False, fit_intercept=True, intercept_scaling=1,
#  l1_ratio=None, max_iter=1000, multi_class='multinomial', n_jobs=None, penalty='l2', random_state=1234, solver='newton-cg',
#   tol=0.0001, verbose=0, warm_start=False)
# #multi_class : 'auto', 'ovr', 'multinomial'
# # solver : {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'}, default='lbfgs'
# logisticRegr.fit(X_train, y_train)
# y_pred = logisticRegr.predict(X_test)

# score = logisticRegr.score(X_train, y_train)
# print(score)
# print("REcall: ", metrics.recall_score(y_test, y_pred))
# print("Precison: ", metrics.precision_score(y_test, y_pred))
# print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))

# display = PrecisionRecallDisplay.from_estimator(
#     logisticRegr, X_test, y_test, name="LogisticRegression"
# )
# _ = display.ax_.set_title("2-class Precision-Recall curve")

# NOT SURE HOW IT WORKS

# from sklearn.model_selection import cross_validate
# scoring = {"accuracy": "accuracy",
#            "precision": "precision",
#            "recall": "recall",
#            "f1": "f1",
#            "roc_auc": "roc_auc"}
# from sklearn.model_selection import RepeatedStratifiedKFold

# k = 5
# kr = 3
# kf = RepeatedStratifiedKFold(n_splits=k, n_repeats=kr, random_state=0)
# info = cross_validate(logisticRegr, X_train, y_train, scoring=scoring, cv=kf, n_jobs=-1)
# pd.DataFrame(info).head()



In [None]:
# y_pred_proba = clf.predict_proba(X_test)
# print(y_pred_proba)

In [None]:
# confusion_matrix = metrics.confusion_matrix(y_test, y_pred)

# cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True]) 

# cm_display.plot()
# plt.show()

## LGBMC

In [None]:
# from lightgbm import LGBMClassifier
# from sklearn.metrics import PrecisionRecallDisplay
# from sklearn.metrics import PrecisionRecallDisplay

# lgbm = LGBMClassifier(objective='binary', random_state=0)
# # lgbm = LGBMClassifier(task = 'train', objective='binary', num_classes = 1, random_state=0, learning_rate = 0.0005, num_iterations = 10000)
# # 'task': 'train',
# #     'boosting_type': 'gbdt',
# #     'objective': 'multiclass',
# #     'num_class':3,
# #     'metric': 'multi_logloss',
# #     'learning_rate': 0.002296,
# #     'max_depth': 7,
# #     'num_leaves': 17,
# #     'feature_fraction': 0.4,
# #     'bagging_fraction': 0.6,
# #     'bagging_freq': 17
# lgbm.fit(X_train, y_train)


# y_pred = lgbm.predict(X_test)   
# # lgbm.f1
# score = lgbm.score(X_train, y_train)
# print(score)
# print("REcall: ", metrics.recall_score(y_test, y_pred))
# print("Precison: ", metrics.precision_score(y_test, y_pred))


# display = PrecisionRecallDisplay.from_estimator(
#     lgbm, X_test, y_test, name="LGBM"
# )
# _ = display.ax_.set_title("2-class Precision-Recall curve")

In [None]:
# y_pred_proba = clf.predict_proba(X_test)
# print(y_pred_proba)

In [None]:
# confusion_matrix = metrics.confusion_matrix(y_test, y_pred)

# cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True]) 

# cm_display.plot()
# plt.show()

# SMOTE

## Decision Tree

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE(random_state=1234)
X_train_smote, y_train_smote = oversample.fit_resample(X_train, y_train)

In [None]:
print(X_train_smote.shape)

In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(min_samples_leaf=10, max_depth=5)

# Train Decision Tree Classifer
clf = clf.fit(X_train_smote,y_train_smote)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [None]:
y_pred_proba = clf.predict_proba(X_test)
print(y_pred_proba)

In [None]:
print("Recall: ", metrics.recall_score(y_test, y_pred))
print("Precision: ", metrics.precision_score(y_test, y_pred))
print("AUROC: ", metrics.roc_auc_score(y_test, y_pred))
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)

cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True]) 

cm_display.plot()
plt.show()

In [None]:
from sklearn import tree
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(clf, 
                   feature_names=selected_features,  
                   class_names=target,
                   filled=True)

## SVC

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import PrecisionRecallDisplay

lsvc = SVC(verbose=0, C=0.5, kernel='rbf', probability=True, random_state = 24)

lsvc.fit(X_train_smote, y_train_smote)
y_pred = lsvc.predict(X_test)

print("Recall: ", metrics.recall_score(y_test, y_pred))
print("Precision: ", metrics.precision_score(y_test, y_pred))
print("AUROC: ", metrics.roc_auc_score(y_test, y_pred))
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))

display = PrecisionRecallDisplay.from_estimator(
    lsvc, X_test, y_test, name="LogisticRegression"
)
_ = display.ax_.set_title("2-class Precision-Recall curve")

In [None]:
y_pred_proba = lsvc.predict_proba(X_test)
print(y_pred_proba)

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)

cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True]) 

cm_display.plot()
plt.show()

## Logistic Regession

In [None]:
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.metrics import PrecisionRecallDisplay


# logisticRegr = LogisticRegression(solver = 'lbfgs')
logisticRegr = LogisticRegression(C=0.01, class_weight='balanced', dual=False, fit_intercept=True, intercept_scaling=1,
 l1_ratio=None, max_iter=1000, multi_class='multinomial', n_jobs=None, penalty='l2', random_state=1234, solver='saga',
  tol=0.0001, verbose=0, warm_start=False)

logisticRegr.fit(X_train_smote, y_train_smote)
y_pred = logisticRegr.predict(X_test)

print("Recall: ", metrics.recall_score(y_test, y_pred))
print("Precision: ", metrics.precision_score(y_test, y_pred))
print("AUROC: ", metrics.roc_auc_score(y_test, y_pred))
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))

display = PrecisionRecallDisplay.from_estimator(
    logisticRegr, X_test, y_test, name="LogisticRegression"
)
_ = display.ax_.set_title("2-class Precision-Recall curve")

In [None]:
y_pred_proba = logisticRegr.predict_proba(X_test)
print(y_pred_proba)

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)

cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True]) 

cm_display.plot()
plt.show()

## LGBMC

In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.metrics import PrecisionRecallDisplay

lgbm = LGBMClassifier(objective='binary', random_state=0)
lgbm.fit(X_train_smote, y_train_smote)


y_pred = lgbm.predict(X_test)   

print("Recall: ", metrics.recall_score(y_test, y_pred))
print("Precision: ", metrics.precision_score(y_test, y_pred))
print("AUROC: ", metrics.roc_auc_score(y_test, y_pred))
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))


display = PrecisionRecallDisplay.from_estimator(
    lgbm, X_test, y_test, name="LGBM"
)
_ = display.ax_.set_title("2-class Precision-Recall curve")

In [None]:
y_pred_proba = lgbm.predict_proba(X_test)
print(y_pred_proba)

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)

cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True]) 

cm_display.plot()
plt.show()

## KNN

In [None]:
knn = sk.neighbors.KNeighborsClassifier(
    n_neighbors=50,
    weights='distance',
    algorithm='ball_tree',
    leaf_size=1,
    n_jobs=-1
    )

knn.fit(X_train_smote, y_train_smote)
y_pred = knn.predict(X_test)
print(sk.metrics.accuracy_score(y_test, y_pred))
print("Recall: ",sk.metrics.recall_score(y_test, y_pred))
print("Precision: ", sk.metrics.precision_score(y_test, y_pred))
print("AUROC: ", sk.metrics.roc_auc_score(y_test, y_pred))

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, y_pred)

cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [False, True]) 

cm_display.plot()
plt.show()

In [None]:
display = PrecisionRecallDisplay.from_estimator(
    knn, X_test, y_test, name="KNN"
)
_ = display.ax_.set_title("2-class Precision-Recall curve")

# Random Forest

In [None]:
from numpy import mean
from numpy import std
from numpy import arange

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold

X_train_comp = joined_df[selected_features]
y_train_comp = joined_df[target]

oversample = SMOTE(random_state=1234)
X_train_smote_comp, y_train_smote_comp = oversample.fit_resample(X_train_comp, y_train_comp)

def get_models():
	models = dict()
	n_trees = [10, 50, 100, 500, 1000]
	for n in n_trees:
		# explore number of features from 1 to 7
		for i in range(1,len(selected_features) + 1):
			# explore ratios from 10% to 100% in 10% increments
			for j in arange(0.1, 1.1, 0.1):
				key = str(n) + ' ' + str(i) + ' ' + ('%.1f' % j)
				# set max_samples=None to use 100%
				if j == 1.0:
					j = None
				models[key] = RandomForestClassifier(max_samples=j, max_features=i, n_estimators=n)
	return models

# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
	# define the evaluation procedure
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	# evaluate the model and collect the results
	scores = cross_val_score(model, X, y, scoring='roc_auc', cv=cv, n_jobs=-1)
	return scores

# get the models to evaluate
models = get_models()
# evaluate the models and store results
recall_results = {}
for name, model in models.items():
	# evaluate the model
	scores = evaluate_model(model, X_train_smote_comp, y_train_smote_comp)
	# store the results
	recall_results[name] = mean(scores)


best_model = max(recall_results, key=recall_results.get)
print(best_model, recall_results[best_model])

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=1000, max_features=4, max_samples=1.0)

rf.fit(X_train_smote, y_train_smote)

y_pred = rf.predict(X_test)

print("Recall: ", metrics.recall_score(y_test, y_pred))
print("Precision: ", metrics.precision_score(y_test, y_pred))
print("AUROC: ", metrics.roc_auc_score(y_test, y_pred))
print("Accuracy: ", metrics.accuracy_score(y_test, y_pred))

# SVC

In [None]:
def get_models_svc():
	models = dict()
	for i in ['rbf', 'linear', 'poly', 'sigmoid']:
		# explore ratios from 10% to 100% in 10% increments
		for j in [0.01, 0.1, 0.5, 1.0]:
			key = str(i) + ' ' + ('%.1f' % j)
			# set max_samples=None to use 100%
			models[key] = SVC(C=j, kernel=i, probability=True, random_state = 1234)
	return models

# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
	# define the evaluation procedure
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	# evaluate the model and collect the results
	scores = cross_val_score(model, X, y, scoring='recall', cv=cv, n_jobs=-1)
	return scores

# get the models to evaluate
models = get_models_svc()
# evaluate the models and store results
recall_results = {}
for name, model in models.items():
	# evaluate the model
	scores = evaluate_model(model, X_train_smote_comp, y_train_smote_comp)
	# store the results
	recall_results[name] = mean(scores)


best_model = max(recall_results, key=recall_results.get)
print(best_model, recall_results[best_model])

# Test 100 iterations of each algorithm

In [None]:
# recallSum = {"DecisionT":0,"SVC":0,"LogisticReg":0,"lgbm":0,"knn":0,"rf":0}

# for i in range(100):
#     oversample = SMOTE()
#     X_train_smote, y_train_smote = oversample.fit_resample(X_train, y_train)

#     clf = clf.fit(X_train_smote,y_train_smote)
#     y_pred = clf.predict(X_test)
#     recallSum["DecisionT"] += metrics.recall_score(y_test, y_pred)

#     lsvc.fit(X_train_smote, y_train_smote)
#     y_pred = lsvc.predict(X_test)
#     recallSum["SVC"] += metrics.recall_score(y_test, y_pred)

#     logisticRegr.fit(X_train_smote, y_train_smote)
#     y_pred = logisticRegr.predict(X_test)
#     recallSum["LogisticReg"] += metrics.recall_score(y_test, y_pred)

#     lgbm.fit(X_train_smote, y_train_smote)
#     y_pred = lgbm.predict(X_test)   
#     recallSum["lgbm"] += metrics.recall_score(y_test, y_pred)

#     knn.fit(X_train_smote, y_train_smote)
#     y_pred = knn.predict(X_test)
#     recallSum["knn"] += metrics.recall_score(y_test, y_pred)

#     rf.fit(X_train_smote, y_train_smote)
#     y_pred = rf.predict(X_test)
#     recallSum["rf"] += metrics.recall_score(y_test, y_pred)

# for key in recallSum :
#     print("Position - "+  str(key) + " -> " + str(recallSum[key]/100))


# Competition

## Prepare competition data

### Load

In [None]:
trans_comp_df = pd.read_csv('.\\ficheiros_competicao_dev\\trans_comp.csv', sep=';', low_memory=False)

loan_comp_df = pd.read_csv('.\\ficheiros_competicao_dev\\loan_comp.csv', sep=';', low_memory=False)

card_comp_df = pd.read_csv('.\\ficheiros_competicao_dev\\card_comp.csv', sep=';', low_memory=False)

### Treat data

In [None]:
trans_comp_df['operation'].fillna('N/A', inplace=True)
trans_comp_df['k_symbol'] = trans_comp_df['k_symbol'].apply(remove_empty_values_k_symbol)
trans_comp_df['bank'].fillna('Unknown', inplace=True)
trans_comp_df['account'].fillna(0, inplace=True)

In [None]:
trans_comp_df['trans_date'] = pd.to_datetime(trans_comp_df['date'].apply(get_formatted_date), infer_datetime_format=True)

trans_comp_df = trans_comp_df.drop(columns=['date'])

loan_comp_df['date'] = pd.to_datetime(loan_comp_df['date'].apply(get_formatted_date), infer_datetime_format=True)

card_comp_df['issued'] = pd.to_datetime(card_comp_df['issued'].apply(get_formatted_date), infer_datetime_format=True)

### Join Data

#### Account and Disposition

In [None]:
comp_df = account_df.merge(disp_df, on='account_id', how='inner', suffixes=['', '_disp'])

comp_df.rename(columns={
    'type': 'account_type',
    'frequency': 'issuance_freq'
}, inplace=True)

# Determine if account is shared or not
owner_number_account = comp_df['account_id'].value_counts()

comp_df['shared'] = comp_df.apply(lambda row: 1 if owner_number_account[row['account_id']] > 1 else 0, axis=1)

# Drop rows with disponents so there are no duplicated account rows
comp_df.drop(comp_df[comp_df['account_type'] == 'DISPONENT'].index, inplace=True)

# Drop account_type and disp_id column
comp_df.drop(columns=['account_type'], inplace=True)

comp_df.head()

##### Competition and Clients

In [None]:
comp_df = comp_df.merge(client_df, on='client_id', how='left', suffixes=['', '_client'])

comp_df.drop(columns=['client_id'], inplace=True)

comp_df.head()

#### Competition and District

In [None]:
comp_df = comp_df.merge(district_df, left_on='district_id_client', right_on='code', how='inner', suffixes=['', '_district'])

comp_df.rename(columns={
    'code': 'district_code',
    'name': 'district_name'
}, inplace=True)

comp_df.drop(columns=['district_id', 'district_id_client'], inplace=True)

comp_df.head()

#### Competition and Loan

In [None]:
comp_df = comp_df.merge(loan_comp_df, on='account_id', how='right', suffixes=['', '_loan'])

comp_df.rename(columns={
    'date': 'loan_date'
}, inplace=True)

# Add seasons
comp_df['season_on_loan'] = comp_df['loan_date'].apply(get_year_from_date)

comp_df.head(100)

In [None]:
comp_df['age_on_loan_request'] = comp_df.apply(lambda row: (row['loan_date'] - row['birthday'])/np.timedelta64(1, 'Y'), axis=1)

comp_df.drop(columns=['birthday'], inplace=True)


comp_df.head()

In [None]:
card_disp_df = card_comp_df.merge(disp_df, on='disp_id', how='inner', suffixes=['_card', '_disp'])

cards_per_user_df = card_disp_df.groupby(['account_id', 'type_card']).size().unstack(fill_value=0)

comp_df = comp_df.merge(cards_per_user_df, on='account_id', how='left', suffixes=['', ''])

comp_df.fillna(0, inplace=True)

comp_df['card_score'] = (comp_df['junior'] + comp_df['classic'] * 3 + comp_df['gold'] * 10).astype(int)

comp_df.drop(columns=['junior', 'classic', 'gold'], inplace=True)

comp_df['loan_year'] = comp_df['loan_date'].apply(get_year_from_date)

comp_df.head()

#### Competition and Transactions

In [None]:
from dateutil.relativedelta import relativedelta

account_balance_df = trans_comp_df[['account_id', 'balance', 'trans_date']]

comp_df[['balance_at_loan', 'balance_three_months_before', 'balance_six_months_before']] = comp_df.apply(find_balance_at_date, axis=1)

comp_df["interest_mean"] = interst_mean_df["interest_mean"]
comp_df["interest_mean"].fillna(0.0, inplace=True)

comp_df.head()

#### Get age bracket

In [None]:
comp_df['age_bracket'] = comp_df['age_on_loan_request'].apply(get_age_bracket)

#### Set correct data types

In [None]:
comp_df['shared'] = comp_df['shared'].astype('bool')


In [None]:
comp_df.drop(columns=['district_name', 'account_id', 'disp_id', 'age_on_loan_request'], inplace=True)

In [None]:
cat = ['sex', 'region', 'issuance_freq', 'season_on_loan', 'age_bracket']

comp_df[cat] = comp_df[cat].astype('category')

comp_df[cat] = comp_df[cat].apply(lambda x : x.cat.codes)

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import PrecisionRecallDisplay

X_train_comp = joined_df[selected_features]
y_train_comp = joined_df[target]

oversample = SMOTE(random_state=1234)
X_train_smote_comp, y_train_smote_comp = oversample.fit_resample(X_train_comp, y_train_comp)

lsvc = SVC(verbose=0, C=0.5, kernel='rbf', probability=True, random_state=42)

lsvc.fit(X_train_smote_comp, y_train_smote_comp)

score = lsvc.score(X_train_smote, y_train_smote)

In [None]:

rf = RandomForestClassifier(n_estimators=1000, max_features=4, max_samples=1.0)

rf.fit(X_train_smote_comp, y_train_smote_comp)

In [None]:
# knn = sk.neighbors.KNeighborsClassifier(
#     n_neighbors=50,
#     weights='distance',
#     algorithm='ball_tree',
#     leaf_size=1,
#     n_jobs=-1
#     )

# knn.fit(X_train_smote_comp, y_train_smote_comp)

In [None]:
X_test_comp = comp_df[selected_features]

In [None]:

y_pred = rf.predict_proba(X_test_comp)
# print("Recall: ",sk.metrics.recall_score(y_test, y_pred))

In [None]:
res_df = pd.DataFrame(columns=['Id', 'Predicted'])

res_df['Id'] = comp_df['loan_id'].copy()

res_df['Predicted'] = y_pred[:, 1]


In [None]:
res_df.to_csv('res.csv', sep=',', index=False, )