In [None]:
import streamlit as st

# Set no tema do seaborn para melhorar o visual dos plots
custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(style="ticks", rc=custom_params)

# Função para converter o df para csv
@st.cache
def convert_df(df):
    return df.to_csv(index=False).encode('utf-8')

# Função principal da aplicação
def main():
    # Configuração inicial da página da aplicação
    st.set_page_config(page_title = 'Telemarketing analisys', \
        page_icon = 'telmarketing_icon.png',
        layout="wide",
        initial_sidebar_state='expanded'
    )

## Introduction

#### Context
The original dataset contains 32581 entries with 12 categorial variables . In this dataset, each entry represents a person who takes a credit by a bank. Each person is classified as good or bad credit risks according to the set of variables. The link to the original dataset can be found below:

https://www.kaggle.com/datasets/laotse/credit-risk-dataset




#### Content
It is almost impossible to understand the original dataset due to its complicated system of categories and symbols. Thus, I wrote a small Python script to convert it into a readable CSV file. Several columns are simply ignored, because in my opinion either they are not important or their descriptions are obscure. The selected attributes are:







| Feature Name         |                  Description                        | Type  |
| -------------------- |:---------------------------------------------------:| -----:|
| person_age    |                Age            |        integer             |
| person_income |             Annual income          |                integer                   |
| person_home_ownership |            Home ownership           |              text                  |
| person_emp_length |         Employment, length (in years)        |        float           |
| loan_intent |          Loan intent                   |              text                     |
| loan_grade |               Loan grade                |                 text                      |
| loan_amnt |                Loan amount                 |                   integer                    |
| loan_int_rate |              Interest rate              |                   float               |
| loan_percent_income |               Percent income            |                  float      |
| cb_person_default_on_file |        Historical default             |            binary            |
| cb_preson_cred_hist_length |       Credit history length          |           integer            |
| **loan_status** |          Loan status (0 is non default 1 is default)  |     binary                  |

### Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_palette("Paired")


from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
#from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import GridSearchCV

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier

#from xgboost import XGBClassifier







**LOADING DATA**

The CSV file containing the data is located in same place of this notebook

df_o = pd.read_csv("./credit_risk_dataset.csv")
print('Data lines and columns:', df_o.shape)
print()
df_o.info()

df_o.head()

### Univariate

print(df_o['loan_status'].value_counts())
print("\nRate of bad loans:")
print(df_o['loan_status'].mean())

var = 'loan_status'
df_o[var].value_counts().plot.barh();

var = 'person_age'

fig , ax = plt.subplots(1,2, figsize=(10,5))
pd.qcut(df_o[var],3, precision=0).value_counts().sort_index().plot.bar(ax=ax[0])
df_o[var].plot.box(ax=ax[1]);

var = 'person_income'

fig , ax = plt.subplots(1,2, figsize=(10,5))
pd.qcut(df_o[var],3, precision=0).value_counts().sort_index().plot.bar(ax=ax[0])
df_o[var].plot.box(ax=ax[1]);

var = 'person_home_ownership'
print(df_o[var].value_counts())
chart = df_o[var].value_counts().plot.barh()
chart.tick_params(axis='x', labelrotation = 0)

var = 'person_emp_length'

fig , ax = plt.subplots(1,2, figsize=(10,5))
pd.qcut(df_o[var],4, precision=0).value_counts().sort_index().plot.bar(ax=ax[0])
df_o[var].plot.box(ax=ax[1]);

var = 'loan_intent'
print(df_o[var].value_counts())
chart = df_o[var].value_counts().plot.barh()
chart.tick_params(axis='x', labelrotation = 0)

var = 'loan_grade'
print(df_o[var].value_counts())
chart = df_o[var].value_counts().sort_index(ascending=False).plot.barh()
chart.tick_params(axis='x', labelrotation = 0)

var = 'loan_amnt'

fig , ax = plt.subplots(1,2, figsize=(10,5))
pd.qcut(df_o[var],3, precision=0).value_counts().sort_index().plot.bar(ax=ax[0])
df_o[var].plot.box(ax=ax[1]);

var = 'loan_int_rate'

fig , ax = plt.subplots(1,2, figsize=(10,5))
pd.qcut(df_o[var],5, precision=3).value_counts().sort_index().plot.bar(ax=ax[0])
df_o[var].plot.box(ax=ax[1]);

var = 'loan_percent_income'

fig , ax = plt.subplots(1,2, figsize=(10,5))
pd.qcut(df_o[var],5, precision=3).value_counts().sort_index().plot.bar(ax=ax[0])
df_o[var].plot.box(ax=ax[1]);

var = 'cb_person_default_on_file'
print(df_o[var].value_counts())
chart = df_o[var].value_counts().plot.barh()
chart.tick_params(axis='x', labelrotation = 0)

var = 'cb_person_cred_hist_length'

fig , ax = plt.subplots(1,2, figsize=(10,5))
pd.qcut(df_o[var],3, precision=0).value_counts().sort_index().plot.bar(ax=ax[0])
df_o[var].plot.box(ax=ax[1]);

### Bivariate

var = 'person_age'
fig, ax = plt.subplots(1,2, figsize=(12,5))

sns.kdeplot(data= df_o, hue='loan_status', x=var, ax= ax[0])
sns.boxplot(data= df_o, x='loan_status', y=var, ax= ax[1]);

var = 'person_income'
fig, ax = plt.subplots(2, figsize=(12,5))

sns.kdeplot(data= df_o, hue='loan_status', x= var, ax= ax[0])
sns.boxplot(data= df_o, y='loan_status', x= var, orient='h', ax= ax[1]);

var = 'person_home_ownership'
print(df_o.groupby(var)['loan_status'].value_counts())

fig, ax = plt.subplots(1,2, figsize=(12,5))

df_o[df_o['loan_status']==0][var].value_counts().plot.pie(ax=ax[0],autopct= "%1.0f%%",startangle=140,
                                                         wedgeprops={"linewidth":1,"edgecolor":"white"},
                                                         explode=[.02,.02,.0,.0]);
df_o[df_o['loan_status']==1][var].value_counts().plot.pie(ax=ax[1],autopct= "%1.0f%%",startangle=120,
                                                         wedgeprops={"linewidth":1,"edgecolor":"white"},
                                                         explode=[.02,.02,.0,.15]);
sns.displot(data= df_o, y=var,  col='loan_status');

var = 'person_emp_length'
fig, ax = plt.subplots(1,2, figsize=(12,5))

sns.kdeplot(data= df_o, hue='loan_status', x= var, ax= ax[0])
sns.boxplot(data= df_o, x='loan_status', y= var, ax= ax[1]);

var = 'loan_intent'
print(df_o.groupby(var)['loan_status'].value_counts())

fig, ax = plt.subplots(1,2, figsize=(12,5))

df_o[df_o['loan_status']==0][var].value_counts().plot.pie(ax=ax[0],autopct= "%1.0f%%",startangle=120,
                                                         wedgeprops={"linewidth":1,"edgecolor":"white"},
                                                         explode=[.03,.03,.0,.0,.0,.0]);
df_o[df_o['loan_status']==1][var].value_counts().plot.pie(ax=ax[1],autopct= "%1.0f%%",startangle=120,
                                                         wedgeprops={"linewidth":1,"edgecolor":"white"},
                                                         explode=[.03,.03,.0,.0,.0,.0]);
sns.displot(data= df_o, y=var,  col='loan_status');

var = 'loan_grade'
print(df_o.groupby(var)['loan_status'].value_counts())

fig, ax = plt.subplots(1,2, figsize=(12,5))

df_o[df_o['loan_status']==0][var].value_counts().plot.pie(ax=ax[0],autopct= "%1.0f%%",startangle=120,
                                                         wedgeprops={"linewidth":1,"edgecolor":"white"},
                                                         explode=[.05,.05,.0,.0,.0,.15,.3]);
df_o[df_o['loan_status']==1][var].value_counts().plot.pie(ax=ax[1],autopct= "%1.0f%%",startangle=120,
                                                         wedgeprops={"linewidth":1,"edgecolor":"white"},
                                                         explode=[.05,.05,.0,.0,.0,.15,.3]);
sns.displot(data= df_o, x=var,  col='loan_status');

var = 'loan_amnt'
fig, ax = plt.subplots(1,2, figsize=(12,5))

sns.kdeplot(data= df_o, hue='loan_status', x= var, ax= ax[0])
sns.boxplot(data= df_o, x='loan_status', y= var, ax= ax[1]);

var = 'loan_int_rate'
fig, ax = plt.subplots(1,2, figsize=(12,5))

sns.kdeplot(data= df_o, hue='loan_status', x= var, ax= ax[0])
sns.boxplot(data= df_o, x='loan_status', y= var, ax= ax[1]);

var = 'loan_percent_income'
fig, ax = plt.subplots(1,2, figsize=(12,5))

sns.kdeplot(data= df_o, hue='loan_status', x= var, ax= ax[0])
sns.boxplot(data= df_o, x='loan_status', y= var, ax= ax[1]);

var = 'cb_person_default_on_file'
print(df_o.groupby(var)['loan_status'].value_counts())

fig, ax = plt.subplots(1,2, figsize=(12,5))

df_o[df_o['loan_status']==0][var].value_counts().plot.pie(ax=ax[0],autopct = "%1.0f%%",startangle=90,
                                                         wedgeprops={"linewidth":1,"edgecolor":"white"},
                                                         explode=[.05,.0]);
df_o[df_o['loan_status']==1][var].value_counts().plot.pie(ax=ax[1],autopct = "%1.0f%%",startangle=120,
                                                         wedgeprops={"linewidth":1,"edgecolor":"white"},
                                                         explode=[.05,.0]);
sns.displot(data= df_o, y=var,  col='loan_status');

var = 'cb_person_cred_hist_length'
fig, ax = plt.subplots(1,2, figsize=(12,5))

sns.kdeplot(data= df_o, hue='loan_status', x= var, ax= ax[0])
sns.boxplot(data= df_o, x='loan_status', y= var, ax= ax[1]);



## Data Structuring & Processing

print()
print('Missing Data %')
z = (df_o.isna().sum() / df_o.shape[0])
print(z.apply(lambda x: '{:.2%}'.format(x)))
print('\nRatio of lines with missing values: {:.2%}'.format(df_o.isna().sum().sum() / df_o.shape[0]))
print()

df = df_o.dropna().copy()
print()
print('Data shape, lines and columns:', df.shape)
print()

df['cb_person_default_on_file'] = df['cb_person_default_on_file'].replace({'Y':1,'N':0})

df = pd.get_dummies(df)

sns.heatmap(df.corr(), cmap="RdYlGn",linewidth =1);

X = df.drop(columns='loan_status')
y = df['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=100)




### Model 1


clf = DecisionTreeClassifier(random_state=100)

path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities

grid_param = {'ccp_alpha':ccp_alphas}

grid = GridSearchCV(estimator = clf, param_grid=grid_param, cv=15)
grid.fit(X_train, y_train) 

# print os melhores parametros
print(grid.best_estimator_)





# print os melhores parametros
print(grid.best_estimator_)
print()
print(grid.best_estimator_.score(X_val, y_val))
print(grid.score(X_val, y_val))
print() 
print(grid.best_estimator_.score(X_test, y_test))
print(grid.score(X_test, y_test))



### Model 2

clf = RandomForestClassifier()

params = {
    'max_depth' : [10],
    'min_samples_leaf': [5],
    'n_estimators': list(range(1, 1001, 250))
}

grid_rf = GridSearchCV(estimator = clf,
                        param_grid = params,
                        scoring = 'accuracy', 
                        cv = 5)

grid_rf.fit(X_train, y_train)

# print os melhores parametros
print(grid_rf.best_estimator_)

                             

# print os melhores parametros
print(grid_rf.best_estimator_)
print()
print(grid_rf.best_estimator_.score(X_val, y_val))
print(grid_rf.score(X_val, y_val))
print() 
print(grid_rf.best_estimator_.score(X_test, y_test))
print(grid_rf.score(X_test, y_test))

### Model 3

clf = AdaBoostClassifier()

params = {
    'n_estimators': [100, 300, 600],
    'learning_rate': [0.04, 0.06, .1]
}

grid_ab = GridSearchCV(estimator = clf,
                        param_grid = params,
                        scoring = 'roc_auc', 
                        cv = 5)

grid_ab.fit(X_train, y_train)


# print os melhores parametros
print(grid_ab.best_estimator_)




# print os melhores parametros
print(grid_ab.best_estimator_)
print()
print(grid_ab.best_estimator_.score(X_val, y_val))
print(grid_ab.score(X_val, y_val))
print() 
print(grid_ab.best_estimator_.score(X_test, y_test))
print(grid_ab.score(X_test, y_test))

### Model 4

clf = GradientBoostingClassifier()

params = {
    'n_estimators': [100, 300, 600],
    'min_samples_leaf': [2, 10, 20],
    'learning_rate': [0.04, 0.06, .1]
}


grid_gb = GridSearchCV(estimator = clf,
                        param_grid = params,
                        scoring = 'roc_auc', 
                        cv = 5)

grid_gb.fit(X_train, y_train)


# print os melhores parametros
print(grid_gb.best_estimator_)




# print os melhores parametros
print(grid_gb.best_estimator_)
print()
print(grid_gb.best_estimator_.score(X_val, y_val))
print(grid_gb.score(X_val, y_val))
print() 
print(grid_gb.best_estimator_.score(X_test, y_test))
print(grid_gb.score(X_test, y_test))

## Assesment



print('\nLoan amount average')
print('$ {:.2f}'.format(df_o.loc[:,'loan_amnt'].mean()))

print('\nVariable person_emp_length with missing values')
print('Loan amount Sum:','$ {:,.0f}'.format(df_o.loc[df_o['person_emp_length'].isna()==1,'loan_amnt'].sum()))
print('Loan amount Mean:','$ {:.2f}'.format(df_o.loc[df_o['person_emp_length'].isna()==1,'loan_amnt'].mean()))

print('\nVariable loan_int_rate with missing values')
print('Loan amount Sum:','$ {:,.0f}'.format(df_o.loc[df_o['loan_int_rate'].isna()==1,'loan_amnt'].sum()))
print('Loan amount Mean:','$ {:.2f}'.format(df_o.loc[df_o['loan_int_rate'].isna()==1,'loan_amnt'].mean()))


print('\nTotal loan amount of lines with variables missing values')
filt = (df_o['person_emp_length'].isna()==1) | (df_o['loan_int_rate'].isna()==1)
print('$ {:,.0f}'.format(df_o.loc[filt,'loan_amnt'].sum()))
print()



print()
print(df_o['loan_status'].value_counts())
print("\nRate of bad loans:")
print(df_o['loan_status'].mean())
print()

print()
print(df['loan_status'].value_counts())
print("\nRate of bad loans:")
print(df['loan_status'].mean())
print()

## Implementation