# Previsão de aprovação de empréstimo
Objetivo: Prever se o cliente pode ser aprovado ou não para pegar empréstimo

[https://www.kaggle.com/datasets/architsharma01/loan-approval-prediction-dataset](https://www.kaggle.com/datasets/architsharma01/loan-approval-prediction-dataset)

In [1]:
# importar o pandas
import pandas as pd
import numpy as np

In [2]:
# importar os dados
tabela = pd.read_csv('analise_emprestimo.csv')
tabela.head(3)

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,2,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected


In [3]:
!pip install ydata-profiling



In [4]:
from ydata_profiling import ProfileReport

In [48]:
profile = ProfileReport(tabela, title="Aprovação de Emprestimo")

In [49]:
profile.to_file("aprovacao.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
# informação da tabela
tabela.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   loan_id                    4269 non-null   int64 
 1    no_of_dependents          4269 non-null   int64 
 2    education                 4269 non-null   object
 3    self_employed             4269 non-null   object
 4    income_annum              4269 non-null   int64 
 5    loan_amount               4269 non-null   int64 
 6    loan_term                 4269 non-null   int64 
 7    cibil_score               4269 non-null   int64 
 8    residential_assets_value  4269 non-null   int64 
 9    commercial_assets_value   4269 non-null   int64 
 10   luxury_assets_value       4269 non-null   int64 
 11   bank_asset_value          4269 non-null   int64 
 12   loan_status               4269 non-null   object
dtypes: int64(10), object(3)
memory usage: 433.7+ KB


In [6]:
tabela.describe()

Unnamed: 0,loan_id,no_of_dependents,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value
count,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0,4269.0
mean,2135.0,2.498712,5059124.0,15133450.0,10.900445,599.936051,7472617.0,4973155.0,15126310.0,4976692.0
std,1232.498479,1.69591,2806840.0,9043363.0,5.709187,172.430401,6503637.0,4388966.0,9103754.0,3250185.0
min,1.0,0.0,200000.0,300000.0,2.0,300.0,-100000.0,0.0,300000.0,0.0
25%,1068.0,1.0,2700000.0,7700000.0,6.0,453.0,2200000.0,1300000.0,7500000.0,2300000.0
50%,2135.0,3.0,5100000.0,14500000.0,10.0,600.0,5600000.0,3700000.0,14600000.0,4600000.0
75%,3202.0,4.0,7500000.0,21500000.0,16.0,748.0,11300000.0,7600000.0,21700000.0,7100000.0
max,4269.0,5.0,9900000.0,39500000.0,20.0,900.0,29100000.0,19400000.0,39200000.0,14700000.0


In [7]:
# analisar valores nulos
tabela.isnull().sum()

loan_id                      0
 no_of_dependents            0
 education                   0
 self_employed               0
 income_annum                0
 loan_amount                 0
 loan_term                   0
 cibil_score                 0
 residential_assets_value    0
 commercial_assets_value     0
 luxury_assets_value         0
 bank_asset_value            0
 loan_status                 0
dtype: int64

In [8]:
tabela.columns

Index(['loan_id', ' no_of_dependents', ' education', ' self_employed',
       ' income_annum', ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

In [9]:
tabela.columns = ['loan_id', 'no_of_dependents', 'education', 'self_employed',
       'income_annum', 'loan_amount', 'loan_term', 'cibil_score',
       'residential_assets_value', 'commercial_assets_value',
       'luxury_assets_value', 'bank_asset_value', 'loan_status']

## Verificando valores únicos

In [10]:
tabela.education.unique()

array([' Graduate', ' Not Graduate'], dtype=object)

In [11]:
tabela.self_employed.unique()

array([' No', ' Yes'], dtype=object)

In [12]:
tabela.loan_status.unique()

array([' Approved', ' Rejected'], dtype=object)

In [13]:
tabela.education = tabela.education.apply(lambda x: 0 if x == ' Graduate' else 1)

In [14]:
tabela.self_employed = tabela.self_employed.apply(lambda x: 0 if x == ' No' else 1)

In [15]:
tabela.loan_status = tabela.loan_status.apply(lambda x: 0 if x == ' Approved' else 1)

## Legenda
* educação
    * 0 - Graduate
    * 1 - Not Graduate
* trabalho
    * 0 - Não
    * 1 - Sim
* aprovação
    * 0 - aprovado
    * 1 - reprovado

In [16]:
tabela.head(3)

Unnamed: 0,loan_id,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,1,2,0,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,0
1,2,0,1,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,1
2,3,3,0,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,1


In [17]:
tabela.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype
---  ------                    --------------  -----
 0   loan_id                   4269 non-null   int64
 1   no_of_dependents          4269 non-null   int64
 2   education                 4269 non-null   int64
 3   self_employed             4269 non-null   int64
 4   income_annum              4269 non-null   int64
 5   loan_amount               4269 non-null   int64
 6   loan_term                 4269 non-null   int64
 7   cibil_score               4269 non-null   int64
 8   residential_assets_value  4269 non-null   int64
 9   commercial_assets_value   4269 non-null   int64
 10  luxury_assets_value       4269 non-null   int64
 11  bank_asset_value          4269 non-null   int64
 12  loan_status               4269 non-null   int64
dtypes: int64(13)
memory usage: 433.7 KB


In [18]:
tabela2 = tabela.drop('loan_id', axis=1)

In [19]:
tabela2.head(3)

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,0,0,9600000,29900000,12,778,2400000,17600000,22700000,8000000,0
1,0,1,1,4100000,12200000,8,417,2700000,2200000,8800000,3300000,1
2,3,0,0,9100000,29700000,20,506,7100000,4500000,33300000,12800000,1


In [20]:
# separar os previsores e alvo
X = tabela2.drop('loan_status', axis=1)
y = tabela2['loan_status']

In [21]:
# importação
from sklearn.model_selection import train_test_split

In [22]:
# separar a base de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

## KNN

In [23]:
# knn
from sklearn.neighbors import KNeighborsClassifier

In [24]:
knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=6)

In [25]:
previsoes_knn = knn.predict(X_test)
previsoes_knn

array([0, 1, 1, ..., 0, 1, 1], dtype=int64)

In [26]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [27]:
accuracy_score(y_test, previsoes_knn)

0.6104605776736924

In [28]:
confusion_matrix(y_test, previsoes_knn)

array([[699, 111],
       [388,  83]], dtype=int64)

In [29]:
previsoes_treino = knn.predict(X_train)
previsoes_treino

array([0, 1, 0, ..., 0, 1, 0], dtype=int64)

In [30]:
accuracy_score(y_train, previsoes_treino)

0.6864123159303882

In [31]:
confusion_matrix(y_train, previsoes_treino)

array([[1709,  137],
       [ 800,  342]], dtype=int64)

## Árvore de Decisão

In [32]:
from sklearn import tree

In [33]:
clf_ar = tree.DecisionTreeClassifier(max_depth=6)
clf_ar = clf_ar.fit(X_train, y_train)

In [34]:
previsoes_clf_ar = clf_ar.predict(X_test)
previsoes_clf_ar

array([1, 0, 1, ..., 0, 1, 0], dtype=int64)

In [35]:
accuracy_score(y_test, previsoes_clf_ar)

0.9687743950039032

In [36]:
confusion_matrix(y_test, previsoes_clf_ar)

array([[805,   5],
       [ 35, 436]], dtype=int64)

In [37]:
previsoes_treino = clf_ar.predict(X_train)
previsoes_treino

array([1, 0, 0, ..., 1, 1, 0], dtype=int64)

In [38]:
accuracy_score(y_train, previsoes_treino)

0.9792503346720214

In [39]:
confusion_matrix(y_train, previsoes_treino)

array([[1845,    1],
       [  61, 1081]], dtype=int64)

## Random Forest

In [40]:
from sklearn.ensemble import RandomForestClassifier

In [41]:
clf_rf = RandomForestClassifier(n_estimators=10)
clf_rf = clf_rf.fit(X_train, y_train)

In [42]:
previsoes_clf_rf = clf_rf.predict(X_test)
previsoes_clf_rf

array([1, 0, 1, ..., 0, 1, 0], dtype=int64)

In [43]:
accuracy_score(y_test, previsoes_clf_rf)

0.970335675253708

In [44]:
confusion_matrix(y_test, previsoes_clf_rf)

array([[795,  15],
       [ 23, 448]], dtype=int64)

In [45]:
previsoes_treino = clf_rf.predict(X_train)
previsoes_treino

array([1, 0, 0, ..., 1, 1, 0], dtype=int64)

In [46]:
accuracy_score(y_train, previsoes_treino)

0.9983266398929049

In [47]:
confusion_matrix(y_train, previsoes_treino)

array([[1845,    1],
       [   4, 1138]], dtype=int64)