# Naive Bayes com Python
### Utilizando as ideias do algoritmo naive bayes para fazer a classificação de riscos em pagamentos.

In [1]:
# importando a biblioteca Pandas do Python
import pandas as pd

In [2]:
# passando todos os registros da base de dados para "dataframe"
dataframe = pd.read_csv("dados-credito.csv", encoding = "utf-8", sep = ",")

In [3]:
# observando os registros da base de dados através do objeto "dataframe"
print(dataframe)

      clientid        income        age         loan  default
0            1  66155.925095  59.017015  8106.532131        0
1            2  34415.153966  48.117153  6564.745018        0
2            3  57317.170063  63.108049  8020.953296        0
3            4  42709.534201  45.751972  6103.642260        0
4            5  66952.688845  18.584336  8770.099235        1
...        ...           ...        ...          ...      ...
1995      1996  59221.044874  48.518179  1926.729397        0
1996      1997  69516.127573  23.162104  3503.176156        0
1997      1998  44311.449262  28.017167  5522.786693        1
1998      1999  43756.056605  63.971796  1622.722598        0
1999      2000  69436.579552  56.152617  7378.833599        0

[2000 rows x 5 columns]


|**Meta Classe**|
|:--------------|
|0 - Não vai pagar|
|1 - Vai pagar|

In [4]:
# observando os dados estatísticos dos registros
dataframe.describe()

Unnamed: 0,clientid,income,age,loan,default
count,2000.0,2000.0,1997.0,2000.0,2000.0
mean,1000.5,45331.600018,40.807559,4444.369695,0.1415
std,577.494589,14326.327119,13.624469,3045.410024,0.348624
min,1.0,20014.48947,-52.42328,1.37763,0.0
25%,500.75,32796.459717,28.990415,1939.708847,0.0
50%,1000.5,45789.117313,41.317159,3974.719419,0.0
75%,1500.25,57791.281668,52.58704,6432.410625,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


In [5]:
# corrigindo valores inconsistentes (idade com valores negativos)
dataframe["age"][dataframe.age > 0].mean()

40.92770044906149

In [6]:
# atribuindo aos valores inconsistentes a média do campo de dados com os registros consistentes
dataframe.loc[dataframe.age < 0, "age"] = 40.92770044906149

In [7]:
# observando os valores estatísticos para analisar se os registros inconsistentes foram modificados
dataframe.describe()

Unnamed: 0,clientid,income,age,loan,default
count,2000.0,2000.0,1997.0,2000.0,2000.0
mean,1000.5,45331.600018,40.9277,4444.369695,0.1415
std,577.494589,14326.327119,13.261825,3045.410024,0.348624
min,1.0,20014.48947,18.055189,1.37763,0.0
25%,500.75,32796.459717,29.072097,1939.708847,0.0
50%,1000.5,45789.117313,41.317159,3974.719419,0.0
75%,1500.25,57791.281668,52.58704,6432.410625,0.0
max,2000.0,69995.685578,63.971796,13766.051239,1.0


In [8]:
# separando os dados previsores do meta classe
previsores = dataframe.iloc[:, 1:4].values
meta_classe = dataframe.iloc[:, 4].values

In [9]:
# observando os atributos previsores
print(previsores)

[[6.61559251e+04 5.90170151e+01 8.10653213e+03]
 [3.44151540e+04 4.81171531e+01 6.56474502e+03]
 [5.73171701e+04 6.31080495e+01 8.02095330e+03]
 ...
 [4.43114493e+04 2.80171669e+01 5.52278669e+03]
 [4.37560566e+04 6.39717958e+01 1.62272260e+03]
 [6.94365796e+04 5.61526170e+01 7.37883360e+03]]


In [10]:
# observando os atributos meta classe
print(meta_classe)

[0 0 0 ... 1 0 0]


In [11]:
# importandoa  biblioteca sklearn do Python
from sklearn.impute import SimpleImputer
# função "SimpleImputer" responsável por corrigir valores faltantes no banco de dados

In [12]:
# importando a biblioteca numpy do Python
import numpy as np

In [13]:
# criando o objeto "imputer"
imputer = SimpleImputer(missing_values = np.nan, strategy = "mean")

In [14]:
# fazendo o treinamento com a base de dados no objeto "imputer"
imputer = imputer.fit(previsores[:, 0:3])

In [15]:
# fazendo a transformação para correção de valores faltantes nos atributos previsores
previsores[:, 0:3] = imputer.transform(previsores[:, 0:3])

In [16]:
previsores.shape

(2000, 3)

In [17]:
# importando a biblioteca sklearn do Python
from sklearn.preprocessing import StandardScaler

In [18]:
# criando o objeto "scaler" para fazer o escalonamentos dos registros
scaler = StandardScaler()

In [19]:
# aplicando o escalonamento nos atributos previsores
previsores = scaler.fit_transform(previsores)

In [20]:
# observando os registros previsores
print(previsores)

[[ 1.45393393  1.36538005  1.20281942]
 [-0.76217555  0.54265932  0.69642695]
 [ 0.83682073  1.67417101  1.17471147]
 ...
 [-0.07122592 -0.97448606  0.35420081]
 [-0.11000289  1.73936652 -0.92675625]
 [ 1.682986    1.14917551  0.96381038]]


In [21]:
# separando a base de dados para treinamento e outra para teste
from sklearn.model_selection import train_test_split

In [22]:
# separando a base de dados para treinamento e outra para teste
previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(previsores, meta_classe, test_size = 0.25,
                                                                                             random_state = 0)

In [23]:
# visualizando a dimensão da base "previsores_treinamento"
previsores_treinamento.shape

(1500, 3)

In [24]:
# visualizando a dimensão da base "previsores_teste"
previsores_teste.shape

(500, 3)

In [25]:
# visualizando a dimensão da base "classe_treinamento"
classe_treinamento.shape

(1500,)

In [26]:
# visualizando a dimensão da base "classe_teste"
classe_teste.shape

(500,)

In [27]:
# importando a biblioteca sklearn do Python
from sklearn.naive_bayes import GaussianNB

In [28]:
# criando o objeto "classificador"
classificador = GaussianNB()

In [29]:
# treinando o objeto "classificador" para gerar a tabelas de probabilidades visto na teoria de aprendizagem bayesiana
classificador.fit(previsores_treinamento, classe_treinamento)

GaussianNB()

In [30]:
# usando a base de dados de teste para analisar o resultado da previsao
previsoes = classificador.predict(previsores_teste)

In [31]:
# observando os resultados do nosso algoritmo IA
print(previsoes)

[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 1 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0
 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0
 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 

In [32]:
# esses seriam os resultados ideias, observe a semelhança entre as duas bases
print(classe_teste)

[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1
 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 0 0 1 0
 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0
 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0 0 1
 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 1 0 0 0 1 0 0 

In [33]:
# importando a biblioteca sklearn do Python
from sklearn.metrics import confusion_matrix, accuracy_score

In [34]:
# compara as duas bases e retorna uma porcentagem de precisão
precisao = accuracy_score(previsoes, classe_teste)

In [35]:
# visualizando a precisão de acerto do algoritmo naive bayes
precisao

0.938

In [36]:
# criando uma matriz que informa os erros e acertos do algoritmo
matriz = confusion_matrix(previsoes, classe_teste)

In [37]:
# visualizando a matriz de confusão
print(matriz)

[[428  23]
 [  8  41]]


### Alguma dúvida? Entre em contato comigo:

- [Me envie um e-mail](mailto:alysson.barbosa@ee.ufcg.edu.br)