## Importação dos dados

In [11]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

In [10]:
!pip install plotly==4.14.3

Collecting plotly==4.14.3
  Using cached plotly-4.14.3-py2.py3-none-any.whl (13.2 MB)
Collecting retrying>=1.3.3
  Using cached retrying-1.3.3.tar.gz (10 kB)
Building wheels for collected packages: retrying
  Building wheel for retrying (setup.py): started
  Building wheel for retrying (setup.py): finished with status 'done'
  Created wheel for retrying: filename=retrying-1.3.3-py3-none-any.whl size=11429 sha256=f3a7307e00f95bde7704b14d452665314d991670f35a422cbf8bcc1694283437
  Stored in directory: c:\users\alece\appdata\local\pip\cache\wheels\f9\8d\8d\f6af3f7f9eea3553bc2fe6d53e4b287dad18b06a861ac56ddf
Successfully built retrying
Installing collected packages: retrying, plotly
Successfully installed plotly-4.14.3 retrying-1.3.3


## Base de dados de crédito

In [None]:
base_credit = pd.read_csv('Database/credit_data.csv')

In [None]:
base_credit

In [None]:
base_credit.describe()

In [None]:
base_credit[base_credit['income'] >= 69995.685578] 

In [None]:
base_credit[base_credit['loan'] <= 1.377630]

### Visuazalização de dados

In [None]:
np.unique(base_credit['default'], return_counts=True)

In [None]:
sns.countplot(x = base_credit['default']);

In [None]:
plt.hist(x = base_credit['age']);

In [None]:
plt.hist(x = base_credit['income']);

In [None]:
plt.hist(x= base_credit['loan']);

In [None]:
grafico = px.scatter_matrix(base_credit, dimensions=['age', 'income', 'loan'], color='default')
grafico.show()

### Tratamento de valores inconsistentes

In [None]:
base_credit.loc[base_credit['age'] < 0]

In [None]:
base_credit[base_credit['age'] < 0]

In [None]:
# Apagar a coluna inteira (de todos os registros da base de dados)
base_credit2 = base_credit.drop('age', axis = 1)
base_credit2

In [None]:
# Apagar somente os registros com valores inconsistentes
base_credit3 = base_credit.drop(base_credit[base_credit['age'] < 0].index)
base_credit3

In [None]:
base_credit3.loc[base_credit3['age'] < 0]

In [None]:
# Preencher os valores inconsistente manualmente

In [None]:
# Preencher a média
base_credit.mean()

In [None]:
base_credit['age'].mean()

In [None]:
base_credit['age'][base_credit['age'] > 0].mean()

In [None]:
base_credit.loc[base_credit['age'] < 0, 'age'] = 40.92

In [None]:
base_credit.loc[base_credit['age'] < 0]

In [None]:
base_credit.head(27)

### Tratamento de valores faltantes

In [None]:
base_credit.isnull()

In [2]:
base_credit.isnull().sum()

NameError: name 'base_credit' is not defined

In [None]:
base_credit.loc[pd.isnull(base_credit['age'])]

In [None]:
base_credit['age'].fillna(base_credit['age'].mean(), inplace = True)

In [None]:
base_credit.loc[pd.isnull(base_credit['age'])]

In [None]:
base_credit.loc[(base_credit['clientid'] == 29) | (base_credit['clientid'] == 31) | (base_credit['clientid'] == 32)]

In [None]:
base_credit.loc[base_credit['clientid'].isin([29, 31, 32])]

### Divisão entre previsores e classes

In [None]:
X_credit = base_credit.iloc[:, 1:4].values

In [None]:
X_credit

In [None]:
type(X_credit)

In [None]:
y_credit = base_credit.iloc[:, 4].values

In [None]:
y_credit

In [None]:
type(y_credit)

### Escalonamento dos valores

In [None]:
X_credit

In [None]:
X_credit[:, 0].min(), X_credit[:, 1].min(), X_credit[:, 2].min()

In [None]:
X_credit[:, 0].max(), X_credit[:, 1].max(), X_credit[:, 2].max()

In [None]:
# Padronização é mais utilizado quando temos outliers na base de dados
from sklearn.preprocessing import StandardScaler
scaler_credit = StandardScaler()
X_credit = scaler_credit.fit_transform(X_credit)

In [None]:
X_credit[:, 0].min(), X_credit[:, 1].min(), X_credit[:, 2].min()

In [None]:
X_credit[:, 0].max(), X_credit[:, 1].max(), X_credit[:, 2].max()

In [None]:
X_credit

## Base de dados do censo

In [None]:
base_census = pd.read_csv('database/census.csv')

In [None]:
base_census

In [None]:
base_census.describe()

In [None]:
base_census.isnull().sum()

### Visualização dos dados

In [None]:
np.unique(base_census['income'], return_counts=True)

In [None]:
sns.countplot(x = base_census['income']);

In [None]:
plt.hist(x = base_census['age']);

In [None]:
plt.hist(x = base_census['education-num']);

In [None]:
plt.hist(x = base_census['hour-per-week']);

In [None]:
grafico = px.treemap(base_census, path=['workclass', 'age'])
grafico

In [None]:
grafico = px.treemap(base_census, path=['occupation', 'relationship', 'age'])
grafico.show()

In [None]:
grafico = px.parallel_categories(base_census, dimensions=['occupation', 'relationship'])
grafico.show()

In [None]:
grafico = px.parallel_categories(base_census, dimensions=['occupation', 'relationship', 'income'])
grafico.show()

In [None]:
grafico = px.parallel_categories(base_census, dimensions=['education', 'income'])
grafico.show()

### Divisão entre previsores e classe

In [None]:
base_census.columns

In [None]:
X_census = base_census.iloc[:, 0:14].values

In [None]:
X_census

In [None]:
X_census[0]

In [3]:
y_census = base_census.iloc[:, 14].values

NameError: name 'base_census' is not defined

In [None]:
y_census

## Tratamento de atributos categóricos

### Label Encoder

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
label_encoder_teste = LabelEncoder()

In [None]:
X_census[:,1]

In [None]:
teste = label_encoder_teste.fit_transform(X_census[:,1])

In [None]:
teste

In [None]:
X_census[0]

In [None]:
label_encoder_workclass = LabelEncoder()
label_encoder_education = LabelEncoder()
label_encoder_marital = LabelEncoder()
label_encoder_occupation = LabelEncoder()
label_encoder_relationship = LabelEncoder()
label_encoder_race = LabelEncoder()
label_encoder_sex = LabelEncoder()
label_encoder_country = LabelEncoder()

In [None]:
X_census[:,1] = label_encoder_workclass.fit_transform(X_census[:,1])
X_census[:,3] = label_encoder_workclass.fit_transform(X_census[:,3])
X_census[:,5] = label_encoder_workclass.fit_transform(X_census[:,5])
X_census[:,6] = label_encoder_workclass.fit_transform(X_census[:,6])
X_census[:,7] = label_encoder_workclass.fit_transform(X_census[:,7])
X_census[:,8] = label_encoder_workclass.fit_transform(X_census[:,8])
X_census[:,9] = label_encoder_workclass.fit_transform(X_census[:,9])
X_census[:,13] = label_encoder_workclass.fit_transform(X_census[:,13])

In [None]:
X_census[0]

In [None]:
X_census

### One hot encoder

In [None]:
len(np.unique(base_census['workclass']))

In [None]:
len(np.unique(base_census['occupation']))

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [None]:
onehotencoder_census = ColumnTransformer(transformers=[('OneHot', OneHotEncoder(), [1,3,5,6,7,8,9,13])], remainder='passthrough')

In [None]:
X_census = onehotencoder_census.fit_transform(X_census).toarray()

In [None]:
X_census[0]

In [None]:
X_census.shape

### Escalonamento dos valores

In [None]:
from sklearn.preprocessing import StandardScaler
scaler_census = StandardScaler()
X_census = scaler_census.fit_transform(X_census)

In [None]:
X_census[0]

## Divisão das bases em treinamento e teste

In [None]:
from sklearn.model_selection import train_test_split

### Credit data

In [None]:
X_credit_treinamento, X_credit_teste, y_credit_treinamento, y_credit_teste = train_test_split(X_credit, y_credit, test_size=0.25, random_state=0)

In [None]:
X_credit_treinamento.shape

In [None]:
y_credit_treinamento.shape

In [None]:
X_credit_teste.shape, y_credit_teste.shape

### Census

In [None]:
X_census_treinamento, X_census_teste, y_census_treinamento, y_census_teste = train_test_split(X_census, y_census, test_size=0.15, random_state=0)

In [None]:
X_census_treinamento.shape, y_census_treinamento.shape

In [None]:
X_census_teste.shape, y_census_teste.shape

### Salvar as variáveis

In [None]:
import pickle

In [None]:
with open('credit.pkl', mode = 'wb') as f:
    pickle.dump([X_credit_treinamento, y_credit_treinamento, X_credit_teste, y_credit_teste], f)

In [None]:
with open('census.pkl', mode = 'wb') as f:
    pickle.dump([X_census_treinamento, y_census_treinamento, X_census_teste, y_census_teste], f)

## Naive Bayes

Base risco de crédito

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
base_risco_credito = pd.read_csv('Database/risco_credito.csv')

In [None]:
base_risco_credito

In [None]:
X_risco_credito = base_risco_credito.iloc[:, 0:4].values

In [None]:
X_risco_credito

In [None]:
y_risco_credito = base_risco_credito.iloc[:, 4].values

In [4]:
from sklearn.preprocessing import LabelEncoder
label_encoder_historia = LabelEncoder()
label_encoder_divida = LabelEncoder()
label_encoder_garantia = LabelEncoder()
label_encoder_renda = LabelEncoder()

In [5]:
X_risco_credito[:,0] = label_encoder_historia.fit_transform(X_risco_credito[:,0])
X_risco_credito[:,1] = label_encoder_divida.fit_transform(X_risco_credito[:,1])
X_risco_credito[:,2] = label_encoder_garantia.fit_transform(X_risco_credito[:,2])
X_risco_credito[:,3] = label_encoder_renda.fit_transform(X_risco_credito[:,3])

NameError: name 'X_risco_credito' is not defined

In [None]:
X_risco_credito

In [None]:
import pickle
with open('risco_credito.pkl', 'wb') as f:
    pickle.dump([X_risco_credito, y_risco_credito], f)

In [None]:
naive_risco_credito = GaussianNB()
naive_risco_credito.fit(X_risco_credito, y_risco_credito)

In [None]:
previsao = naive_risco_credito.predict([[0,0,1,2], [2,0,0,0]])

In [None]:
previsao

In [None]:
naive_risco_credito.classes_

In [None]:
naive_risco_credito.class_count_

In [None]:
naive_risco_credito.class_prior_

Base credit data

In [None]:
import pickle
with open('credit.pkl', 'rb') as f:
    X_credit_treinamento, y_credit_treinamento, X_credit_teste, y_credit_teste = pickle.load(f)

In [None]:
X_credit_treinamento.shape,y_credit_treinamento.shape

In [None]:
X_credit_teste.shape, y_credit_teste.shape

In [None]:
naive_credit_data = GaussianNB()
naive_credit_data.fit(X_credit_treinamento, y_credit_treinamento)

In [None]:
previsoes = naive_credit_data.predict(X_credit_teste)

In [None]:
previsoes

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
accuracy_score(y_credit_teste, previsoes)

In [None]:
confusion_matrix(y_credit_teste, previsoes)

In [None]:
from yellowbrick.classifier import ConfusionMatrix

In [None]:
cm = ConfusionMatrix(naive_credit_data)
cm.fit(X_credit_treinamento, y_credit_treinamento)
cm.score(X_credit_teste, y_credit_teste)

In [None]:
print(classification_report(y_credit_teste, previsoes))

Base Census

In [None]:
with open('census.pkl', 'rb') as f:
    X_census_treinamento, y_census_treinamento, X_census_teste, y_census_teste = pickle.load(f)

In [None]:
X_census_treinamento.shape, y_census_treinamento.shape

In [None]:
X_census_teste.shape, y_census_teste.shape

In [None]:
naive_census = GaussianNB()
naive_census.fit(X_census_treinamento, y_census_treinamento)
previsoes = naive_census.predict(X_census_teste)
previsoes

In [None]:
y_census_teste

In [None]:
accuracy_score(y_census_teste, previsoes)

In [None]:
cm = ConfusionMatrix(naive_census)
cm.fit(X_census_treinamento, y_census_treinamento)
cm.score(X_census_teste, y_census_teste)

In [None]:
print(classification_report(y_census_teste, previsoes))

## Árvores de decisão

In [None]:
from sklearn.tree import DecisionTreeClassifier

Risco de crédito

In [None]:
import pickle
with open('risco_credito.pkl', 'rb') as f:
    X_risco_credito, y_risco_credito = pickle.load(f)

In [None]:
y_risco_credito

In [None]:
arvore_risco_credito = DecisionTreeClassifier(criterion='entropy')
arvore_risco_credito.fit(X_risco_credito, y_risco_credito)

In [None]:
arvore_risco_credito.feature_importances_

In [None]:
arvore_risco_credito.classes_

In [None]:
from sklearn import tree
previsores = ['historia', 'divida', 'garantias', 'renda']
figura, eixos = plt.subplots(nrows=1, ncols=1, figsize=(10, 10))
tree.plot_tree(arvore_risco_credito, feature_names=previsores, class_names = arvore_risco_credito.classes_, filled=True);

In [None]:
previsoes = arvore_risco_credito.predict([[0,0,1,2],[2,0,0,0]])
previsoes

Credit base

In [None]:
import pickle
with open('credit.pkl', 'rb') as f:
    X_credit_treinamento, y_credit_treinamento, X_credit_teste, y_credit_teste = pickle.load(f)

In [None]:
X_credit_treinamento.shape, y_credit_treinamento.shape

In [None]:
X_credit_teste.shape, y_credit_teste.shape

In [None]:
arvore_credit = DecisionTreeClassifier(criterion='entropy', random_state=0)
arvore_credit.fit(X_credit_treinamento, y_credit_treinamento)

In [None]:
previsoes = arvore_credit.predict(X_credit_teste)
previsoes

In [None]:
y_credit_teste

In [None]:
from sklearn.metrics import accuracy_score, classification_report

In [None]:
accuracy_score(y_credit_teste, previsoes)

In [None]:
from yellowbrick.classifier import ConfusionMatrix
cm = ConfusionMatrix(arvore_credit)
cm.fit(X_credit_treinamento, y_credit_treinamento)
cm.score(X_credit_teste, y_credit_teste)

In [None]:
print(classification_report(y_credit_teste, previsoes))

In [None]:
from sklearn import tree
previsores = ['income', 'age', 'loan']
fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(20,20))
tree.plot_tree(arvore_credit, feature_names=previsores, class_names=['0','1'], filled=True);
fig.savefig('arvore_credit.png')

Base census

In [None]:
with open('census.pkl', 'rb') as f:
    X_census_treinamento, y_census_treinamento, X_census_teste, y_census_teste = pickle.load(f)

In [None]:
X_census_treinamento.shape, y_census_treinamento.shape

In [None]:
X_census_teste.shape, y_census_teste.shape

In [None]:
arvore_census = DecisionTreeClassifier(criterion='entropy', random_state=0)
arvore_census.fit(X_census_treinamento, y_census_treinamento)

In [None]:
previsoes = arvore_census.predict(X_census_teste)
previsoes

In [None]:
y_census_teste

In [None]:
accuracy_score(y_census_teste, previsoes)

In [None]:
from yellowbrick.classifier import ConfusionMatrix
cm = ConfusionMatrix(arvore_census)
cm.fit(X_census_treinamento, y_census_treinamento)
cm.score(X_census_teste, y_census_teste)

In [None]:
previsoes

In [None]:
print(classification_report(y_census_teste, previsoes))

In [None]:
# recall - quantos % identificou
# precision - quantos % dos que identificou classificou corretamente

## Random Forest

In [None]:
# Ensemble learning
# - Consultar diversas árvores de decisão

In [None]:
# K - número de atributos
# escolhe aleatoriamente K atributos para comparação da métrica de pureza/impureza (impureza de gini/entropia)

Base credit data

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
import pickle
with open('credit.pkl', 'rb') as f:
    X_credit_treinamento, y_credit_treinamento, X_credit_teste, y_credit_teste = pickle.load(f)

In [None]:
X_credit_treinamento.shape, y_credit_treinamento.shape

In [None]:
X_credit_teste.shape, y_credit_teste.shape

In [None]:
random_forest_credit = RandomForestClassifier(n_estimators=40, criterion='entropy', random_state=0)
random_forest_credit.fit(X_credit_treinamento, y_credit_treinamento)

In [None]:
previsoes = random_forest_credit.predict(X_credit_teste)
previsoes

In [None]:
y_credit_teste

In [None]:
from sklearn.metrics import accuracy_score, classification_report
accuracy_score(y_credit_teste, previsoes)

In [None]:
from yellowbrick.classifier import ConfusionMatrix
cm = ConfusionMatrix(random_forest_credit)
cm.fit(X_credit_treinamento, y_credit_treinamento)
cm.score(X_credit_teste, y_credit_teste)

In [None]:
print(classification_report(y_credit_teste, previsoes))

Base census

In [None]:
import pickle
with open('census.pkl', 'rb') as f:
    X_census_treinamento, y_census_treinamento, X_census_teste, y_census_teste = pickle.load(f)

In [None]:
X_census_treinamento.shape, y_census_treinamento.shape

In [None]:
X_census_teste.shape, y_census_teste.shape

In [None]:
y_census_treinamento

In [None]:
random_forest_census = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=0)
random_forest_census.fit(X_census_treinamento, y_census_treinamento)

In [None]:
previsoes = random_forest_census.predict(X_census_teste)
previsoes

In [None]:
y_census_teste

In [None]:
from sklearn.metrics import accuracy_score, classification_report
accuracy_score(y_census_teste, previsoes)

In [None]:
from yellowbrick.classifier import ConfusionMatrix
cm = ConfusionMatrix(random_forest_census)
cm.fit(X_census_treinamento, y_census_treinamento)
cm.score(X_census_teste, y_census_teste)

In [None]:
print(classification_report(y_census_teste, previsoes))

## Regras