In [1]:
# -*- coding: utf-8 -*-
########### Banco de dados aleatório usado para prática.
### By: Gabriel Santiago
### Data: 08-07-2020
### Algorítimo: Árvore de Decisão
### Base de dados: bakn-full// Base de cadastro dos clientes de um banco. 
### Base de dados para estudo fornecida por: archive.ics.uci.edu
### Objetivo: baseado em um histórico de dados, predizer se um cliente irá ou não assinar um depósito a prazo.
### Segue uma explicação sobre cada atributo do banco.

In [2]:
### Descrição do banco de dados usado:
#Input variables:
# bank client data:
#1 - age (numeric)
#2 - job : type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')
#3 - marital : marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)
#4 - education (categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')
#5 - default: has credit in default? (categorical: 'no','yes','unknown')
#6 - housing: has housing loan? (categorical: 'no','yes','unknown')
#7 - loan: has personal loan? (categorical: 'no','yes','unknown')
## related with the last contact of the current campaign:
#8 - contact: contact communication type (categorical: 'cellular','telephone')
#9 - month: last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')
#10 - day_of_week: last contact day of the week (categorical: 'mon','tue','wed','thu','fri')
#11 - duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
# other attributes:
#12 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
#13 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
#14 - previous: number of contacts performed before this campaign and for this client (numeric)
#15 - poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')
# social and economic context attributes
#16 - emp.var.rate: employment variation rate - quarterly indicator (numeric)
#17 - cons.price.idx: consumer price index - monthly indicator (numeric)
#18 - cons.conf.idx: consumer confidence index - monthly indicator (numeric)
#19 - euribor3m: euribor 3 month rate - daily indicator (numeric)
#20 - nr.employed: number of employees - quarterly indicator (numeric)

#Output variable (desired target):
#21 - y - has the client subscribed a term deposit? (binary: 'yes','no')

In [3]:
import pandas as pd

base = pd.read_csv('../bank-full.csv')

In [4]:
## Separando previsores da classe:
previsores = base.iloc[:, 0:16].values
classe = base.iloc[:, 16].values

In [5]:
"""
## Aplicando o labelEncoder:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
previsores[:, 1] = label_encoder.fit_transform( previsores[:, 1] )
previsores[:, 2] = label_encoder.fit_transform( previsores[:, 2] )
previsores[:, 3] = label_encoder.fit_transform( previsores[:, 3] )
previsores[:, 4] = label_encoder.fit_transform( previsores[:, 4] )
previsores[:, 6] = label_encoder.fit_transform( previsores[:, 6] )
previsores[:, 7] = label_encoder.fit_transform( previsores[:, 7] )
previsores[:, 8] = label_encoder.fit_transform( previsores[:, 8] )
previsores[:, 10] = label_encoder.fit_transform( previsores[:, 10] )
previsores[:, 15] = label_encoder.fit_transform( previsores[:, 15] )
"""
### Aplicando o OneHotEncoder:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
 
onehotencorder = ColumnTransformer(transformers=[("OneHot", OneHotEncoder(), [1,2,3,4,6,7,8,10,15])],remainder='passthrough')
previsores = onehotencorder.fit_transform(previsores)

In [6]:
# escalonamento
#from sklearn.preprocessing import StandardScaler

#scaler = StandardScaler()
#previsores = scaler.fit_transform( previsores )

In [7]:
## Separando as bases entre treinamento e testes:
from sklearn.model_selection import train_test_split

previsores_treinamento, previsores_teste, classe_treinamento, classe_teste = train_test_split(previsores, classe, test_size=0.15, random_state=0)

In [8]:
## Criando a árvore de decisões:
from sklearn.tree import DecisionTreeClassifier

classificador = DecisionTreeClassifier( criterion='entropy',random_state=0 )
classificador.fit(previsores_treinamento, classe_treinamento)

previsoes = classificador.predict( previsores_teste )

In [9]:
## Avaliando o algorítimo:
from sklearn.metrics import accuracy_score, confusion_matrix

precisao = accuracy_score( classe_teste, previsoes )
print(f'A precisão do algorítimo foi de: {precisao}')

A precisão do algorítimo foi de: 0.8798289590091418


In [10]:
## Precisão sem escalonamento e com LabelEncoder: 0.8768799764081392
## Precisão com escalonamento e com LabelEncoder: 0.8779121203184901
## Precisão com escalonamento e com OneHotEncoder: 0.8786493659687408
## Precisão sem escalonamento e com OneHotEncoder: 0.8798289590091418
