In [1]:
#Biblioteca para carga dos dados
import pandas as pd 

In [2]:
#Definção dos títulos das colunas
headers = ['ESCT', 'NDEP', 'RENDA', 'TIPOR', 'VBEM', 'NPARC',
           'VPARC', 'TEL', 'IDADE', 'RESMS', 'ENTRADA', 'CLASSE']


#Carga do conjunto de treino
#Conjunto de dados disponível do repositório do Prof. Eduardo Bezerra (CEFET/RJ)
arquivo = 'https://raw.githubusercontent.com/MLRG-CEFET-RJ/ml-class/master/ml-t3/datasets/credtrain.txt'
data_train =pd.read_csv(arquivo, sep='\t', header=None, names=headers)

#Carga do conjunto de teste
#Conjunto de dados disponível do repositório do Prof. Eduardo Bezerra (CEFET/RJ)
arquivo = 'https://raw.githubusercontent.com/MLRG-CEFET-RJ/ml-class/master/ml-t3/datasets/credtest.txt'
data_test = pd.read_csv(arquivo, sep='\t', header=None, names=headers)



In [3]:
data_test.head()

Unnamed: 0,ESCT,NDEP,RENDA,TIPOR,VBEM,NPARC,VPARC,TEL,IDADE,RESMS,ENTRADA,CLASSE
0,0,2,500,1,618,10,85,0,36,6,0,0
1,1,0,813,0,552,4,119,0,43,48,119,1
2,3,0,350,0,488,12,66,0,43,0,0,1
3,1,0,1530,0,381,1,398,0,28,48,0,1
4,0,0,688,1,396,10,60,0,49,72,0,1


In [4]:
#Biblioteca para transformação dos dados em matrizes
import numpy as np

In [5]:
#Transoformação dos atributos e da classe alvo em matrizes
X_train_ = np.array(data_train.iloc[:, 0:11])
y_train_ = np.array(data_train['CLASSE'])

#Transformação dos atributos e da classe alvo em matrizes
X_test = np.array(data_test.iloc[:, 0:11])
y_test = np.array(data_test['CLASSE'])

In [6]:
X_test

array([[   0,    2,  500, ...,   36,    6,    0],
       [   1,    0,  813, ...,   43,   48,  119],
       [   3,    0,  350, ...,   43,    0,    0],
       ...,
       [   0,    3, 1200, ...,   39,    6,    0],
       [   1,    0,  600, ...,   33,    6,    0],
       [   0,    0,  800, ...,   29,    6,    0]], shape=(577, 11))

In [7]:
#Função para fatiamento dos conjuntos de dados
from sklearn.model_selection import train_test_split

In [8]:
#Separação de treino e validação
X_train, X_val, y_train, y_val = train_test_split(X_train_, y_train_, #Conjuntos de dados
                                                  train_size=0.8,     #Tamanho da fatia de treinamento
                                                  random_state=31)

In [9]:
#Checagem rápida de parte dos dados carregados
print(data_train.head(),
      data_test.head(),
      X_train[0],
      y_train[0],
      sep='\n\n')

   ESCT  NDEP  RENDA  TIPOR  VBEM  NPARC  VPARC  TEL  IDADE  RESMS  ENTRADA  \
0     1     0    360      0   313      9     52    0     25     48        0   
1     0     0    350      1   468     10     65    0     33      6        0   
2     0     0   1100      0   829      9    125    0     56     48        0   
3     0     0   3000      0   552     12     76    1     31     60        0   
4     1     0   1000      0   809     12    111    0     24      7        0   

   CLASSE  
0       1  
1       1  
2       1  
3       1  
4       1  

   ESCT  NDEP  RENDA  TIPOR  VBEM  NPARC  VPARC  TEL  IDADE  RESMS  ENTRADA  \
0     0     2    500      1   618     10     85    0     36      6        0   
1     1     0    813      0   552      4    119    0     43     48      119   
2     3     0    350      0   488     12     66    0     43      0        0   
3     1     0   1530      0   381      1    398    0     28     48        0   
4     0     0    688      1   396     10     60    0     

In [10]:
data_train.describe()

Unnamed: 0,ESCT,NDEP,RENDA,TIPOR,VBEM,NPARC,VPARC,TEL,IDADE,RESMS,ENTRADA,CLASSE
count,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0,1500.0
mean,0.512,0.126667,984.512,0.445333,562.031333,8.392667,102.247333,0.134,41.210667,31.528667,32.255333,0.476667
std,0.659909,0.572865,905.53292,0.497168,265.690646,3.609131,62.987325,0.340766,13.202266,52.662099,94.307614,0.499622
min,0.0,0.0,300.0,0.0,300.0,1.0,50.0,0.0,19.0,0.0,0.0,0.0
25%,0.0,0.0,467.5,0.0,410.0,6.0,66.0,0.0,31.0,6.0,0.0,0.0
50%,0.0,0.0,650.0,0.0,490.0,10.0,83.0,0.0,39.0,6.0,0.0,0.0
75%,1.0,0.0,1200.0,1.0,618.0,10.0,118.0,0.0,52.0,48.0,0.0,1.0
max,3.0,7.0,8000.0,1.0,4000.0,24.0,711.0,1.0,70.0,420.0,1300.0,1.0


In [11]:
#Importando o algoritmo que será usado como base
from sklearn.tree import DecisionTreeClassifier

In [12]:
#Criação do modelo
modelo = DecisionTreeClassifier(max_depth=3,
                                random_state=31)

In [13]:
#Treinamento do modelo
modelo.fit(X_train,y_train)

In [14]:
#Visualização gráfica da árvore de decisão
import os
from graphviz import Source
from sklearn.tree import export_graphviz

export_graphviz(modelo,
                out_file='credit_tree.dot', #Arquivo para armazenamento do modelo gráfico
                feature_names=headers[0:11], #Nomes dos atributos
                rounded=True,
                filled=True
                )

ModuleNotFoundError: No module named 'graphviz'