# Árvores de Regressão - (ExtraTrees)

O algoritmo de machine learning ExtraTrees (Extremely Randomized Trees) cria muitas árvores de decisão de maneira aleatória, para então através da combinação dos resultados de cada árvore encontrar a resposta final.
Seu principal difrencial está no fato deste processo ser extremamento aleatóriom, contribuido assim para modelos mais generalizáveis.

In [40]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

In [41]:
# Realizando a leitura da base de Dados contendo os dados de vinhos para analise
df = pd.read_csv('wine_dataset.csv')
# Analisando a estrutura de Base dados que temos
df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,style
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


**Realizado Analise dos dados, para tratamento de Dados em Necessidades** <p>

In [42]:
# Analisa base de dados
def Verificar_DataSet( Base_Dados ):

  # Verificando dimensão
  Dimensao = Base_Dados.shape
  print(f'Base de dados possui { Dimensao[0] } Linhas e {Dimensao[1] } Colunas')
  print('-' * 50 )

  # Campos únicos
  Campos_Unicos = Base_Dados.nunique()
  print(f'Campos únicos')
  print( pd.DataFrame( Campos_Unicos, columns=['Quantidade_Campos'] ) )
  print('-' * 50 )

  # Campos nulos
  Campos_Nulus = Base_Dados.isnull().sum()
  print(f'Campos Nulos')
  print( pd.DataFrame( Campos_Nulus, columns=['Quantidade_Campos'] ) )
  print('-' * 50 )

# Chamando a função
Verificar_DataSet(df)

Base de dados possui 6497 Linhas e 13 Colunas
--------------------------------------------------
Campos únicos
                      Quantidade_Campos
fixed_acidity                       106
volatile_acidity                    187
citric_acid                          89
residual_sugar                      316
chlorides                           214
free_sulfur_dioxide                 135
total_sulfur_dioxide                276
density                             998
pH                                  108
sulphates                           111
alcohol                             111
quality                               7
style                                 2
--------------------------------------------------
Campos Nulos
                      Quantidade_Campos
fixed_acidity                         0
volatile_acidity                      0
citric_acid                           0
residual_sugar                        0
chlorides                             0
free_sulfur_dioxide      

In [44]:
# Estatísticas básicas
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed_acidity,6497.0,7.215307,1.296434,3.8,6.4,7.0,7.7,15.9
volatile_acidity,6497.0,0.339666,0.164636,0.08,0.23,0.29,0.4,1.58
citric_acid,6497.0,0.318633,0.145318,0.0,0.25,0.31,0.39,1.66
residual_sugar,6497.0,5.443235,4.757804,0.6,1.8,3.0,8.1,65.8
chlorides,6497.0,0.056034,0.035034,0.009,0.038,0.047,0.065,0.611
free_sulfur_dioxide,6497.0,30.525319,17.7494,1.0,17.0,29.0,41.0,289.0
total_sulfur_dioxide,6497.0,115.744574,56.521855,6.0,77.0,118.0,156.0,440.0
density,6497.0,0.994697,0.002999,0.98711,0.99234,0.99489,0.99699,1.03898
pH,6497.0,3.218501,0.160787,2.72,3.11,3.21,3.32,4.01
sulphates,6497.0,0.531268,0.148806,0.22,0.43,0.51,0.6,2.0


In [43]:
# Estatísticas básicas - Categoricos
df.describe( include=['O'] ).transpose()

Unnamed: 0,count,unique,top,freq
style,6497,2,white,4898


In [48]:
# Analisando Diferentes tipos de vilho!!!
df.groupby(['style']).sum()

Unnamed: 0_level_0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,wine_type
style,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
red,13303.1,843.985,433.29,4059.55,139.859,25384.0,74302.0,1593.79794,5294.47,1052.38,16666.35,9012,0
white,33574.75,1362.825,1636.87,31305.15,224.193,172939.0,677690.5,4868.74609,15616.13,2399.27,51498.88,28790,4898


**Criação de Varíaveis e Dados para facilitar criação de aprendizagem de maquina** <p> 

In [45]:
# Converte a coluna 'Style' em valores numéricos e armazenando na coluna 'wine_type'
df['wine_type'] = df['style'].map({'red': 0, 'white': 1})

df.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,style,wine_type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red,0


In [49]:
# Realizado a Armazenamento dos Vinhos para ter como Recursos
y = df['style']
# Criando uma base sem os dados de 'style' para criar previsões como uma base de testes(Rótulos)
X = df.drop('style', axis=1).reset_index()

In [50]:
# Criando variaveis com os dados das Varaiveis X e Y para realizar o teste comparativo e ensinar como funciona os detalhes dos vinhos
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Criando um Modelo para iniciar aprendizagem
modelo = GaussianNB()
# Definindo quais dados deve ser utilizado para realizar a aprendizagem.
modelo.fit(X_train, y_train)

In [53]:
# Definindo as bases para criar predição
y_pred = modelo.predict(X_test)
print("Acurácia:", y_pred)

# Realiza a Validação para identificar qual o % de compatibilidade(Acurácia)
accuracy = accuracy_score(y_test, y_pred)
print("Acurácia:", accuracy)

Acurácia: ['white' 'red' 'white' ... 'white' 'white' 'white']
Acurácia: 1.0


In [54]:
# Exibe o relatório de classificação
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         red       1.00      1.00      1.00       341
       white       1.00      1.00      1.00       959

    accuracy                           1.00      1300
   macro avg       1.00      1.00      1.00      1300
weighted avg       1.00      1.00      1.00      1300

