# Decision Tree  
Decision Tree model:  
http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

## Install graphviz to draw the tree
`pip install graphviz`  
`conda install python-graphviz`

In [1]:
! pip install graphviz



In [3]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier #
from sklearn import preprocessing
from sklearn import tree #
import graphviz #
import os

# mac 不用設定路徑
os.environ["PATH"] += os.pathsep + 'C:/Users/Student/Desktop/ML/hands-on_0525/example/release/bin'

# Load data
iris = load_iris()
X = iris.data 
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

model = DecisionTreeClassifier(max_depth=4) 
model.fit(X_train, y_train)

X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)
con_matrix = confusion_matrix(y_test, y_pred)

print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))
print('con_matrix: {}'.format(con_matrix))


# output tree structure
dot_data = tree.export_graphviz(model, out_file=None) 
graph = graphviz.Source(dot_data) 
graph.render("iris") # 存成一個名為iris的檔案


number of correct sample: 28
accuracy: 0.9333333333333333
con_matrix: [[10  0  0]
 [ 0  9  0]
 [ 0  2  9]]


'iris.pdf'

# Naive Bayes 
Naive Bayes model:  
http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html  
http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html

In [7]:
from sklearn import datasets
from sklearn.naive_bayes import GaussianNB, MultinomialNB #
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import neighbors, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


iris = datasets.load_iris()

X = iris.data 
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

model = GaussianNB() # p.24
#model = MultinomialNB() # p.20 不可以輸入負值 Negative values in data passed to MultinomialNB (input X)
model.fit(X_train, y_train)

X_test = scaler.transform(X_test)
y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)
con_matrix = confusion_matrix(y_test, y_pred)

print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))
print('con_matrix: {}'.format(con_matrix))

number of correct sample: 28
accuracy: 0.9333333333333333
con_matrix: [[12  0  0]
 [ 0  9  2]
 [ 0  0  7]]


寫迴圈把所有的模型跑一遍，觀察哪個模型比較適合數據集，模型表現比較好 
Auto ML  廠商  通用很廣的模型通常準確度很低

如果樣本特徵的分佈大部分是連續值，使用GaussianNB會比較好；如果樣本特徵的分佈大部分是多元離散值，使用MultinomialNB比較合適；如果樣本特徵是二元離散值或者很稀疏的多元離散值，應該使用BernoulliNB。