In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import export_text
from sklearn.model_selection import train_test_split

In [58]:
X = np.array([
    [20, 4], [22, 5], [25, 6],
    [30, 10], [32, 12], [35, 14],
    [40, 18], [45, 20], [48, 22],
    [52, 25], [55, 28], [60, 30],
    [62, 32], [65, 35], [70, 40],
    [30, 10], [45, 20], [60, 30],
    [18, 3], [21, 4], [24, 5],
    [28, 9], [33, 11], [36, 15],
    [25, 6], [55, 28], [40, 18],
    [68, 38], [72, 42], [66, 36]
])

y = np.array([
    "cat", "cat", "cat",
    "dog", "dog", "dog",
    "dog", "dog", "dog",
    "horse", "horse", "horse",
    "horse", "horse", "horse",
    "cat", "horse", "dog",
    "cat", "cat", "cat",
    "dog", "dog", "dog",
    "dog", "dog", "horse",
    "horse", "horse", "horse"
])



In [79]:
'''
Perform three train/test splits:

80% / 20%
70% / 30%
60% / 40%
For each split:

Train a DecisionTreeClassifier
Predict on the test set
Print accuracy
Print confusion matrix
Print tree rules using export_text
Compare how the different split ratios affect the results
'''

def nicepersc(perc): 
    return f'{100-perc*100} % / {perc*100}%'
    
def DecisionTreesTrainsplit(perc):
    print('-'*50)
    print(nicepersc(perc))
    # train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=perc, random_state=42) 
    
    # train (fit)
    clf = DecisionTreeClassifier(criterion='gini', random_state=42)
    clf.fit(X_train, y_train)
    
    # predict 
    y_pred = clf.predict(X_test)
    
    # accuracy 
    accuracy = accuracy_score(y_test, y_pred)
    print(f'accuracy is ({accuracy})')
    # cm 
    cm = confusion_matrix(y_test, y_pred)
    print(f'cm is \n({cm})\n')
    rules = export_text(clf, feature_names=["Size", "Weight"])
    print(rules,'-'*50,'\n')
    return accuracy
    
splitpesc=[0.2,0.3,0.4]

accuracy_list=[]

for  i ,  perc in enumerate(splitpesc):
   accuracy_list.append(DecisionTreesTrainsplit(splitpesc[i]))
    
min_acc_index = accuracy_list.index(min(accuracy_list))
best_split=splitpesc[min_acc_index]

print(f'the most accurate train split was {nicepersc(best_split)} \n'
      f'with the accuracy of {accuracy_list[min_acc_index]}\n')


--------------------------------------------------
80.0 % / 20.0%
accuracy is (0.5)
cm is 
([[0 1 0]
 [0 2 1]
 [0 1 1]])

|--- Size <= 26.50
|   |--- Weight <= 5.50
|   |   |--- class: cat
|   |--- Weight >  5.50
|   |   |--- class: cat
|--- Size >  26.50
|   |--- Weight <= 29.00
|   |   |--- Size <= 37.50
|   |   |   |--- class: dog
|   |   |--- Size >  37.50
|   |   |   |--- Weight <= 19.00
|   |   |   |   |--- class: dog
|   |   |   |--- Weight >  19.00
|   |   |   |   |--- Size <= 50.00
|   |   |   |   |   |--- class: dog
|   |   |   |   |--- Size >  50.00
|   |   |   |   |   |--- class: dog
|   |--- Weight >  29.00
|   |   |--- class: horse
 -------------------------------------------------- 

--------------------------------------------------
70.0 % / 30.0%
accuracy is (0.5555555555555556)
cm is 
([[0 1 0]
 [1 2 1]
 [0 1 3]])

|--- Size <= 26.50
|   |--- class: cat
|--- Size >  26.50
|   |--- Weight <= 16.00
|   |   |--- class: dog
|   |--- Weight >  16.00
|   |   |--- Weight <= 

In [82]:
#Predict the animal type with features [34, 17] Use the trained model to predict its class

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=best_split, random_state=42)     

clf = DecisionTreeClassifier(criterion='gini', random_state=42)
clf.fit(X_train, y_train)
        
newx=np.array([[34, 17]])
y_pred = clf.predict(newx)
print(f' the class prediction for the features {newx} is {y_pred}')

 the class prediction for the features [[34 17]] is ['dog']
