Decision tree Classifier


In [1]:
###### Set Up #####
# verify our folder with the data and module assets is installed
# if it is installed make sure it is the latest
!test -e ds-assets && cd ds-assets && git pull && cd ..
# if it is not installed clone it 
!test ! -e ds-assets && git clone https://github.com/lutzhamel/ds-assets.git
# point to the folder with the assets
home = "ds-assets/assets/" 
import sys
sys.path.append(home)      # add home folder to module search path

Cloning into 'ds-assets'...
remote: Enumerating objects: 168, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 168 (delta 0), reused 2 (delta 0), pack-reused 164[K
Receiving objects: 100% (168/168), 7.40 MiB | 30.07 MiB/s, done.
Resolving deltas: 100% (60/60), done.


In [2]:
import pandas as pd
from treeviz import tree_print
from sklearn import tree
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from confint import classification_confint
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV


In [11]:
pip install dtreeviz




In [12]:
from dtreeviz.trees import dtreeviz # remember to load the package



In [3]:
df = pd.read_csv("CowData_new.csv")
df.head(5)

Unnamed: 0,Fat...D,CU.Protein,Tot.Solids,Denovo.FA,Mixed.FA,Performed.FA,DeNovo.Cal.Rel.,Mixed.Cal.Rel..,Preformed.Cal.Rel..,BHB,Blood.NEFA..ueq.L.,preg
0,2.9387,2.9093,11.457,0.69,1.15,0.94,24.77,41.48,33.74,0.11,21.29,False
1,3.2224,2.8329,11.508,0.81,1.33,0.97,25.94,42.83,31.22,0.11,-79.51,False
2,2.4844,2.7936,10.819,0.61,1.06,0.7,25.62,44.85,29.52,0.09,9.6,False
3,2.9304,2.5808,11.294,0.71,1.22,0.84,25.78,43.94,30.28,0.08,105.36,False
4,3.7864,2.7025,12.235,0.92,1.39,1.37,24.99,37.82,37.18,0.14,234.24,False


In [4]:
df.shape

(2153, 12)

In [5]:
df = df.dropna()

In [6]:
df.columns

Index(['Fat...D', 'CU.Protein', 'Tot.Solids', 'Denovo.FA', 'Mixed.FA',
       'Performed.FA', 'DeNovo.Cal.Rel.', 'Mixed.Cal.Rel..',
       'Preformed.Cal.Rel..', 'BHB', 'Blood.NEFA..ueq.L.', 'preg'],
      dtype='object')

In [7]:
X  = df.drop(['preg'],axis=1)
y = df['preg']

In [8]:
df['preg'].value_counts()

False    1305
True      847
Name: preg, dtype: int64

In [9]:
# split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, test_size=0.25, random_state=2)

In [10]:

# set up the tree model object - limit the complexity to put us somewhere in the middle of the graph.
model = tree.DecisionTreeClassifier(random_state=1)

In [11]:
# setting up grid search
param_grid = {
    'max_depth': list(range(3,20)), # search 1..19
    'criterion': ['entropy', 'gini']
    }
    

In [12]:
# fit the model on the training set of data
#model.fit(X_train, y_train)
grid = GridSearchCV(model, param_grid, cv=5)

In [13]:
# performing grid search 
grid.fit(X_train,y_train)

# print out best parameters
print("Best parameters: {}".format(grid.best_params_))

# print out the best model
print("Best tree:")
tree_print(grid.best_estimator_,X_train)

# compute the accuracy of optimal classifier      
predict_y = grid.best_estimator_.predict(X_train)
acc = accuracy_score(y_train, predict_y)

# print accuracy          
print("Accuracy of optimal classifier: {:3.2f}".format(acc))

Best parameters: {'criterion': 'gini', 'max_depth': 6}
Best tree:
if CU.Protein =< 3.1266000270843506: 
  |then if Denovo.FA =< 0.5149999856948853: 
  |  |then if BHB =< 0.06249999813735485: 
  |  |  |then if Mixed.Cal.Rel.. =< 43.454999923706055: 
  |  |  |  |then if Mixed.FA =< 0.42500001192092896: 
  |  |  |  |  |then True
  |  |  |  |  |else if CU.Protein =< 2.5539000034332275: 
  |  |  |  |  |  |then True
  |  |  |  |  |  |else False
  |  |  |  |else True
  |  |  |else if CU.Protein =< 2.590000033378601: 
  |  |  |  |then if Fat...D =< 3.4764000177383423: 
  |  |  |  |  |then False
  |  |  |  |  |else True
  |  |  |  |else if BHB =< 0.1850000023841858: 
  |  |  |  |  |then if Mixed.Cal.Rel.. =< 33.26250076293945: 
  |  |  |  |  |  |then False
  |  |  |  |  |  |else True
  |  |  |  |  |else False
  |  |else if CU.Protein =< 2.362850069999695: 
  |  |  |then if DeNovo.Cal.Rel. =< 16.914999961853027: 
  |  |  |  |then False
  |  |  |  |else if Mixed.Cal.Rel.. =< 36.114999771118164: 


In [None]:
viz = dtreeviz(clf, X_train, y_train,
                target_name="preg",
                feature_names=iris.feature_names,
                class_names=list(iris.target_names))

viz


In [None]:
#tree_print(model,X_train)

In [14]:
predict_array = grid.best_estimator_.predict(X_train)      # produces an array of labels
predicted_labels = pd.DataFrame(predict_array)  # turn it into a DF
predicted_labels.columns = ['Outcome']          # name the column - same name as in target!
#predicted_labels

In [15]:
from sklearn.metrics import accuracy_score

print("Our model accuracy is: {}".format(accuracy_score(y_train, predicted_labels)))

Our model accuracy is: 0.6604708798017348


In [16]:
y.value_counts()

False    1305
True      847
Name: preg, dtype: int64

In [17]:
predicted_labels.value_counts()

Outcome
False      1506
True        108
dtype: int64

In [None]:
#predicted_labels.head()

In [None]:
train=y.to_numpy()

In [None]:
len(predict_array)

1614

In [18]:
 #build and print the confusion matrix 
labels = [0,1]
cm = confusion_matrix(y_train, predict_array, labels=labels)
cm_df = pd.DataFrame(cm, index=labels, columns=labels)
print("Confusion Matrix:\n{}".format(cm_df))

Confusion Matrix:
     0    1
0  964    6
1  542  102


In [None]:
#cm

Test

In [None]:
#dft = pd.read_csv("DT_test.csv")
#dft.head(5)

In [None]:
#Xt  = dft.drop(['Outcome'],axis=1)
#yt = dft['Outcome']

In [19]:
y_test_model = grid.best_estimator_.predict(X_test)
acc = accuracy_score(y_test, y_test_model)
acc

0.6133828996282528

In [20]:
observations = X_test.shape[0]
lb,ub = classification_confint(acc, observations)
print("Accuracy: {:3.2f} ({:3.2f},{:3.2f})".format(acc,lb,ub))

Accuracy: 0.61 (0.57,0.65)


In [21]:
y_test.value_counts()

False    335
True     203
Name: preg, dtype: int64

In [22]:
yt_m = pd.DataFrame(y_test_model)
yt_m.value_counts()

False    497
True      41
dtype: int64

In [23]:
# build and print the confusion matrix 
labels = [0,1]
cm = confusion_matrix(y_test, y_test_model, labels=labels)
cm_df = pd.DataFrame(cm, index=labels, columns=labels)
print("Confusion Matrix:\n{}".format(cm_df))

Confusion Matrix:
     0   1
0  312  23
1  185  18


In [25]:
from sklearn.metrics import classification_report
target_names=['0','1']
print(classification_report(y_test,y_test_model,target_names=target_names))

              precision    recall  f1-score   support

           0       0.63      0.93      0.75       335
           1       0.44      0.09      0.15       203

    accuracy                           0.61       538
   macro avg       0.53      0.51      0.45       538
weighted avg       0.56      0.61      0.52       538

