# Decision Trees Classifiers with Scikit-Learn using Gini Index and Entropy functions

## 1. Importing Libraries

In [8]:
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn import tree

## 2. Importing and Analyzing Dataset

In [9]:
data_set = pd.read_csv('Datasets/balance+scale/balance-scale.data', sep=',', header=None)

print("Dataset Length: ", len(data_set))
print("Dataset Shape: ", data_set.shape)
print("Dataset: ", data_set.head())

Dataset Length:  625
Dataset Shape:  (625, 5)
Dataset:     0  1  2  3  4
0  B  1  1  1  1
1  R  1  1  1  2
2  R  1  1  1  3
3  R  1  1  1  4
4  R  1  1  1  5


## 3. Splitting the Data

In [10]:
X = data_set.values[:, 1:5] #gets all the values from all the rows and the columns from index 1 to 4
y = data_set.values[:, 0]   #gets all the values from all the rows and the first column (index 0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 100)


## 4. Training with Gini Index
* Gini Index: metric to measuer how often a randomly chosen element would be incorreclty indentified. A lower gini index is preferred
* Forumal:  Gini Index =1−∑(*pi*)^2 


In [11]:
dtc_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100, max_depth = 3, min_samples_leaf = 5)   
# max_depth limits the maximum depth of the tree to 3 levels
# min_samples_leaf means that each node must have at least 5 data points to be considered a leaf node

#Fitting the data
dtc_gini.fit(X_train, y_train)


## 5. Training with Entropy

In [12]:
dtc_entropy = DecisionTreeClassifier(criterion = 'entropy', random_state = 100, max_depth = 3, min_samples_leaf = 5)
#Parameters same as gini index

#Fitting the data
dtc_entropy.fit(X_train, y_train)

## 6. Prediction and Evaluation

In [13]:
#Predictions for gini index
y_pred_gini = dtc_gini.predict(X_test)

#Predicitions for entropy
y_pred_entropy = dtc_entropy.predict(X_test)


#Calculations for gini predictions
conf_matrix_gini = confusion_matrix(y_test, y_pred_gini)
accuracy_gini = accuracy_score(y_test, y_pred_gini)
classification_rep_gini = classification_report(y_test, y_pred_gini)

print("Gini Index Confusion Matrix: ", conf_matrix_gini)
print("Gini Index Accuracy Score: ", accuracy_gini)
print("Gini Index Classification Report: ", classification_rep_gini)


#Calculations for entropy
conf_matrix_entropy = confusion_matrix(y_test, y_pred_entropy)
accuracy_entropy = accuracy_score(y_test, y_pred_entropy)
classification_rep_entropy = classification_report(y_test, y_pred_entropy)

print("Entropy Confusion Matrix: ", conf_matrix_entropy)
print("Entropy Accuracy Score: ", accuracy_entropy)
print("Classification Report: ", classification_rep_entropy)



Gini Index Confusion Matrix:  [[ 0  6  7]
 [ 0 67 18]
 [ 0 19 71]]
Gini Index Accuracy Score:  0.7340425531914894
Gini Index Classification Report:                precision    recall  f1-score   support

           B       0.00      0.00      0.00        13
           L       0.73      0.79      0.76        85
           R       0.74      0.79      0.76        90

    accuracy                           0.73       188
   macro avg       0.49      0.53      0.51       188
weighted avg       0.68      0.73      0.71       188

Entropy Confusion Matrix:  [[ 0  6  7]
 [ 0 63 22]
 [ 0 20 70]]
Entropy Accuracy Score:  0.7074468085106383
Classification Report:                precision    recall  f1-score   support

           B       0.00      0.00      0.00        13
           L       0.71      0.74      0.72        85
           R       0.71      0.78      0.74        90

    accuracy                           0.71       188
   macro avg       0.47      0.51      0.49       188
weighted avg

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## 7. Plotting Decision Tree



In [14]:
#Plot for Gini Index
plt.figure(figsize = (15,10))
plot_tree(dtc_gini, filled = True, feature_names = ['X1', 'X2', 'X3', 'X4'], class_names = ['L', 'B', 'R'], rounded = True)
plt.show()

#Plot for Entropy
plt.figure(figsize = (15,10))
plot_tree(dtc_entropy, filled = True, feature_names = ['X1', 'X2', 'X3', 'X4'], class_names = ['L', 'B', 'R'], rounded = True)
plt.show()

NameError: name 'plot_tree' is not defined

<Figure size 1500x1000 with 0 Axes>