<a href="https://colab.research.google.com/github/Edison1847/Decision-tree-simple-tutorial/blob/main/Decision%20tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1. Import libraries**

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # data visualization
import seaborn as sns # statistical data visualization
import graphviz
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn import tree
from sklearn.metrics import classification_report
%matplotlib


Ignore warnings

In [None]:
import warnings
warnings.filterwarnings('ignore')

# **2. Import the training dataset**

In [None]:
drive.mount('/content/drive/')
%cd /content/drive/My Drive/Colab Notebooks/Decision Tree/
data = 'datasheetHD.csv'
df = pd.read_csv(data)
df.head()

**Summary of the data set**

In [None]:
df.info()

1. There are 11 categories/features (age-ST slope) and the last category(Ishchemic heart disease) is the target variable (prediction)
2. There are data of 1190 individuals / rows

---
### **We are not doing data preprocessing to make the tutorial simple**
---

# **3. Declare feature vectors (training data) and target variable (prediction)**

In [None]:
X = df.drop(['Ischemic Heart Disease'], axis=1)

y = df['Ischemic Heart Disease']

# **4. Split data into separate training and test set**

In [None]:
# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 0)

In [None]:
# check the shape of X_train and X_test
X_train.shape, X_test.shape

In [None]:
X_train.head()

In [None]:
X_test.head()

**-- We now have training and test set ready for model building. --**

# **5. Initiate Decision Tree**

In [None]:
# instantiate the DecisionTreeClassifier model with criterion gini index
clf_gini = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=0)

# fit the model
clf_gini.fit(X_train, y_train)

# **6. Prediction**

In [None]:
y_pred_gini = clf_gini.predict(X_test)
# Check accuracy score with criterion gini index
print('Model accuracy score with criterion gini index: {0:0.4f}'. format(accuracy_score(y_test, y_pred_gini)))

**Comparing the train-set and test-set accuracy to check for overfitting.**

In [None]:
y_pred_train_gini = clf_gini.predict(X_train)
print('Training-set accuracy score: {0:0.4f}'. format(accuracy_score(y_train, y_pred_train_gini)))

**Check for overfitting and underfitting**

In [None]:
# print the scores on training and test set
print('Training set score: {:.4f}'.format(clf_gini.score(X_train, y_train)))
print('Test set score: {:.4f}'.format(clf_gini.score(X_test, y_test)))

# **7. Visualize decision-trees**

In [None]:
plt.figure(figsize=(12,8))
tree.plot_tree(clf_gini.fit(X_train, y_train))

In [None]:
dot_data = tree.export_graphviz(clf_gini, out_file=None,
                              feature_names=X_train.columns,
                              class_names=True,
                              filled=True, rounded=True,
                              special_characters=True)
graph = graphviz.Source(dot_data)
graph.render("Decision_Tree_Graph", format="png")
graph

# **8. Check the Accuracy**

In [None]:
# Print the Confusion Matrix and slice it into four pieces
cm = confusion_matrix(y_test, y_pred_gini)
print('Confusion matrix\n\n', cm)

**Get the accuracy report**

In [None]:
print(classification_report(y_test, y_pred_gini))

**Specificity**

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_gini).ravel()
specificity = tn / (tn+fp)
specificity

**Sensitivity**

In [None]:
sensitivity = tp / (tp + fn)
sensitivity

# **ROC & AUC**

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, roc_auc_score

# Get the false positive rate, true positive rate, and thresholds from the ROC curve.
fpr, tpr, _ = roc_curve(y_test, y_pred_gini)

# Calculate the area under the ROC curve.
roc_auc = auc(fpr, tpr)

# Plot the ROC curve.
plt.figure()
plt.plot(fpr, tpr, label="ROC curve (area = %0.2f)" % roc_auc)

# Plot a diagonal line from (0, 0) to (1, 1).
plt.plot([0, 1], [0, 1], "k-")

# Set the limits of the plot.
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])

# Set the labels of the axes.
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")

# Set the title of the plot.
plt.title("Receiver operating characteristic")

# Add a legend.
plt.legend(loc="lower right")

# Show the plot.
plt.show()