In [None]:
# Load libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [None]:
df = pd.read_csv('Diabetes.csv', sep = ',')

In [None]:
x = df.drop(['Outcome'], axis=1)
y = df['Outcome']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= 0.2, random_state=1)

In [None]:
sc = StandardScaler()

scaler = sc.fit(x_train)
x_train_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [None]:
# Create Decision Tree classifer object
dec = DecisionTreeClassifier()

In [None]:
# Train Decision Tree Classifer
history = dec.fit(x_train_scaled,y_train)

In [None]:
#Predict the response for test dataset
y_pred = dec.predict(x_test_scaled)

In [None]:
print("Accuracy:",accuracy_score(y_test, y_pred))

In [None]:
from sklearn.tree import export_graphviz
from six import StringIO
from IPython.display import Image  
import pydotplus

feature_cols = ['Pregnancies', 'Glucose',	'BloodPressure', 'SkinThickness',	'Insulin',	'BMI',	'DiabetesPedigreeFunction',	'Age']

dot_data = StringIO()

# export_graphviz function converts decision tree classifier into dot file and pydotplus convert this dot file to png
export_graphviz(dec, out_file=dot_data,  
                filled=True, rounded=True,
                feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

## Pre-pruning

Using parameter called **max_depth**, **max_leaf_nodes**

In [None]:
# Create Decision Tree classifer object
dec = DecisionTreeClassifier(max_depth=2)

# Train Decision Tree Classifer
history = dec.fit(x_train_scaled,y_train)

#Predict the response for test dataset
y_pred = dec.predict(x_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_pred))

## Post-pruning

Using cost_complexity_pruning technique (CCP)

In [None]:
path = dec.cost_complexity_pruning_path(x_train_scaled, y_train)
ccp_alphas = path.ccp_alphas
print(ccp_alphas)

In [None]:
# For each alpha we will append our model to a list
decs = []
for ccp_alpha in ccp_alphas:
    dec = DecisionTreeClassifier(random_state=0, ccp_alpha=ccp_alpha)
    dec.fit(x_train_scaled, y_train)
    decs.append(dec)

In [None]:
import matplotlib.pyplot as plt

train_acc = []
test_acc = []
for d in decs:
    y_train_pred = d.predict(x_train_scaled)
    y_test_pred = d.predict(x_test_scaled)
    train_acc.append(accuracy_score(y_train_pred,y_train))
    test_acc.append(accuracy_score(y_test_pred,y_test))

plt.scatter(ccp_alphas,train_acc)
plt.scatter(ccp_alphas,test_acc)
plt.plot(ccp_alphas,train_acc,label='train_accuracy')
plt.plot(ccp_alphas,test_acc,label='test_accuracy')
plt.legend()
plt.title('Accuracy vs alpha')
plt.show()

In [None]:
dec_ = DecisionTreeClassifier(random_state=0,ccp_alpha=0.02)
dec_.fit(x_train_scaled,y_train)

In [None]:
y_test_pred = dec_.predict(x_test_scaled)

print("Accuracy:", accuracy_score(y_test, y_test_pred))