In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import tree
from PIL import Image
import matplotlib.pyplot as plt

# To make nice plots
!pip install six
!pip install pydotplus
!pip install graphviz
from sklearn.tree import export_graphviz
from sklearn import metrics
from six import StringIO
import pydotplus



You should consider upgrading via the 'c:\users\vio_g\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.




You should consider upgrading via the 'c:\users\vio_g\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.




You should consider upgrading via the 'c:\users\vio_g\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.


# Pre-Processing data

Get Dataset

In [2]:
df = pd.read_csv(r'../../data/2015_cleaned_droppedNaN.csv')

In [3]:
df.head()

Unnamed: 0,HeartDiseaseorAttack,HighBP,HighChol,BMI,Smoker,Diabetes,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,SexIsMale,AgeGroup
0,0.0,1.0,1.0,4018.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,9.0
1,0.0,0.0,0.0,2509.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,7.0
2,0.0,1.0,1.0,2819.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,9.0
3,0.0,1.0,0.0,2652.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,11.0
4,0.0,1.0,1.0,2389.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,11.0


In [4]:
df.shape

(292745, 12)

**Check empty values**

In [5]:
print(round((((df.isnull().sum()).sum() / np.product(df.shape)) * 100), 2))

0.0


We reduce the database for testing purposes

In [6]:
X = df.drop(columns=['HeartDiseaseorAttack'])
y = np.array(df['HeartDiseaseorAttack'])

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [8]:
y_train.shape

(234196,)

In [9]:
X_train.shape

(234196, 11)

# Gini vs Entropy

In this section we analyze if the method to calculate impurities (gini or entropy) affects the accuracy score.

In [None]:

gini_model = DecisionTreeClassifier(criterion='gini', random_state=42)
entropy_model = DecisionTreeClassifier(criterion='entropy', random_state=42)

gini_model.fit(X_train, y_train)
entropy_model.fit(X_train, y_train)

gini_predictions = gini_model.predict(X_test)
entropy_prediction = entropy_model.predict(X_test)

gini_score = accuracy_score(y_test, gini_predictions)
entropy_score = accuracy_score(y_test, entropy_prediction)

print(f"Average accuracy score for gini {gini_score}")
print(f"Average accuracy score for entropy {entropy_score}")

In [None]:
print("Statistical Analysis for entropy")

entropy_statistics = pd.Series(entropy_scores)
entropy_statistics.describe()

Tree plot for gini

In [None]:
tree.plot_tree(gini_model)
dot_data = StringIO()

# For label correctly the data in the nodes: 
column_names = []
for column in X: 
  column_names.append(column)
clases_names = []
for c in gini_model.classes_:
  clases_names.append(str(c))


export_graphviz(gini_model ,out_file=dot_data, filled=True, rounded=True, 
                special_characters=True, class_names=clases_names, feature_names=column_names)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png("tree_gini_not_optimized.png")

Tree plot for entropy

In [None]:
tree.plot_tree(entropy_model)
dot_data = StringIO()

# For label correctly the data in the nodes: 
clases_names = []
for c in entropy_model.classes_:
  clases_names.append(str(c))

export_graphviz(entropy_model ,out_file=dot_data, filled=True, rounded=True, 
                special_characters=True, class_names=clases_names, feature_names=column_names)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png("tree_entropy_not_optimized.png")

**Conclusion**

Entropy seems to have a better performance for this particular dataset. However, the gini criterion is faster because it is less computationally expensive. With a really big dataset (like our dataset) it might not be worth the time invested in training when using the entropy criterion. 

Moreover both trees seem overfitted, therefore we will analyze pruning in the next section. 

# Pruning the tree

Pruning means limiting the growth of a tree with the purpose of avoiding overfitting. 

Decision-trees classifiers in sklearn use the following parameters for pruning: 
* max_depth
* max_leaf_nodes
* min_samples_split
* min_samples_leaf
* min_impurity_decrease

For our tree, we will directly use ccp (cost complexity pruning), which is a post-pruning technique. The subtree with the largest cost complexity that is smaller than ccp_alpha will be chosen. 

The higher alpha is, the more the tree is prunned. An alpha of 0 will not preformed prunning (will leave just a node in the tree). 

In [None]:
model = DecisionTreeClassifier(criterion='entropy', random_state=42)

cost_complexity_pruning_path returns the effective alphas and the corresponding total leaf impurities at each step of the pruning process. 

In [None]:
path = model.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas # extract different values for alpha
ccp_alphas = ccp_alphas[:-1] # exclude the maximum value for alpha

In [None]:
ccp_alphas

Now I created a new decision tree per values of alpha and I store it in an array.

In [None]:
model_alphas = []
for alpha in ccp_alphas: 
  model = DecisionTreeClassifier(ccp_alpha=alpha, random_state=0)
  model.fit(X_train, y_train)
  model_alphas.append(model)

Now I graph the accuracy of each tree using the Training Dataset and the Testing Dataset as a function of alpha. 

The blue line is the accuracy for the training dataset. 
The yellow line is the accuracy for the test dataset. 

As we prune (alpha gets bigger) we see that the trainig accuracy decreases but the accuracy of testing increase. 

In [None]:
train_scores = [model.score(X_train, y_train) for model in model_alphas]
test_scores = [model.score(X_test, y_test) for model in model_alphas]

import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ax.set_xlabel('alpha')
ax.set_ylabel('accuracy')
ax.set_title('Accuracy vs alpha for training and testing sets')
ax.plot(ccp_alphas, train_scores, marker='o', label='train', drawstyle='steps-post')
ax.plot(ccp_alphas, test_scores, marker='o', label='test', drawstyle='steps-post')
ax.legend()
plt.show()

The obtained values for alpha seem to be to low. 

Therefore we will try to find a good value for alpha using cross validation. 

We first create a list with possible alpha values. 

In [None]:
ccp_alpha_list = np.arange(0, 0.1, 0.005)
ccp_alpha_list

In [None]:
from sklearn.model_selection import cross_val_score
stat_values = []
for alpha in ccp_alpha_list:
  model = DecisionTreeClassifier(ccp_alpha=alpha, random_state=42)
  scores = cross_val_score(model, X_train, y_train, cv=5)
  stat_values.append([alpha, np.mean(scores), np.std(scores)])

Now we draw a graph for the means and standard deviation of the accuracy scores calculated for each candidate. 

In [None]:
alpha_results = pd.DataFrame(stat_values, columns=['alpha', 'mean_accuracy', 'std'])
alpha_results.plot(x = 'alpha', y='mean_accuracy', yerr='std', marker='o', linestyle='--')

In [None]:
alpha_results

In [None]:
max_index = alpha_results['mean_accuracy'].idxmax()
ideal_ccp_alpha = alpha_results['alpha'][max_index]
ideal_ccp_alpha

# Build and evaluate classification tree

In [None]:
scores = []
for i in range(0, 100):
  model_pruned = DecisionTreeClassifier(criterion='entropy', ccp_alpha=ideal_ccp_alpha, random_state=42)
  model_pruned.fit(X_train, y_train)
  predictions = model_pruned.predict(X_test)
  scores.append(accuracy_score(y_test, predictions))
average_score = sum(scores) / len(scores)
print(f'average score is {average_score} and its statistic is: ')
print('difference between max and mit value is ', max(scores) - min(scores))
scores = pd.Series(scores)
print(scores.describe())


In [None]:
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(model_pruned, X_test, y_test)

In [None]:
dot_data = StringIO()
print(column_names)
export_graphviz(model_pruned ,out_file=dot_data, filled=True, rounded=True, 
                special_characters=True, class_names=clases_names, feature_names=column_names)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png("tree_optimized_prunning.png")
tree.plot_tree(model_pruned)