In [11]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
data= pd.read_csv("heart.csv")
X = data.iloc[:,0:13]  #independent columns
#y = 5th column
y = data.iloc[:,-1]    #target column i.e price range

In [15]:

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train a decision tree model without pruning
tree_unpruned = DecisionTreeClassifier(random_state=42)
tree_unpruned.fit(X_train, y_train)

# Evaluate the unpruned tree on the test set
y_pred_unpruned = tree_unpruned.predict(X_test)
print("Confusion matrix for unpruned tree:")
print(confusion_matrix(y_test, y_pred_unpruned))

# Train a decision tree model with pruning using cross-validation to choose the best ccp_alpha value
ccp_alphas = [0.0, 0.01, 0.02, 0.03, 0.04, 0.05]  # define a range of ccp_alpha values to try
best_alpha = None
best_score = -1
for alpha in ccp_alphas:
    tree_pruned = DecisionTreeClassifier(random_state=42, ccp_alpha=alpha)
    tree_pruned.fit(X_train, y_train)
    score = tree_pruned.score(X_test, y_test)
    if score > best_score:
        best_score = score
        best_alpha = alpha

# Train a decision tree model with pruning using the best ccp_alpha value
tree_pruned = DecisionTreeClassifier(random_state=42, ccp_alpha=best_alpha)
tree_pruned.fit(X_train, y_train)

# Evaluate the pruned tree on the test set
y_pred_pruned = tree_pruned.predict(X_test)
print("Confusion matrix for pruned tree:")
print(confusion_matrix(y_test, y_pred_pruned))
print("Best alpha value:", best_alpha)


Confusion matrix for unpruned tree:
[[159   0]
 [  9 140]]
Confusion matrix for pruned tree:
[[122  37]
 [ 33 116]]
Best alpha value: 0.0
