In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import plot_confusion_matrix

In [None]:
df= pd.read_csv("C:/Users/HP/Desktop/Anusha- Python Data sets/processed.cleveland.data")
df.head()

In [None]:
df.columns=['age','sex','cp','restbp','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','hd']
df.head()

In [None]:
#check for missing data
df['ca'].unique()

In [None]:
df['thal'].unique()

In [None]:
#no. of missing values
len(df.loc[(df['ca'] == '?')|(df['thal'] == '?')])

In [None]:
df.loc[(df['ca'] == '?')|(df['thal'] == '?')]

In [None]:
len(df)

In [None]:
#Delete rows with missing values
df_no_missing= df.loc[(df['ca'] != '?')&(df['thal'] != '?')]
len(df_no_missing)

In [None]:
df_no_missing['ca'].unique()

In [None]:
df_no_missing['thal'].unique()

In [None]:
X= df_no_missing.drop('hd',axis=1).copy() #Alternative: X= df_no_missing.iloc[:,:-1]
y= df_no_missing['hd'].copy()
X,y

In [None]:
#variables sex,cp,fbs,restecg,exang,slope,thal should be categorical
X.dtypes

In [None]:
X_encoded= pd.get_dummies(X, columns=['sex','cp','fbs','restecg','exang','slope','thal'])
X_encoded.head()

In [None]:
y.unique()

In [None]:
y_not_zero_index= y>0
y[y_not_zero_index]=1
y.unique()

In [None]:
X_encoded_train, X_encoded_test, y_train, y_test= train_test_split(X_encoded,y, random_state=42)
clf_dt= DecisionTreeClassifier(random_state=42)
clf_dt= clf_dt.fit(X_encoded_train,y_train)

In [None]:
plt.figure(figsize=(15, 7.5))
plot_tree(clf_dt,filled=True,rounded=True,class_names= ["No HD","Yes HD"], feature_names= X_encoded.columns);

In [None]:
#Accuracy= 0.7733
plot_confusion_matrix(clf_dt,X_encoded_test,y_test,display_labels=["Does not have HD", "Has HD"])

In [None]:
path= clf_dt.cost_complexity_pruning_path(X_encoded_train, y_train) #Determine values for alpha
ccp_alphas= path.ccp_alphas #extract different values for alpha
ccp_alphas= ccp_alphas[:-1] #exclude the max. value for alpha
ccp_alphas

In [None]:
clf_dts=[]
for ccp_alpha in ccp_alphas:
    clf_dt= DecisionTreeClassifier(random_state=0,ccp_alpha=ccp_alpha)
    clf_dt.fit(X_encoded_train,y_train)
    clf_dts.append(clf_dt)

In [None]:
train_scores= [clf_dt.score(X_encoded_train,y_train) for clf_dt in clf_dts]
test_scores= [clf_dt.score(X_encoded_test,y_test) for clf_dt in clf_dts]

fig,ax= plt.subplots()
ax.set_xlabel("alpha")
ax.set_xlabel("accuracy")
ax.set_title("Accuracy vs Alpha for training & test data sets")
ax.plot(ccp_alphas, train_scores, marker='o',label="train",drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o',label="test",drawstyle="steps-post")
ax.legend()
plt.show()

In [None]:
#Perform Cross validation
alpha_loop_values= []
for ccp_alpha in ccp_alphas:
    clf_dt= DecisionTreeClassifier(random_state=0, ccp_alpha= ccp_alpha)
    scores= cross_val_score(clf_dt,X_encoded_train,y_train,cv=5)
    alpha_loop_values.append([ccp_alpha,np.mean(scores),np.std(scores)])

alpha_results= pd.DataFrame(alpha_loop_values,columns=['alpha','mean_accuracy','std'])
alpha_results.plot(x='alpha',y='mean_accuracy',yerr='std',marker='o',linestyle='--')

In [None]:
ideal_ccp_alpha=alpha_results[(alpha_results['alpha']>0.014) & (alpha_results['alpha']<0.015)]['alpha']
ideal_ccp_alpha

In [None]:
#Convert to float
ideal_ccp_alpha= float(ideal_ccp_alpha)
ideal_ccp_alpha

In [None]:
clf_dt_pruned= DecisionTreeClassifier(random_state=42,ccp_alpha=ideal_ccp_alpha)
clf_dt_pruned= clf_dt_pruned.fit(X_encoded_train,y_train)

In [None]:
#accuracy= 0.826
plot_confusion_matrix(clf_dt_pruned,X_encoded_test,y_test,display_labels=["Does not have HD","Has HD"])

In [None]:
plt.figure(figsize=(15, 7.5))
plot_tree(clf_dt_pruned,filled=True,rounded=True,class_names= ["No HD","Yes HD"], feature_names= X_encoded.columns);