In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv("/content/heart.csv")
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 5   fbs       1025 non-null   int64  
 6   restecg   1025 non-null   int64  
 7   thalach   1025 non-null   int64  
 8   exang     1025 non-null   int64  
 9   oldpeak   1025 non-null   float64
 10  slope     1025 non-null   int64  
 11  ca        1025 non-null   int64  
 12  thal      1025 non-null   int64  
 13  target    1025 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 112.2 KB


In [None]:
df.isna().sum()

Unnamed: 0,0
age,0
sex,0
cp,0
trestbps,0
chol,0
fbs,0
restecg,0
thalach,0
exang,0
oldpeak,0


In [None]:
target_count = df["target"].value_counts()
print(target_count)

target
1    526
0    499
Name: count, dtype: int64


In [None]:
from scipy.stats import shapiro
columns = ["age","sex","cp","trestbps","chol","fbs","restecg","thalach","exang","oldpeak","slope","ca","thal"]

for col in columns:
  stat, p_value = shapiro(df[col].dropna())
  print(f'{col} Shapiro-Wilk p-value: {p_value}')

age Shapiro-Wilk p-value: 5.038685420061504e-09
sex Shapiro-Wilk p-value: 3.354400376014356e-44
cp Shapiro-Wilk p-value: 1.0574907536704914e-34
trestbps Shapiro-Wilk p-value: 2.194234143572828e-15
chol Shapiro-Wilk p-value: 4.000062978092988e-18
fbs Shapiro-Wilk p-value: 5.468937347498388e-49
restecg Shapiro-Wilk p-value: 4.592291924455292e-40
thalach Shapiro-Wilk p-value: 1.550333343988222e-11
exang Shapiro-Wilk p-value: 1.5303974419519303e-43
oldpeak Shapiro-Wilk p-value: 4.6685467599581855e-30
slope Shapiro-Wilk p-value: 8.52670235827709e-37
ca Shapiro-Wilk p-value: 2.450533614934568e-37
thal Shapiro-Wilk p-value: 1.6948949161818206e-36


In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[columns] = scaler.fit_transform(df[columns])
print(df.head())

        age  sex   cp  trestbps      chol  fbs  restecg   thalach  exang  \
0  0.479167  1.0  0.0  0.292453  0.196347  0.0      0.5  0.740458    0.0   
1  0.500000  1.0  0.0  0.433962  0.175799  1.0      0.0  0.641221    1.0   
2  0.854167  1.0  0.0  0.481132  0.109589  0.0      0.5  0.412214    1.0   
3  0.666667  1.0  0.0  0.509434  0.175799  0.0      0.5  0.687023    0.0   
4  0.687500  0.0  0.0  0.415094  0.383562  1.0      0.5  0.267176    0.0   

    oldpeak  slope    ca      thal  target  
0  0.161290    1.0  0.50  1.000000       0  
1  0.500000    0.0  0.00  1.000000       0  
2  0.419355    0.0  0.00  1.000000       0  
3  0.000000    1.0  0.25  1.000000       0  
4  0.306452    0.5  0.75  0.666667       0  


In [None]:
from scipy.stats import zscore
columns = ["age","sex","cp","trestbps","chol","fbs","restecg","thalach","exang","oldpeak","slope","ca","thal"]
for col in columns:
    outliers = (abs(zscore(df[col].dropna())) > 3).sum()
    print(f'{col} Number of outliers: {outliers}')

age Number of outliers: 0
sex Number of outliers: 0
cp Number of outliers: 0
trestbps Number of outliers: 7
chol Number of outliers: 13
fbs Number of outliers: 0
restecg Number of outliers: 0
thalach Number of outliers: 4
exang Number of outliers: 0
oldpeak Number of outliers: 7
slope Number of outliers: 0
ca Number of outliers: 18
thal Number of outliers: 7


In [None]:
X = df.drop("target",axis=1)
y = df["target"]

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)
classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
f1 = f1_score(y_test,y_pred)

print("Accuracy: ",accuracy)
print("Precision: ",precision)
print("Recall: ",recall)
print("F1 Score: ",f1)

Accuracy:  0.9853658536585366
Precision:  1.0
Recall:  0.970873786407767
F1 Score:  0.9852216748768473
