In [52]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [53]:
df = pd.read_json("heart.json") # Reading Dataset

df.head() # Displays first 5 columns 

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [54]:
df.columns # Name of columns

Index(['Age', 'Sex', 'ChestPainType', 'RestingBP', 'Cholesterol', 'FastingBS',
       'RestingECG', 'MaxHR', 'ExerciseAngina', 'Oldpeak', 'ST_Slope',
       'HeartDisease'],
      dtype='object')

In [55]:
df.shape # shape - (no. of rows, no. of columns)

(918, 12)

In [56]:
df.info() # basic info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [57]:
df.describe().T  # Description/summary of the dataframe

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,918.0,53.510893,9.432617,28.0,47.0,54.0,60.0,77.0
RestingBP,918.0,132.396514,18.514154,0.0,120.0,130.0,140.0,200.0
Cholesterol,918.0,198.799564,109.384145,0.0,173.25,223.0,267.0,603.0
FastingBS,918.0,0.233115,0.423046,0.0,0.0,0.0,0.0,1.0
MaxHR,918.0,136.809368,25.460334,60.0,120.0,138.0,156.0,202.0
Oldpeak,918.0,0.887364,1.06657,-2.6,0.0,0.6,1.5,6.2
HeartDisease,918.0,0.553377,0.497414,0.0,0.0,1.0,1.0,1.0


In [58]:
df.isna().sum() #counting the number of null values in each column

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [59]:
df.HeartDisease.replace(to_replace = { 0 : "NO", 1 : "YES" }, inplace = True) # Replacing 0,1 with NO,YES for easy understanding

df.sample()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
68,52,M,ASY,160,246,0,ST,82,Y,4.0,Flat,YES


In [60]:
df.FastingBS.replace(to_replace={0: "No", 1: "Yes"}, inplace=True) # Counting the number of 'NO' and 'YES' in outcome column

df.sample()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
563,55,M,ASY,135,204,Yes,ST,126,Y,1.1,Flat,YES


In [61]:
df.Sex.value_counts() # count values of 'Sex' column 

M    725
F    193
Name: Sex, dtype: int64

In [62]:
df.ChestPainType.value_counts() # count values of 'ChestPainType' column

ASY    496
NAP    203
ATA    173
TA      46
Name: ChestPainType, dtype: int64

In [63]:
df.RestingECG.value_counts() # count values of 'RestingECG' column

Normal    552
LVH       188
ST        178
Name: RestingECG, dtype: int64

In [64]:
df.ST_Slope.value_counts() # count values of 'ST_Slope' column

Flat    460
Up      395
Down     63
Name: ST_Slope, dtype: int64

In [65]:
df.FastingBS.value_counts() # count values of 'FastingBS' column

No     704
Yes    214
Name: FastingBS, dtype: int64

In [66]:
df.ExerciseAngina.value_counts() # count values of 'ExerciseAngina' column

N    547
Y    371
Name: ExerciseAngina, dtype: int64

In [67]:
df.HeartDisease.value_counts() # count values of 'HeartDisease' column

YES    508
NO     410
Name: HeartDisease, dtype: int64

In [68]:
df.HeartDisease.value_counts(normalize = True) # percentage of split between the categorical values

# About 55% of the people have heart disease (Class 1) 
# and remaining 45% of the people didn't have heart disease (Class 0)

YES    0.553377
NO     0.446623
Name: HeartDisease, dtype: float64

In [69]:
def frequency_encoding(df, var):
    encoded_feature = df[var].value_counts().to_dict()
    df[var] = df[var].map(encoded_feature)

for col in ['ChestPainType', 'RestingECG', 'ST_Slope']:
    frequency_encoding(df, col)

df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,173,140,289,No,552,172,N,0.0,395,NO
1,49,F,203,160,180,No,552,156,N,1.0,460,YES
2,37,M,173,130,283,No,178,98,N,0.0,395,NO
3,48,F,496,138,214,No,552,108,Y,1.5,460,YES
4,54,M,203,150,195,No,552,122,N,0.0,395,NO


In [70]:
# replacing 'No' as '0' & 'Yes' as 1
df.Sex.replace(to_replace={"F": 0, "M": 1}, inplace=True)
df.FastingBS.replace(to_replace={"No": 0, "Yes": 1}, inplace=True)
df.ExerciseAngina.replace(to_replace={"N": 0, "Y": 1}, inplace=True)
df.HeartDisease.replace(to_replace={"NO": 0, "YES": 1}, inplace=True)
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,173,140,289,0,552,172,0,0.0,395,0
1,49,0,203,160,180,0,552,156,0,1.0,460,1
2,37,1,173,130,283,0,178,98,0,0.0,395,0
3,48,0,496,138,214,0,552,108,1,1.5,460,1
4,54,1,203,150,195,0,552,122,0,0.0,395,0


In [71]:
# Split the dataset into features (X) and target (y)
X = df.drop(["HeartDisease"],axis = 1)
y = df["HeartDisease"]

In [72]:
X[:5]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,1,173,140,289,0,552,172,0,0.0,395
1,49,0,203,160,180,0,552,156,0,1.0,460
2,37,1,173,130,283,0,178,98,0,0.0,395
3,48,0,496,138,214,0,552,108,1,1.5,460
4,54,1,203,150,195,0,552,122,0,0.0,395


In [73]:
y[:5]

0    0
1    1
2    0
3    1
4    0
Name: HeartDisease, dtype: int64

In [74]:
from imblearn import combine

smt = combine.SMOTETomek(random_state=14) 
X, y = smt.fit_resample(X, y)

In [75]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [84]:
# Define the pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])

print("accuracy","","precision","","recall","","f1_score")

# Fit the pipeline to the training data
pipe.fit(X_train, y_train)

# Predict the target using the test data
y_pred = pipe.predict(X_test)

# Evaluate the accuracy of the pipeline
print("LogisticRegression:")
print(round(accuracy_score(y_test, y_pred),2),"  ",round(precision_score(y_test,y_pred),2),"  ",round(recall_score(y_test,y_pred),2),"  ",round(f1_score(y_test,y_pred),2))

# Change the classifier to SVM
pipe.steps[1] = ('classifier', SVC())

# Fit the pipeline to the training data
pipe.fit(X_train, y_train)

# Predict the target using the test data
y_pred = pipe.predict(X_test)

# Evaluate the accuracy of the pipeline
print("SVC:")
print(round(accuracy_score(y_test, y_pred),2),"  ",round(precision_score(y_test,y_pred),2),"  ",round(recall_score(y_test,y_pred),2),"  ",round(f1_score(y_test,y_pred),2))

# Change the classifier to KNeighbours
pipe.steps[1] = ('classifier',KNeighborsClassifier())

# Fit the pipeline to the training data
pipe.fit(X_train, y_train)

# Predict the target using the test data
y_pred = pipe.predict(X_test)

# Evaluate the accuracy of the pipeline
print("KNeighbors:")
print(round(accuracy_score(y_test, y_pred),2),"  ",round(precision_score(y_test,y_pred),2),"  ",round(recall_score(y_test,y_pred),2),"  ",round(f1_score(y_test,y_pred),2))

# Change the classifier to Decision Tree
pipe.steps[1] = ('classifier', DecisionTreeClassifier())

# Fit the pipeline to the training data
pipe.fit(X_train, y_train)

# Predict the target using the test data
y_pred = pipe.predict(X_test)

# Evaluate the accuracy of the pipeline
print("DecisionTree:")
print(round(accuracy_score(y_test, y_pred),2),"  ",round(precision_score(y_test,y_pred),2),"  ",round(recall_score(y_test,y_pred),2),"  ",round(f1_score(y_test,y_pred),2))

# Change the classifier to Random Forest
pipe.steps[1] = ('classifier', RandomForestClassifier())

# Fit the pipeline to the training data
pipe.fit(X_train, y_train)

# Predict the target using the test data
y_pred = pipe.predict(X_test)

# Evaluate the accuracy of the pipeline
print("RandomForest:")
print(round(accuracy_score(y_test, y_pred),2),"  ",round(precision_score(y_test,y_pred),2),"  ",round(recall_score(y_test,y_pred),2),"  ",round(f1_score(y_test,y_pred),2))

# Change the classifier to GaussianNB
pipe.steps[1] = ('classifier',GaussianNB())

# Fit the pipeline to the training data
pipe.fit(X_train, y_train)

# Predict the target using the test data
y_pred = pipe.predict(X_test)

# Evaluate the accuracy of the pipeline
print("GaussianNB:")
print(round(accuracy_score(y_test, y_pred),2),"  ",round(precision_score(y_test,y_pred),2),"  ",round(recall_score(y_test,y_pred),2),"  ",round(f1_score(y_test,y_pred),2))

accuracy  precision  recall  f1_score
LogisticRegression:
0.88    0.85    0.9    0.87
SVC:
0.91    0.88    0.93    0.9
KNeighbors:
0.84    0.83    0.81    0.82
DecisionTree:
0.87    0.86    0.85    0.85
RandomForest:
0.91    0.88    0.93    0.9
GaussianNB:
0.87    0.84    0.88    0.86
