In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from plotly.offline import iplot
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("heart.csv")

In [3]:
print(f"• Number of Records: {df.shape[0]:,.0f}")
print(f"• Number of Features: {df.shape[1]}")

• Number of Records: 918
• Number of Features: 12


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [5]:
df.sample(10)

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
58,54,M,ASY,150,365,0,ST,134,N,1.0,Up,0
611,62,M,TA,135,139,0,ST,137,N,0.2,Up,0
659,59,M,NAP,126,218,1,Normal,134,N,2.2,Flat,1
587,37,M,NAP,118,240,0,LVH,165,N,1.0,Flat,0
727,60,F,ASY,158,305,0,LVH,161,N,0.0,Up,1
307,53,M,ATA,130,0,0,ST,120,N,0.7,Down,0
508,67,M,ASY,120,0,1,Normal,150,N,1.5,Down,1
86,65,M,ASY,170,263,1,Normal,112,Y,2.0,Flat,1
917,38,M,NAP,138,175,0,Normal,173,N,0.0,Up,0
585,57,M,ATA,180,285,1,ST,120,N,0.8,Flat,1


In [6]:
# Summary of All Numerical Data
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,918.0,53.510893,9.432617,28.0,47.0,54.0,60.0,77.0
RestingBP,918.0,132.396514,18.514154,0.0,120.0,130.0,140.0,200.0
Cholesterol,918.0,198.799564,109.384145,0.0,173.25,223.0,267.0,603.0
FastingBS,918.0,0.233115,0.423046,0.0,0.0,0.0,0.0,1.0
MaxHR,918.0,136.809368,25.460334,60.0,120.0,138.0,156.0,202.0
Oldpeak,918.0,0.887364,1.06657,-2.6,0.0,0.6,1.5,6.2
HeartDisease,918.0,0.553377,0.497414,0.0,0.0,1.0,1.0,1.0


In [7]:
# Summary of All Categorical Data
df.describe(include="object").T

Unnamed: 0,count,unique,top,freq
Sex,918,2,M,725
ChestPainType,918,4,ASY,496
RestingECG,918,3,Normal,552
ExerciseAngina,918,2,N,547
ST_Slope,918,3,Flat,460


In [8]:
# Check Nan Values
df.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [9]:
# Check Duplicates Records
df.duplicated().sum()

0

In [10]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [11]:
df["Age"].mean()

53.510893246187365

In [12]:
df["Age"].median()

54.0

In [13]:
px.box( x=df["Age"], labels={"x" :"Age"})

In [14]:
px.histogram(df, x="Age")

In [15]:
df["Sex"].value_counts()

M    725
F    193
Name: Sex, dtype: int64

In [16]:
gender = df["Sex"].value_counts(normalize = 1) * 100
gender

M    78.976035
F    21.023965
Name: Sex, dtype: float64

In [17]:
px.bar(
    gender,
    y=gender,
    x = ["Male" if i == "M" else "Female" for i in gender.index],
    color = gender,
    labels= {"x" :"gender", "y": "Frequency in PCT(%)"},
)

In [18]:
df["ChestPainType"].value_counts()

ASY    496
NAP    203
ATA    173
TA      46
Name: ChestPainType, dtype: int64

In [19]:
ChestPainType = df["ChestPainType"].value_counts(normalize=1) * 100 
ChestPainType

ASY    54.030501
NAP    22.113290
ATA    18.845316
TA      5.010893
Name: ChestPainType, dtype: float64

In [20]:
px.bar(
    ChestPainType ,
    y=ChestPainType ,
    x = ChestPainType.index,
    color = ChestPainType,
    title="Chest Pain Type Frequency",
    labels= {"x" :"Chest Pain Type", "y": "Frequency in PCT(%)"},
)

In [21]:
px.box(
    x = df["RestingBP"], 
    labels={"x": "Resting Bllod Pressure"},
)

From Box Plot, We Can See The Outliers.

After searching, 
We found that, High blood pressure is a risk factor for heart disease.
So, let's check the heart disease for those who have high blood pressure greater than 170 mmHg (upper)

In [22]:
HighBP = df[df["RestingBP"] > 170]
HighBP

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
109,39,M,ATA,190,241,0,Normal,106,N,0.0,Up,0
123,58,F,ATA,180,393,0,Normal,110,Y,1.0,Flat,1
189,53,M,ASY,180,285,0,ST,120,Y,1.5,Flat,1
190,46,M,ASY,180,280,0,ST,120,N,0.0,Up,0
241,54,M,ASY,200,198,0,Normal,142,Y,2.0,Flat,1
274,45,F,ATA,180,295,0,Normal,180,N,0.0,Up,0
275,59,M,NAP,180,213,0,Normal,100,N,0.0,Up,0
278,57,F,ASY,180,347,0,ST,126,Y,0.8,Flat,0
365,64,F,ASY,200,0,0,Normal,140,Y,1.0,Flat,1
372,63,M,ASY,185,0,0,Normal,98,Y,0.0,Up,1


In [23]:
HighBP_AND_HeartDisease = HighBP [ HighBP["HeartDisease"] == 1 ]
HighBP_AND_HeartDisease

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
123,58,F,ATA,180,393,0,Normal,110,Y,1.0,Flat,1
189,53,M,ASY,180,285,0,ST,120,Y,1.5,Flat,1
241,54,M,ASY,200,198,0,Normal,142,Y,2.0,Flat,1
365,64,F,ASY,200,0,0,Normal,140,Y,1.0,Flat,1
372,63,M,ASY,185,0,0,Normal,98,Y,0.0,Up,1
399,61,M,NAP,200,0,1,ST,70,N,0.0,Flat,1
411,54,M,ASY,180,0,1,Normal,150,N,1.5,Flat,1
475,59,M,ASY,178,0,1,LVH,120,Y,0.0,Flat,1
550,55,M,ASY,172,260,0,Normal,73,N,2.0,Flat,1
585,57,M,ATA,180,285,1,ST,120,N,0.8,Flat,1


In [24]:
high_blood = df[df["RestingBP"] > 170]["HeartDisease"].value_counts()

px.bar(
    high_blood,
    x = ["No Heart Disease" if i == 0 else "Heart Disease" for i in high_blood.index],
    y = high_blood,
    color = ["No Heart Disease" if i == 0 else "Heart Disease" for i in high_blood.index],
    title = "Frequency of Heart Disease With Blood Pressure > 170",
    labels= {"x" :"Heart Disease", "y": "Frequency in PCT(%)"},
)

In [25]:
px.box(
    x = df["Cholesterol"], 
    labels={"x": "Cholesterol"}
)

In [26]:
px.histogram(
    df , 
    x = "Cholesterol",
)

After a little search, we found that, 
very High / low levels of cholesterol in the blood can increase the risk of heart disease.
So, let's check the heart disease for those who have high cholesterol in the blood greater than 400 mg/dl 
or equal to 0

In [27]:
df[ df["Cholesterol"] == 0 ] 

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
293,65,M,ASY,115,0,0,Normal,93,Y,0.0,Flat,1
294,32,M,TA,95,0,1,Normal,127,N,0.7,Up,1
295,61,M,ASY,105,0,1,Normal,110,Y,1.5,Up,1
296,50,M,ASY,145,0,1,Normal,139,Y,0.7,Flat,1
297,57,M,ASY,110,0,1,ST,131,Y,1.4,Up,1
...,...,...,...,...,...,...,...,...,...,...,...,...
514,43,M,ASY,122,0,0,Normal,120,N,0.5,Up,1
515,63,M,NAP,130,0,1,ST,160,N,3.0,Flat,0
518,48,M,NAP,102,0,1,ST,110,Y,1.0,Down,1
535,56,M,ASY,130,0,0,LVH,122,Y,1.0,Flat,1


In [28]:
df[ df["Cholesterol"] == 0] ["HeartDisease"].value_counts()

1    152
0     20
Name: HeartDisease, dtype: int64

In [29]:
df[df["Cholesterol"] > 400]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
28,53,F,ATA,113,468,0,Normal,127,N,0.0,Up,0
30,53,M,NAP,145,518,0,Normal,130,N,0.0,Flat,1
69,44,M,ASY,150,412,0,Normal,170,N,0.0,Up,0
76,32,M,ASY,118,529,0,Normal,130,N,0.0,Flat,1
103,40,M,ASY,120,466,1,Normal,152,Y,1.0,Flat,1
149,54,M,ASY,130,603,1,Normal,125,Y,1.0,Flat,1
182,52,M,ASY,140,404,0,Normal,124,Y,2.0,Flat,1
250,44,M,ASY,135,491,0,Normal,135,N,0.0,Flat,1
496,58,M,ASY,132,458,1,Normal,69,N,1.0,Down,0
616,67,F,NAP,115,564,0,LVH,160,N,1.6,Flat,0


In [30]:
df[df["Cholesterol"] > 400]["HeartDisease"]

28     0
30     1
69     0
76     1
103    1
149    1
182    1
250    1
496    0
616    0
624    1
667    0
796    1
Name: HeartDisease, dtype: int64

In [31]:
chol = df[df["Cholesterol"] > 400]["HeartDisease"].value_counts()
chol

1    8
0    5
Name: HeartDisease, dtype: int64

In [32]:
# FastingBS: fasting blood sugar 
# 1: if FastingBS > 120 mg/dl, 
# 0: otherwise
df["FastingBS"].sample(20)

176    0
729    0
168    0
596    0
844    0
191    0
656    0
310    0
614    0
771    0
847    0
223    0
803    0
473    1
602    0
821    0
757    0
604    1
475    1
484    0
Name: FastingBS, dtype: int64

In [33]:
# ===== To Know What the Fasting Blood Sugar is, Go To The Columns Details 
fasting_blood_sugar = df["FastingBS"].value_counts(normalize=1) * 100
fasting_blood_sugar

0    76.688453
1    23.311547
Name: FastingBS, dtype: float64

In [34]:
px.bar(
    df, 
    x = ["Greate than 120" if i == 1 else "Less Than 120" for i in fasting_blood_sugar.index],
    y = fasting_blood_sugar, 
    color = ["Greate than 120" if i == 1 else "Less Than 120" for i in fasting_blood_sugar.index],
    title = "Fasting Blood Sugar Frequency",
    labels= {"x" :"Fasting Blood Sugar", "y": "Frequency in PCT(%)"},
)

In [35]:
# To Know What the RestingECG is, Go To The Columns Details 
df["RestingECG"].sample(10)

164    Normal
246    Normal
368    Normal
882       LVH
830       LVH
513        ST
855       LVH
235    Normal
255    Normal
212    Normal
Name: RestingECG, dtype: object

In [36]:
df["RestingECG"].value_counts()

Normal    552
LVH       188
ST        178
Name: RestingECG, dtype: int64

In [37]:
ECG = df["RestingECG"].value_counts(normalize=1) * 100
ECG

Normal    60.130719
LVH       20.479303
ST        19.389978
Name: RestingECG, dtype: float64

In [38]:
px.bar(
    ECG,
    title= "ECG Results Frequency",
    y = ECG , 
    color = ECG,
    labels= {"x" :"ECG Results", "y": "Frequency in PCT(%)"},
)


Max Heart Rate

In [39]:
df["MaxHR"].sample(10)

560    137
775    182
308    138
81     154
454    123
382    145
778    130
641    152
892    152
863    155
Name: MaxHR, dtype: int64

In [40]:
px.box(
    x = df["MaxHR"], 
    title= "Max Heart Rate Distribution",
    labels={"x": "Max Heart Rate"}
)

In [41]:
px.histogram(
    df,
    x = "MaxHR"
)

After Googling, we found that, 
    Lower values might indicate heart disease or poor heart health
    So, let's check the heart disease for those who have low Max Heart Rate less than 70 

In [42]:
df[df["MaxHR"] < 70 ]

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
370,60,M,ASY,135,0,0,Normal,63,Y,0.5,Up,1
390,51,M,ASY,140,0,0,Normal,60,N,0.0,Flat,1
402,65,M,ASY,145,0,1,ST,67,N,0.7,Flat,1
496,58,M,ASY,132,458,1,Normal,69,N,1.0,Down,0


Exercise Angina Column 
[Y: Yes, N: No]

In [43]:
df["ExerciseAngina"]

0      N
1      N
2      N
3      Y
4      N
      ..
913    N
914    N
915    Y
916    N
917    N
Name: ExerciseAngina, Length: 918, dtype: object

In [44]:
ExerciseAngina = df["ExerciseAngina"].value_counts()
ExerciseAngina

N    547
Y    371
Name: ExerciseAngina, dtype: int64

In [45]:
ExerciseAngina = df["ExerciseAngina"].value_counts(normalize = 1) * 100
px.bar(
    ExerciseAngina,
    y = ExerciseAngina,
    x = ["Yes" if i == 'Y' else "No" for i in ExerciseAngina.index],
    color = ExerciseAngina,
    labels= {"x" :"Exercise Angina" , "y": "Frequency in PCT(%)"},
)

In [46]:
df [ df["ExerciseAngina"] == 'Y' ] ["HeartDisease"].value_counts()

1    316
0     55
Name: HeartDisease, dtype: int64

In [47]:
ExerciseAngina_and_HeartDisease = df [ df["ExerciseAngina"] == 'Y' ] ["HeartDisease"].value_counts(normalize = 1) * 100 
px.bar(
    ExerciseAngina_and_HeartDisease,
    y = ExerciseAngina_and_HeartDisease,
    color = ExerciseAngina_and_HeartDisease,
    title = "Heart Disease after Exercise Angina",
    labels= {"y": "Frequency in PCT(%)"},
)

Oldpeak Column 
Numeric value measured in depression

In [48]:
df["Oldpeak"]

0      0.0
1      1.0
2      0.0
3      1.5
4      0.0
      ... 
913    1.2
914    3.4
915    1.2
916    0.0
917    0.0
Name: Oldpeak, Length: 918, dtype: float64

In [49]:
px.box(
    x = df["Oldpeak"], 
    title= "Oldpeak Distribution",
    template="plotly_white",
    labels={"x": "Oldpeak"}
)

In [50]:
px.histogram(df, x = "Oldpeak")

In [51]:
high_oldpeak = df[df["Oldpeak"] > 3.7]["HeartDisease"].value_counts()

px.pie(
    high_oldpeak,
    names = ["Heart Disease" if i ==1 else "No Heart Disease" for i in high_oldpeak.index],
    values = high_oldpeak,
    title="Frequency of Heart Disease With Oldpeak > 3.7"
)

ST Slope Column 
the slope of the peak exercise ST segment [Up: upsloping, Flat: flat, Down: downsloping]

In [52]:
df["ST_Slope"].sample(10)

88     Flat
247    Down
753      Up
681      Up
828    Flat
540    Flat
894    Flat
234      Up
883    Flat
92       Up
Name: ST_Slope, dtype: object

In [53]:
s = df["ST_Slope"].value_counts(normalize=1) * 100
px.bar(
    s,
    y = s,
    color = s,
    labels= {"y": "Frequency in PCT(%)"},
)

In [54]:
target = df["HeartDisease"].value_counts(normalize=1) * 100
target

1    55.337691
0    44.662309
Name: HeartDisease, dtype: float64

In [55]:
px.pie(
    df,
    names = target.index,
    values = target,
    title="Heart Disease Frequency in percentage",
)

What is the ratio of male to female patients with heart disease ?

In [56]:
# Ratio of male to female patients with heart disease
groupped = df.groupby("Sex", as_index=False)["HeartDisease"].count()
groupped["proportion"] = np.round(groupped["HeartDisease"] / groupped["HeartDisease"].sum() * 100, 2)
groupped.rename(columns={"HeartDisease": "count"}, inplace=True)



In [57]:
px.bar(
    groupped,
    barmode="group",
    x=["Male" if i == "M" else "Female" for i in groupped["Sex"]],
    y="proportion",
    color="Sex",
    labels={"x": "Gender", "proportion": "Frequency in %"},
).show()

men are more susceptible to heart disease than women.

now .. What is the frequency of "chest pain types" among patients with heart disease??!

In [58]:
f = df["HeartDisease"] == 1
dff = df[f].copy()
chest_pain_with_disease = dff["ChestPainType"].value_counts(normalize=1)*100
chest_pain_with_disease

ASY    77.165354
NAP    14.173228
ATA     4.724409
TA      3.937008
Name: ChestPainType, dtype: float64

In [59]:
px.bar(
    chest_pain_with_disease, 
    x = chest_pain_with_disease.index,
    y=chest_pain_with_disease, 
    color=chest_pain_with_disease.index,
    labels={"x" :"ChestPainType", "y": "Frequency in PCT(%)"},
    title = "Chest Pain Type Via Heart patients"
)

77% of people with heart disease, patients are "asymptomatic" = normal people
means they don't have any noticeable symptoms of a particular health issue or disease

For those who have heart disease, what is the frequency of patients with exercise angina??!

In [60]:
f = df["HeartDisease"] == 1
dff = df[f].copy()
angina_with_disease = dff["ExerciseAngina"].value_counts(normalize=1)*100
angina_with_disease

Y    62.204724
N    37.795276
Name: ExerciseAngina, dtype: float64

In [61]:
px.bar(
    angina_with_disease, 
    x = ["Yes" if i == "Y" else "No" for i in angina_with_disease.index],
    y=angina_with_disease, 
    color=angina_with_disease.index,
    labels={"x" :"ExerciseAngina", "y": "Frequency in PCT(%)"},
    title= "Exercise Angina Via Heart Patients"
)

Correlations Between Data 

In [62]:
# Correlations Between Data 
correlations = df.corr()
px.imshow(correlations, aspect=True, text_auto="0.3f")


Relation Between Blood Pressure & Age

In [63]:
px.scatter(
    df,
    x = "Age",
    y = "RestingBP",
    trendline="ols",
    color = "HeartDisease",
    color_continuous_scale="RdBu",
    title="Blood Pressure Via Age"
)

Relation Between Age & Max Heart Rate

In [64]:
px.scatter(
    df,
    x = "Age",
    y = "MaxHR",
    trendline="ols",
    color = "HeartDisease",
    color_continuous_scale="RdBu",
    title="Max Heart Rate Via Age"
)

Categorical Data Encoding

In [65]:
df.select_dtypes(include="object").columns

Index(['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], dtype='object')

In [66]:
df_encodded = pd.get_dummies(data=df, columns=['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], 
                             drop_first="True")*1

df_encodded

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
0,40,140,289,0,172,0.0,0,1,1,0,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,0,1,1,0,0,0,1,0,0,1
3,48,138,214,0,108,1.5,1,0,0,0,0,1,0,1,1,0
4,54,150,195,0,122,0.0,0,1,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,132,1.2,1,1,0,0,1,1,0,0,1,0
914,68,144,193,1,141,3.4,1,1,0,0,0,1,0,0,1,0
915,57,130,131,0,115,1.2,1,1,0,0,0,1,0,1,1,0
916,57,130,236,0,174,0.0,1,0,1,0,0,0,0,0,1,0


Splitting Data Into Train & Test

In [67]:
X = df_encodded.drop(columns=["HeartDisease"])
y = df_encodded["HeartDisease"]

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=3)

In [69]:
model = XGBClassifier(
objective="binary:logistic",
subsample=0.7,
min_child_weight=7,
max_depth=3,
learning_rate=0.1,
gamma=0.0,
colsample_bytree=0.7,
)


In [70]:
# Cross Validation Score
kf = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv = kf)

print(f"CROSS VALIDATION SCORE: {np.mean(scores)*100:0.2f}%")

CROSS VALIDATION SCORE: 87.69%


In [71]:
model.fit(X_train, y_train)
train_score = model.score(X_train, y_train)*100
print(f"TRAIN SCORE {train_score:0.2f}%")

TRAIN SCORE 89.65%


In [72]:
predictions = model.predict(X_test)
test_score = accuracy_score(y_test, predictions)*100
print(f"TEST SCORE {test_score:0.2f}%")

TEST SCORE 94.02%


In [73]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.93      0.94      0.94        86
           1       0.95      0.94      0.94        98

    accuracy                           0.94       184
   macro avg       0.94      0.94      0.94       184
weighted avg       0.94      0.94      0.94       184



In [74]:
cm = confusion_matrix(y_test, predictions)
ticks = df["HeartDisease"].map({0:"No Heart Disease", 1:"Heart Disease"}).unique()

px.imshow(cm, aspect=True, text_auto=True, x=ticks, y=ticks,
          color_continuous_scale="RdBu", title="Confusion Matrix")

In [75]:
import pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open("Inputs.pkl", 'wb') as f:
    pickle.dump(X_train.columns,f)

In [76]:
%%writefile streamlit_app.py
import pickle
import pandas as pd
import numpy as np
import streamlit as st

with open('model.pkl', 'rb') as f:
    Model = pickle.load(f)
with open('Inputs.pkl', 'rb') as f:
    Inputs = pickle.load(f)
    
def prediction(Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope):
    df = pd.DataFrame(columns=Inputs)
    df.at[0,"Age"] = Age    
    df.at[0,"ChestPainType"] = ChestPainType
    df.at[0,"RestingBP"] = RestingBP
    df.at[0,"Cholesterol"] = Cholesterol
    df.at[0,"RestingECG"] = RestingECG
    df.at[0,"MaxHR"] = MaxHR
    df.at[0,"Oldpeak"] = Oldpeak
    df.at[0,"ST_Slope"] = ST_Slope
    
    if FastingBS == "Yes" :
        df.at[0,"FastingBS"] = 1  
    else:
        df.at[0,"FastingBS"] = 0
        
    if Sex == "Male" :
        df.at[0,"Sex"] = "M"  
    else:
        df.at[0,"Sex"] = "F"
    
    if ExerciseAngina == "Yes" :
        df.at[0,"ExerciseAngina"] = "Y"  
    else:
        df.at[0,"ExerciseAngina"] = "N"
    
    df_encodded = pd.get_dummies(data=df, columns=['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'] ,drop_first="True")*1

    for col in df_encodded.select_dtypes(include=['object']).columns:
        df_encodded[col] = pd.to_numeric(df_encodded[col], errors='coerce')
        
    result = Model.predict(df_encodded)[0]
    
    return result

def Main():
    st.title("Heart Failure Prediction")
    Sex = st.selectbox("Gender",['Male', 'Female'])
    ChestPainType = st.selectbox("Chest Pain Type",['ASY', 'NAP', 'ATA', 'TA'])
    Age = st.slider("Age",min_value=15.0 , max_value=100.0 , step=1.0,value = 1.0)
    RestingBP = st.slider("Resting Blood Presure",min_value=0.0 , max_value= 200.0 , step=1.0,value = 1.0)
    Cholesterol = st.slider("Cholesterol Level",min_value=0.0 , max_value=600.0 , step=1.0,value = 1.0)
    FastingBS = st.selectbox("Fasting Blood Suger",['Yes','No'])
    RestingECG = st.selectbox("Resting Rlectrocardiogram Result",['Normal', 'LVH','ST'])
    MaxHR = st.slider("MaxHR",min_value=60.0 , max_value=202.0 , step=1.0,value = 1.0)
    ExerciseAngina = st.selectbox("Exercise Angina",['Yes','No'])
    Oldpeak = st.slider("Old Peak",min_value=-2.6 , max_value=6.2 , step=1.0,value = 1.0)
    ST_Slope = st.selectbox("ST Slope",['Flat','Up','Down'])
    
    if st.button("Predict"):
        result = prediction(Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope)
        list_result = ["No Heart Disease" , "HeartDisease"]
        st.text(list_result[result])
Main()


Overwriting streamlit_app.py
