In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from zipfile import ZipFile
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix, ConfusionMatrixDisplay 
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

## Data Ingestion

In [2]:
path = r"C:\Users\ncc892\Downloads\heart_failure_dataset.zip"

with ZipFile(path , 'r') as zippath:
    zippath.printdir()

   

File Name                                             Modified             Size
heart.csv                                      2025-10-19 16:54:20        35921


In [3]:
with ZipFile(path, 'r') as zipref:
    with zipref.open('heart.csv') as file:
        data = pd.read_csv(file)

## Preliminary Data Analysis

In [4]:
df = data.copy()
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [5]:
df.shape

(918, 12)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


In [16]:
missing_values = df.isna().sum()

print(f'There are no missing values in this dataset')

There are no missing values in this dataset


In [None]:
duplicates = df.duplicated().sum()

if duplicates == 0:
    print(f'There are no duplicates in this dataset')
else:
    print(f'There are {duplicates} duplicates in this dataset')

There are no duplicates in this dataset


## Exploratory Data Analysis

In [17]:
df.describe(include='all')

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
count,918.0,918,918,918.0,918.0,918.0,918,918.0,918,918.0,918,918.0
unique,,2,4,,,,3,,2,,3,
top,,M,ASY,,,,Normal,,N,,Flat,
freq,,725,496,,,,552,,547,,460,
mean,53.510893,,,132.396514,198.799564,0.233115,,136.809368,,0.887364,,0.553377
std,9.432617,,,18.514154,109.384145,0.423046,,25.460334,,1.06657,,0.497414
min,28.0,,,0.0,0.0,0.0,,60.0,,-2.6,,0.0
25%,47.0,,,120.0,173.25,0.0,,120.0,,0.0,,0.0
50%,54.0,,,130.0,223.0,0.0,,138.0,,0.6,,1.0
75%,60.0,,,140.0,267.0,0.0,,156.0,,1.5,,1.0


From the desciption above, it is observed that

|Feature|Observation|Inference/Implication|
|---------|----------|------------|
|Age|Majority of the patients 47 and 60 years|This feature might prove to be a useful indicator|
|Sex|Majority of the patients affected are males|This indicates a possibly imbalanced class or that males are likely more affected than females|
|ChestPainType|Most cases are asymptomatic|It should be carefully watched|
|RestingBP|The least value is 0 and the highest is 200, with majority lying between 120 and 140|This indicates possible outliers|
|Cholesterol|The least value is 0 and the highest is 60|This indicates possible outliers and skewness|
|FastingBS|