In [2]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

from sklearn.impute import KNNImputer
import plotly.express as px

## Data Ingestion

In [None]:
file = pd.read_csv('heart.csv')

## Preliminary Data Analysis

In [4]:

df = file.copy()
df.head()   

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [5]:
df.shape

(918, 12)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int64  
 1   Sex             918 non-null    object 
 2   ChestPainType   918 non-null    object 
 3   RestingBP       918 non-null    int64  
 4   Cholesterol     918 non-null    int64  
 5   FastingBS       918 non-null    int64  
 6   RestingECG      918 non-null    object 
 7   MaxHR           918 non-null    int64  
 8   ExerciseAngina  918 non-null    object 
 9   Oldpeak         918 non-null    float64
 10  ST_Slope        918 non-null    object 
 11  HeartDisease    918 non-null    int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 86.2+ KB


From the basic infofrmation displayed above, th e dataset appears to have no missing columns

In [7]:
missing_values = df.isna().sum()

print(f'There are no missing values in this dataset')

There are no missing values in this dataset


In [8]:
duplicates = df.duplicated().sum()

if duplicates == 0:
    print(f'There are no duplicates in this dataset')
else:
    print(f'There are {duplicates} duplicates in this dataset')

There are no duplicates in this dataset


## Exploratory Data Analysis

In [9]:
df.describe(include='all')

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
count,918.0,918,918,918.0,918.0,918.0,918,918.0,918,918.0,918,918.0
unique,,2,4,,,,3,,2,,3,
top,,M,ASY,,,,Normal,,N,,Flat,
freq,,725,496,,,,552,,547,,460,
mean,53.510893,,,132.396514,198.799564,0.233115,,136.809368,,0.887364,,0.553377
std,9.432617,,,18.514154,109.384145,0.423046,,25.460334,,1.06657,,0.497414
min,28.0,,,0.0,0.0,0.0,,60.0,,-2.6,,0.0
25%,47.0,,,120.0,173.25,0.0,,120.0,,0.0,,0.0
50%,54.0,,,130.0,223.0,0.0,,138.0,,0.6,,1.0
75%,60.0,,,140.0,267.0,0.0,,156.0,,1.5,,1.0


From the desciption above, it is observed that

|Feature|Observation|Inference/Implication|
|---------|----------|------------|
|Age|Majority of the patients 47 and 60 years|This feature might prove to be a useful indicator|
|Sex|Majority of the patients affected are males|This indicates a possibly imbalanced class or that males are likely more affected than females|
|ChestPainType|Most cases are asymptomatic|It should be carefully watched|
|RestingBP|The least value is 0 and the highest is 200, with majority lying between 120 and 140|This indicates an anomaly beacuse a living human can't have a resting BP of 0; so the column should be adjusted|
|Cholesterol|The least value is 0 and the highest is 60|This also indicates an anomaly because a cholesterol level of 0 is not achievable for any human being; so the column has to be adjusted|
|FastingBS|

In [10]:
df.nunique()

Age                50
Sex                 2
ChestPainType       4
RestingBP          67
Cholesterol       222
FastingBS           2
RestingECG          3
MaxHR             119
ExerciseAngina      2
Oldpeak            53
ST_Slope            3
HeartDisease        2
dtype: int64

## Exploratory Data Analysis

In [11]:
# Divide the dataset into numerical and categorical column

num_cols = df.select_dtypes(include='number').columns
cat_cols = df.select_dtypes(include='object').columns

## Encoding Categorical Variables

In [12]:
# df.Sex = df['Sex'].map(M=0, F=1)
# df.ChestPainType = df['ChestPainType'].map(ATA=0, NAP=1, ASY =2, TA=3)
# df.RestingECG = df['RestingECG'].map(Normal=0, ST=1, LVH=2)

In [13]:
for col in cat_cols:
    print((df[col].unique()), list(range(df[col].nunique())))
    df[col].replace((df[col].unique()), range(df[col].nunique()), inplace=True)
    print('='*50)
    print()

['M' 'F'] [0, 1]

['ATA' 'NAP' 'ASY' 'TA'] [0, 1, 2, 3]

['Normal' 'ST' 'LVH'] [0, 1, 2]

['N' 'Y'] [0, 1]

['Up' 'Flat' 'Down'] [0, 1, 2]



In [14]:
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,0,0,140,289,0,0,172,0,0.0,0,0
1,49,1,1,160,180,0,0,156,0,1.0,1,1
2,37,0,0,130,283,0,1,98,0,0.0,0,0
3,48,1,2,138,214,0,0,108,1,1.5,1,1
4,54,0,1,150,195,0,0,122,0,0.0,0,0


### Handle Anomalies in Cholesterol and RestingBP


For Cholesterol

In [15]:
df.Cholesterol.value_counts()

Cholesterol
0      172
254     11
220     10
223     10
204      9
      ... 
353      1
278      1
157      1
176      1
131      1
Name: count, Length: 222, dtype: int64

It is observed from the info above that there 172 patients recorded to have 0 gram of Chelesterol. This is outrageous, and will  be adjusted with the KNN Imputer

In [16]:
# First convert 0 to null values, and the compute the mean to replace it
df.Cholesterol.replace(0, np.nan, inplace=True)


In [17]:
imputer = KNNImputer(n_neighbors=3)
fitting = imputer.fit_transform(df)
df = pd.DataFrame(fitting, columns=df.columns)

In [18]:
df.Cholesterol.isna().sum()

np.int64(0)

For RestingBP


In [19]:
df.RestingBP.unique()

array([140., 160., 130., 138., 150., 120., 110., 136., 115., 100., 124.,
       113., 125., 145., 112., 132., 118., 170., 142., 190., 135., 180.,
       108., 155., 128., 106.,  92., 200., 122.,  98., 105., 133.,  95.,
        80., 137., 185., 165., 126., 152., 116.,   0., 144., 154., 134.,
       104., 139., 131., 141., 178., 146., 158., 123., 102.,  96., 143.,
       172., 156., 114., 127., 101., 174.,  94., 148., 117., 192., 129.,
       164.])

In [20]:
# First convert 0 to null values, and the compute the mean to replace it
df.RestingBP.replace(0, np.nan, inplace=True)

In [21]:
# Use the KNN
imputer_method = KNNImputer(n_neighbors=3)
fitting = imputer.fit_transform(df)
df = pd.DataFrame(fitting, columns=df.columns)

In [22]:
df.RestingBP.unique()

array([140., 160., 130., 138., 150., 120., 110., 136., 115., 100., 124.,
       113., 125., 145., 112., 132., 118., 170., 142., 190., 135., 180.,
       108., 155., 128., 106.,  92., 200., 122.,  98., 105., 133.,  95.,
        80., 137., 185., 165., 126., 152., 116., 144., 154., 134., 104.,
       139., 131., 141., 178., 146., 158., 123., 102.,  96., 143., 172.,
       156., 114., 127., 101., 174.,  94., 148., 117., 192., 129., 164.])

In [23]:
df.RestingBP.isna().sum()

np.int64(0)

## Convert all Columns to Integer Datatype

In [24]:
all_columns = df.columns
all_columns =  all_columns.drop('HeartDisease')
df[all_columns] = df[all_columns].astype('int32')

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Age             918 non-null    int32  
 1   Sex             918 non-null    int32  
 2   ChestPainType   918 non-null    int32  
 3   RestingBP       918 non-null    int32  
 4   Cholesterol     918 non-null    int32  
 5   FastingBS       918 non-null    int32  
 6   RestingECG      918 non-null    int32  
 7   MaxHR           918 non-null    int32  
 8   ExerciseAngina  918 non-null    int32  
 9   Oldpeak         918 non-null    int32  
 10  ST_Slope        918 non-null    int32  
 11  HeartDisease    918 non-null    float64
dtypes: float64(1), int32(11)
memory usage: 46.7 KB


# Visualization

In [26]:
correlation_matrix = df.corr()
correlation_matrix

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
Age,1.0,-0.05575,0.214164,0.263128,0.057108,0.198039,0.213152,-0.382045,0.215793,0.222111,0.268264,0.282039
Sex,-0.05575,1.0,-0.179766,-0.009658,0.080395,-0.120076,0.018343,0.189186,-0.190664,-0.107969,-0.150693,-0.305445
ChestPainType,0.214164,-0.179766,1.0,0.073016,0.038855,0.155111,0.112067,-0.273076,0.324049,0.278211,0.352364,0.459017
RestingBP,0.263128,-0.009658,0.073016,1.0,0.113194,0.067556,0.094144,-0.109329,0.152621,0.165713,0.082401,0.118333
Cholesterol,0.057108,0.080395,0.038855,0.113194,1.0,0.058798,0.074925,-0.028855,0.083761,0.063703,0.071345,0.101263
FastingBS,0.198039,-0.120076,0.155111,0.067556,0.058798,1.0,0.050707,-0.131438,0.060451,0.041421,0.175774,0.267291
RestingECG,0.213152,0.018343,0.112067,0.094144,0.074925,0.050707,1.0,0.048552,0.036119,0.084853,0.078807,0.061011
MaxHR,-0.382045,0.189186,-0.273076,-0.109329,-0.028855,-0.131438,0.048552,1.0,-0.370425,-0.165156,-0.343419,-0.400421
ExerciseAngina,0.215793,-0.190664,0.324049,0.152621,0.083761,0.060451,0.036119,-0.370425,1.0,0.396723,0.428706,0.494282
Oldpeak,0.222111,-0.107969,0.278211,0.165713,0.063703,0.041421,0.084853,-0.165156,0.396723,1.0,0.488129,0.392385


|Observation|Positive/Negative|Inference|
|Sex vs Age|Negative|Age is not correlated with sex|

In [27]:
# # Heatmap Visualization
# fig = px.imshow(correlation_matrix,
#                 text_auto=True,  
#                 color_continuous_scale='Viridis', 
#                 title='Correlation Heatmap of Dataset Features')

# fig.show()

In [28]:
# import plotly.graph_objects as go

# # Create a heatmap trace
# heatmap_trace = go.Heatmap(z=correlation_matrix.values,
#                            x=correlation_matrix.columns,
#                            y=correlation_matrix.index,
#                            colorscale='Viridis',
#                            colorbar=dict(title='Correlation Coefficient'))

# # Create a figure and add the trace
# fig = go.Figure(data=[heatmap_trace])

# # Update layout for title and axis labels
# fig.update_layout(title='Correlation Heatmap of Dataset Features',
#                   xaxis_title='Features',
#                   yaxis_title='Features')

# fig.show()

In [29]:
df.corr()['HeartDisease'].sort_values()

MaxHR            -0.400421
Sex              -0.305445
RestingECG        0.061011
Cholesterol       0.101263
RestingBP         0.118333
FastingBS         0.267291
Age               0.282039
Oldpeak           0.392385
ChestPainType     0.459017
ExerciseAngina    0.494282
ST_Slope          0.558771
HeartDisease      1.000000
Name: HeartDisease, dtype: float64

In [30]:
# Visualizing the correlation of all the features with respect to the target
px.line(df.corr()['HeartDisease'][:-1].sort_values())

|Observation|Inference|
|----------|---------|
| | |

In [31]:
# Age and HeartDisease Distribution
px.sunburst(df, path=['HeartDisease', 'Age'])

In [32]:
df.to_csv('cleaned_data.csv', index=False)