<a href="https://colab.research.google.com/github/Dilipd0812/Objective-Learn-how-to-clean-and-prepare-raw-data-for-ML./blob/main/Heart_Disease_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
johnsmith88_heart_disease_dataset_path = kagglehub.dataset_download('johnsmith88/heart-disease-dataset')

print('Data source import complete.')


<div style="background-color: #f0f0f0; color: #333; text-align: center; padding: 15px; border-radius: 12px; font-size: 22px; font-weight: bold; width: 60%; margin: 20px auto; box-shadow: 2px 2px 10px rgba(0,0,0,0.1);">
    📂 Import libraries
</div>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, HTML
import warnings
warnings.filterwarnings('ignore')

<div style="background-color: #e6f2ff; color: #003366; text-align: center; padding: 15px; border-radius: 10px; font-size: 20px; font-weight: bold; width: 70%; margin: 20px auto; box-shadow: 2px 2px 8px rgba(0,0,0,0.1);">
    🔍 Load the Dataset and Initial Data Exploration
</div>

In [None]:
Data = pd.read_csv('/kaggle/input/heart-disease-dataset/heart.csv')
df = pd.DataFrame(Data)

In [None]:
df.head()

In [None]:
df.tail()

<div style="background-color: #e0f7fa; color: #006064; text-align: center; padding: 10px; border-radius: 10px; font-size: 18px; font-weight: bold; width: 60%; margin: 10px auto; box-shadow: 1px 1px 5px rgba(0,0,0,0.1);">
    🧾 Get Some Information About Dataset
</div>

In [None]:
rows, columns = df.shape
print(f"📊 Dataset Dimensions:\n------------------------\n🧾 Rows:    {rows}\n📁 Columns: {columns}")
print(f" ✅ The dataset consists of {rows} patients with {columns} features.")

<div style="background-color: #e0f7fa; color: #006064; text-align: center; padding: 10px; border-radius: 10px; font-size: 18px; font-weight: bold; width: 60%; margin: 10px auto; box-shadow: 1px 1px 5px rgba(0,0,0,0.1);">
    ✅ Check the data type
</div>

In [None]:
# Finding the data type of each column
df.info()

<div style="background-color: #e0f7fa; color: #006064; text-align: center; padding: 10px; border-radius: 10px; font-size: 18px; font-weight: bold; width: 50%; margin: 20px auto; box-shadow: 1px 1px 5px rgba(0,0,0,0.0);">
    ✅ Statistics summary
</div>

In [None]:
df.describe(include='all')

In [None]:
# Number of Values for Every Feature
print("🔢 Number of Unique Values for Each Feature:\n")
print(df.nunique())

In [None]:
# Print value counts for each column
for col in df.columns:
    print('---🔢',df[col].value_counts())
    print('-'*40)

<div style="background-color: #f0f0f0; color: #333; text-align: center; padding: 15px; border-radius: 12px; font-size: 22px; font-weight: bold; width: 60%; margin: 20px auto; box-shadow: 2px 2px 10px rgba(0,0,0,0.1);">
    🔍 Checking Missing (NaN) Values in Each Column
</div>

In [None]:
df.isna().sum()

In [None]:
rows_with_missing = df[df.isna().any(axis=1)]

from IPython.display import display
display(rows_with_missing)

<div style="background-color: #f0f0f0; color: #333; text-align: left; padding: 10px; border-radius: 12px; font-size: 22px; font-weight: bold; width: 60%; margin: 5px auto; box-shadow: 2px 2px 10px rgba(0,0,0,0.1);">
    <span style="font-size: 20px;">✨</span> There are no missing values in the dataset.
</div>

<div style="text-align: left; font-size: 22px; font-weight: bold; padding: 10px; color: #333; background-color: #f0f0f0;">
    <h6>✔️Transform data into Categorical and Continous features :</h6>
</div>

### - Change the labels and columns' name for better visualisation and interpretation

In [None]:
df['sex'] = df['sex'].replace({0:'male',1:'female'})
df['cp'] = df['cp'].replace({0:'Typical angina',1:'Atypical angina',2:'Non-anginal pain',3:'Asymptomatic'})
df['fbs'] = df['fbs'].replace({0:'False',1:'True'})
df['slope'] = df['slope'].replace({0:'Upsloping',1:'Flat',2:'Downsloping'})
df['thal'] = df['thal'].replace({0:'Normal',1:'Fixeddefect',2:'Reversible defect',3:'Not described'})
df['exang'] = df['exang'].astype(str).replace({'0': 'No', '1': 'Yes'})
df['restecg'] = df['restecg'].astype(str).replace({'0': 'Normal', '1': 'ST_T_Wave_Abnormality','2':'probable_or_Definite_LVH'})

In [None]:
df.rename(columns={'cp':'chest_pain_type',
                    'trestbps':'resting_bp',
                    'chol':'cholesterol',
                    'fbs':'fasting_blood_sugar',
                    'restecg':'rest_ecg_result',
                    'thalach':'max_heart_rate',
                    'exang':'exercise_induced_angina',
                    'oldpeak':'st_depression',
                    'slope':'st_slop',
                    'ca':'num_major_vessels',
                    'thal':'thalium_result',
                    'target':'heart_disease'},inplace=True)

<div style="text-align: left; font-size: 18px; font-weight: bold; padding: 10px 20px; color: #3e4a5b; background-color: #e1f5fe; border-radius: 10px; box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1); width: fit-content; margin: 15px auto;">
    <h3 style="margin: 0;">Dataset Features Breakdown</h3>
</div>

<div style="text-align: left; font-size: 16px; font-weight: 500; padding: 10px; background-color: #fff9c4; border-radius: 8px; margin: 10px 0; box-shadow: 0 2px 5px rgba(0,0,0,0.1);">
    <h4 style="color: #388e3c; margin: 0;">🟢 Categorical Features:</h4>
    <ul style="font-size: 14px; color: #333;">
        <li>Sex</li>
        <li>ChestPain type(cp)</li>
        <li>FastingBS level (fbs)</li>
        <li>RestingECG result (restecg)</li>
        <li>Exercise_induced_Angina (exang)</li>
        <li>ST_Slope (slope)</li>
        <li>num_major_vessels (ca)</li>
        <li>Thalium result(thal)</li>
        <li>HeartDisease (target)</li>
    </ul>
</div>

<div style="text-align: left; font-size: 16px; font-weight: 500; padding: 10px; background-color: #e8f5e9; border-radius: 8px; margin: 10px 0; box-shadow: 0 2px 5px rgba(0,0,0,0.1);">
    <h4 style="color: #388e3c; margin: 0;">🟢 Continous Features:</h4>
    <ul style="font-size: 14px; color: #333;">
        <li>Age</li>
        <li>RestingBP (trestbps)</li>
        <li>Cholesterol (chol)</li>
        <li>st_depression(Oldpeak)</li>
        <li>Maximum heart rate achieved during a stress test (thalach)</li>
        

In [None]:
df.columns

In [None]:
col_categorical=['sex', 'chest_pain_type','fasting_blood_sugar', 'rest_ecg_result','exercise_induced_angina','st_slop','num_major_vessels', 'thalium_result', 'heart_disease']
df_categorical = df[col_categorical].copy()

In [None]:
col_continuous = ['age','resting_bp','cholesterol','st_depression','max_heart_rate']
df_continuous = df[col_continuous].copy()

<div style="text-align: left; font-size: 22px; font-weight: bold; padding: 10px; color: #333; background-color: #fff9c4;">
    <h6>✔️Now Check dataset for Noise and Outlier:</h6>
</div>

In [None]:
colors = ['#4db6ac', '#64b5f6', '#ba68c8', '#f06292', '#ffd54f']

# Columns to plot (excluding 'thalach')
cols = col_continuous[:-1]  # Make sure 'thalach' is the last in col_continous

fig, axes = plt.subplots(ncols=len(cols), figsize=(16, 5), dpi=120)

for i, col in enumerate(cols):
    ax = axes[i]
    ax.scatter(df_continuous[col], df_continuous['max_heart_rate'],color=colors[i % len(colors)], alpha=0.7, edgecolors='k', s=40)
    ax.set_title(f"{col} vs max heart rate", fontsize=15, color=colors[i % len(colors)])
    ax.set_xlabel(col, fontsize=9)
    if i == 0:
        ax.set_ylabel('Max Heart Rate', fontsize=9)
    ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.6)

plt.tight_layout()
plt.show()

In [None]:
print('4 lowest max_heart_rate:\n',df['max_heart_rate'].nsmallest(4))
print('------------------\n 4 lowest age:\n',df['age'].nsmallest(4))
print('------------------\n 4 Highest Cholestrol:\n',df['cholesterol'].nlargest(4))
print('------------------\n 4 Highest st_depression:\n',df['st_depression'].nlargest(4))
print('------------------\n 4 Highest resting_bp:\n',df['resting_bp'].nlargest(4))

### After identifying Outlier in the data, we can remove or handle it appropriately to improve analysis or model performance.

In [None]:
df0 = df[df['max_heart_rate']>71]

In [None]:
df1= df0[df0['cholesterol']<564]

In [None]:
df2=df1[df1['st_depression']<5.6]

In [None]:
df3=df2[df2['resting_bp']<192]

In [None]:
df_categorical = df3[col_categorical].copy()

In [None]:
df_continuous = df3[col_continuous].copy()

In [None]:
fig, axes = plt.subplots(ncols=len(cols), figsize=(16, 5), dpi=120)

for i, col in enumerate(cols):
    ax = axes[i]
    ax.scatter(df_continuous[col], df_continuous['max_heart_rate'],color=colors[i % len(colors)], alpha=0.7, edgecolors='k', s=40)
    ax.set_title(f"{col} vs max heart rate", fontsize=15, color=colors[i % len(colors)])
    ax.set_xlabel(col, fontsize=9)
    if i == 0:
        ax.set_ylabel('Max Heart Rate', fontsize=9)
    ax.grid(True, linestyle='--', linewidth=0.5, alpha=0.6)

plt.tight_layout()
plt.show()

<div style="text-align: left; font-size: 22px; font-weight: bold; padding: 10px; color: #333; background-color: #fff9c4;">
    <h4>🧹Check for Duplicate rows:</h4>
</div>

In [None]:
#count duplicate
df3.duplicated().sum()

In [None]:
df3 = df3.copy()
df3.drop_duplicates(inplace=True)
Rows,Columns = df3.shape
print('Rows:',Rows,'\nColumns:',Columns)
print(f" ✅ Now,the dataset consists of {Rows} patients with {Columns} features.")

<div style="text-align: left; font-size: 24px; font-weight: 600; padding: 15px 25px; color: #3e4a5b; background-color: #fff9c4; border-radius: 50px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); width: fit-content; margin: 20px auto;">
    <h4 style="margin: 0;">✨Dataset is Clean and Reliable for Analysis✨</h4>
</div>

In [None]:
# Print value counts for each categorical column
for col in col_categorical:
    print('🔢',df3[col].value_counts())
    print('-'*40)

### ✨The proportion of females doing the test is higher than that of males✨

<div style="background-color: #f0f0f0; color: #333; text-align: left; padding: 10px; border-radius: 12px; font-size: 22px; font-weight: bold; width: 60%; margin: 5px auto; box-shadow: 2px 2px 10px rgba(0,0,0,0.1);">
    <span style="font-size: 20px;">✨</span>Analyse Categorical features✨
</div>

In [None]:
df_categorical.describe(include=['object','category'])

In [None]:
# Custom pastel and husl palettes
bar_palette = sns.color_palette("pastel")
pie_palette = sns.color_palette("husl", n_colors=10)

for i in col_categorical:
    # Create figure
    fig, axes = plt.subplots(1, 2, figsize=(20, 6), dpi=130)
    fig.suptitle(i, fontsize=16, fontweight='bold', color='#333')

    # Bar Chart
    sns.countplot(data=df3, x=i, ax=axes[0], palette=bar_palette,width=0.4)
    for bar in axes[0].containers:
        axes[0].bar_label(bar, fontsize=10)

    axes[0].set_title('Bar Chart', fontsize=13, fontweight='semibold')
    axes[0].set_xlabel(i, fontsize=11)
    axes[0].set_ylabel('Count', fontsize=11)
    axes[0].tick_params(axis='x', rotation=45)

    # Pie Chart
    df3[i].value_counts().plot.pie(autopct='%1.1f%%',startangle=90,ax=axes[1], colors=pie_palette, textprops={'fontsize': 10})
    axes[1].set_ylabel('')
    axes[1].set_title('Pie Chart', fontsize=13, fontweight='semibold')

    # Layout styling
    plt.tight_layout(pad=3)
    plt.subplots_adjust(top=0.85)
    plt.show()

In [None]:
# Separate data for patient with and without heart disease
disease = df3[df3['heart_disease']==1]
no_disease = df3[df3['heart_disease']==0]

In [None]:
for i in col_categorical[:-1]:

    fig, ax = plt.subplots(1, 4, figsize=(20, 6), dpi=130)
    fig.suptitle(f" Distribution of '{i.upper()}' in Heart Disease vs No Disease", fontsize=20, fontweight='bold', color='#2c3e50', y=1.12)

    # Bar plot for patients with heart disease
    disease_counts = disease[i].value_counts()
    disease_counts.plot(kind='bar', color=sns.color_palette("tab10"), ax=ax[0], edgecolor='black')

    # Add value labels
    for idx, val in enumerate(disease_counts):
        ax[0].text(idx, val + 0.5, str(val), ha='center', fontsize=12, fontweight='bold')

    # Pie chart
    disease_counts.plot(kind='pie', autopct='%.1f%%', ax=ax[1], startangle=90, textprops={'fontsize': 12, 'weight': 'bold'})
    ax[1].set_ylabel('')

    # Customize axis
    ax[0].set_xlabel(i, fontsize=13, fontweight='bold')
    ax[0].set_ylabel('Count', fontsize=13, fontweight='bold')
    ax[0].tick_params(axis='x', labelrotation=45, labelsize=11)
    ax[0].tick_params(axis='y', labelsize=11)

    # Bar plot for patients without heart disease
    no_disease_counts = no_disease[i].value_counts()
    no_disease_counts.plot(kind='bar', color=sns.color_palette("tab10"), ax=ax[2], edgecolor='black')

    # Add value labels
    for idx, val in enumerate(no_disease_counts):
        ax[2].text(idx, val + 0.5, str(val), ha='center', fontsize=12, fontweight='bold')

    # Pie chart
    no_disease_counts.plot(kind='pie', autopct="%.1f%%", ax=ax[3], startangle=90, textprops={'fontsize': 12, 'weight': 'bold'})
    ax[3].set_ylabel('')

    # Customize axis
    ax[2].set_xlabel(i, fontsize=13, fontweight='bold')
    ax[2].set_ylabel('Count', fontsize=13, fontweight='bold')
    ax[2].tick_params(axis='x', labelrotation=45, labelsize=11)
    ax[2].tick_params(axis='y', labelsize=11)

    # Add section titles
    fig.text(0.23, 0.97, f"Patients with Heart Disease by {i}", ha='center', fontsize=15, fontweight='bold', color='crimson')
    fig.text(0.73, 0.97, f"Patients without Heart Disease by {i}", ha='center', fontsize=15, fontweight='bold', color='crimson')

    plt.tight_layout()
    plt.show()

<div style="background-color: #f0f0f0; color: #333; text-align: left; padding: 10px; border-radius: 12px; font-size: 22px; font-weight: bold; width: 60%; margin: 5px auto; box-shadow: 2px 2px 10px rgba(0,0,0,0.1);">
    <span style="font-size: 20px;">✨</span>Analyse continuous features✨
</div>

In [None]:
df_continuous.describe().T

<div style="background-color: pink; color: #333; text-align: left; padding: 10px; border-radius: 12px; font-size: 20px; font-weight: bold; width: 60%; margin: 5px auto; box-shadow: 2px 2px 10px rgba(0,0,0,0.1);">
    <span style="font-size: 20px;"></span>✨Statistical summary of the numerical features✨
</div>

<div style="text-align: left; font-size: 16px; font-weight: bold; padding: 10px; color: #333; background-color: #fafad2;">
    <ul>
        <li><strong>Age:</strong></li>
            <div style="font-weight: normal; margin-left: 20px;">
                The average age in the dataset is 54 years
            </div>
            <div style="font-weight: normal; margin-left: 20px;">
                The oldest is 77 years, and the youngest is 29 years old.
            </div>
            <li><strong>Resting blood pressure:</strong></li>
            <div style="font-weight: normal; margin-left: 20px;">
                The average is 131 , max = 180 and min = 94
            </div>    
            <li><strong>Cholesterol:</strong></li>
            <div style="font-weight: normal; margin-left: 20px;">
                The average registered cholestrol level is 245.34
            </div>
            <div style="font-weight: normal; margin-left: 20px;">
                Maximum level is 417 and the minimum level is 126.
            </div>
            <div style="font-weight: normal; margin-left: 20px;">
                Note: According to researches, a healthy cholesterol level is <200mg/dl and usually high level of cholesterol is associated with heart disease.
            </div>        
            <li><strong>St_depression:</strong></li>
            <div style="font-weight: normal; margin-left: 20px;">
                The average value of st_dpression is 0.999. Max is 4.4 and the minimum is 0.
            </div>  
            <li><strong>Max heart rate achieved:</strong></li>
            <div style="font-weight: normal; margin-left: 20px;">
               The average max heart rate registered is 149.93 bpm. The Maximum and the minumum are 202 and 88 bpm respectively.
            </div>                    
            </div>    
            </div>
            </div>
        </li>
    </ul>
</div>

In [None]:
# Histogram Continous values
df_continuous.hist(bins=10, figsize=(12,10))
plt.show()

In [None]:
g= sns.pairplot(df3, hue='heart_disease', palette={ 0: 'blue', 1: 'red'})

plt.suptitle("Feature distributions and relatioships by heart disease", fontsize=16, y=1.02)
plt.show()

In [None]:
corr = df_continuous.corr()
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot= True, cmap= 'RdBu_r',center=0,vmin=-1,vmax=1, square=True,fmt=".1f")
plt.title('Correlation of continnous features')
plt.show()

<div style="text-align: left; font-size: 16px; font-weight: bold; padding: 10px; color: #333; background-color: #fff9c4;">
    ✔️ This plot indicates the correlation of continuous features (age, resting_bp, cholesterol, st_depression and max heart rate):
    <ul>
        <li><strong>Age and Max Heart Rate:</strong></li>
            <div style="font-weight: normal; margin-left: 20px;">
                Moderate negative correlation (-0.4), indicating max heart rate tends to decrease as age increase.As people get older, their physical health and max heart rate may slowly decrease.
            </div>
            <li><strong>ST Depression and Max Heart Rate.</strong></li>
            <div style="font-weight: normal; margin-left: 20px;">
                negative correlation (-0.34). Higher ST depression is linked to lower max heart rate. When we start to do more exercise, our heart rate will tend to increase and the ST depression will decrease.
            </div>    
            <li><strong>Age and Resting Blood pressure:</strong></li>
            <div style="font-weight: normal; margin-left: 20px;">
                Weak positive correlation (0.3). Slight increase in blood pressure with age. older patients tend to have slightly higher blood pressure, but not by much.
            </div>
            <li><strong>Cholesterol:</strong></li>
            <div style="font-weight: normal; margin-left: 20px;">
                very weak correlation with other features(<0.2). Limited predictive feature.
            </div>  
            <li><strong>Some other feature pairs show very low or no significant correlation, indicating limited to no linear relationship.</strong></li>
            </div>    
            </div>
            </div>
        </li>
    </ul>
</div>


In [None]:
sns.pairplot(df_continuous, corner=True, plot_kws={'alpha': 0.6, 's': 40, 'edgecolor': 'k'})

plt.suptitle("Continuous Feature distributions and relatioships", fontsize=16, y=1.02)
plt.tight_layout()
plt.show()

In [None]:
# Set the style
sns.set(style="whitegrid")

# Create subplot
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Scatter Plot 1: Age vs Max Heart Rate
sns.regplot(data=df3, x='age',y='max_heart_rate',ax=axes[0],color='teal')
axes[0].set_title("Age vs Max Heart Rate", fontsize=14)
axes[0].set_xlabel("Age", fontsize=12)
axes[0].set_ylabel("Max Heart Rate", fontsize=12)

# Scatter Plot 2: ST Depression vs Max Heart Rate
sns.regplot(data=df3, x='st_depression',y='max_heart_rate',ax=axes[1],color='coral')

#sns.scatterplot(data=df3,x='st_depression',y='max_heart_rate',ax=axes[1],color='coral',edgecolor='black', s=50,alpha=0.7)
axes[1].set_title("ST Depression vs Max Heart Rate", fontsize=14)
axes[1].set_xlabel("ST Depression", fontsize=12)
axes[1].set_ylabel("Max Heart Rate", fontsize=12)

plt.tight_layout()
plt.show()


##### - Max heart rate decreases with age and there is a downward trend.Older people usually have lower maximum heart rates.
#####  - There is a weak negative relationship between ST depression and max heart rate. When ST depression is higher, the maximum heart rate is often lower.

In [None]:
plt.figure(figsize=(15, 10))

for i, feature in enumerate(df_continuous):
    plt.subplot(2, 3, i+1)
    sns.boxplot(data=df3, x='heart_disease', y=feature, palette='husl')
    plt.title(f'{feature} vs Heart Disease')
    plt.xlabel('Heart Disease')
    plt.ylabel(feature)
    plt.xticks([0, 1], ['No Disease', 'Disease'])

plt.tight_layout()
plt.show()

<div style="text-align: left; font-size: 16px; font-weight: bold; padding: 10px; color: #333; background-color: #fff9c4;">
    ✔️ These plots indicate differences in several health parameters between patients with and without heart disease.  <ul>
        <li>Age, cholesterol, and blood pressure don’t differ much between groups — They are not strong indicators.</li>
        <li>ST depression is noticeably lower in heart disease patients — it could be a useful feature.</li>
        <li>Max heart rate is higher in heart disease patients — it is also potentially useful for diagnosis.</li> </div>

In [None]:
for i in df_continuous:
    fig, ax = plt.subplots(1, 1, figsize=(7, 4))

    sns.histplot(data=df3, x=i, hue='heart_disease', bins=10, multiple='stack', ax=ax,palette={0:'#D12C7A',1:'blue'})

    ax.set_title(f"{i} distribution by heart disease status", fontsize=13)
    ax.set_xlabel(i)
    ax.set_ylabel("Count")
    ax.legend(labels=["Negative", "Positive"], title="Heart Disease")

    plt.tight_layout()
    plt.show()

In [None]:
cols=col_continuous[1:]
fig, ax = plt.subplots(1, len(cols), figsize=(5 * len(cols), 4))

# Convert ax to a flat list if it's a NumPy array
ax = ax.flatten() if hasattr(ax, 'flatten') else ax


for i, col in enumerate(cols):
    sns.regplot(data=df3[df3['heart_disease'] == 1], x='age', y=col, ax=ax[i], color='red', label='Heart disease')
    sns.regplot(data=df3[df3['heart_disease'] == 0], x='age', y=col, ax=ax[i], color='blue', label='No disease')
    ax[i].legend()
    ax[i].set_title(f'Age vs {col}')

<div style="background-color: #ffffcc; padding: 10px;">🌟  Resting blood pressure, cholesterol, and ST depression are positively and linearly related with age, although the relationship with ST depression is weaker.</div>
<div style="background-color: #ffffcc; padding: 10px;">🌟  Lower st_depression regardless of age is also likely an indication of a heart disease.</div>
<div style="background-color: #ffffcc; padding: 10px;">🌟  Max heart rate decreases with age and there is a downward trend.Older people usually have lower maximum heart rates.</div>
<div style="background-color: #ffffcc; padding: 10px;">🌟  Younger patients with higher maximum_heart_rate are more likely to have a heart condition.</div>
</div>

<div style="text-align: left; font-size: 16px; font-weight: 600; padding: 15px 25px; color: #3e4a5b; background-color: #f0f0f0; border-radius: 50px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); width: fit-content; margin: 20px auto;">
    <h4 style="margin: 0;"> Who is more likely to have heart disease: males or females❓</h4>
</div>

In [None]:
count_value = df3['sex'].value_counts()
sex = df3['sex'].value_counts().index

print(count_value)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Plot Gender Distribution
ax1.bar(sex,count_value,label=sex,color=['pink','skyblue'],width=0.5)
ax1.set_title("Distribution of Patient Gender")
ax1.set_xlabel('Gender')
ax1.set_ylabel('Count')
ax1.grid(linestyle='--', axis='y',dashes=(1,5))

# Create a count table of sex vs target
gender_target = pd.crosstab(df3['sex'], df3['heart_disease'])

# Plot Gender vs Heart Disease
gender_target.plot(kind='bar', color=['blue', 'red'], width=0.6,ax=ax2)

ax2.set_title("Heart Disease Distribution by Gender")
ax2.set_xlabel("Gender")
ax2.set_ylabel("Count")
ax2.grid(linestyle='--', axis='y', dashes=(1, 5))
ax2.legend(title='Heart disease',labels=['negetive','positive'])

plt.tight_layout()
plt.show()

In [None]:
# Counts
male_counts = [
    no_disease[no_disease['sex'] == 'male'].shape[0],
    disease[disease['sex'] == 'male'].shape[0]
]

female_counts = [
    no_disease[no_disease['sex'] == 'female'].shape[0],
    disease[disease['sex'] == 'female'].shape[0]
]

# Colors
female_colors = ['lightgreen', 'orange']
male_colors = ['blue', 'yellow']

# Function to format both percentage and raw count
def make_autopct(values):
    def my_autopct(pct):
        total = sum(values)
        val = int(round(pct * total / 100.0))
        return f'{pct:.1f}%\n({val})'
    return my_autopct

# Plot
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Outer ring (Female)
female_pie = ax1.pie(female_counts, labels=None, autopct=make_autopct(female_counts), startangle=90,radius=1.0, colors=female_colors,wedgeprops=dict(width=0.3), pctdistance=0.85)

# Inner ring (Male)
male_pie = ax1.pie(male_counts, labels=None,autopct=make_autopct(male_counts), startangle=90, radius=0.7, colors=male_colors,wedgeprops=dict(width=0.3), pctdistance=0.75)

# Title
ax1.set_title('Patient Distribution by Gender and Disease', fontsize=16, weight='bold')

# Legend
custom_labels = ['Female - No Disease', 'Female - Heart Disease','Male - No Disease', 'Male - Heart Disease']
ax1.legend(custom_labels, loc='upper right',bbox_to_anchor=(1.4, 1))

sex_counts = df3.groupby(['heart_disease', 'sex']).size().reset_index(name='count')
sex_counts['percentage'] = sex_counts['count'] / sex_counts.groupby('heart_disease')['count'].transform('sum') * 100

# Plot 2: Percentage barplot
barplot2 = sns.barplot(data=sex_counts, x='heart_disease', y='percentage', hue='sex', palette='Set2', ax=ax2)
ax2.set_title("sex by Heart Disease Status", fontsize=14)
ax2.set_xlabel("Heart Disease", fontsize=12)
ax2.set_ylabel("Percentage", fontsize=12)
ax2.set_xticks([0, 1])
ax2.set_xticklabels(['No Disease', 'Disease'])
ax2.legend(title=' sex')

# Add percentage labels
for container in barplot2.containers:
    barplot2.bar_label(container, fmt='%.1f%%', fontsize=10)

plt.tight_layout()
plt.show()


<div style="text-align: left; font-size: 22px; font-weight: bold; padding: 10px; color: #333; background-color: #fff9c4;">
    <h6>✔️Plots indicate that heart disease is more common in male than female and male patients have higher chance of heart attack, even though female patients make up a larger portion of the dataset.</h6>
</div>

<div style="text-align: left; font-size: 16px; font-weight: 600; padding: 15px 25px; color: #3e4a5b; background-color: #f0f0f0; border-radius: 50px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); width: fit-content; margin: 20px auto;">
    <h4 style="margin: 0;"> Which age group has the highest risk of heart disease❓</h4>
</div>

In [None]:
print('Minimum of age:',df3['age'].min())
print('Maximum of age:',df3['age'].max())

In [None]:
# Create age group
bins = [20, 30, 40, 50, 60, 70, 80]
labels = ['21-30', '31-40', '41-50', '51-60', '61-70', '71-80']
df3['age_group'] = pd.cut(df3['age'],bins=bins,labels=labels)
df3.head()

In [None]:
# Plot
fig, (ax0, ax1) = plt.subplots(ncols=2, figsize=(12, 5))

sns.countplot(data=df3,x='age_group',hue='heart_disease',palette=['lightgreen', 'red'], ax=ax0, width=0.5)
ax0.set_title("Age Distribution")
ax0.set_xlabel('Age')
ax0.set_ylabel('Count')
ax0.grid(linestyle='--', axis='y', dashes=(1, 5))
ax0.legend(title='Heart Disease', labels=['No Disease', 'Heart Disease'])

# Get age counts
disease_age_counts = disease['age'].value_counts().sort_index()
no_disease_age_counts = no_disease['age'].value_counts().sort_index()

age_disease = disease_age_counts.index
count_disease = disease_age_counts.values

age_no_disease = no_disease_age_counts.index
count_no_disease = no_disease_age_counts.values

# Scatter plots on ax1
ax1.scatter(age_disease, count_disease, color='red', s=150, alpha=0.6, label='Heart Disease')
ax1.scatter(age_no_disease, count_no_disease, color='blue', s=150, alpha=0.6, label='No Disease')

# Customize ax1
ax1.set_xlabel('Age')
ax1.set_ylabel('Number of Patients')
ax1.set_title('Age Distribution by Heart Disease Status')
ax1.grid(linestyle='--', axis='y', dashes=(1, 5))
ax1.legend()
plt.tight_layout()
plt.show()

<div style="text-align: left; font-size: 20px; font-weight: bold; padding: 10px; color: #333; background-color: #fff9c4;">
    <h6>✔️Plots display the age group 51-60 has the highest risk of heart disease, and the risk starts to increase from age 40</h6>
        <h6>✔️Age from 40-60 years have the high chance of heart attack</h6>
</div>

<div style="text-align: left; font-size: 16px; font-weight: 600; padding: 15px 25px; color: #3e4a5b; background-color: #f0f0f0; border-radius: 50px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); width: fit-content; margin: 20px auto;">
    <h4 style="margin: 0;"> Is there a relation between exercise-induced angina and heart disease❓</h4>
</div>

In [None]:
# Prepare data
ecg_counts = df3.groupby(['heart_disease', 'exercise_induced_angina']).size().reset_index(name='count')
ecg_counts['percentage'] = ecg_counts['count'] / ecg_counts.groupby('heart_disease')['count'].transform('sum') * 100

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 6), dpi=120)

# Plot 1: Countplot
colors = ['blue', 'red']
barplot1 = sns.countplot(data=df3, x='exercise_induced_angina', hue='heart_disease', palette=colors, ax=axes[0], width=0.5)
axes[0].set_title("Exercise Induced Angina vs Heart Disease", fontsize=14)
axes[0].set_xlabel('Exercise Induced Angina', fontsize=12)
axes[0].set_ylabel('Count', fontsize=12)
axes[0].legend(title='Heart Disease', labels=['No Disease', 'Heart Disease'])
axes[0].grid(True, linestyle='--', axis='y', dashes=(1, 5))
axes[0].invert_xaxis()
axes[0].tick_params(axis='x', rotation=45)

# Add value labels to barplot1
for container in barplot1.containers:
    barplot1.bar_label(container, fontsize=10)

# Plot 2: Percentage barplot using corrected data
barplot2 = sns.barplot(data=ecg_counts, x='heart_disease', y='percentage', hue='exercise_induced_angina', palette='Set2', ax=axes[1])
axes[1].set_title("Exercise Induced Angina by Heart Disease Status", fontsize=14)
axes[1].set_xlabel("Heart Disease", fontsize=12)
axes[1].set_ylabel("Percentage", fontsize=12)
axes[1].set_xticks([0, 1])
axes[1].set_xticklabels(['No Disease', 'Disease'])
axes[1].legend(title='Exercise Induced Angina')

# Add percentage labels
for container in barplot2.containers:
    barplot2.bar_label(container, fmt='%.1f%%', fontsize=10)

plt.tight_layout()
plt.show()


<div style="text-align: left; font-size: 20px; font-weight: bold; padding: 10px; color: #333; background-color: #fff9c4;">
    <h6>✔️These plots display that people with no exercise induced angina have higher chance of heart attack</h6>
</div>

<div style="text-align: left; font-size: 16px; font-weight: 600; padding: 15px 25px; color: #3e4a5b; background-color: #f0f0f0; border-radius: 50px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); width: fit-content; margin: 20px auto;">
    <h4 style="margin: 0;"> What is the distribution of thalium stress test results by disease status❓</h4>
</div>

In [None]:
# Prepare data
thal_counts = df3.groupby(['heart_disease', 'thalium_result']).size().reset_index(name='count')
thal_counts['percentage'] = thal_counts['count'] / thal_counts.groupby('heart_disease')['count'].transform('sum') * 100

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 6), dpi=120)

# Calculate proportions
proportions = df3.groupby('thalium_result')['heart_disease'].value_counts(normalize=True).unstack().fillna(0)

# Plot 1:
proportions.plot(kind='bar', stacked=True, colormap='Set1',edgecolor='black',ax=axes[0] )
axes[0].set_xlabel('thalium stress test result', fontsize=12)
axes[0].set_ylabel('possibility', fontsize=12)
axes[0].set_title('thalium stress test result vs Heart Disease', fontsize=14)
axes[0].legend(title='heart disease',labels=['Negative','positive'],loc='upper right',bbox_to_anchor=(1.2, 1))
axes[0].grid(True, linestyle='--', axis='y', dashes=(1, 5))
axes[0].tick_params(axis='x', rotation=45)

# Plot 2: Percentage barplot
barplot2 = sns.barplot(data=thal_counts, x='heart_disease', y='percentage', hue='thalium_result', palette='Set2', ax=axes[1])
axes[1].set_title("thalium stress test result by Heart Disease Status", fontsize=14)
axes[1].set_xlabel("Heart Disease", fontsize=12)
axes[1].set_ylabel("Percentage", fontsize=12)
axes[1].set_xticks([0, 1])
axes[1].set_xticklabels(['No Disease', 'Disease'])
axes[1].legend(title='thalium stress test result')

# Add percentage labels
for container in barplot2.containers:
    barplot2.bar_label(container, fmt='%.1f%%', fontsize=10)

plt.tight_layout()
plt.show()

<div style="text-align: left; font-size: 20px; font-weight: bold; padding: 10px; color: #333; background-color: #fff9c4;">
    <h6>✔️These plots show that patients with a 'Reversible Defect' result on the thallium stress test have a higher chance of having a heart attack</h6>
</div>

<div style="text-align: left; font-size: 16px; font-weight: 600; padding: 15px 25px; color: #3e4a5b; background-color: #f0f0f0; border-radius: 50px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); width: fit-content; margin: 20px auto;">
    <h4 style="margin: 0;"> Does Chest Pain Type Affect the Likelihood of Heart Disease❓</h4>
</div>

In [None]:
# Prepare data
cp_counts = df3.groupby(['heart_disease', 'chest_pain_type']).size().reset_index(name='count')
cp_counts['percentage'] = cp_counts['count'] / cp_counts.groupby('heart_disease')['count'].transform('sum') * 100

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 6), dpi=120)

# Calculate proportions
proportions = df3.groupby('chest_pain_type')['heart_disease'].value_counts(normalize=True).unstack()

# Plot 1:
proportions.plot(kind='bar', stacked=True, colormap='Set1',edgecolor='black',ax=axes[0] )
axes[0].set_xlabel('Chest Pain Type', fontsize=12)
axes[0].set_ylabel('possibility', fontsize=12)
axes[0].set_title('Chest Pain type vs Heart Disease', fontsize=14)
axes[0].legend(title='heart disease',labels=['Negative','positive'],loc='upper right',bbox_to_anchor=(1.2, 1))
plt.tight_layout()
axes[0].grid(True, linestyle='--', axis='y', dashes=(1, 5))
axes[0].tick_params(axis='x', rotation=45)


# Plot 2: Percentage barplot
barplot2 = sns.barplot(data=cp_counts, x='heart_disease', y='percentage', hue='chest_pain_type', palette='Set2', ax=axes[1])
axes[1].set_title("Chest Pain Types by Heart Disease Status", fontsize=14)
axes[1].set_xlabel("Heart Disease", fontsize=12)
axes[1].set_ylabel("Percentage", fontsize=12)
axes[1].set_xticks([0, 1])
axes[1].set_xticklabels(['No Disease', 'Disease'])
axes[1].legend(title='Chest Pain Type')

# Add percentage labels
for container in barplot2.containers:
    barplot2.bar_label(container, fmt='%.1f%%', fontsize=10)

plt.tight_layout()
plt.show()

<div style="text-align: left; font-size: 20px; font-weight: bold; padding: 10px; color: #333; background-color: #fff9c4;">
    <h6>✔️These plots show that people with Non-Anginal chest pain have higher chances of heart attack compared to those with other chest pain types. When we suffer from typical angina, the possibility of having heart disease is much lower than those with atypical angina, non-anginal pain and asymptomatic condition.</h6>
</div>

<div style="text-align: left; font-size: 16px; font-weight: 600; padding: 15px 25px; color: #3e4a5b; background-color: #f0f0f0; border-radius: 50px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); width: fit-content; margin: 20px auto;">
    <h4 style="margin: 0;"> What kind of ECG results (restecg) are more common in patients with heart disease❓</h4>
</div>

In [None]:
restecg_counts = df3.groupby(['heart_disease', 'rest_ecg_result']).size().reset_index(name='count')
restecg_counts['percentage'] = restecg_counts['count'] / restecg_counts.groupby('heart_disease')['count'].transform('sum') * 100

# Create subplots
fig, axes = plt.subplots(1, 2, figsize=(16, 6), dpi=120)

# Calculate proportions
proportions = df3.groupby('rest_ecg_result')['heart_disease'].value_counts(normalize=True).unstack().fillna(0)

# Plot 1:
proportions.plot(kind='bar', stacked=True, colormap='Set1',edgecolor='black',ax=axes[0] )
axes[0].set_xlabel('RestEcg result', fontsize=12)
axes[0].set_ylabel('possibility', fontsize=12)
axes[0].set_title('RestEcg result vs Heart Disease', fontsize=14)
axes[0].legend(title='heart disease',labels=['Negative','positive'],loc='upper right',bbox_to_anchor=(1.2, 1))
plt.tight_layout()
axes[0].grid(True, linestyle='--', axis='y', dashes=(1, 5))
axes[0].tick_params(axis='x', rotation=45)


# Plot 2: Percentage barplot
barplot2 = sns.barplot(data=restecg_counts, x='heart_disease', y='percentage', hue='rest_ecg_result', palette='Set2', ax=axes[1])
axes[1].set_title("RestEcg result by Heart Disease Status", fontsize=14)
axes[1].set_xlabel("Heart Disease", fontsize=12)
axes[1].set_ylabel("Percentage", fontsize=12)
axes[1].set_xticks([0, 1])
axes[1].set_xticklabels(['No Disease', 'Disease'])
axes[1].legend(title='RestEcg result')

# Add percentage labels
for container in barplot2.containers:
    barplot2.bar_label(container, fmt='%.1f%%', fontsize=10)

plt.tight_layout()
plt.show()

<div style="text-align: left; font-size: 20px; font-weight: bold; padding: 10px; color: #333; background-color: #fff9c4;">
    <h6>✔️These plots show that people with ST Twave abnormality have higher chances of heart attack compared to those with other Restcg results. the prevalence in people have ST-T wave abnormality seem to be much higher than those who are normal or show probable definite left ventricular hypertrophy.</h6>
</div>

<div style="text-align: left; font-size: 16px; font-weight: 600; padding: 15px 25px; color: #3e4a5b; background-color: #f0f0f0; border-radius: 50px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); width: fit-content; margin: 20px auto;">
    <h4 style="margin: 0;"> Does a higher resting blood pressure increase the chance of heart disease❓</h4>
</div>

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))  # Adjust width as needed

# --- KDE Plot ---
sns.kdeplot(data=df3[df3['heart_disease'] == 0]['resting_bp'], label='No Disease', shade=True, ax=axes[0])
sns.kdeplot(data=df3[df3['heart_disease'] == 1]['resting_bp'], label='Heart Disease', shade=True, ax=axes[0])
axes[0].set_title("Resting Blood Pressure by Heart Disease status")
axes[0].set_xlabel("Resting Blood Pressure")
axes[0].legend()

# --- Violin Plot ---
sns.violinplot(data=df3, x='heart_disease', y='resting_bp', ax=axes[1])
axes[1].set_xticks([0, 1])
axes[1].set_xticklabels(['No Disease', 'Heart Disease'])
axes[1].set_title("Distribution of Resting Blood Pressure by Heart Disease Status")
axes[1].set_xlabel("Heart Disease")
axes[1].set_ylabel("Resting Blood Pressure")

plt.tight_layout()
plt.show()


<div style="text-align: left; font-size: 20px; font-weight: bold; padding: 10px; color: #333; background-color: #fff9c4;">
    <h6>✔️The plots show that people with heart disease tend to have a slightly lower resting blood pressure compared to those without heart disease.</h6>
    <h6>✔️As we saw in the boxplot, blood pressure doesn’t differ much between people with and without heart disease — the plots here support that it is not a strong indicator of heart disease.</h6>
</div>

<div style="text-align: left; font-size: 16px; font-weight: 600; padding: 15px 25px; color: #3e4a5b; background-color: #f0f0f0; border-radius: 50px; box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); width: fit-content; margin: 20px auto;">
    <h4 style="margin: 0;"> What is the impact of cholesterol level on heart disease risk❓</h4>
</div>

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(12, 8))  # (rows, columns)

# First plot (boxplot)
sns.boxplot(data=df3, x='heart_disease', y='cholesterol', palette='husl', ax=axes[0, 0])
axes[0, 0].set_title('Cholesterol vs Heart Disease')
axes[0, 0].set_xlabel('Heart Disease')
axes[0, 0].set_ylabel('Cholesterol')
axes[0, 0].set_xticks([0, 1])
axes[0, 0].set_xticklabels(['No Disease', 'Disease'])

# Second plot (scatter)
sns.scatterplot(data=df3, x='age', y='cholesterol', hue='heart_disease', palette='husl', ax=axes[0, 1])
axes[0, 1].set_title('Cholesterol vs Age')
axes[0, 1].set_xlabel('Age')
axes[0, 1].set_ylabel('Cholesterol')

# Third plot (KDE)
#sns.distplot(df3[df3['heart_disease'] == 0]['cholesterol'], label='No Disease', kde=True, color='red',ax=axes[1, 0])
#sns.distplot(df3[df3['heart_disease'] == 1]['cholesterol'], label='Heart Disease', kde=True, color='blue',ax=axes[1, 0])

sns.kdeplot(data=df3[df3['heart_disease'] == 0]['cholesterol'], label='No Disease', shade=True, ax=axes[1, 0])
sns.kdeplot(data=df3[df3['heart_disease'] == 1]['cholesterol'], label='Heart Disease', shade=True, ax=axes[1, 0])
axes[1, 0].set_title("Cholesterol Distribution by Heart Disease")
axes[1, 0].set_xlabel("Cholesterol")
axes[1, 0].legend()

# Fourth plot (Violin)
sns.violinplot(data=df3, x='heart_disease', y='cholesterol', palette='Set2', ax=axes[1, 1])
axes[1, 1].set_xticks([0, 1])
axes[1, 1].set_xticklabels(['No Disease', 'Heart Disease'])
axes[1, 1].set_title("Violin Plot of Cholesterol")
axes[1, 1].set_xlabel("Heart Disease")
axes[1, 1].set_ylabel("Cholesterol")

plt.tight_layout()
plt.show()

<div style="text-align: left; font-size: 20px; font-weight: bold; padding: 10px; color: #333; background-color: #fff9c4;">
    <h6>✔️High Cholestrol leads to high chance of heart attack.</h6>
    <h6>✔️However, as we can see in these plots, cholesterol levels don't differ much between people with and without heart disease.</h6>
</div>

In [None]:
sns.catplot(data=df3,x='age_group',hue='chest_pain_type',col='heart_disease',kind='count',palette=['lightgreen', 'red','yellow','brown'], height=5,ax=ax, aspect=1)
plt.subplots_adjust(top=0.8)
plt.suptitle('Age Distribution by Gender and Chest paint type')
plt.show()

#### 💡People aged between 40 and 60 with non-anginal chest pain are more likely to have heart disease.