In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)     

sns.set_style("whitegrid")

try:
    df = pd.read_csv(r'C:\Users\chand\OneDrive\Desktop\pjt_datasets\heart_failure_clinical_records_dataset.csv')
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: 'your_dataset.csv' not found. Please check the file path.")
print("First 5 rows of the dataset:")
print(df.head())

print("\nDataset Info:")
df.info()

print("\nDataset Shape:")
print(df.shape)

print("\nDescriptive Statistics:")
print(df.describe())

print("\nMissing Values:")
print(df.isnull().sum())

In [3]:
print("\n--- Duplicate Rows Check ---")
print(f"Number of duplicate rows found: {df.duplicated().sum()}")
if df.duplicated().sum() > 0:
    df.drop_duplicates(inplace=True)
    print(f"Shape after dropping duplicates: {df.shape}")


--- Duplicate Rows Check ---
Number of duplicate rows found: 0


In [None]:
print("\n--- Distribution of DEATH_EVENT ---")
print(df['DEATH_EVENT'].value_counts())
print("\nProportion of DEATH_EVENT:")
print(df['DEATH_EVENT'].value_counts(normalize=True))

plt.figure(figsize=(7, 5))
sns.countplot(x='DEATH_EVENT', data=df, palette='viridis', hue='DEATH_EVENT', legend=False)
plt.title('Distribution of DEATH_EVENT (0=Survived, 1=Died)')
plt.xlabel('DEATH_EVENT')
plt.ylabel('Count')
plt.xticks(ticks=[0, 1], labels=['Survived', 'Died'])
plt.show()

In [None]:
binary_cols = ['sex', 'anaemia', 'diabetes', 'high_blood_pressure', 'smoking']
for col in binary_cols:
    print(f"\n--- Distribution of {col.replace('_', ' ').title()} ---")
    print(df[col].value_counts())
    print(df[col].value_counts(normalize=True))

    plt.figure(figsize=(7, 5))
    sns.countplot(x=col, data=df, palette='cividis', hue=col, legend=False)
    plt.title(f'Distribution of {col.replace("_", " ").title()}')
    plt.xlabel(col.replace('_', ' ').title())
    plt.ylabel('Count')
    if col == 'sex':
        plt.xticks(ticks=[0, 1], labels=['Female', 'Male'])
    elif col in ['anaemia', 'diabetes', 'high_blood_pressure', 'smoking']:
        plt.xticks(ticks=[0, 1], labels=['No', 'Yes'])
    plt.show()

In [None]:
numerical_cols = [
    'age', 'creatinine_phosphokinase', 'ejection_fraction',
    'platelets', 'serum_creatinine', 'serum_sodium', 'time'
]
for col in numerical_cols:
    print(f"\n--- Distribution of {col.replace('_', ' ').title()} ---")
    print(df[col].describe())

    plt.figure(figsize=(10, 6))
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f'Distribution of {col.replace("_", " ").title()}')
    plt.xlabel(f'{col.replace("_", " ").title()}')
    plt.ylabel('Count')
    plt.show()

    plt.figure(figsize=(10, 2))
    sns.boxplot(x=df[col])
    plt.title(f'Box Plot of {col.replace("_", " ").title()}')
    plt.xlabel(f'{col.replace("_", " ").title()}')
    plt.show()

In [None]:
print("\n--- Correlation Matrix of All Features ---")
correlation_matrix = df.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Matrix of All Features')
plt.show()

print("\n--- Numerical Features vs. DEATH_EVENT (Box Plots) ---")
numerical_features_for_comparison = [
    'age', 'creatinine_phosphokinase', 'ejection_fraction',
    'platelets', 'serum_creatinine', 'serum_sodium'
]


In [None]:

plt.figure(figsize=(15, 12))
for i, col in enumerate(numerical_features_for_comparison):
    plt.subplot(3, 2, i + 1)
    sns.boxplot(x='DEATH_EVENT', y=col, data=df, palette='viridis')
    plt.title(f'{col.replace("_", " ").title()} by Death Event')
    plt.xlabel('Death Event')
    plt.ylabel(f'{col.replace("_", " ").title()}')
    plt.xticks(ticks=[0, 1], labels=['Survived', 'Died'])
plt.tight_layout()
plt.show()

plt.figure(figsize=(8, 6))
sns.boxplot(x='DEATH_EVENT', y='time', data=df, palette='viridis')
plt.title('Time (Follow-up) by Death Event')
plt.xlabel('Death Event')
plt.ylabel('Time (Days)')
plt.xticks(ticks=[0, 1], labels=['Survived', 'Died'])
plt.show()

In [None]:
print("\n--- Categorical/Binary Features vs. DEATH_EVENT (Stacked Bar Charts) ---")
binary_features_for_comparison = ['sex', 'anaemia', 'diabetes', 'high_blood_pressure', 'smoking']

plt.figure(figsize=(15, 12))
for i, col in enumerate(binary_features_for_comparison):
    crosstab_norm = pd.crosstab(df[col], df['DEATH_EVENT'], normalize='index') * 100

    plt.subplot(3, 2, i + 1)
    crosstab_norm.plot(kind='bar', stacked=True, ax=plt.gca(), cmap='coolwarm')
    plt.title(f'Death Event Proportion by {col.replace("_", " ").title()}')
    plt.xlabel(col.replace('_', ' ').title())
    plt.ylabel('Proportion (%)')
    if col == 'sex':
        plt.xticks(ticks=[0, 1], labels=['Female', 'Male'], rotation=0)
    elif col in ['anaemia', 'diabetes', 'high_blood_pressure', 'smoking']:
        plt.xticks(ticks=[0, 1], labels=['No', 'Yes'], rotation=0)
    plt.legend(title='Death Event', labels=['Survived', 'Died'])
plt.tight_layout()
plt.show()


In [None]:
print("\n--- Multivariate Plots ---")

sns.relplot(x='ejection_fraction', y='serum_creatinine', hue='DEATH_EVENT', col='high_blood_pressure', data=df, kind='scatter', palette='coolwarm')
plt.suptitle('Ejection Fraction vs. Serum Creatinine by Death Event & High Blood Pressure', y=1.02)
plt.show()

sns.relplot(x='time', y='age', hue='DEATH_EVENT', col='smoking', data=df, kind='scatter', palette='viridis')
plt.suptitle('Age vs. Time by Death Event & Smoking Status', y=1.02)
plt.show()

print("\nAnalysis Done. Proceed to documenting observations and insights in Markdown cells.")