# Heart Disease Data Cleaning and Exploration

## 1. Load the Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df_heart = pd.read_csv('Assignment/heart.csv')

## 2. Initial Data Exploration

In [None]:
# 1. Display the first 5 rows
print("First 5 rows:")
print(df_heart.head())

In [None]:
# 2. Display the last 5 rows
print("Last 5 rows:")
print(df_heart.tail())

In [None]:
# 3. Get a summary of the dataframe
print("Dataframe Info:")
print(df_heart.info())

In [None]:
# 4. Get descriptive statistics
print("Descriptive Statistics:")
print(df_heart.describe())

In [None]:
# 5. Check the dimensions of the dataframe
print("Dataframe Shape:")
print(df_heart.shape)

## 3. Handling Missing Data

In [None]:
# 6. Check for missing values in each column
print("Missing Values per Column:")
print(df_heart.isnull().sum())

## 4. Data Cleaning and Transformation

In [None]:
# 7. Rename columns for better readability
df_heart.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg', 'max_heart_rate_achieved', 'exercise_induced_angina', 'st_depression', 'st_slope', 'num_major_vessels', 'thalassemia', 'target']

In [None]:
# 8. Convert categorical variables to 'category' type
categorical_cols = ['sex', 'chest_pain_type', 'fasting_blood_sugar', 'rest_ecg', 'exercise_induced_angina', 'st_slope', 'num_major_vessels', 'thalassemia', 'target']
for col in categorical_cols:
    df_heart[col] = df_heart[col].astype('category')

## 5. Handling Duplicates

In [None]:
# 9. Check for duplicate rows
print(f'Number of duplicate rows: {df_heart.duplicated().sum()}')

In [None]:
# 10. Remove duplicate rows
df_heart.drop_duplicates(inplace=True)

## 6. Handling Outliers

In [None]:
# 11. Visualize distributions of numerical columns
numerical_cols = df_heart.select_dtypes(include=np.number).columns.tolist()
for col in numerical_cols:
    plt.figure(figsize=(8, 5))
    sns.boxplot(y=df_heart[col])
    plt.title(f'{col} Boxplot')
    plt.show()

In [None]:
# 12. Remove outliers from 'resting_blood_pressure' using IQR
Q1 = df_heart['resting_blood_pressure'].quantile(0.25)
Q3 = df_heart['resting_blood_pressure'].quantile(0.75)
IQR = Q3 - Q1
df_heart = df_heart[~((df_heart['resting_blood_pressure'] < (Q1 - 1.5 * IQR)) | (df_heart['resting_blood_pressure'] > (Q3 + 1.5 * IQR)))]

In [None]:
# 13. Remove outliers from 'cholesterol' using IQR
Q1 = df_heart['cholesterol'].quantile(0.25)
Q3 = df_heart['cholesterol'].quantile(0.75)
IQR = Q3 - Q1
df_heart = df_heart[~((df_heart['cholesterol'] < (Q1 - 1.5 * IQR)) | (df_heart['cholesterol'] > (Q3 + 1.5 * IQR)))]

In [None]:
# 14. Remove outliers from 'max_heart_rate_achieved' using IQR
Q1 = df_heart['max_heart_rate_achieved'].quantile(0.25)
Q3 = df_heart['max_heart_rate_achieved'].quantile(0.75)
IQR = Q3 - Q1
df_heart = df_heart[~((df_heart['max_heart_rate_achieved'] < (Q1 - 1.5 * IQR)) | (df_heart['max_heart_rate_achieved'] > (Q3 + 1.5 * IQR)))]

## 7. Feature Engineering

In [None]:
# 15. Create age groups
df_heart['age_group'] = pd.cut(df_heart['age'], bins=[0, 40, 50, 60, 100], labels=['Young', 'Middle-aged', 'Senior', 'Elderly'])

In [None]:
# 16. Create cholesterol levels
df_heart['cholesterol_level'] = pd.cut(df_heart['cholesterol'], bins=[0, 200, 240, 500], labels=['Normal', 'Borderline High', 'High'])

In [None]:
# 17. Create blood pressure categories
df_heart['bp_category'] = pd.cut(df_heart['resting_blood_pressure'], bins=[0, 120, 130, 140, 200], labels=['Normal', 'Elevated', 'High BP Stage 1', 'High BP Stage 2'])

## 8. Final Exploration

In [None]:
# 18. Show the cleaned data's first 5 rows
print("Cleaned Data Head:")
print(df_heart.head())

In [None]:
# 19. Show the cleaned data's info
print("Cleaned Data Info:")
print(df_heart.info())

In [None]:
# 20. Show the cleaned data's description
print("Cleaned Data Description:")
print(df_heart.describe())

In [None]:
# 21. Correlation heatmap
plt.figure(figsize=(15, 12))
sns.heatmap(df_heart.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# 22. Target distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='target', data=df_heart)
plt.title('Target Distribution')
plt.show()

In [None]:
# 23. Age distribution by target
plt.figure(figsize=(10, 6))
sns.histplot(data=df_heart, x='age', hue='target', multiple='stack', kde=True)
plt.title('Age Distribution by Target')
plt.show()

In [None]:
# 24. Chest pain type vs. target
plt.figure(figsize=(8, 5))
sns.countplot(x='chest_pain_type', hue='target', data=df_heart)
plt.title('Chest Pain Type vs. Target')
plt.show()

In [None]:
# 25. Sex vs. target
plt.figure(figsize=(6, 4))
sns.countplot(x='sex', hue='target', data=df_heart)
plt.title('Sex vs. Target')
plt.show()