# Heart Stroke Risk Prediction
## Notebook 1: Exploratory Data Analysis (EDA)
**Author:** Dev Kapania | UPES B.Tech CSE (Big Data)
**Date:** January 2024

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn')
%matplotlib inline

print('Libraries loaded successfully!')

## 1. Load Dataset

In [None]:
df = pd.read_csv('../data/raw/heart.csv')
print(f'Dataset shape: {df.shape}')
df.head()

In [None]:
print('Dataset Info:')
print(df.info())
print('\nBasic Statistics:')
df.describe()

## 2. Missing Values & Duplicates

In [None]:
print('Missing values per column:')
print(df.isnull().sum())
print(f'\nDuplicate rows: {df.duplicated().sum()}')

## 3. Target Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Count plot
df['target'].value_counts().plot(kind='bar', ax=axes[0], color=['#2ecc71', '#e74c3c'])
axes[0].set_title('Target Class Distribution')
axes[0].set_xlabel('Heart Disease (0=No, 1=Yes)')
axes[0].set_ylabel('Count')
axes[0].tick_params(rotation=0)

# Pie chart
df['target'].value_counts().plot(kind='pie', ax=axes[1],
    labels=['No Disease', 'Heart Disease'],
    colors=['#2ecc71', '#e74c3c'],
    autopct='%1.1f%%')
axes[1].set_title('Target Class Proportion')
axes[1].set_ylabel('')

plt.tight_layout()
plt.savefig('../data/processed/target_distribution.png', dpi=150, bbox_inches='tight')
plt.show()
print(f'Class distribution:\n{df["target"].value_counts()}')

## 4. Feature Distributions

In [None]:
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

for i, col in enumerate(numerical_cols):
    axes[i].hist(df[df['target']==0][col], alpha=0.6, label='No Disease', color='#2ecc71', bins=20)
    axes[i].hist(df[df['target']==1][col], alpha=0.6, label='Heart Disease', color='#e74c3c', bins=20)
    axes[i].set_title(f'{col} Distribution')
    axes[i].legend()

axes[-1].axis('off')
plt.suptitle('Feature Distributions by Target Class', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('../data/processed/feature_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

## 5. Correlation Heatmap

In [None]:
plt.figure(figsize=(12, 8))
corr_matrix = df.corr(numeric_only=True)
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f',
            cmap='RdYlGn', center=0, vmin=-1, vmax=1,
            square=True, linewidths=0.5)
plt.title('Feature Correlation Heatmap', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('../data/processed/correlation_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

## 6. Key Insights Summary

In [None]:
print('='*50)
print('KEY EDA INSIGHTS')
print('='*50)
print(f'Total samples: {len(df)}')
print(f'Features: {df.shape[1]-1}')
print(f'Class balance: {dict(df["target"].value_counts())}')
print(f'Missing values: {df.isnull().sum().sum()}')
print(f'\nTop correlated features with target:')
print(df.corr(numeric_only=True)["target"].drop("target").abs().sort_values(ascending=False).head(5))