In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Suppress warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("="*70)
print("HEART DISEASE DATASET - EXPLORATORY DATA ANALYSIS")
print("="*70)
print("\n Libraries imported successfully!")

In [None]:
# Load the dataset
df = pd.read_csv('../data/heart.csv')

print("\n" + "="*70)
print("DATASET LOADED")
print("="*70)

print(f"\n Dataset Shape: {df.shape}")
print(f"   - Total Rows: {df.shape[0]:,}")
print(f"   - Total Columns: {df.shape[1]}")
print(f"   - Features: {df.shape[1] - 1}")
print(f"   - Target: 1 (binary classification)")

print(f"\n Dataset meets requirements:")
print(f"   âœ“ Features â‰¥ 12: {df.shape[1] - 1} features")
print(f"   âœ“ Instances â‰¥ 500: {df.shape[0]} instances")

In [None]:
print("\n" + "="*70)
print("FIRST 10 ROWS OF DATASET")
print("="*70)
display(df.head(10))

print("\n" + "="*70)
print("LAST 5 ROWS OF DATASET")
print("="*70)
display(df.tail())

In [None]:
print("\n" + "="*70)
print("DATASET INFORMATION")
print("="*70)

print("\nðŸ“‹ Column Details:")
df.info()

print("\n" + "="*70)
print("COLUMN NAMES AND DESCRIPTIONS")
print("="*70)

# Feature descriptions
feature_descriptions = {
    'age': 'Age of the patient (years)',
    'sex': 'Gender (1 = male, 0 = female)',
    'cp': 'Chest pain type (0-3)',
    'trestbps': 'Resting blood pressure (mm Hg)',
    'chol': 'Serum cholesterol (mg/dl)',
    'fbs': 'Fasting blood sugar > 120 mg/dl (1 = true, 0 = false)',
    'restecg': 'Resting electrocardiographic results (0-2)',
    'thalach': 'Maximum heart rate achieved',
    'exang': 'Exercise induced angina (1 = yes, 0 = no)',
    'oldpeak': 'ST depression induced by exercise',
    'slope': 'Slope of peak exercise ST segment (0-2)',
    'ca': 'Number of major vessels (0-4) colored by fluoroscopy',
    'thal': 'Thalassemia (0 = normal; 1 = fixed defect; 2 = reversable defect)',
    'target': 'Heart disease (0 = no disease, 1 = disease)'
}

for i, (col, desc) in enumerate(feature_descriptions.items(), 1):
    print(f"{i:2d}. {col:12s} : {desc}")

In [None]:
print("\n" + "="*70)
print("STATISTICAL SUMMARY")
print("="*70)

display(df.describe().round(2))

print("\n Key Statistics:")
print(f"   - Average Age: {df['age'].mean():.1f} years")
print(f"   - Age Range: {df['age'].min():.0f} - {df['age'].max():.0f} years")
print(f"   - Average Max Heart Rate: {df['thalach'].mean():.1f} bpm")
print(f"   - Average Cholesterol: {df['chol'].mean():.1f} mg/dl")

In [None]:
print("\n" + "="*70)
print("DATA QUALITY CHECK")
print("="*70)

# Missing values
print("\n Missing Values:")
missing_values = df.isnull().sum()
if missing_values.sum() == 0:
    print("No missing values found in any column!")
else:
    print(missing_values[missing_values > 0])

# Duplicate rows
duplicates = df.duplicated().sum()
print(f"\n Duplicate Rows: {duplicates}")
if duplicates == 0:
    print("No duplicate rows found!")
else:
    print(f"Found {duplicates} duplicate rows")

# Data types
print("\n Data Types Distribution:")
print(df.dtypes.value_counts())

# Check for any negative values that shouldn't exist
print("\n Data Validity Check:")
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    if (df[col] < 0).any():
        print(f" {col} has negative values")
    else:
        print(f"{col} - all values valid")

In [None]:
print("\n" + "="*70)
print("TARGET VARIABLE ANALYSIS")
print("="*70)

# Count and percentage
target_counts = df['target'].value_counts().sort_index()
target_pct = df['target'].value_counts(normalize=True).sort_index() * 100

print("\n Target Distribution:")
print(f"\n{'Class':<15} {'Count':<10} {'Percentage'}")
print("-" * 40)
for cls in sorted(df['target'].unique()):
    count = target_counts[cls]
    pct = target_pct[cls]
    label = "No Disease" if cls == 0 else "Disease"
    print(f"{cls} ({label}){' '*(7-len(label))} {count:<10} {pct:.2f}%")

print(f"\n Dataset is {'balanced' if abs(target_pct[0] - target_pct[1]) < 10 else 'imbalanced'}")

# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
colors = ['#90EE90', '#FFB6C6']
sns.countplot(data=df, x='target', ax=axes[0], palette=colors)
axes[0].set_title('Target Distribution (Count)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Target (0 = No Disease, 1 = Disease)', fontsize=11)
axes[0].set_ylabel('Count', fontsize=11)
axes[0].set_xticklabels(['No Disease (0)', 'Disease (1)'])

# Add value labels on bars
for container in axes[0].containers:
    axes[0].bar_label(container, fmt='%d', fontsize=12)

# Pie chart
target_counts.plot(
    kind='pie',
    autopct='%1.1f%%',
    ax=axes[1],
    colors=colors,
    startangle=90,
    labels=['No Disease', 'Disease'],
    textprops={'fontsize': 12}
)
axes[1].set_title('Target Distribution (Percentage)', fontsize=14, fontweight='bold')
axes[1].set_ylabel('')

plt.tight_layout()
plt.savefig('target_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n Plot saved as 'target_distribution.png'")

In [None]:
print("\n" + "="*70)
print("FEATURE DISTRIBUTIONS")
print("="*70)

# Get all features except target
features = [col for col in df.columns if col != 'target']

# Create subplots
n_cols = 3
n_rows = (len(features) + n_cols - 1) // n_cols

fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 4*n_rows))
axes = axes.flatten()

for idx, feature in enumerate(features):
    # Plot histogram with KDE
    sns.histplot(data=df, x=feature, kde=True, ax=axes[idx], color='skyblue', bins=20)
    axes[idx].set_title(f'Distribution of {feature}', fontweight='bold', fontsize=11)
    axes[idx].set_xlabel(feature, fontsize=10)
    axes[idx].set_ylabel('Frequency', fontsize=10)
    axes[idx].grid(axis='y', alpha=0.3)

# Hide extra subplots
for idx in range(len(features), len(axes)):
    axes[idx].axis('off')

plt.tight_layout()
plt.savefig('feature_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n Plot saved as 'feature_distributions.png'")

In [None]:
print("\n" + "="*70)
print("CORRELATION ANALYSIS")
print("="*70)

# Calculate correlation matrix
correlation_matrix = df.corr()

# Create heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(
    correlation_matrix,
    annot=True,
    fmt='.2f',
    cmap='coolwarm',
    center=0,
    square=True,
    linewidths=1,
    cbar_kws={"shrink": 0.8},
    vmin=-1,
    vmax=1
)

plt.title('Feature Correlation Matrix', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n Plot saved as 'correlation_matrix.png'")

# Find highly correlated features with target
print("\n Correlations with Target Variable:")
target_corr = correlation_matrix['target'].sort_values(ascending=False)
print("\nTop Positive Correlations:")
print(target_corr[target_corr > 0][1:6])  # Exclude target itself
print("\nTop Negative Correlations:")
print(target_corr[target_corr < 0][:5])

In [None]:
print("\n" + "="*70)
print("FEATURE COMPARISON BY TARGET CLASS")
print("="*70)

# Select key numerical features
key_features = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.flatten()

for idx, feature in enumerate(key_features):
    # Box plot
    sns.boxplot(data=df, x='target', y=feature, ax=axes[idx], palette=['#90EE90', '#FFB6C6'])
    axes[idx].set_title(f'{feature} by Target', fontweight='bold', fontsize=12)
    axes[idx].set_xlabel('Target (0 = No Disease, 1 = Disease)', fontsize=10)
    axes[idx].set_ylabel(feature, fontsize=10)
    axes[idx].set_xticklabels(['No Disease', 'Disease'])

# Hide last subplot
axes[-1].axis('off')

plt.tight_layout()
plt.savefig('feature_by_target.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n Plot saved as 'feature_by_target.png'")

In [None]:
print("\n" + "="*70)
print("CATEGORICAL FEATURES ANALYSIS")
print("="*70)

categorical_features = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']

fig, axes = plt.subplots(3, 3, figsize=(18, 12))
axes = axes.flatten()

for idx, feature in enumerate(categorical_features):
    # Count plot by target
    df_temp = df.groupby([feature, 'target']).size().reset_index(name='count')
    pivot_df = df_temp.pivot(index=feature, columns='target', values='count').fillna(0)
    
    pivot_df.plot(kind='bar', ax=axes[idx], color=['#90EE90', '#FFB6C6'], width=0.7)
    axes[idx].set_title(f'{feature} Distribution by Target', fontweight='bold', fontsize=11)
    axes[idx].set_xlabel(feature, fontsize=10)
    axes[idx].set_ylabel('Count', fontsize=10)
    axes[idx].legend(['No Disease', 'Disease'], fontsize=9)
    axes[idx].tick_params(axis='x', rotation=0)

# Hide last subplot
axes[-1].axis('off()

plt.tight_layout()
plt.savefig('categorical_features.png', dpi=300, bbox_inches='tight')
plt.show()

print("\n Plot saved as 'categorical_features.png'")

In [None]:
print("\n" + "="*70)
print("EXPLORATORY DATA ANALYSIS SUMMARY")
print("="*70)

print("\n Dataset Overview:")
print(f"   â€¢ Total samples: {df.shape[0]:,}")
print(f"   â€¢ Total features: {df.shape[1] - 1}")
print(f"   â€¢ Target variable: Binary (0/1)")
print(f"   â€¢ Missing values: 0")
print(f"   â€¢ Duplicate rows: 0")

print("\n Target Distribution:")
print(f"   â€¢ Class 0 (No Disease): {target_counts[0]} ({target_pct[0]:.1f}%)")
print(f"   â€¢ Class 1 (Disease): {target_counts[1]} ({target_pct[1]:.1f}%)")
print(f"   â€¢ Balance: {'Good' if abs(target_pct[0] - target_pct[1]) < 10 else 'Moderate'}")

print("\n Key Findings:")
print(f"   â€¢ Age range: {df['age'].min():.0f} - {df['age'].max():.0f} years")
print(f"   â€¢ Average age: {df['age'].mean():.1f} years")
print(f"   â€¢ Gender split: {(df['sex']==1).sum()} males, {(df['sex']==0).sum()} females")
print(f"   â€¢ Strongest positive correlation with target: {target_corr[target_corr > 0].index[1]}")
print(f"   â€¢ Strongest negative correlation with target: {target_corr[target_corr < 0].index[0]}")

print("\n Visualizations Created:")
print("   1. target_distribution.png")
print("   2. feature_distributions.png")
print("   3. correlation_matrix.png")
print("   4. feature_by_target.png")
print("   5. categorical_features.png")

print("\n Dataset is clean and ready for modeling!")
print("\n Next Step: Run '02_train_models.ipynb' to train ML models")
print("="*70)