# Heart Disease Logistic Regression Analysis

This notebook performs:
1. Data loading and target binarization
2. Exploratory Data Analysis (EDA)
3. Data preparation (train/test split, normalization)

**Note:** No high-level ML libraries (scikit-learn, statsmodels, TensorFlow, PyTorch) are used.

## 1. Import Libraries and Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('Heart_Disease_Prediction.csv')

# Display first few rows
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
df.head()

In [None]:
# Binarize the target column: 1 = Presence (disease), 0 = Absence (no disease)
df['Heart Disease Binary'] = df['Heart Disease'].map({'Presence': 1, 'Absence': 0})

# Verify the mapping
print("Target column mapping:")
print(df[['Heart Disease', 'Heart Disease Binary']].drop_duplicates())
print("\nTarget distribution:")
print(df['Heart Disease Binary'].value_counts())

## 2. Exploratory Data Analysis (EDA)

In [None]:
# Summary statistics
print("="*60)
print("SUMMARY STATISTICS")
print("="*60)
print("\nDataset Info:")
print(df.info())
print("\n" + "="*60)
print("Descriptive Statistics:")
print("="*60)
df.describe()

In [None]:
# Check for missing values
print("="*60)
print("MISSING VALUES CHECK")
print("="*60)
missing_values = df.isnull().sum()
print("\nMissing values per column:")
print(missing_values)
print(f"\nTotal missing values: {missing_values.sum()}")

In [None]:
# Outlier detection using IQR method for numerical columns
print("="*60)
print("OUTLIER DETECTION (IQR Method)")
print("="*60)

numerical_cols = ['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression', 'Number of vessels fluro']

def detect_outliers_iqr(data, column):
    """Detect outliers using IQR method"""
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[column] < lower_bound) | (data[column] > upper_bound)]
    return len(outliers), lower_bound, upper_bound

print("\nOutliers per column:")
outlier_summary = []
for col in numerical_cols:
    n_outliers, lb, ub = detect_outliers_iqr(df, col)
    outlier_summary.append({'Column': col, 'Outliers': n_outliers, 'Lower Bound': lb, 'Upper Bound': ub})
    print(f"  {col}: {n_outliers} outliers (bounds: [{lb:.2f}, {ub:.2f}])")

outlier_df = pd.DataFrame(outlier_summary)
outlier_df

In [None]:
# Handle outliers by capping (winsorizing) at IQR bounds
print("="*60)
print("HANDLING OUTLIERS (Capping at IQR bounds)")
print("="*60)

df_clean = df.copy()

for col in numerical_cols:
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Cap outliers
    original_outliers = ((df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)).sum()
    df_clean[col] = df_clean[col].clip(lower=lower_bound, upper=upper_bound)
    print(f"  {col}: Capped {original_outliers} outliers")

print("\nOutliers handled successfully!")

In [None]:
# Plot class distribution
print("="*60)
print("CLASS DISTRIBUTION")
print("="*60)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Bar plot
class_counts = df_clean['Heart Disease Binary'].value_counts()
colors = ['#2ecc71', '#e74c3c']
axes[0].bar(['Absence (0)', 'Presence (1)'], [class_counts[0], class_counts[1]], color=colors)
axes[0].set_xlabel('Heart Disease')
axes[0].set_ylabel('Count')
axes[0].set_title('Class Distribution (Bar Chart)')
for i, v in enumerate([class_counts[0], class_counts[1]]):
    axes[0].text(i, v + 2, str(v), ha='center', fontweight='bold')

# Pie chart
axes[1].pie([class_counts[0], class_counts[1]], 
            labels=['Absence (0)', 'Presence (1)'], 
            autopct='%1.1f%%',
            colors=colors,
            explode=(0.02, 0.02))
axes[1].set_title('Class Distribution (Pie Chart)')

plt.tight_layout()
plt.show()

print(f"\nClass 0 (Absence): {class_counts[0]} ({class_counts[0]/len(df_clean)*100:.1f}%)")
print(f"Class 1 (Presence): {class_counts[1]} ({class_counts[1]/len(df_clean)*100:.1f}%)")

## 3. Data Preparation

### Feature Selection (â‰¥6 features)
Selected features:
1. **Age** - Patient age
2. **BP** - Blood Pressure
3. **Cholesterol** - Cholesterol level
4. **Max HR** - Maximum Heart Rate
5. **ST depression** - ST depression induced by exercise
6. **Number of vessels fluro** - Number of major vessels colored by fluoroscopy

In [None]:
# Select features and target
selected_features = ['Age', 'BP', 'Cholesterol', 'Max HR', 'ST depression', 'Number of vessels fluro']

X = df_clean[selected_features].values
y = df_clean['Heart Disease Binary'].values

print("="*60)
print("FEATURE SELECTION")
print("="*60)
print(f"\nSelected {len(selected_features)} features: {selected_features}")
print(f"\nFeature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

In [None]:
# Stratified Train/Test Split (70/30) - Manual implementation without sklearn
def stratified_train_test_split(X, y, test_size=0.3, random_seed=42):
    """
    Manual implementation of stratified train-test split.
    Ensures both train and test sets have similar class proportions.
    """
    np.random.seed(random_seed)
    
    # Get indices for each class
    class_0_indices = np.where(y == 0)[0]
    class_1_indices = np.where(y == 1)[0]
    
    # Shuffle indices
    np.random.shuffle(class_0_indices)
    np.random.shuffle(class_1_indices)
    
    # Calculate split points for each class
    n_test_class_0 = int(len(class_0_indices) * test_size)
    n_test_class_1 = int(len(class_1_indices) * test_size)
    
    # Split indices for each class
    test_indices_0 = class_0_indices[:n_test_class_0]
    train_indices_0 = class_0_indices[n_test_class_0:]
    
    test_indices_1 = class_1_indices[:n_test_class_1]
    train_indices_1 = class_1_indices[n_test_class_1:]
    
    # Combine indices
    train_indices = np.concatenate([train_indices_0, train_indices_1])
    test_indices = np.concatenate([test_indices_0, test_indices_1])
    
    # Shuffle combined indices
    np.random.shuffle(train_indices)
    np.random.shuffle(test_indices)
    
    # Create train and test sets
    X_train = X[train_indices]
    X_test = X[test_indices]
    y_train = y[train_indices]
    y_test = y[test_indices]
    
    return X_train, X_test, y_train, y_test

# Perform stratified split
X_train, X_test, y_train, y_test = stratified_train_test_split(X, y, test_size=0.3, random_seed=42)

print("="*60)
print("STRATIFIED TRAIN/TEST SPLIT (70/30)")
print("="*60)
print(f"\nTraining set: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Test set: {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")
print(f"\nClass distribution in training set:")
print(f"  Class 0: {np.sum(y_train == 0)} ({np.sum(y_train == 0)/len(y_train)*100:.1f}%)")
print(f"  Class 1: {np.sum(y_train == 1)} ({np.sum(y_train == 1)/len(y_train)*100:.1f}%)")
print(f"\nClass distribution in test set:")
print(f"  Class 0: {np.sum(y_test == 0)} ({np.sum(y_test == 0)/len(y_test)*100:.1f}%)")
print(f"  Class 1: {np.sum(y_test == 1)} ({np.sum(y_test == 1)/len(y_test)*100:.1f}%)")