In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%%writefile notebooks/01_data_preprocessing.ipynb

#
# =================================================================================================
# COMPREHENSIVE MACHINE LEARNING FULL PIPELINE ON HEART DISEASE UCI DATASET
# =================================================================================================
#
# Notebook: 01_data_preprocessing.ipynb
#
# Description:
# This notebook covers the initial phase of the project: Data Preprocessing and Cleaning. [cite: 29]
# It includes loading the dataset, handling missing values, encoding categorical features,
# scaling numerical features, and performing exploratory data analysis (EDA).
#
# =================================================================================================

# Required Library Imports [cite: 20]
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# =================================================================================================
# Sprint 2.1: Data Preprocessing & Cleaning [cite: 29]
# =================================================================================================

# -------------------------------------------------------------------------------------------------
# Step 1: Load the Heart Disease UCI dataset into a Pandas DataFrame. [cite: 31]
# -------------------------------------------------------------------------------------------------
print("Step 1: Loading the dataset...")
# The dataset is often available in UCI's repository. We'll use a common public URL for it.
# This assumes you've uploaded 'heart_disease.csv' to the '/kaggle/input/' directory if running on Kaggle.
try:
    # Attempt to load from a local Kaggle path first
    df = pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')
    print("Dataset loaded successfully from local path.")
except FileNotFoundError:
    print("Local file not found. Loading from a remote URL...")
    # Fallback to a remote URL if local file isn't found
    url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'
    # The original dataset does not have column headers. We need to add them.
    column_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
    df = pd.read_csv(url, header=None, names=column_names)
    print("Dataset loaded successfully from remote URL.")

df.to_csv('data/heart_disease.csv', index=False)

print("\nInitial Dataset Info:")
df.info()
print("\nFirst 5 rows of the dataset:")
print(df.head())


# -------------------------------------------------------------------------------------------------
# Step 2: Handle missing values (imputation or removal). [cite: 32]
# -------------------------------------------------------------------------------------------------
print("\nStep 2: Handling missing values...")
# The original dataset from UCI represents missing values with '?'.
# We need to replace '?' with NaN (Not a Number) to handle them properly.
df.replace('?', np.nan, inplace=True)

print("\nMissing values before handling:")
print(df.isnull().sum())

# 'ca' (number of major vessels) and 'thal' are object types due to '?'. Let's convert them to numeric.
df['ca'] = pd.to_numeric(df['ca'])
df['thal'] = pd.to_numeric(df['thal'])

# Impute missing values. For simplicity, we'll use the median for numeric columns.
# Median is often better for skewed distributions, which is common in medical data.
for col in ['ca', 'thal']:
    median_val = df[col].median()
    df[col].fillna(median_val, inplace=True)

print("\nMissing values after handling:")
print(df.isnull().sum())


# -------------------------------------------------------------------------------------------------
# Step 3: Perform data encoding (one-hot encoding for categorical variables). [cite: 33]
# -------------------------------------------------------------------------------------------------
print("\nStep 3: Performing one-hot encoding for categorical variables...")
# Identify categorical columns for one-hot encoding.
# While some of these are numeric (like 'sex', 'cp'), they represent categories.
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print("\nDataset shape after one-hot encoding:", df.shape)
print("\nFirst 5 rows of the dataset after encoding:")
print(df.head())


# -------------------------------------------------------------------------------------------------
# Step 4: Standardize numerical features using StandardScaler. [cite: 34]
# -------------------------------------------------------------------------------------------------
print("\nStep 4: Standardizing numerical features...")
from sklearn.preprocessing import StandardScaler

# Identify numerical columns for scaling
numerical_cols = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']

scaler = StandardScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

print("\nFirst 5 rows of the dataset after scaling:")
print(df.head())


# -------------------------------------------------------------------------------------------------
# Step 5: Conduct Exploratory Data Analysis (EDA). [cite: 35]
# -------------------------------------------------------------------------------------------------
print("\nStep 5: Conducting Exploratory Data Analysis (EDA)...")

# --- Correlation Heatmap ---
plt.figure(figsize=(18, 15))
sns.heatmap(df.corr(), annot=True, cmap='viridis', fmt='.2f')
plt.title('Correlation Heatmap of Features', fontsize=20)
plt.show()

# --- Histograms for Numerical Features ---
print("\nDisplaying histograms for original numerical features (before scaling)...")
# For more interpretable histograms, we'll reload the data before scaling and encoding
try:
    df_eda = pd.read_csv('/kaggle/input/heart-disease-uci/heart.csv')
except FileNotFoundError:
    df_eda = pd.read_csv(url, header=None, names=column_names)
df_eda.replace('?', np.nan, inplace=True)
df_eda.dropna(inplace=True) # Drop NA for simplicity in EDA plots
for col in ['ca', 'thal']:
    df_eda[col] = pd.to_numeric(df_eda[col])

df_eda[numerical_cols].hist(bins=20, figsize=(15, 10), layout=(2, 3))
plt.suptitle('Histograms of Numerical Features')
plt.show()

# --- Boxplots for Numerical Features vs. Target ---
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols):
    plt.subplot(2, 3, i + 1)
    sns.boxplot(x='target', y=col, data=df_eda)
    plt.title(f'{col} vs. Target')
plt.tight_layout()
plt.show()

print("\nEDA visualizations have been generated.")


# -------------------------------------------------------------------------------------------------
# Convert to Binary Classification Target
# -------------------------------------------------------------------------------------------------
print("\nConverting target to binary classification (0: No Disease, 1: Disease)...")
# Any value > 0 in the 'target' column becomes 1, otherwise it stays 0.
df['target'] = (df['target'] > 0).astype(int)

print("\nValue counts of the new binary target:")
print(df['target'].value_counts())

# Check the distribution of the binary target
distribution = df['target'].value_counts(normalize=True) * 100
print("Target variable distribution:")
print(distribution)

# -------------------------------------------------------------------------------------------------
# Deliverable: Cleaned dataset ready for modeling
# -------------------------------------------------------------------------------------------------
print("\nData preprocessing and cleaning complete.")
# Saving the cleaned dataset with the new binary target
df.to_csv('cleaned_heart_disease.csv', index=False)
print("Cleaned dataset saved to 'cleaned_heart_disease.csv'.")