#  01 - EDA & Preprocessing

This notebook performs initial exploration and preprocessing on the healthcare dataset (e.g., diabetes).

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set(style='whitegrid')

: 

In [None]:
# Load Dataset
# Update path if needed
df = pd.read_csv('../data/diabetes.csv')
df.head()

In [None]:
# Basic Info
df.info()

In [None]:
# Missing Values
print(df.isnull().sum())

In [None]:
# Descriptive Statistics
df.describe()

In [None]:
# Target Distribution
sns.countplot(data=df, x='Outcome')
plt.title('Class Distribution')
plt.show()

In [None]:
# Feature Distributions
num_cols = df.select_dtypes(include=['int64', 'float64']).columns

df[num_cols].hist(figsize=(15, 10), bins=20, color='skyblue')
plt.tight_layout()
plt.show()

In [None]:
# Correlation Matrix
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Feature Correlation Heatmap')
plt.show()

In [None]:
# Preprocessing
from sklearn.preprocessing import StandardScaler

X = df.drop('Outcome', axis=1)
y = df['Outcome']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save preprocessed features for reuse
X = pd.DataFrame(X_scaled, columns=X.columns)
X.to_csv('../data/features.csv', index=False)
y.to_csv('../data/labels.csv', index=False)

✅ **Next step:** Proceed to `02_model_training.ipynb` to build and evaluate the predictive model.