In [None]:
# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

# 2. Load Dataset
df = pd.read_csv('../data/diabetes.csv')
df.head()

In [None]:
# 3. Basic Info
df.info()
df.describe()
df.isnull().sum()

In [None]:
# 4. Check for Invalid Zeroes
cols_with_zero_issues = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for col in cols_with_zero_issues:
    print(f"{col} - zero values: {df[col].isin([0]).sum()}")

In [None]:
# 5. Replace 0 with NaN, Then Impute (Median)
df[cols_with_zero_issues] = df[cols_with_zero_issues].replace(0, np.nan)
df.fillna(df.median(numeric_only=True), inplace=True)

In [None]:
# 6. EDA – Correlation Matrix
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# 7. Feature Scaling
X = df.drop('Outcome', axis=1)
y = df['Outcome']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 8. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)