# process_data package demo
This notebook imports the local package and tests all functions on a small sample dataset.

In [1]:
# Import the package (add src to sys.path if not installed)
from pathlib import Path
import sys
try:
    import process_data as pdlib
except ModuleNotFoundError:
    sys.path.append(str(Path.cwd() / "src"))
    import process_data as pdlib

import pandas as pd
import numpy as np
import inspect

print("process_data version:", getattr(pdlib, "__version__", "unknown"))
print("exports:", [n for n in dir(pdlib) if not n.startswith("_")])

process_data version: unknown
exports: ['BMICalculator', 'CSVDataLoader', 'DataPreprocessor', 'DiabetesModel', 'EthnicityEncoder', 'GenderBinaryEncoder', 'data', 'features', 'model']


In [2]:
# Prepare a sample CSV (use existing one if present)
data_dir = Path.cwd() / "data"
csv_path = data_dir / "sample_diabetes_mellitus_data.csv"
data_dir.mkdir(parents=True, exist_ok=True)

if csv_path.exists():
    print("Found existing sample CSV:", csv_path)
    df = pdlib.data_loader(csv_path)
else:
    rng = np.random.default_rng(42)
    n = 50
    df = pd.DataFrame({
        "age": rng.integers(18, 85, size=n).astype(float),
        "height": rng.normal(170, 10, size=n),
        "weight": rng.normal(75, 15, size=n),
        "aids": rng.integers(0, 2, size=n),
        "cirrhosis": rng.integers(0, 2, size=n),
        "hepatic_failure": rng.integers(0, 2, size=n),
        "immunosuppression": rng.integers(0, 2, size=n),
        "leukemia": rng.integers(0, 2, size=n),
        "lymphoma": rng.integers(0, 2, size=n),
        "solid_tumor_with_metastasis": rng.integers(0, 2, size=n),
        "gender": rng.choice(["M", "F", "Male", "Female", None], size=n, p=[0.35,0.35,0.1,0.1,0.1]),
        "ethnicity": rng.choice(["A","B","C", None], size=n, p=[0.4,0.35,0.2,0.05]),
        "diabetes_mellitus": rng.integers(0, 2, size=n)
    })
    df.loc[[1, 7], "age"] = np.nan
    df.loc[[2, 9], "gender"] = None
    df.loc[[3], "ethnicity"] = None
    df.to_csv(csv_path, index=False)
    print("Created sample CSV:", csv_path)
    print(df.head())

print("Data shape:", df.shape)

Found existing sample CSV: c:\EC\BSE\DSDM\Term 1\21DM004 Computing for Data Science\hw4\process_data\data\sample_diabetes_mellitus_data.csv


AttributeError: module 'process_data' has no attribute 'data_loader'

In [None]:
# Test data_loader and data_split
df_loaded = pdlib.data_loader(csv_path)
train_df, test_df = pdlib.data_split(csv_path, test_size=0.3, random_state=42)
print("Loaded:", df_loaded.shape, "Train:", train_df.shape, "Test:", test_df.shape)

In [None]:
# Remove rows with NaNs in specific columns (age, gender, ethnicity)
cols_nan = ["age", "gender", "ethnicity"]
train_df = pdlib.data_remove_nans(train_df, columns=cols_nan)
test_df = pdlib.data_remove_nans(test_df, columns=cols_nan)
print("After remove_nans -> Train/Test:", train_df.shape, test_df.shape)
assert train_df[cols_nan].isna().sum().sum() == 0
assert test_df[cols_nan].isna().sum().sum() == 0

In [None]:
# Fill NaNs in numeric columns height, weight with means
train_df = pdlib.data_fill_nans(train_df, columns=["height", "weight"])
test_df = pdlib.data_fill_nans(test_df, columns=["height", "weight"])
print("Height/Weight NaNs after fill -> Train/Test:",
      train_df[["height","weight"]].isna().sum().to_dict(),
      test_df[["height","weight"]].isna().sum().to_dict())

In [None]:
# One-hot encode ethnicity and make gender binary
train_df = pdlib.data_encoding(train_df, columns=["ethnicity"])
test_df = pdlib.data_encoding(test_df, columns=["ethnicity"])
train_df = pdlib.data_binary(train_df, column="gender")
test_df = pdlib.data_binary(test_df, column="gender")
print("After encoding/binary -> Train/Test:", train_df.shape, test_df.shape)
print("gender dtype:", train_df["gender"].dtype)
train_df.head(2)

In [None]:
# Define features and target for model training
FEATURES = [
    "age", "height", "weight",
    "aids", "cirrhosis", "hepatic_failure",
    "immunosuppression", "leukemia", "lymphoma",
    "solid_tumor_with_metastasis",
]
TARGET = "diabetes_mellitus"

missing_tr = [c for c in FEATURES if c not in train_df.columns]
missing_te = [c for c in FEATURES if c not in test_df.columns]
if missing_tr or missing_te:
    raise KeyError(f"Missing required features. Train: {missing_tr}, Test: {missing_te}")

# Train models
X_train = train_df[FEATURES]
y_train = train_df[TARGET]
model_lr = pdlib.data_train_models(X_train, y_train, model_type="logreg")
model_rf = pdlib.data_train_models(X_train, y_train, model_type="rf")

print(type(model_lr).__name__, type(model_rf).__name__)

In [None]:
# Add prediction probabilities
try:
    add_predictions = pdlib.add_predictions
except AttributeError:
    from process_data.pred_auc_score import add_predictions

train_with_pred, test_with_pred = add_predictions(
    model_lr, train_df, test_df, FEATURES, pred_col="predictions", inplace=False
)
print(train_with_pred[[TARGET, "predictions"]].head())
print(test_with_pred[[TARGET, "predictions"]].head())
assert "predictions" in train_with_pred.columns and "predictions" in test_with_pred.columns

In [None]:
# Compute ROC AUC
auc_train = pdlib.pred_auc_score(train_with_pred[TARGET], train_with_pred["predictions"]) 
auc_test = pdlib.pred_auc_score(test_with_pred[TARGET], test_with_pred["predictions"]) 
print(f"AUC (train): {auc_train:.3f} | AUC (test): {auc_test:.3f}")

In [None]:
# Verify data_predict matches add_predictions
from numpy.testing import assert_allclose

proba_chk = pdlib.data_predict(model_lr, test_df[FEATURES], proba=True)
assert_allclose(proba_chk, test_with_pred["predictions"].to_numpy(), atol=1e-9)
print("data_predict(proba=True) matches add_predictions output on test set.")