In [1]:
from pathlib import Path
import pandas as pd
from data_ingestion import load_all_raw
from feature_engineering import build_subject_snapshot
from scoring import compute_clean_patient_flags, compute_dqi
import warnings
import pandas as pd

# Suppress specific pandas warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="pandas")
pd.set_option('future.no_silent_downcasting', True)


ROOT_DIR = Path(input("Enter root directory path: ").strip('"\''))
if not ROOT_DIR.exists():
    raise ValueError(f"Path not found: {ROOT_DIR}")

# Verify before loading
study_folders = [p for p in ROOT_DIR.iterdir() if p.is_dir() and 'study' in p.name.lower()]
print(f"Found {len(study_folders)} study folders: {[p.name for p in study_folders[:3]]}...")

# 1. Load all studies and build snapshot
raw_all = load_all_raw(ROOT_DIR)  # load raw tables for all discovered study folders
subject_df = build_subject_snapshot(raw_all)  # engineer subject-level features
subject_df = compute_clean_patient_flags(subject_df)  # derive clean_patient flag
subject_df = compute_dqi(subject_df)  # compute overall Data Quality Index (DQI)

Path("data/processed").mkdir(parents=True, exist_ok=True)  # ensure output folder exists
# make every nonâ€‘numeric column explicitly string
for col in subject_df.columns:
    if not pd.api.types.is_numeric_dtype(subject_df[col]):
        subject_df[col] = subject_df[col].astype("string")

# persist subject-level snapshot for dashboard / further analysis
subject_df.to_parquet("data/processed/subject_site_snapshot.parquet", index=False)

# 2. Site-level aggregation and simple metrics
site_df = (
    subject_df.groupby(["study_id", "site_id"], as_index=False)
    .agg(
        mean_dqi=("dqi", "mean"),              # average DQI per site
        pct_clean=("clean_patient", "mean"),   # proportion of clean patients
        n_subjects=("subject_id", "nunique"),  # number of unique subjects
        n_red=("dqi_band", lambda x: (x == "Red").sum()),  # count of Red-band subjects
    )
)

# persist site-level snapshot for dashboard use
site_df.to_parquet("data/processed/site_snapshot.parquet", index=False)




Found 23 study folders: ['Study 10_CPID_Input Files - Anonymization', 'Study 11_CPID_Input Files - Anonymization', 'Study 13_CPID_Input Files - Anonymization']...


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

features = [
    "n_missing_visits", "n_missing_pages", "n_open_queries",
    "n_nonconformant_pages", "n_lab_issues", "n_uncoded_terms",
    "n_open_edrr_issues", "n_sae_pending_actions",
    "pct_crfs_verified", "pct_crfs_signed", "pct_crfs_overdue"
]

X = subject_df[features].fillna(0)
y = subject_df["clean_patient"]

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=5,
    random_state=42,
    class_weight="balanced"
)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     11614

    accuracy                           1.00     11614
   macro avg       1.00      1.00      1.00     11614
weighted avg       1.00      1.00      1.00     11614

