# Importing Datasets

In [1]:
from os import read
import pandas as pd 
df_nhanes = pd.read_csv("/Users/arjunanand/Documents/SE_Project/ClinicConnect/AI-Model/data/Nhanes_2013_2014.csv")
df_convo = pd.read_csv("/Users/arjunanand/Documents/SE_Project/ClinicConnect/AI-Model/data/Doctor-HealthCare-100k.csv")

# Cleaning Datasets

In [2]:
from pdfminer.high_level import extract_text
import re

text = extract_text("/Users/arjunanand/Documents/SE_Project/ClinicConnect/AI-Model/data/oasis.pdf")

# Remove common patterns
patterns = [
    r"PRA Disclosure Statement.*?Baltimore, Maryland 21244-1850\.",
    r"Page \d+ of \d+",
    r"\*{5}CMS Disclaimer\*{5}.*?retained\.",  # Disclaimer block
    r"OASIS-E1 All Items Effective.*?Centers for Medicare & Medicaid Services",
    r"\n\s*\n"  # Empty lines
]

for pattern in patterns:
    text = re.sub(pattern, "", text, flags=re.DOTALL)

# Save cleaned text
with open("clean_oasis.txt", "w") as f:
    f.write(text)

In [2]:
import pandas as pd

df = pd.read_csv("/Users/arjunanand/Documents/SE_Project/ClinicConnect/AI-Model/data/Doctor-HealthCare-100k.csv")

#Remove duplicates
df = df.drop_duplicates(subset=["input"])

#Trim whitespace
df["output"] = df["output"].str.strip()

#Filter invalid responses
df = df[df["output"].str.contains("Chat Doctor|ENT evaluation|I hope", na=False)]

#Add safety disclaimers
df["output"] = df["output"] + "\n\nDISCLAIMER: This is simulated advice. Consult a real doctor."

# Generating Synthetic Data
Since there were not much healthcare plan public datasets available online I chose to generate synthetic data by creating actual medical care plan (following health institutions guidelines) templates for specific diseases to narrow down our focus in the initial phases. 
Then using Faker library to generate this synthetic data and validate these generate care plans by comparing them with the actual care plan condition set initially according to medical guidelines.

In [4]:
import pandas as pd
from faker import Faker
import random
import re

fake = Faker()

# Enhanced disease list with subtypes
diseases = {
    "diabetes": {
        "type": ["Type 2", "Type 1", "Gestational"],
        "comorbidities": ["hypertension", "obesity", "neuropathy", "retinopathy"]
    },
    "hypertension": {
        "stage": ["Stage 1", "Stage 2", "Resistant"],
        "comorbidities": ["ckd", "heart_failure", "diabetes"]
    },
    "heart_disease": {
        "type": ["CAD", "HFrEF", "Arrhythmia"],
        "comorbidities": ["diabetes", "hypertension", "hyperlipidemia"]
    },
    "asthma": {
        "severity": ["Intermittent", "Persistent Mild", "Persistent Moderate", "Persistent Severe"],
        "comorbidities": ["GERD", "allergic_rhinitis", "eczema"]
    },
    # Chronic Kidney disease
    "ckd": {
        "stage": ["Stage 1", "Stage 2", "Stage 3a", "Stage 3b", "Stage 4", "Stage 5"],
        "comorbidities": ["hypertension", "anemia", "mineral_bone_disorder"]
    },
    "depression": {
        "type": ["MDD", "Persistent", "Seasonal"],
        "comorbidities": ["anxiety", "PTSD", "substance_use"]
    }
}

# Clinically accurate care plan templates
care_plan_templates = {
    "diabetes": {
        "monitoring": [
            "Daily fasting and postprandial glucose checks",
            "HbA1c every 3 months if >6.5%",
            "Annual comprehensive foot exam",
            "Annual lipid profile",
            "Annual urinary albumin-to-creatinine ratio"
        ],
        "medications": {
            "oral": [
                "Metformin 500-2000mg daily (unless eGFR <30)",
                "SGLT2 inhibitor (empagliflozin 10-25mg daily)",
                "DPP-4 inhibitor (sitagliptin 100mg daily)"
            ],
            "injectable": [
                "GLP-1 RA (semaglutide 0.5-1mg weekly) for ASCVD risk",
                "Basal insulin (glargine 0.2-0.5 units/kg)"
            ]
        },
        "lifestyle": [
            "ADA diet: 45-60g carbs per meal, 15-20g snacks",
            "150 mins/week moderate exercise (spread over ≥3 days)",
            "Continuous glucose monitoring for Type 1 DM",
            "Annual dilated eye exam",
            "Foot care education"
        ]
    },
    "hypertension": {
        "monitoring": [
            "Home BP monitoring 2x daily (AM/PM)",
            "Basic metabolic panel at 1 month then q6-12mo",
            "Urinalysis annually",
            "24h ambulatory BP monitoring for resistant HTN"
        ],
        "medications": {
            "oral": [
                "ACE inhibitor (lisinopril 5-40mg daily)",
                "Thiazide diuretic (chlorthalidone 12.5-25mg daily)",
                "CCB (amlodipine 5-10mg daily)",
                "ARB (losartan 50-100mg daily) if ACE-I intolerance"
            ],
            "injectable": []
        },
        "lifestyle": [
            "DASH diet: <1500mg Na+/day, 8-10 servings fruits/vegetables",
            "Aerobic exercise 30 mins 5x/week",
            "Alcohol restriction (<1 drink/day women, <2 men)",
            "Weight loss goal 5-10% body weight"
        ]
    },
    "heart_disease": {
        "monitoring": [
            "Echocardiogram annually for EF <40%",
            "Stress test q1-2 years",
            "Lipid panel q6 months",
            "NT-proBNP every 3-6 months in HF"
        ],
        "medications": {
            "oral": [
                "Beta-blocker (carvedilol 3.125-25mg bid)",
                "ARNI (sacubitril/valsartan 49/51mg bid)",
                "MRA (spironolactone 25-50mg daily)",
                "Antiplatelet (aspirin 81mg daily)"
            ],
            "injectable": [
                "SC anticoagulation (enoxaparin 1mg/kg bid) for ACS"
            ]
        },
        "lifestyle": [
            "Cardiac rehab program: 36 sessions over 12 weeks",
            "Mediterranean diet rich in omega-3s",
            "Smoking cessation counseling",
            "Fluid restriction <2L/day in HF"
        ]
    },
    "asthma": {
        "monitoring": [
            "Peak flow monitoring AM/PM during exacerbations",
            "ACT (Asthma Control Test) monthly",
            "FeNO q3-6 months for biologic therapy",
            "PFTs annually"
        ],
        "medications": {
            "oral": [
                "Leukotriene modifier (montelukast 10mg nightly)",
                "Theophylline 200-400mg bid (level 5-15 mcg/mL)"
            ],
            "injectable": [
                "ICS-formoterol (Symbicort 160/4.5 2 puffs bid)",
                "Biologic (omalizumab 150-375mg SC q4wk)"
            ]
        },
        "lifestyle": [
            "Trigger avoidance: HEPA filter, dust mite covers",
            "Inhaler technique training q6 months",
            "Action plan for PEF <80% personal best",
            "Annual influenza vaccination"
        ]
    },
    "ckd": {
        "monitoring": [
            "eGFR q3-6 months based on stage",
            "Urine ACR q6 months",
            "CBC q3 months for anemia",
            "Bone mineral panel (Ca, Phos, PTH) q3 months"
        ],
        "medications": {
            "oral": [
                "ACEI/ARB (ramipril 2.5-10mg daily)",
                "SGLT2 inhibitor (dapagliflozin 10mg daily if eGFR ≥25)",
                "Phosphate binder (sevelamer 800mg TAC)"
            ],
            "injectable": [
                "ESA (darbepoetin 0.45mcg/kg weekly)",
                "Iron sucrose 100mg IV q2wk if TSAT <20%"
            ]
        },
        "lifestyle": [
            "Renal diet: Protein 0.6-0.8g/kg/day",
            "Fluid restriction 1-1.5L/day in volume overload",
            "AV fistula evaluation for Stage 4 CKD",
            "Potassium restriction <2g/day"
        ]
    },
    "depression": {
        "monitoring": [
            "PHQ-9 every 4-6 weeks",
            "Suicide risk assessment at each visit",
            "Thyroid function tests annually",
            "Medication adherence checks"
        ],
        "medications": {
            "oral": [
                "SSRI (escitalopram 10-20mg daily)",
                "SNRI (venlafaxine XR 75-225mg daily)",
                "Atypical (bupropion XL 150-450mg daily)"
            ],
            "injectable": []
        },
        "lifestyle": [
            "CBT program: 16 sessions over 20 weeks",
            "Sleep hygiene: Fixed bed/wake times",
            "Behavioral activation scheduling",
            "Mindfulness-based stress reduction"
        ]
    }
}

def generate_synthetic_plan():
    disease = random.choice(list(diseases.keys()))
    details = diseases[disease]
    
    # Choose subtype safely depending on available keys
    if "type" in details:
        subtype = random.choice(details["type"])
    elif "stage" in details:
        subtype = random.choice(details["stage"])
    elif "severity" in details:
        subtype = random.choice(details["severity"])
    else:
        subtype = "N/A"
    
    # For comorbidities, sample a random number between 0 and 2 from the list
    comorbidity_list = details["comorbidities"] + ["none"]
    comorbidity_k = random.randint(0, min(2, len(comorbidity_list)))
    comorbidities = random.sample(comorbidity_list, k=comorbidity_k)
    
    # Safely sample medications and lifestyle items based on available items
    oral_meds = care_plan_templates[disease]["medications"]["oral"]
    oral_sample_size = random.randint(1, min(3, len(oral_meds)))
    injectable_meds = care_plan_templates[disease]["medications"]["injectable"]
    injectable_sample_size = random.randint(0, min(1, len(injectable_meds))) if injectable_meds else 0
    lifestyle_items = care_plan_templates[disease]["lifestyle"]
    lifestyle_sample_size = random.randint(2, min(4, len(lifestyle_items)))
    
    plan = {
        "patient_id": fake.uuid4(),
        "primary_condition": {
            "name": disease,
            "subtype": subtype
        },
        "demographics": {
            "age": random.randint(18, 90),
            "sex": random.choice(["M", "F"]),
            "bmi": round(random.uniform(18.5, 45.1), 1)
        },
        "comorbidities": comorbidities,
        "care_plan": {
            "monitoring": care_plan_templates[disease]["monitoring"],
            "medications": {
                "oral": random.sample(oral_meds, k=oral_sample_size),
                "injectable": random.sample(injectable_meds, k=injectable_sample_size) if injectable_meds else []
            },
            "lifestyle": random.sample(lifestyle_items, k=lifestyle_sample_size)
        }
    }
    
    # Adding condition-specific adjustments for diabetes, ckd, and hypertension.
    if disease == "diabetes" and "ckd" in plan["comorbidities"]:
        # Remove Metformin if present in oral meds and add SGLT2 inhibitor instead
        plan["care_plan"]["medications"]["oral"] = [
            med for med in plan["care_plan"]["medications"]["oral"] 
            if "Metformin" not in med
        ]
        plan["care_plan"]["medications"]["oral"].append("SGLT2 inhibitor (dapagliflozin 10mg daily)")
    
    if disease == "hypertension" and plan["demographics"]["age"] > 65:
        plan["care_plan"]["medications"]["oral"].append("Thiazide diuretic (hydrochlorothiazide 12.5mg daily)")
    
    return plan

# Function for validating the generated care plans
def validate_plan(plan):
    errors = []
    
    # Medication safety checks
    if plan["primary_condition"]["name"] == "ckd":
        if any("Metformin" in med for med in plan["care_plan"]["medications"]["oral"]):
            errors.append("Metformin in CKD")
    
    if plan["primary_condition"]["name"] == "asthma":
        # Check for ICS in injectable medications (if none exist, flag an error)
        if not any(re.search("ICS", med, re.I) for med in plan["care_plan"]["medications"]["injectable"]):
            errors.append("Missing controller therapy in asthma")
    
    # Monitoring frequency validation
    required_monitoring = {
        "diabetes": ["HbA1c", "foot exam"],
        "hypertension": ["BP monitoring"],
        "heart_disease": ["echocardiogram"],
        "asthma": ["peak flow"],
        "ckd": ["eGFR"],
        "depression": ["PHQ-9"]
    }
    
    for key in required_monitoring.get(plan["primary_condition"]["name"], []):
        if not any(re.search(key, m, re.I) for m in plan["care_plan"]["monitoring"]):
            errors.append(f"Missing {key} monitoring")
    
    # Medication count validation
    total_meds = len(plan["care_plan"]["medications"]["oral"]) + len(plan["care_plan"]["medications"]["injectable"])
    if total_meds == 0:
        errors.append("No medications prescribed")
    
    return errors

# Generate and validate data
synthetic_data = []
validation_errors = []

for _ in range(1000):
    plan = generate_synthetic_plan()
    errors = validate_plan(plan)
    if errors:
        validation_errors.append({
            "patient_id": plan["patient_id"],
            "errors": errors
        })
    else:
        synthetic_data.append(plan)

# Save outputs to CSV files for both care plans and validation errors.
pd.DataFrame(synthetic_data).to_csv("clinical_care_plans.csv", index=False)
pd.DataFrame(validation_errors).to_csv("validation_errors.csv", index=False)

print(f"Generated {len(synthetic_data)} valid care plans")
print(f"Found {len(validation_errors)} invalid entries")


Generated 878 valid care plans
Found 122 invalid entries
