# Importing Datasets

In [2]:
from os import read
import pandas as pd 
df_nhanes = pd.read_csv("/Users/arjunanand/Documents/SE_Project/ClinicConnect/AI-Model/data/Nhanes_2013_2014.csv")
df_convo = pd.read_csv("/Users/arjunanand/Documents/SE_Project/ClinicConnect/AI-Model/data/Doctor-HealthCare-100k.csv")

# Cleaning Datasets

In [2]:
from pdfminer.high_level import extract_text
import re

text = extract_text("/Users/arjunanand/Documents/SE_Project/ClinicConnect/AI-Model/data/oasis.pdf")

# Remove common patterns
patterns = [
    r"PRA Disclosure Statement.*?Baltimore, Maryland 21244-1850\.",
    r"Page \d+ of \d+",
    r"\*{5}CMS Disclaimer\*{5}.*?retained\.",  # Disclaimer block
    r"OASIS-E1 All Items Effective.*?Centers for Medicare & Medicaid Services",
    r"\n\s*\n"  # Empty lines
]

for pattern in patterns:
    text = re.sub(pattern, "", text, flags=re.DOTALL)

# Save cleaned text
with open("clean_oasis.txt", "w") as f:
    f.write(text)

In [3]:
import pandas as pd

df = pd.read_csv("/Users/arjunanand/Documents/SE_Project/ClinicConnect/AI-Model/data/Doctor-HealthCare-100k.csv")

#Remove duplicates
df = df.drop_duplicates(subset=["input"])

#Trim whitespace
df["output"] = df["output"].str.strip()

#Filter invalid responses
df = df[df["output"].str.contains("Chat Doctor|ENT evaluation|I hope", na=False)]

#Add safety disclaimers
df["output"] = df["output"] + "\n\nDISCLAIMER: This is simulated advice. Consult a real doctor."

# Generating Synthetic Data
Since there were not much healthcare plan public datasets available online I chose to generate synthetic data by creating actual medical care plan (following health institutions guidelines) templates for specific diseases to narrow down our focus in the initial phases. 
Then using Faker library to generate this synthetic data and validate these generate care plans by comparing them with the actual care plan condition set initially according to medical guidelines.

In [None]:
# Faker is the library for generating synthetic data
from faker import Faker
import random

# Initialize Faker for synthetic data generation
fake = Faker()

# List of selected diseases
diseases = ["diabetes", "hypertension", "heart_disease", "asthma", "ckd", "depression"]

# Care plan templates for each disease
care_plan_templates = {
    "diabetes": {
        "condition": "diabetes",
        "components": {
            "monitoring": ["Daily fasting blood glucose checks", "HbA1c every 3 months"],
            "medications": {
                "oral": ["Metformin 500mg bid"],
                "injectable": ["Insulin glargine 10 units nightly"]
            },
            "lifestyle": ["30 mins aerobic exercise 5x/week", "Carbohydrate-controlled diet"]
        }
    },
    "hypertension": {
        "condition": "hypertension",
        "components": {
            "monitoring": ["Weekly blood pressure checks"],
            "medications": {
                "oral": ["Lisinopril 10mg daily"],
                "injectable": []  # No injectables typically for hypertension
            },
            "lifestyle": ["Low-sodium diet", "Regular exercise"]
        }
    },
    "heart_disease": {
        "condition": "heart_disease",
        "components": {
            "monitoring": ["Regular ECG monitoring"],  # Placeholder
            "medications": {
                "oral": ["Aspirin 81mg daily"],  # Placeholder
                "injectable": []
            },
            "lifestyle": ["Heart-healthy diet", "Moderate exercise"]  # Placeholder
        }
    },
    "asthma": {
        "condition": "asthma",
        "components": {
            "monitoring": ["Peak flow monitoring"],  # Placeholder
            "medications": {
                "oral": [],
                "injectable": ["Albuterol inhaler as needed"]  # Placeholder, adjusting for typical use
            },
            "lifestyle": ["Avoid triggers", "Breathing exercises"]  # Placeholder
        }
    },
    "ckd": {
        "condition": "ckd",
        "components": {
            "monitoring": ["Quarterly kidney function tests"],  # Placeholder
            "medications": {
                "oral": ["ACE inhibitors as prescribed"],  # Placeholder
                "injectable": []
            },
            "lifestyle": ["Low-protein diet", "Hydration management"]  # Placeholder
        }
    },
    "depression": {
        "condition": "depression",
        "components": {
            "monitoring": ["Monthly mood assessments"],  # Placeholder
            "medications": {
                "oral": ["Sertraline 50mg daily"],  # Placeholder
                "injectable": []
            },
            "lifestyle": ["Counseling sessions", "Regular physical activity"]  # Placeholder
        }
    }
}

def generate_synthetic_plan():
    # Randomly select a disease
    disease = random.choice(diseases)
    # Get the corresponding care plan template
    template = care_plan_templates[disease]
    return {
        "patient_id": fake.uuid4(),
        "primary_condition": disease,
        "age": random.randint(18, 90),
        "bmi": round(random.uniform(18.5, 45.1), 1),
        "comorbidities": random.sample(
            ["hypertension", "obesity", "ckd", "depression", "none"],
            k=random.randint(0, 2)
        ),
        "care_plan": template
    }

# Generate 1000 synthetic plans
synthetic_data = [generate_synthetic_plan() for _ in range(1000)]

# Validation function
def validate_plan(plan):
    """
    Validate a synthetic care plan for consistency and realism.
    """
    # Check that primary condition matches care plan condition
    assert plan["primary_condition"] == plan["care_plan"]["condition"], "Condition mismatch"
    # Ensure monitoring section is not empty
    assert len(plan["care_plan"]["components"]["monitoring"]) > 0, "Monitoring is empty"
    # Ensure at least one medication (oral or injectable) is present
    assert (len(plan["care_plan"]["components"]["medications"]["oral"]) > 0 or 
            len(plan["care_plan"]["components"]["medications"]["injectable"]) > 0), "No medications"
    # Ensure lifestyle recommendations are not empty
    assert len(plan["care_plan"]["components"]["lifestyle"]) > 0, "Lifestyle is empty"
    # Check age and BMI ranges
    assert 18 <= plan["age"] <= 90, "Age out of range"
    assert 18.5 <= plan["bmi"] <= 45.1, "BMI out of range"

# Validate all generated plans
for plan in synthetic_data:
    try:
        validate_plan(plan)
    except AssertionError as e:
        print(f"Invalid plan for patient {plan['patient_id']}: {e}")

# Save to CSV
df = pd.DataFrame(synthetic_data)
df.to_csv("care_plans.csv", index=False)
print("Synthetic care plans generated and saved to 'care_plans.csv'")