# Testing

In [1]:
# import libraries

import json
import math
import os

import pandas as pd

from dotenv import load_dotenv
from openai import AzureOpenAI

from rag import load_embedding_model
from applicant import Applicant
from agent import Agent

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load applicants data

data_folder = '../../../data/applicants'

with open(f"{data_folder}/applicants.json", 'r', encoding='utf-8') as file:
    applicants = json.load(file)

with open(f"{data_folder}/answers.json", 'r') as file:
    answers = json.load(file)

In [3]:
# regional indicators

indicators = [
    'mortality',
    'morbidity',
    'healthcare',
    'lifestyle',
    'education',
    'socioeconomic',
    'environment',
    'infrastructure',
    'security'
]

In [4]:
def collect_geo_indicators(municipality : str) -> list[str]:
    '''
    Fetches the main risk indicators from a municipality
    '''
    demographics = {}
    for indicator in indicators:
        content = pd.read_csv(f"../../../data/demographics/processed/{indicator}.csv", encoding='utf-8')
        if municipality in content['municipality'].values:
            municipality_data = content[content['municipality'] == municipality].iloc[0]
            for column in municipality_data.index[1:]:
                value = municipality_data[column]
                if value in ['very low', 'low', 'high', 'very high']:
                    demographics[column] = value
    return demographics

In [5]:
# load electronic health records data

data_folder = '../../../data/health_records'

patients = pd.read_csv(f"{data_folder}/patients.csv")

allergies = pd.read_csv(f"{data_folder}/allergies.csv")
careplans = pd.read_csv(f"{data_folder}/careplans.csv")
conditions = pd.read_csv(f"{data_folder}/conditions.csv")
devices = pd.read_csv(f"{data_folder}/devices.csv")
encounters = pd.read_csv(f"{data_folder}/encounters.csv")
imagings = pd.read_csv(f"{data_folder}/imagings.csv")
immunizations = pd.read_csv(f"{data_folder}/immunizations.csv")
medications = pd.read_csv(f"{data_folder}/medications.csv")
observations = pd.read_csv(f"{data_folder}/observations.csv")
procedures = pd.read_csv(f"{data_folder}/procedures.csv")

In [6]:
def recent_observations(observations : list[list]) -> list[list]:
    '''
    Returns only the latest observations from the health records
    '''
    unique_data_dict = {}
    for entry in observations:
        key = entry[0]
        unique_data_dict[key] = entry
    unique_data = list(unique_data_dict.values())
    unique_data.sort(key=lambda x: x[0])
    return unique_data

In [7]:
def clean_nans(obj : dict | list | float) -> dict | list | str:
    '''
    Replaces all NaN values with an empty string
    '''
    if isinstance(obj, dict):
        return {k: clean_nans(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [clean_nans(item) for item in obj]
    elif isinstance(obj, float) and math.isnan(obj):
        return ''
    else:
        return obj

In [8]:
def get_health_records(patient_id : str) -> dict:
    '''
    Gather all health records for a given patient
    '''
    health_record = {}
    health_record['patient'] = patients[patients['Id'] == patient_id][['Id', 'BIRTHDATE', 'MARITAL', 'RACE', 'ETHNICITY', 'GENDER']]\
        .to_dict(orient='records')[0]
    health_record['allergies'] = allergies[allergies['PATIENT'] == patient_id]['DESCRIPTION'].unique().tolist()
    health_record['careplans'] = careplans[careplans['PATIENT'] == patient_id]['DESCRIPTION'].unique().tolist()
    health_record['conditions'] = conditions[conditions['PATIENT'] == patient_id]['DESCRIPTION'].unique().tolist()
    health_record['devices'] = devices[devices['PATIENT'] == patient_id]['DESCRIPTION'].unique().tolist()
    health_record['encounters'] = encounters[encounters['PATIENT'] == patient_id]['DESCRIPTION'].unique().tolist()
    pat_imagings = imagings[imagings['PATIENT'] == patient_id][['BODYSITE_DESCRIPTION', 'MODALITY_DESCRIPTION']]
    health_record['imagings'] = [
        f"{modality}: {bodysite}" for modality, bodysite in zip(pat_imagings['MODALITY_DESCRIPTION'], pat_imagings['BODYSITE_DESCRIPTION'])
    ]
    health_record['immunizations'] = immunizations[immunizations['PATIENT'] == patient_id]['DESCRIPTION'].tolist()
    health_record['medications'] = medications[medications['PATIENT'] == patient_id]['DESCRIPTION'].tolist()
    health_record['procedures'] = procedures[procedures['PATIENT'] == patient_id]['DESCRIPTION'].tolist()
    sections = ['allergies', 'careplans', 'conditions', 'devices', 'encounters', 'imagings', 'immunizations', 'medications', 'procedures']
    records = []
    for section in sections:
        records += health_record[section]
    health_record['records'] = records
    raw_observations = observations[observations['PATIENT'] == patient_id][['DESCRIPTION', 'VALUE', 'UNITS']]\
        .values.tolist()
    health_record['observations'] = recent_observations(raw_observations)
    return clean_nans(health_record)

In [9]:
# set up GPT client and model for embeddings

load_dotenv()
instance = AzureOpenAI(
    azure_endpoint=os.getenv("ENDPOINT_URL"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version="2024-05-01-preview",
    # api_version="2025-01-01-preview"
)
model = os.getenv("DEPLOYMENT_NAME")
embeddings = load_embedding_model()

[AGENT] Embedding model loaded


In [10]:
# create agent

agent = Agent(id, instance, model, embeddings)

In [11]:
# create applicant

id = 'c8403116-d069-4130-aa02-39954fc971b5'
applicant = Applicant(id)

In [12]:
# applicant inputs

health_records = get_health_records(id) if applicants[id]['access']['health'] else {}
steps = applicants[id]['steps'] if applicants[id]['access']['steps'] else None
posts = applicants[id]['posts'] if applicants[id]['access']['posts'] else []

applicant.answers = {key: answers[id][key] for key in list(answers[id])[:5]}
options = agent.questions['personal']['personal_location']['options']
municipality = options[applicant.answers['personal_location']]

In [13]:
# process user info

agent.collect_basic_info(applicant)

agent.collect_geo_indicators(applicant, municipality)
agent.generate_geo_indices(applicant)

agent.collect_social_media_posts(applicant, posts)
agent.generate_posts_indices(applicant)

agent.collect_health_records(applicant, health_records, steps)
agent.generate_health_indices(applicant)

In [14]:
# start conversation

agent.start_conversation()

In [15]:
print(agent.conversation[0]['content'])

You are a life insurance expert in mortality risk assessment.
You will be provided with user insights.
Your task is to identify all risk factors present in a negative or risky way.
Find as many factors as you can, even if they are not so obvious.
You are only allowed to select from this exact list of risk factors (delimited by triple backticks).
```
lifestyle_job_hazards
lifestyle_smoking
lifestyle_diet
lifestyle_exercise
lifestyle_alcohol
lifestyle_drugs
lifestyle_sleep
lifestyle_stress
lifestyle_healthcare_access
lifestyle_health_checkups
lifestyle_socioeconomic_status
lifestyle_air_pollution
lifestyle_unsafe_sex
lifestyle_loneliness
lifestyle_crime
lifestyle_natural_disasters
lifestyle_driving
lifestyle_dangerous_hobbies
lifestyle_housing
lifestyle_green_spaces
family_cancer
family_heart
family_stroke
family_diabetes
family_hypertension
family_kidney
family_neurological
family_epilepsy
family_longevity
family_death_cause
family_mental_illness
family_bones
family_obesity
family_chole

In [16]:
# identify factors

agent.identify_factors(applicant)

In [None]:
print(agent.conversation[1]['content'])

Basic user information:
- age: 40-59
- gender: Female
- marital: Single
- location: Gondomar
- occupation: Tradesperson

Indicators from the user's city:
- road accidents with victims: very low
- alcohol-sensitive mortality: very low
- diabetes mortality: very low
- distance to green spaces: very low
- urban green space: very low
- mortality sensitive to tobacco consumption: very low
- mortality from suicide and self-inflicted injuries: very low
- users without a family doctor: very low
- unsanitary housing: very low
- safe water: very high
- mortality from healthcare-sensitive causes: very low
- elderly population living alone: very low
- distance to primary health care: very low
- social response capacity for seniors: very low
- illiteracy rate: very low

Data from the user's health records:
- Normal pregnancy
- Creatinine: 2.7 mg/dL
- Colonoscopy
- DALY: 2.5 a
- Respiratory rate: 12.0 /min
- Diabetes self management plan
- Chloride: 104.3 mmol/L
- zoster
- Encounter for symptom
- So

In [None]:
print(agent.conversation[2]['content'])

[
    "lifestyle_green_spaces",
    "lifestyle_loneliness",
    "health_diabetes",
    "health_cholesterol",
    "health_anemia",
    "health_kidney",
    "health_pregnancy",
    "health_infections",
    "health_weight",
    "health_mental_illness",
    "health_digestive_condition"
]


In [21]:
# factor predictions

predictions = agent.make_factor_predictions(applicant)

In [22]:
# change conversation

agent.change_conversation()

In [23]:
print(agent.conversation[0]['content'])

You are assisting with selecting questions for a life insurance applicant.

You will receive:
- a partially completed questionnaire
- a list of remaining risk factors

Select a risk factor only if at least one of the following is true:
- the factor is impactful for assessing mortality risk
- there is a realistic chance the applicant's response may be negative or unhealthy

Then, take one of the following actions:
1. If at least one factor meets the above conditions:
   - reply with the factor name only (exactly as listed)

2. If none of the remaining factors meet the conditions:
   - reply with 'exit' only


In [24]:
# dynamic questions

max_questions = 15
for index in range(max_questions):
    content, _ = agent.produce_next_question(applicant, index)
    if content:
        factor = content['factor']
        answer = answers[id][factor]
        agent.save_answer(applicant, factor, answer)
    else:
        break

In [25]:
agent.build_record(applicant)

In [None]:
print(agent.record_text(applicant))

Do you often visit parks, nature trails, or other green spaces that encourage outdoor activities and relaxation?
- No

Do you regularly feel socially isolated to a degree that affects your well-being?
- No

Have you been diagnosed with diabetes?
- Yes, Type 2

Have you been diagnosed with high cholesterol?
- Yes

Have you been diagnosed with anemia or a blood disorder?
- Yes

Have you ever been diagnosed with kidney disease or kidney failure?
- Yes

Are you currently pregnant, or have you been pregnant within the last 12 months?
- No

Do you have a history of recurrent infections requiring medical treatment?
- Yes

How would you describe your current weight status?
- Healthy weight

Have you ever been diagnosed with a mental health condition (e.g., depression, anxiety, bipolar disorder)?
- No

Do you have any digestive conditions (e.g., GERD, ulcers, Crohn's disease, IBS)?
- No

How many cigarettes (or equivalent) do you smoke per day?
- I do not smoke

How much alcohol do you consume 

In [None]:
applicant.answers

{'personal_age': 2,
 'personal_gender': 1,
 'personal_marital': 0,
 'personal_location': 4,
 'personal_occupation': 56,
 'lifestyle_green_spaces': 1,
 'lifestyle_loneliness': 1,
 'health_diabetes': 1,
 'health_cholesterol': 0,
 'health_anemia': 0,
 'health_kidney': 0,
 'health_pregnancy': 2,
 'health_infections': 0,
 'health_weight': 1,
 'health_mental_illness': 1,
 'health_digestive_condition': 1,
 'lifestyle_smoking': 4,
 'lifestyle_alcohol': 2,
 'lifestyle_drugs': 1,
 'lifestyle_diet': 1,
 'lifestyle_job_hazards': 5,
 'lifestyle_exercise': 0,
 'lifestyle_sleep': 2,
 'lifestyle_stress': 1,
 'family_cancer': 5,
 'family_heart': 2,
 'family_stroke': 2}
