# Generation

In [None]:
# import libraries

import json
import math
import os
import time

import pandas as pd

from dotenv import load_dotenv
from openai import AzureOpenAI

from rag import load_embedding_model
from applicant import Applicant
from agent import Agent

In [2]:
# load applicants data

data_folder = '../../../data/applicants'

with open(f"{data_folder}/applicants.json", 'r', encoding='utf-8') as file:
    applicants = json.load(file)

with open(f"{data_folder}/answers.json", 'r') as file:
    answers = json.load(file)

In [3]:
# regional indicators

indicators = [
    'mortality',
    'morbidity',
    'healthcare',
    'lifestyle',
    'education',
    'socioeconomic',
    'environment',
    'infrastructure',
    'security'
]

In [4]:
def collect_geo_indicators(municipality : str) -> list[str]:
    '''
    Fetches the main risk indicators from a municipality
    '''
    demographics = {}
    for indicator in indicators:
        content = pd.read_csv(f"../../../data/demographics/processed/{indicator}.csv", encoding='utf-8')
        if municipality in content['municipality'].values:
            municipality_data = content[content['municipality'] == municipality].iloc[0]
            for column in municipality_data.index[1:]:
                value = municipality_data[column]
                if value in ['very low', 'low', 'high', 'very high']:
                    demographics[column] = value
    return demographics

In [5]:
# load electronic health records data

data_folder = '../../../data/health_records'

patients = pd.read_csv(f"{data_folder}/patients.csv")

allergies = pd.read_csv(f"{data_folder}/allergies.csv")
careplans = pd.read_csv(f"{data_folder}/careplans.csv")
conditions = pd.read_csv(f"{data_folder}/conditions.csv")
devices = pd.read_csv(f"{data_folder}/devices.csv")
encounters = pd.read_csv(f"{data_folder}/encounters.csv")
imagings = pd.read_csv(f"{data_folder}/imagings.csv")
immunizations = pd.read_csv(f"{data_folder}/immunizations.csv")
medications = pd.read_csv(f"{data_folder}/medications.csv")
observations = pd.read_csv(f"{data_folder}/observations.csv")
procedures = pd.read_csv(f"{data_folder}/procedures.csv")

In [6]:
def recent_observations(observations : list[list]) -> list[list]:
    '''
    Returns only the latest observations from the health records
    '''
    unique_data_dict = {}
    for entry in observations:
        key = entry[0]
        unique_data_dict[key] = entry
    unique_data = list(unique_data_dict.values())
    unique_data.sort(key=lambda x: x[0])
    return unique_data

In [7]:
def clean_nans(obj : dict | list | float) -> dict | list | str:
    '''
    Replaces all NaN values with an empty string
    '''
    if isinstance(obj, dict):
        return {k: clean_nans(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [clean_nans(item) for item in obj]
    elif isinstance(obj, float) and math.isnan(obj):
        return ''
    else:
        return obj

In [8]:
def get_health_records(patient_id : str) -> dict:
    '''
    Gather all health records for a given patient
    '''
    health_record = {}
    health_record['patient'] = patients[patients['Id'] == patient_id][['Id', 'BIRTHDATE', 'MARITAL', 'RACE', 'ETHNICITY', 'GENDER']]\
        .to_dict(orient='records')[0]
    health_record['allergies'] = allergies[allergies['PATIENT'] == patient_id]['DESCRIPTION'].unique().tolist()
    health_record['careplans'] = careplans[careplans['PATIENT'] == patient_id]['DESCRIPTION'].unique().tolist()
    health_record['conditions'] = conditions[conditions['PATIENT'] == patient_id]['DESCRIPTION'].unique().tolist()
    health_record['devices'] = devices[devices['PATIENT'] == patient_id]['DESCRIPTION'].unique().tolist()
    health_record['encounters'] = encounters[encounters['PATIENT'] == patient_id]['DESCRIPTION'].unique().tolist()
    pat_imagings = imagings[imagings['PATIENT'] == patient_id][['BODYSITE_DESCRIPTION', 'MODALITY_DESCRIPTION']]
    health_record['imagings'] = [
        f"{modality}: {bodysite}" for modality, bodysite in zip(pat_imagings['MODALITY_DESCRIPTION'], pat_imagings['BODYSITE_DESCRIPTION'])
    ]
    health_record['immunizations'] = immunizations[immunizations['PATIENT'] == patient_id]['DESCRIPTION'].tolist()
    health_record['medications'] = medications[medications['PATIENT'] == patient_id]['DESCRIPTION'].tolist()
    health_record['procedures'] = procedures[procedures['PATIENT'] == patient_id]['DESCRIPTION'].tolist()
    sections = ['allergies', 'careplans', 'conditions', 'devices', 'encounters', 'imagings', 'immunizations', 'medications', 'procedures']
    records = []
    for section in sections:
        records += health_record[section]
    health_record['records'] = records
    raw_observations = observations[observations['PATIENT'] == patient_id][['DESCRIPTION', 'VALUE', 'UNITS']]\
        .values.tolist()
    health_record['observations'] = recent_observations(raw_observations)
    return clean_nans(health_record)

In [None]:
# set up GPT client and model for embeddings

load_dotenv()
instance = AzureOpenAI(
    azure_endpoint=os.getenv("ENDPOINT_URL"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    # api_version="2024-05-01-preview",
    api_version="2025-01-01-preview"
)
model = os.getenv("DEPLOYMENT_NAME")
embeddings = load_embedding_model()

In [11]:
# create agent

agent = Agent(id, instance, model, embeddings)

In [12]:
# generate indices for user data

users = {}

for id, applicant in applicants.items():
    user = Applicant(id)

    user.answers = {key: answers[id][key] for key in list(answers[id])[:5]}
    options = agent.questions['personal']['personal_location']['options']
    municipality = options[user.answers['personal_location']]

    agent.collect_basic_info(user)

    agent.collect_geo_indicators(user, applicant['municipality'])
    agent.generate_geo_indices(user)

    health_records = get_health_records(id) if applicant['access']['health'] else {}
    posts = applicant['posts'] if applicant['access']['posts'] else []
    steps = applicant['steps'] if applicant['access']['steps'] else None
    
    agent.collect_health_records(user, health_records, steps)
    agent.generate_health_indices(user)

    agent.collect_social_media_posts(user, posts)
    agent.generate_posts_indices(user)
    
    users[id] = user

In [13]:
# questionnaire sections

categories = [
    'personal',
    'lifestyle',
    'family',
    'health'
]

In [14]:
# variable to store statistics

stats = {
    'questionnaires': {}
}

for category in categories[1:]:
    stats[category] = {}
    for factor, content in agent.questions[category].items():
        stats[category][factor] = {
            'count': 0,
            'answers': [0] * len(content['options']),
            'avg_time': 0.0
        }

In [15]:
# variable to store questionnaire records

dataset = {}

In [None]:
# generate dynamic questionnaires filled with answers

for id, user in users.items():
    questionnaire = []
    quest_stats = {
        'pred_factors': 0,
        'pred_time': 0,
        'count': 0,
        'avg_time': 0.0,
        'errors': 0
    }

    agent.start_conversation()
    agent.identify_factors(user)
    quest_stats['pred_factors'] = len(agent.chosen_factors)

    start_time = time.time()
    predictions = agent.make_factor_predictions(user)
    end_time = time.time()
    elapsed_time = end_time - start_time
    questionnaire += predictions
    quest_stats['pred_time'] = elapsed_time

    agent.change_conversation()
    for index in range(15):
        start_time = time.time()
        content, errors = agent.produce_next_question(user, index)
        end_time = time.time()
        elapsed_time = end_time - start_time

        if content:
            factor = content['factor']
            answer = answers[id][factor]
            agent.save_answer(user, factor, answer)
            content['answer'] = answer
            questionnaire.append(content)

            category = factor.split('_', 1)[0]
            stats[category][factor]['answers'][answer] += 1
            avg_time = stats[category][factor]['avg_time']
            factor_count = stats[category][factor]['count']
            stats[category][factor]['avg_time'] = (avg_time * factor_count + elapsed_time) / (factor_count + 1)
            stats[category][factor]['count'] += 1

            quest_stats['avg_time'] = (quest_stats['avg_time'] * quest_stats['count'] + elapsed_time) / (quest_stats['count'] + 1)
            quest_stats['errors'] += errors
            quest_stats['count'] += 1
        else:
            break
            
    dataset[id] = questionnaire
    stats['questionnaires'][id] = quest_stats
    total_quests = len(dataset)
    print(f"{total_quests} questionnaires generated")

In [17]:
# save all dynamic questionnaires and respective statistics

data_folder = '../../../data/questionnaires'

with open(f"{data_folder}/records/dynamic_2.json", 'w', encoding='utf-8') as file:
    json.dump(dataset, file, indent=4, ensure_ascii=False)

with open(f"{data_folder}/stats/dynamic_2.json", 'w', encoding='utf-8') as file:
    json.dump(stats, file, indent=4, ensure_ascii=False)