In [2]:
import json
import os
from pprint import pprint
from datetime import datetime
from dateutil import parser

In [3]:
# methods for parsing information

def getInformation(entries):
    basicInfo, condInfo, visitsInfo, docInfo, procInfo, orgInfo = {}, {}, {}, {}, {}, {}

    for entry in entries:
        lookupKey = entry["fullUrl"]
        if entry["resource"]["resourceType"] == "Patient":
            basicInfo = getBasicInformation(entry)
        if entry["resource"]["resourceType"] == "Condition":
            conditionName, visitID, date = getConditionsInformation(entry)
            condInfo[lookupKey] = [date, conditionName, visitID]
        if entry["resource"]["resourceType"] == "Encounter":
            notes, practitionerID, startEnd, serviceProviderID = getVisitsInformation(entry)
            visitsInfo[lookupKey] = {"notes":notes, "practitionerID":practitionerID, 
                                     "start_and_end_time":startEnd, "serviceProviderID":serviceProviderID}
        if entry["resource"]["resourceType"] == "Practitioner":
            docInfo[lookupKey] = {"name": getPractitionerInformation(entry)}
        if entry["resource"]["resourceType"] == "Procedure":
            start, end, status, description, visitID = getProceduresInformation(entry)
            procInfo[lookupKey] = [start, end, status, description, visitID]
        if entry["resource"]["resourceType"] == "Organization":
            name, phone, address, postalCode, typeOrg = getOrganizationInformation(entry)
            orgInfo[lookupKey] = {"name":name, "phone":phone, "address":address, "postalCode":postalCode, "type_of_organization":typeOrg}
    
    # idx + 1 for itm
    sortedItems = sorted(condInfo.items(), key=lambda item: item[1], reverse=True)
    sortedConditions = {key: value for key, value in sortedItems}

    sortedProcItems = sorted(procInfo.items(), key=lambda item: item[1])
    sortedProcedures = {key: value for key, value in sortedProcItems}

    return basicInfo, sortedConditions, visitsInfo, docInfo, sortedProcedures, orgInfo

def calculateAge(birthdate_str):
    birthdate = datetime.strptime(birthdate_str, "%Y-%m-%d")
    
    today = datetime.today()
    
    age = today.year - birthdate.year
    
    if (today.month, today.day) < (birthdate.month, birthdate.day):
        age -= 1

    return age

def getOrganizationInformation(entry):
    name, phone, address, postalCode, typeOrg = "", "", "", "", ""

    try:
        name = entry["resource"]["name"]
        if not "telecom" in entry["resource"]:
            phone = ""
        else:
            phone = entry["resource"]["telecom"][0]["value"]
        addyInfo = entry["resource"]["address"][0]
        address = " ".join([addyInfo["line"][0], addyInfo["city"], addyInfo["state"]])
        postalCode = addyInfo["postalCode"]
        typeOrg = entry["resource"]["type"][0]["text"]
    except Exception as e:
        print(e)
        
    return name, phone, address, postalCode, typeOrg

def getBasicInformation(entry):
    parsedData = {}

    parsedData["system_id"] = entry["fullUrl"]

    try:
        firstName = "".join([char for char in entry["resource"]["name"][0]["given"][0] if not char.isdigit()])
        lastName = "".join([char for char in entry["resource"]["name"][0]["family"] if not char.isdigit()])
        gender = entry["resource"]["gender"]
        languageSpoken = entry["resource"]["communication"][0]["language"]["coding"][0]["display"]
        if not "telecom" in entry["resource"]:
            phoneNumber = ""
        else:
            phoneNumber = entry["resource"]["telecom"][0]["value"]
        age = calculateAge(entry["resource"]["birthDate"])
        addyInfo = entry["resource"]["address"][0]
        address = " ".join([addyInfo["line"][0], addyInfo["city"], addyInfo["state"]])

        parsedData["firstName"], parsedData["lastName"], parsedData["gender"], parsedData["languageSpoken"], parsedData["phoneNumber"], parsedData["age"], parsedData["address"] = firstName, lastName, gender, languageSpoken, phoneNumber, str(age), address

        return parsedData
    except Exception as e:
        print(e)
        return {}

def getConditionsInformation(entry):
    conditionName, visitID, date = "", "", ""

    try:
        conditionName = entry["resource"]["code"]["text"]
        visitID = entry["resource"]["encounter"]["reference"]
        date = entry["resource"]["onsetDateTime"]
        date = parser.isoparse(date)

    except Exception as e:
        print(e)
        
    return conditionName, visitID, date
    
def getVisitsInformation(entry):
    notes, practitionerID, start, end, serviceProviderID = "", "", "", "", ""

    try:
        notes = entry["resource"]["type"][0]["text"]
        practitionerID = entry["resource"]["participant"][0]["individual"]["reference"]
        start, end = entry["resource"]["period"]["start"], entry["resource"]["period"]["end"]
        serviceProviderID = entry["resource"]["serviceProvider"]["reference"]

    except Exception as e:
        print(e)
    
    return notes, practitionerID, (start, end), serviceProviderID

def getPractitionerInformation(entry):
    try:
        firstName = "".join([char for char in entry["resource"]["name"][0]["given"][0] if not char.isdigit()])
        lastName = "".join([char for char in entry["resource"]["name"][0]["family"] if not char.isdigit()])
        prefix = entry["resource"]["name"][0]["prefix"][0]

        fullName = " ".join([prefix, firstName, lastName])

        return fullName
    except Exception as e:
        print(e)
        return ""
    

def getProceduresInformation(entry):
    start, end, status, description, visitID = "", "", "", "", ""

    try:
        status = entry["resource"]["status"]
        description = entry["resource"]["code"]["text"]
        visitID = entry["resource"]["encounter"]["reference"]
        start, end = entry["resource"]["performedPeriod"]["start"], entry["resource"]["performedPeriod"]["end"]

    except Exception as e:
        print(e)
    
    return start, end, status, description, visitID





In [4]:
from ehrSystem import patient, practitioner, serviceProvider
from tqdm import tqdm
import time

file_path = "./syntheticmedicare10k/"
patient_files = os.listdir(file_path)
resource_types = set()

patients, practitioners, serviceProviders = {}, {}, {}

# keep track of which conditions each doctor diagnosed and which procedures they were a part of, as a list
# do the same for hospital
for pfile in tqdm(patient_files, desc="Processing"):
    with open(file_path + pfile, 'r') as file:
        # Load the JSON data into a dictionary
        data = json.load(file)
        parsedData = {}
        entries = data["entry"]
        basicInformation, conditionsLookupTable, visitsLookup, practitionerLookup, proceduresLookup, serviceProviderLookup = getInformation(entries)

        currPatient = patient(basicInfo=basicInformation)
        patients[basicInformation["system_id"]] = currPatient

        for visit in visitsLookup:
            currInfo = visitsLookup[visit]

            visitPractitioners = practitionerLookup[currInfo["practitionerID"]]
            visitServiceProviders = serviceProviderLookup[currInfo["serviceProviderID"]]

            if not currInfo["practitionerID"] in practitioners:
                currPractitioner = practitioner(basicInfo=visitPractitioners)
                practitioners[currInfo["practitionerID"]] = currPractitioner
            
            if not currInfo["serviceProviderID"] in serviceProviders:
                currServiceProvider = serviceProvider(basicInfo=visitServiceProviders)
                serviceProviders[currInfo["serviceProviderID"]] = currServiceProvider
            
            practitioners[currInfo["practitionerID"]].patients.add(patients[basicInformation["system_id"]])
            serviceProviders[currInfo["serviceProviderID"]].patients.add(patients[basicInformation["system_id"]])

            patients[basicInformation["system_id"]].practitioners.add(practitioners[currInfo["practitionerID"]])
            patients[basicInformation["system_id"]].serviceProviders.add(serviceProviders[currInfo["serviceProviderID"]])

            practitioners[currInfo["practitionerID"]].serviceProviders.add(serviceProviders[currInfo["serviceProviderID"]])
            serviceProviders[currInfo["serviceProviderID"]].practitioners.add(practitioners[currInfo["practitionerID"]])
            
            currInfo["practitioners"] = practitioners[currInfo["practitionerID"]]
            currInfo["serviceProviders"] = serviceProviders[currInfo["serviceProviderID"]]

            visitsLookup[visit] = currInfo
            patients[basicInformation["system_id"]].visits.append(currInfo)
    
        for key in conditionsLookupTable:
            date, name, visitID = conditionsLookupTable[key]
            dateStr = str(date)

            condInfo = {"name":name, "date":date, "dateStr":dateStr}

            currVisit = visitsLookup[visitID]
            visitPractitioner = practitioners[currVisit["practitionerID"]]
            visitServiceProvider = serviceProviders[currVisit["serviceProviderID"]]

            visitPractitioner.conditions.append(condInfo)
            visitServiceProvider.conditions.append(condInfo)
            patients[basicInformation["system_id"]].conditions.append(condInfo)

        for key in proceduresLookup:
            start, end, status, description, visitID = proceduresLookup[key]

            procInfo = {"start":start, "end":end, "status":status, "description":description}

            currVisit = visitsLookup[visitID]
            visitPractitioner = practitioners[currVisit["practitionerID"]]
            visitServiceProvider = serviceProviders[currVisit["serviceProviderID"]]

            visitPractitioner.procedures.append(procInfo)
            visitServiceProvider.procedures.append(procInfo)
            patients[basicInformation["system_id"]].procedures.append(procInfo)
        
        time.sleep(0.1)

Processing:  41%|████      | 4080/10000 [10:27<15:34,  6.33it/s]

'value'


Processing:  90%|████████▉ | 8958/10000 [24:11<02:18,  7.51it/s]

'value'


Processing: 100%|██████████| 10000/10000 [27:07<00:00,  6.15it/s]


In [5]:
patientsKeys = set(patients.keys())
practitionersKeys = set(practitioners.keys())
serviceProvidersKeys = set(serviceProviders.keys())

print("Num Patients:", len(patientsKeys))
print("Num Practitioners:", len(practitionersKeys))
print("Num Service Providers:", len(serviceProvidersKeys))

Num Patients: 10000
Num Practitioners: 14191
Num Service Providers: 14183


In [6]:
patientDct = {patients[p].my_id:{"basicInfo":patients[p].basicInfo, "embeddingString":patients[p].generateEmbeddingString(), "userID":patients[p].my_id} for p in patients}
practitionerDct = {practitioners[p].my_id:{"basicInfo":practitioners[p].basicInfo, "embeddingString":practitioners[p].generateEmbeddingString(), "userID":practitioners[p].my_id} for p in practitioners}
serviceProviderDct = {serviceProviders[p].my_id:{"basicInfo":serviceProviders[p].basicInfo, "embeddingString":serviceProviders[p].generateEmbeddingString(), "userID":serviceProviders[p].my_id} for p in serviceProviders}

with open("parsedEhrData/patientsInfo.json", "w") as json_file:
    json.dump(patientDct, json_file, indent=4)

with open("parsedEhrData/practitionerInfo.json", "w") as json_file:
    json.dump(practitionerDct, json_file, indent=4)

with open("parsedEhrData/serviceProviderInfo.json", "w") as json_file:
    json.dump(serviceProviderDct, json_file, indent=4)