In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import json
import glob
from tqdm import tqdm
import time
import os
import gc

In [None]:
# ✅ Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# ✅ Step 2: Copy ZIPs from Google Drive to Colab
import shutil
import os

drive_folder = '/content/drive/MyDrive/vm'  # <-- change if your folder name is different
os.makedirs('zips', exist_ok=True)

zip_files = ['aspirus.zip', 'bcmh.zip', 'mgh.zip', 'portage.zip']
for i in zip_files:
    src = f'{drive_folder}/{i}'
    dst = f'zips/{i}'
    shutil.copy(src, dst)

# ✅ Step 3: Extract ZIPs into ./fhir so the notebook can find them
import zipfile

extract_path = './fhir'  # this matches what the notebook expects
os.makedirs(extract_path, exist_ok=True)

for i in zip_files:
    zip_path = f'zips/{i}'
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

print("✅ All JSONs are ready in ./fhir")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ All JSONs are ready in ./fhir


In [None]:
OPERATING_SYS = 'Win'
# OPERATING_SYS = 'Linux'

delim = '/'

if OPERATING_SYS != 'Win':
    delim = '/'

In [None]:
#input_root_folder_path = '.\\fhir\'
input_root_folder_path = '/content/fhir/'

In [None]:
output_folder_path = '/content/output/'


if not os.path.exists(output_folder_path):
    os.makedirs(output_folder_path)
    print("The new Output directory is created!")

In [None]:
files = glob.glob(input_root_folder_path+delim+'**'+delim+'*.json',recursive=True)

print('---Found '+str(len(files))+' Json Files---')
#files = ''

---Found 1785 Json Files---


In [None]:
def filter_resource(data, resource_type):
    return list(filter(lambda x: x['resource']['resourceType'] == resource_type.strip(), data['entry']))

## Patient

In [None]:
cols = ['id','gender','birthDate','maritalStatus','city','state','postalCode','country','deceased','deceasedDateTime']
arr = []
start = time.time()
f_count = 0

for filess in tqdm(files):
    try:
        #load File
        f = open(filess)
        data = json.load(f)
        f.close()

        ar = []

        patient = filter_resource(data, 'Patient')[0]

        ar.append(patient['resource']['id'])
        ar.append(patient['resource']['gender'])
        ar.append(patient['resource']['birthDate'])
        ar.append(patient['resource']['maritalStatus']['text'])
        ar.append(patient['resource']['address'][0]['city'])
        ar.append(patient['resource']['address'][0]['state'])
        ar.append(patient['resource']['address'][0]['postalCode'])
        ar.append(patient['resource']['address'][0]['country'])

        if 'deceasedDateTime' in patient['resource']:
            ar.append(True)
            ar.append(patient['resource']['deceasedDateTime'])
        else:
            ar.append(False)
            ar.append(np.nan)

        arr.append(ar)

    except Exception as e:
        f_count += 1
        continue

end = time.time()

print(str(f_count)+' Files Failed...')
print(str(len(arr))+' Patient bundles extracted as DataFrame in '+str(end-start)+ 'Seconds')

df_patient = pd.DataFrame(arr, columns = cols)

100%|██████████| 1785/1785 [03:45<00:00,  7.90it/s]

8 Files Failed...
1777 Patient bundles extracted as DataFrame in 225.8826620578766Seconds





In [None]:
df_patient['city'].value_counts()

Unnamed: 0_level_0,count
city,Unnamed: 1_level_1
Marquette,1452
Baraga,245
Houghton,57
Calumet,23


In [None]:
df_patient['deceased'].value_counts()/len(df_patient)

Unnamed: 0_level_0,count
deceased,Unnamed: 1_level_1
False,0.830051
True,0.169949


In [None]:
#Dropping Duplicates If Any
df_patient = df_patient.drop_duplicates('id',
                                        inplace=False,
                                        ignore_index=True)

In [None]:
df_patient.head()

  cast_date_col = pd.to_datetime(column, errors="coerce")


Unnamed: 0,id,gender,birthDate,maritalStatus,city,state,postalCode,country,deceased,deceasedDateTime
0,897ac583-e0bb-d807-3a91-04da0fa8b0c8,male,1963-03-08,Married,Marquette,MI,49855,US,False,
1,e7569220-c5b7-3b9e-c125-8ebf36ce83c1,female,1941-03-17,Divorced,Marquette,MI,49855,US,True,1995-08-08T23:53:03-04:00
2,f69c49ac-ae51-561e-8fdb-5ccc32339474,female,2012-10-03,Never Married,Marquette,MI,49855,US,False,
3,23413039-8f86-5480-23e0-dc1019ec6d55,male,2022-06-04,Never Married,Marquette,MI,49855,US,False,
4,de8b0c82-23d6-c1d2-6517-1fb4ce1bd2ec,female,1982-05-10,Married,Marquette,MI,49855,US,False,


In [None]:
df_patient.to_csv(output_folder_path+delim+'Patient.csv')
del df_patient
gc.collect()

410

In [None]:
from google.colab import files
files.download('/content/output/Patient.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Conditions

In [None]:
cols = ['code','codeText','patientId','encounterId','onsetDateTime','recordedDate','clinicalStatusCode']

arr = []
start = time.time()
f_count = 0

for filess in tqdm(files):
    try:
        #load File
        f = open(filess)
        data = json.load(f)
        f.close()


        conditions = filter_resource(data, 'Condition')
        for cond in conditions:
            ar = []

            ar.append(cond['resource']['code']['coding'][0]['code'])
            ar.append(cond['resource']['code']['text'])
            ar.append(cond['resource']['subject']['reference'].strip().split('urn:uuid:')[1])
            ar.append(cond['resource']['encounter']['reference'].strip().split('urn:uuid:')[1])
            ar.append(cond['resource']['onsetDateTime'])
            ar.append(cond['resource']['recordedDate'])
            ar.append(cond['resource']['clinicalStatus']['coding'][0]['code'])

            arr.append(ar)
    except Exception as e:
#         print(e)
        f_count += 1
        continue

end = time.time()
print(str(f_count)+' Files Failed...')
print(str(len(arr))+' Patient condition bundles extracted as DataFrame in '+str(end-start)+ 'Seconds')

df_condition = pd.DataFrame(arr, columns = cols)

100%|██████████| 1785/1785 [04:21<00:00,  6.83it/s]

0 Files Failed...
82995 Patient condition bundles extracted as DataFrame in 261.1928975582123Seconds





In [None]:
df_condition.head()

Unnamed: 0,code,codeText,patientId,encounterId,onsetDateTime,recordedDate,clinicalStatusCode
0,473461003,Educated to high school level (finding),897ac583-e0bb-d807-3a91-04da0fa8b0c8,7773818f-f83d-a87c-9020-786c660e7aae,1981-05-01T23:44:35-04:00,1981-05-01T23:44:35-04:00,active
1,714628002,Prediabetes (finding),897ac583-e0bb-d807-3a91-04da0fa8b0c8,69bffef6-dbcd-b7b7-6d45-565f1405cabb,1982-05-07T23:07:19-04:00,1982-05-07T23:07:19-04:00,active
2,271737000,Anemia (disorder),897ac583-e0bb-d807-3a91-04da0fa8b0c8,69bffef6-dbcd-b7b7-6d45-565f1405cabb,1982-05-07T23:07:19-04:00,1982-05-07T23:07:19-04:00,active
3,124171000119105,Chronic intractable migraine without aura (dis...,897ac583-e0bb-d807-3a91-04da0fa8b0c8,86e93012-15ab-8d4c-25e6-509c3215df4e,1990-02-04T22:07:19-05:00,1990-02-04T22:07:19-05:00,active
4,82423001,Chronic pain (finding),897ac583-e0bb-d807-3a91-04da0fa8b0c8,a806224d-d46b-414e-3ffc-822bd9985422,1991-03-25T22:07:19-05:00,1991-03-25T22:07:19-05:00,active


In [None]:
df_condition['onsetDateTime'] = pd.to_datetime(df_condition['onsetDateTime'], format="%Y-%m-%dT%H:%M:%S%z", utc=True)
df_condition['recordedDate'] = pd.to_datetime(df_condition['recordedDate'], format="%Y-%m-%dT%H:%M:%S%z", utc=True)

In [None]:
#Extracting resolvedDateTime form Conditions DataFrame

cols = ['patientId','code','encounterId','onsetDateTime','resolvedDateTime','codeText']
arr = []
for name,group in tqdm(df_condition.groupby(['patientId','encounterId','onsetDateTime'])):
    #Groupby Condition Code Again
    for name2, group2 in group.groupby(['code','codeText']):
        ar = []
        # Add patientId
        ar.append(name[0])

        # Add code
        ar.append(name2[0])

        # Add encounterId
        ar.append(name[1])

        # Add onsetDateTime
        ar.append(name[2])

        #Get Records with clinicalStatusCode as Resolved
        resolved = group2.query('clinicalStatusCode == "resolved"')

        #Add Resolved Date to Array if Resolved Record exists
        if len(resolved) > 0 :
            ar.append(resolved['recordedDate'].max())
        else:
            ar.append(group2['recordedDate'].max())

        # Add codeText
        ar.append(name2[1])

        arr.append(ar)

df_condition_new = pd.DataFrame(arr, columns = cols)

100%|██████████| 68939/68939 [04:15<00:00, 270.31it/s]


In [None]:
df_condition_new['onsetDateTime'] = pd.to_datetime(df_condition_new['onsetDateTime'],
                                                   format="%Y-%m-%dT%H:%M:%S%z", utc=True)
df_condition_new['resolvedDateTime'] = pd.to_datetime(df_condition_new['resolvedDateTime'],
                                                      format="%Y-%m-%dT%H:%M:%S%z", utc=True)
df_condition_new.head()

Unnamed: 0,patientId,code,encounterId,onsetDateTime,resolvedDateTime,codeText
0,003766de-aeea-a288-4211-67a5d217d8e5,18718003,183ddc4a-431c-f9e1-26f0-3ab186112068,2019-02-11 09:37:06+00:00,2019-02-11 09:37:06+00:00,Gingival disease (disorder)
1,003766de-aeea-a288-4211-67a5d217d8e5,314529007,222bacd4-bcb9-ae95-7024-9cb28e538078,2020-02-03 06:27:11+00:00,2020-02-03 06:27:11+00:00,Medication review due (situation)
2,003766de-aeea-a288-4211-67a5d217d8e5,444814009,2342595d-a2e8-47e4-6b59-bd3a241c8d8d,2024-03-18 20:27:11+00:00,2024-03-18 20:27:11+00:00,Viral sinusitis (disorder)
3,003766de-aeea-a288-4211-67a5d217d8e5,314529007,28c8b0b3-1333-55b7-2a1a-78b643d2a0b4,2019-02-04 06:27:11+00:00,2019-02-04 06:27:11+00:00,Medication review due (situation)
4,003766de-aeea-a288-4211-67a5d217d8e5,195662009,3bfcc655-da74-80f7-59b5-4f580b72c916,2022-03-17 20:27:11+00:00,2022-03-17 20:27:11+00:00,Acute viral pharyngitis (disorder)


In [None]:
df_condition_new.query('code == "840539006"')

Unnamed: 0,patientId,code,encounterId,onsetDateTime,resolvedDateTime,codeText
1493,0398d83d-cbf6-6b80-6755-b4dfeb642ef1,840539006,e62dd93a-d6a5-7457-4836-3463205757f1,2020-11-19 01:30:56+00:00,2020-11-19 01:30:56+00:00,Disease caused by severe acute respiratory syn...
2211,05ed6f9b-ed5c-0f5b-e6e1-195f2095438d,840539006,5d7b8e96-ae76-6501-bc4f-a259937ac891,2021-07-31 20:54:26+00:00,2021-07-31 20:54:26+00:00,Disease caused by severe acute respiratory syn...
3520,08d1c854-d31a-2c43-a4d7-7e374fa02f6f,840539006,4e28d76e-1dbd-a837-c28b-18696cb69596,2020-11-21 05:20:57+00:00,2020-11-21 05:20:57+00:00,Disease caused by severe acute respiratory syn...
3679,09647056-4cc0-e56b-683a-40dd4a8550fb,840539006,8201db62-9294-6710-9dfb-a9a6da5ca3b1,2020-12-11 20:52:40+00:00,2020-12-11 20:52:40+00:00,Disease caused by severe acute respiratory syn...
3917,0a1b91f2-531a-4120-d118-0867bfcd35f4,840539006,09f870ca-c71d-8c46-4180-33005a67ca33,2021-03-11 19:02:46+00:00,2021-03-11 19:02:46+00:00,Disease caused by severe acute respiratory syn...
...,...,...,...,...,...,...
80678,f74a9152-00eb-864f-5ff2-9df19bf9381f,840539006,415459d4-9a94-ecd4-4c32-7c243100a6c7,2020-12-11 10:31:17+00:00,2020-12-11 10:31:17+00:00,Disease caused by severe acute respiratory syn...
81639,fb16e30b-1aad-9d1f-45da-8716177b993f,840539006,aa29d959-72f9-15e2-f14b-d2cdf7b7f999,2021-01-08 17:22:43+00:00,2021-01-08 17:22:43+00:00,Disease caused by severe acute respiratory syn...
81744,fbbeac64-84f6-2935-0b22-3aa0492b3d68,840539006,10cc8026-3b0c-bcc0-7f87-83f5d657b4b3,2020-05-16 11:23:49+00:00,2020-05-16 11:23:49+00:00,Disease caused by severe acute respiratory syn...
82238,fcca75fd-5bf9-21fb-c906-4394b29d5c8e,840539006,bcc480ac-6905-4942-42dc-577da4bdb1ec,2020-12-18 10:36:19+00:00,2020-12-18 10:36:19+00:00,Disease caused by severe acute respiratory syn...


840539006 Is the Code for COVID 19

In [None]:
#df_condition_new.to_csv(output_folder_path+delim+'Condition.csv')
del df_condition_new
gc.collect()

0

In [None]:
from google.colab import files
files.download('/content/output/Condition.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Encounters

In [None]:
cols = ['id','status','code','codeText','start','end','patientId','location','serviceProvider','encounterClass']

arr = []
start = time.time()
f_count = 0

for filess in tqdm(files):
    try:
        #load File
        f = open(filess)
        data = json.load(f)
        f.close()


        encounters = filter_resource(data, 'Encounter')
        for encounter in encounters:
            ar = []

            ar.append(encounter['resource']['id'])
            ar.append(encounter['resource']['status'])
            ar.append(encounter['resource']['type'][0]['coding'][0]['code'])
            ar.append(encounter['resource']['type'][0]['text'])
            ar.append(encounter['resource']['period']['start'])
            ar.append(encounter['resource']['period']['end'])
            ar.append(encounter['resource']['subject']['reference'].strip().split('urn:uuid:')[1])
            ar.append(encounter['resource']['location'][0]['location']['display'])
            ar.append(encounter['resource']['serviceProvider']['display'])
            ar.append(encounter['resource']['class']['code'])

            arr.append(ar)
    except Exception as e:
#         print(e)
        f_count += 1
        continue

end = time.time()
print(str(f_count)+' Files Failed...')
print(str(len(arr))+' Patient encounter bundles extracted as DataFrame in '+str(end-start)+ 'Seconds')

df_encounter = pd.DataFrame(arr, columns = cols)

100%|██████████| 1785/1785 [04:45<00:00,  6.25it/s]


0 Files Failed...
143880 Patient encounter bundles extracted as DataFrame in 285.75977182388306Seconds


In [None]:
df_encounter.head()

Unnamed: 0,id,status,code,codeText,start,end,patientId,location,serviceProvider,encounterClass
0,7773818f-f83d-a87c-9020-786c660e7aae,finished,162673000,General examination of patient (procedure),1981-05-01T23:07:19-04:00,1981-05-01T23:44:35-04:00,897ac583-e0bb-d807-3a91-04da0fa8b0c8,"MARQUETTE INTERNAL MEDICINE ASSOCIATES, P.C.","MARQUETTE INTERNAL MEDICINE ASSOCIATES, P.C.",AMB
1,69bffef6-dbcd-b7b7-6d45-565f1405cabb,finished,162673000,General examination of patient (procedure),1982-05-07T23:07:19-04:00,1982-05-08T00:07:14-04:00,897ac583-e0bb-d807-3a91-04da0fa8b0c8,"MARQUETTE INTERNAL MEDICINE ASSOCIATES, P.C.","MARQUETTE INTERNAL MEDICINE ASSOCIATES, P.C.",AMB
2,86e93012-15ab-8d4c-25e6-509c3215df4e,finished,182813001,Emergency treatment (procedure),1990-02-04T22:07:19-05:00,1990-02-04T23:07:19-05:00,897ac583-e0bb-d807-3a91-04da0fa8b0c8,UPPER PENINSULA HOME HEALTH & HOSPICE,UPPER PENINSULA HOME HEALTH & HOSPICE,EMER
3,a806224d-d46b-414e-3ffc-822bd9985422,finished,183452005,Emergency hospital admission (procedure),1991-03-25T22:07:19-05:00,1991-03-25T23:07:19-05:00,897ac583-e0bb-d807-3a91-04da0fa8b0c8,UPPER PENINSULA HOME HEALTH & HOSPICE,UPPER PENINSULA HOME HEALTH & HOSPICE,EMER
4,71b3743b-7dee-7ecb-252b-56b1b0c7626b,finished,162673000,General examination of patient (procedure),1991-05-17T23:07:19-04:00,1991-05-18T00:01:10-04:00,897ac583-e0bb-d807-3a91-04da0fa8b0c8,"MARQUETTE INTERNAL MEDICINE ASSOCIATES, P.C.","MARQUETTE INTERNAL MEDICINE ASSOCIATES, P.C.",AMB


In [None]:
df_encounter['encounterClass'].value_counts()

Unnamed: 0_level_0,count
encounterClass,Unnamed: 1_level_1
AMB,134880
EMER,5342
IMP,2347
HH,1125
VR,186


In [None]:
#df_encounter.to_csv(output_folder_path+delim+'Encounter.csv')
del df_encounter
gc.collect()

48

In [None]:
from google.colab import files
files.download('/content/output/Encounter.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Observations

In [None]:
cols = ['id','patientId','issuedDate','effectiveDateTime','category','encounter','code','codeText','value','units','snomedCode','observationType']

arr = []
start = time.time()
f_count = 0

for filess in tqdm(files):
    try:
        #load File
        f = open(filess)
        data = json.load(f)
        f.close()


        observations = filter_resource(data, 'Observation')
        for observation in observations:

            if 'component' in observation['resource'].keys():
                for comp in observation['resource']['component']:
                    ar = []
                    ar.append(observation['resource']['id'])
                    ar.append(observation['resource']['subject']['reference'].strip().split('urn:uuid:')[1])
                    ar.append(observation['resource']['issued'])
                    ar.append(observation['resource']['effectiveDateTime'])
                    ar.append(observation['resource']['category'][0]['coding'][0]['code'])
                    ar.append(observation['resource']['encounter']['reference'].strip().split('urn:uuid:')[1])

                    ar.append(comp['code']['coding'][0]['code'])
                    ar.append(comp['code']['coding'][0]['display'])

                    if 'valueCodeableConcept' in comp.keys():
                        ar.append(comp['valueCodeableConcept']['coding'][0]['display'])
                        ar.append(np.nan)
                        ar.append(comp['valueCodeableConcept']['coding'][0]['code'])
                        ar.append('text')
                    elif 'valueQuantity' in comp.keys():
                        ar.append(comp['valueQuantity']['value'])
                        ar.append(comp['valueQuantity']['unit'])
                        ar.append(np.nan)
                        ar.append('numeric')
                    else:
                        ar.append(comp['valueString'])
                        ar.append(np.nan)
                        ar.append(np.nan)
                        ar.append('text')

                    arr.append(ar)
            else:
                ar = []
                ar.append(observation['resource']['id'])
                ar.append(observation['resource']['subject']['reference'].strip().split('urn:uuid:')[1])
                ar.append(observation['resource']['issued'])
                ar.append(observation['resource']['effectiveDateTime'])
                ar.append(observation['resource']['category'][0]['coding'][0]['code'])
                ar.append(observation['resource']['encounter']['reference'].strip().split('urn:uuid:')[1])

                ar.append(observation['resource']['code']['coding'][0]['code'])
                ar.append(observation['resource']['code']['coding'][0]['display'])

                if 'valueCodeableConcept' in observation['resource'].keys():
                    ar.append(observation['resource']['valueCodeableConcept']['coding'][0]['display'])
                    ar.append(np.nan)
                    ar.append(observation['resource']['valueCodeableConcept']['coding'][0]['code'])
                    ar.append('text')
                elif 'valueString' in observation['resource'].keys():
                    ar.append(observation['resource']['valueString'])
                    ar.append(np.nan)
                    ar.append(np.nan)
                    ar.append('text')
                else:
                    ar.append(observation['resource']['valueQuantity']['value'])
                    ar.append(observation['resource']['valueQuantity']['unit'])
                    ar.append(np.nan)
                    ar.append('numeric')

                arr.append(ar)
    except Exception as e:
#         print(e)
#         print(observation['resource'])
        f_count += 1
        continue

end = time.time()
print(str(f_count)+' Files Failed...')
print(str(len(arr))+' Patient observation bundles extracted as DataFrame in '+str(end-start)+ 'Seconds')

df_observation = pd.DataFrame(arr, columns = cols)

100%|██████████| 1785/1785 [05:51<00:00,  5.08it/s]


0 Files Failed...
1927976 Patient observation bundles extracted as DataFrame in 351.53541469573975Seconds


In [None]:
df_observation

Unnamed: 0,id,patientId,issuedDate,effectiveDateTime,category,encounter,code,codeText,value,units,snomedCode,observationType
0,3c8ae378-f736-773e-4faf-00407ef48303,897ac583-e0bb-d807-3a91-04da0fa8b0c8,2015-10-02T23:07:19.355-04:00,2015-10-02T23:07:19-04:00,laboratory,1d4b3443-33fe-8c93-2ec4-76e07d16593d,4548-4,Hemoglobin A1c/Hemoglobin.total in Blood,6.89,%,,numeric
1,58a06e53-f9e1-4e6a-27b5-ab2d753d2707,897ac583-e0bb-d807-3a91-04da0fa8b0c8,2015-10-02T23:07:19.355-04:00,2015-10-02T23:07:19-04:00,vital-signs,1d4b3443-33fe-8c93-2ec4-76e07d16593d,8302-2,Body Height,166.8,cm,,numeric
2,4e8413b9-a0f4-d074-6684-549b7ce2e6c0,897ac583-e0bb-d807-3a91-04da0fa8b0c8,2015-10-02T23:07:19.355-04:00,2015-10-02T23:07:19-04:00,vital-signs,1d4b3443-33fe-8c93-2ec4-76e07d16593d,72514-3,Pain severity - 0-10 verbal numeric rating [Sc...,3,{score},,numeric
3,5f1fa90c-79fa-0a9b-09f2-14b2109d8d1f,897ac583-e0bb-d807-3a91-04da0fa8b0c8,2015-10-02T23:07:19.355-04:00,2015-10-02T23:07:19-04:00,vital-signs,1d4b3443-33fe-8c93-2ec4-76e07d16593d,29463-7,Body Weight,76.7,kg,,numeric
4,bca9794c-abe9-f353-fb2e-67aa5274b5ee,897ac583-e0bb-d807-3a91-04da0fa8b0c8,2015-10-02T23:07:19.355-04:00,2015-10-02T23:07:19-04:00,vital-signs,1d4b3443-33fe-8c93-2ec4-76e07d16593d,39156-5,Body mass index (BMI) [Ratio],27.57,kg/m2,,numeric
...,...,...,...,...,...,...,...,...,...,...,...,...
1927971,3018c094-3193-06c7-a548-050479bcf954,800ff6a4-f0da-affd-c69f-1d616ec69170,2024-04-11T18:23:30.686-04:00,2024-04-11T18:23:30-04:00,survey,3a741ee8-995a-921e-1ff1-29d93b04555d,93034-7,Have you been discharged from the armed forces...,No,,LA32-8,text
1927972,3018c094-3193-06c7-a548-050479bcf954,800ff6a4-f0da-affd-c69f-1d616ec69170,2024-04-11T18:23:30.686-04:00,2024-04-11T18:23:30-04:00,survey,3a741ee8-995a-921e-1ff1-29d93b04555d,93035-4,"At any point in the past 2 years, has season o...",No,,LA32-8,text
1927973,3018c094-3193-06c7-a548-050479bcf954,800ff6a4-f0da-affd-c69f-1d616ec69170,2024-04-11T18:23:30.686-04:00,2024-04-11T18:23:30-04:00,survey,3a741ee8-995a-921e-1ff1-29d93b04555d,32624-9,Race,White,,LA4457-3,text
1927974,3018c094-3193-06c7-a548-050479bcf954,800ff6a4-f0da-affd-c69f-1d616ec69170,2024-04-11T18:23:30.686-04:00,2024-04-11T18:23:30-04:00,survey,3a741ee8-995a-921e-1ff1-29d93b04555d,56051-6,Do you consider yourself Hispanic/Latino?,No,,LA32-8,text


In [None]:
#df_observation.to_csv(output_folder_path+delim+'Observation.csv')
del df_observation
del ar
del arr
gc.collect()

NameError: name 'df_observation' is not defined

In [None]:
from google.colab import files
files.download('/content/output/Observation.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Care Plan

In [None]:
cols = ['id','status','patientId','start','end','category','code','codeText','intent','encounter','careTeam','activityCode','activityCodeText','activityStatus','activityLocation']

arr = []
start = time.time()
f_count = 0

for filess in tqdm(files):
    try:
        #load File
        f = open(filess)
        data = json.load(f)
        f.close()


        cps = filter_resource(data, 'CarePlan')
        for cp in cps:
            if 'activity' in cp['resource'].keys():
                for activity in cp['resource']['activity']:
                    ar = []
                    ar.append(cp['resource']['id'])
                    ar.append(cp['resource']['status'])
                    ar.append(cp['resource']['subject']['reference'].strip().split('urn:uuid:')[1])
                    ar.append(cp['resource']['period']['start'])

                    if 'end' in cp['resource']['period'].keys():
                        ar.append(cp['resource']['period']['end'])
                    else:
                        ar.append(np.nan)

                    ar.append(cp['resource']['category'][0]['coding'][0]['code'])
                    ar.append(cp['resource']['category'][1]['coding'][0]['code'])
                    ar.append(cp['resource']['category'][1]['coding'][0]['display'])
                    ar.append(cp['resource']['intent'])
                    ar.append(cp['resource']['encounter']['reference'].strip().split('urn:uuid:')[1])
                    ar.append(cp['resource']['careTeam'][0]['reference'].strip().split('urn:uuid:')[1])

                    ar.append(activity['detail']['code']['coding'][0]['code'])
                    ar.append(activity['detail']['code']['coding'][0]['display'])
                    ar.append(activity['detail']['status'])
                    ar.append(activity['detail']['location']['display'])

                    arr.append(ar)
            else:
                ar = []
                ar.append(cp['resource']['id'])
                ar.append(cp['resource']['status'])
                ar.append(cp['resource']['subject']['reference'].strip().split('urn:uuid:')[1])
                ar.append(cp['resource']['period']['start'])

                if 'end' in cp['resource']['period'].keys():
                    ar.append(cp['resource']['period']['end'])
                else:
                    ar.append(np.nan)

                ar.append(cp['resource']['category'][0]['coding'][0]['code'])
                ar.append(cp['resource']['category'][1]['coding'][0]['code'])
                ar.append(cp['resource']['category'][1]['coding'][0]['display'])
                ar.append(cp['resource']['intent'])
                ar.append(cp['resource']['encounter']['reference'].strip().split('urn:uuid:')[1])
                ar.append(cp['resource']['careTeam'][0]['reference'].strip().split('urn:uuid:')[1])

                ar.append(np.nan)
                ar.append(np.nan)
                ar.append(np.nan)
                ar.append(np.nan)

                arr.append(ar)
    except Exception as e:
#         print(e)
        f_count += 1
        continue

end = time.time()
print(str(f_count)+' Files Failed...')
print(str(len(arr))+' Patient CarePlan bundles extracted as DataFrame in '+str(end-start)+ 'Seconds')

df_cp = pd.DataFrame(arr, columns = cols)

100%|██████████| 1785/1785 [05:00<00:00,  5.94it/s]

0 Files Failed...
16494 Patient CarePlan bundles extracted as DataFrame in 300.35617303848267Seconds





In [None]:
df_cp

  cast_date_col = pd.to_datetime(column, errors="coerce")


Unnamed: 0,id,status,patientId,start,end,category,code,codeText,intent,encounter,careTeam,activityCode,activityCodeText,activityStatus,activityLocation
0,5cb621c4-c9e7-bb70-704a-6ffe3d9ca12b,active,897ac583-e0bb-d807-3a91-04da0fa8b0c8,1982-05-07T23:07:19-04:00,,assess-plan,735985000,Diabetes self management plan (record artifact),order,69bffef6-dbcd-b7b7-6d45-565f1405cabb,cb7362e3-599b-0ae5-1c01-c06904c4b3e2,160670007,Diabetic diet (finding),in-progress,"MARQUETTE INTERNAL MEDICINE ASSOCIATES, P.C."
1,5cb621c4-c9e7-bb70-704a-6ffe3d9ca12b,active,897ac583-e0bb-d807-3a91-04da0fa8b0c8,1982-05-07T23:07:19-04:00,,assess-plan,735985000,Diabetes self management plan (record artifact),order,69bffef6-dbcd-b7b7-6d45-565f1405cabb,cb7362e3-599b-0ae5-1c01-c06904c4b3e2,229065009,Exercise therapy (regime/therapy),in-progress,"MARQUETTE INTERNAL MEDICINE ASSOCIATES, P.C."
2,1dac7aab-a270-a4bf-a6ec-68df3ac7317d,active,897ac583-e0bb-d807-3a91-04da0fa8b0c8,1991-05-17T23:07:19-04:00,,assess-plan,443402002,Lifestyle education regarding hypertension (pr...,order,71b3743b-7dee-7ecb-252b-56b1b0c7626b,c1da9251-ab1a-f015-7b51-2216b4010ac3,386463000,Prescribed activity/exercise education (proced...,in-progress,"MARQUETTE INTERNAL MEDICINE ASSOCIATES, P.C."
3,1dac7aab-a270-a4bf-a6ec-68df3ac7317d,active,897ac583-e0bb-d807-3a91-04da0fa8b0c8,1991-05-17T23:07:19-04:00,,assess-plan,443402002,Lifestyle education regarding hypertension (pr...,order,71b3743b-7dee-7ecb-252b-56b1b0c7626b,c1da9251-ab1a-f015-7b51-2216b4010ac3,413473000,Counseling about alcohol consumption (procedure),in-progress,"MARQUETTE INTERNAL MEDICINE ASSOCIATES, P.C."
4,1dac7aab-a270-a4bf-a6ec-68df3ac7317d,active,897ac583-e0bb-d807-3a91-04da0fa8b0c8,1991-05-17T23:07:19-04:00,,assess-plan,443402002,Lifestyle education regarding hypertension (pr...,order,71b3743b-7dee-7ecb-252b-56b1b0c7626b,c1da9251-ab1a-f015-7b51-2216b4010ac3,1151000175103,Dietary Approaches to Stop Hypertension diet (...,in-progress,"MARQUETTE INTERNAL MEDICINE ASSOCIATES, P.C."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16489,4fc132aa-e5b0-4aa7-ed5d-4be1c9b3de67,active,800ff6a4-f0da-affd-c69f-1d616ec69170,2014-04-29T14:52:19-04:00,,assess-plan,276239002,Therapy (regime/therapy),order,53a16ac0-0c34-847d-31a0-7c5e80768ed6,144ab46d-6440-bf85-dc00-ce201f1a39f1,91251008,Physical therapy procedure (regime/therapy),in-progress,DLP MARQUETTE GENERAL HOSPITAL LLC
16490,4fc132aa-e5b0-4aa7-ed5d-4be1c9b3de67,active,800ff6a4-f0da-affd-c69f-1d616ec69170,2014-04-29T14:52:19-04:00,,assess-plan,276239002,Therapy (regime/therapy),order,53a16ac0-0c34-847d-31a0-7c5e80768ed6,144ab46d-6440-bf85-dc00-ce201f1a39f1,228557008,Cognitive and behavioral therapy (regime/therapy),in-progress,DLP MARQUETTE GENERAL HOSPITAL LLC
16491,51e198c2-35a2-c3a9-95a9-5aae82611ca8,completed,800ff6a4-f0da-affd-c69f-1d616ec69170,2014-08-21T17:52:19-04:00,2015-04-02T17:52:19-04:00,assess-plan,134435003,Routine antenatal care (regime/therapy),order,f08a59ad-2a53-fbb2-a5ae-915f09f206dd,6f1400eb-faed-0ff7-06ba-cd708b6f49a1,135892000,Antenatal education (procedure),completed,DLP MARQUETTE GENERAL HOSPITAL LLC
16492,51e198c2-35a2-c3a9-95a9-5aae82611ca8,completed,800ff6a4-f0da-affd-c69f-1d616ec69170,2014-08-21T17:52:19-04:00,2015-04-02T17:52:19-04:00,assess-plan,134435003,Routine antenatal care (regime/therapy),order,f08a59ad-2a53-fbb2-a5ae-915f09f206dd,6f1400eb-faed-0ff7-06ba-cd708b6f49a1,713076009,Antenatal risk assessment (procedure),completed,DLP MARQUETTE GENERAL HOSPITAL LLC


In [None]:
df_cp.to_csv(output_folder_path+delim+'CarePlan.csv')
del df_cp
gc.collect()

22

In [None]:
from google.colab import files
files.download('/content/output/CarePlan.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>