In [9]:
import pandas as pd
import glob
import fastparquet

In [10]:
path = r'C:\Users\altz7\Desktop\med_data' # use your path
all_files = glob.glob(path + "/*.json")

frames = []

for filename in all_files:
    file = pd.read_json(filename)
    frames.append(file)

raw_frame = pd.concat(frames, axis=0, ignore_index=True)

#unpack nested data for one of three available dictionary keys: "resourceType", "type", "entry"
df_all_data = pd.json_normalize(raw_frame["entry"])

In [11]:
# example of data selection for resourceType = "Patient"

# http://hl7.org/fhir/us/core/STU5/StructureDefinition-us-core-patient.html

patient = df_all_data[df_all_data["resource.resourceType"] == "Patient"]

required_columns = ["resource.resourceType", "resource.id", "resource.extension", 
                    "resource.gender", "resource.birthDate", "resource.address"]

patient = patient[required_columns]

In [12]:
#after first json normalization all remaining nested structures are converted to lists

# first "0" in LC will represent first list item from column "resource.extension" or "resource.address"

patient["race"] = [patient["resource.extension"][i][0]['extension'][0]['valueCoding']['display'] for i in patient.index]

patient["street_raw"] = [patient["resource.address"][i][0]['line'] for i in patient.index]
patient['street'] = [''.join(map(str, l)) for l in patient['street_raw']] # convert list data into string

patient["city"] = [patient["resource.address"][i][0]['city'] for i in patient.index]

patient["state"] = [patient["resource.address"][i][0]['state'] for i in patient.index]

patient["country"] = [patient["resource.address"][i][0]['country'] for i in patient.index]

#added protection against missing data inside nested dictionary
patient["postalCode"] = [patient["resource.address"][i][0]['postalCode'] if "postalCode" in patient["resource.address"][i][0] else '' for i in patient.index]

In [13]:
# rename and drop columns

patient = patient.drop(columns=['street_raw', 'resource.extension', 'resource.address'])


patient = patient.rename(columns={"resource.resourceType": "resource_type",
                                  "resource.id": "id",
                                  "resource.gender": "gender",
                                  "resource.birthDate": "date_of_birth"})

In [14]:
patient

Unnamed: 0,resourceType,id,gender,date_of_birth,race,street,city,state,country,postalCode
0,Patient,8c95253e-8ee8-9ae8-6d40-021d702dc78e,male,1944-08-28,White,859 Altenwerth Run Unit 88,Charlton,MA,US,
619,Patient,6fa23508-960e-ff22-c3d0-0519a036543b,male,2001-06-28,White,736 Dare Mission,Boston,MA,US,2129.0
1552,Patient,83524678-9bff-93b7-ef89-d7f5390072ff,female,1980-06-10,Black or African American,210 Nitzsche Vale Apt 73,Springfield,MA,US,
2255,Patient,4e343b0a-8698-b6dd-64c6-c2d2d0959e6e,female,1946-05-21,White,606 Reichert Camp,Fall River,MA,US,
3216,Patient,6e4ac285-2a8d-a30d-5ecb-e32cb595a876,female,1968-07-07,White,502 Langosh Neck Unit 20,Dracut,MA,US,
3777,Patient,1029f880-d3db-f477-9da3-f59c14ed22c6,male,1998-04-30,White,1029 Johnson Manor,Devens,MA,US,1434.0
4012,Patient,a18edb30-e93c-8e9b-8e6a-95a651a24a36,female,1943-04-23,White,654 Metz Path,Sandwich,MA,US,
4679,Patient,f406a4e8-821b-0c9a-c8ec-09ad0f1fe9c6,male,1990-10-27,White,531 Dare Tunnel,Arlington,MA,US,2476.0
4848,Patient,b8c71fe0-e911-205e-19c3-b92e88e5b5a6,female,1941-09-27,White,151 Crist Vista,Tewksbury,MA,US,


In [15]:
#Saving results:

#Parquet
patient.to_parquet('patient_profile.parquet')
df_all_data.to_parquet('exa_all_data.parquet')

#CSV
patient.to_csv('patient_profile.csv', index=False)
df_all_data.to_csv('exa_all_data.csv', index=False)

In [None]:
import json
file_read = open(r'C:\Users\altz7\Desktop\med_data\Aaron697_Dickens475_8c95253e-8ee8-9ae8-6d40-021d702dc78e.json')
data = json.load(file_read)
file_read.close()
#json_object = json.loads(r'C:\Users\altz7\Desktop\med_data\Aaron697_Dickens475_8c95253e-8ee8-9ae8-6d40-021d702dc78e.json')

print(len(data))