# Load all data

In [1]:
import pandas as pd
# hide warnings
import warnings
warnings.filterwarnings('ignore')

_allergies = "data/Synthea Sample Data Latest/allergies.csv"
_careplans = "data/Synthea Sample Data Latest/careplans.csv"
_claims = "data/Synthea Sample Data Latest/claims.csv"
_claims_transactions = "data/Synthea Sample Data Latest/claims_transactions.csv"
_conditions = "data/Synthea Sample Data Latest/conditions.csv"
_devices = "data/Synthea Sample Data Latest/devices.csv"
_encounters = "data/Synthea Sample Data Latest/encounters.csv"
_imaging_studies = "data/Synthea Sample Data Latest/imaging_studies.csv"
_immunizations = "data/Synthea Sample Data Latest/immunizations.csv"
_medications = "data/Synthea Sample Data Latest/medications.csv"
_obervations = "data/Synthea Sample Data Latest/observations.csv"
_organizations = "data/Synthea Sample Data Latest/organizations.csv"
_patients = "data/Synthea Sample Data Latest/patients.csv"
_payer_transitions = "data/Synthea Sample Data Latest/payer_transitions.csv"
_payers = "data/Synthea Sample Data Latest/payers.csv"
_procedures = "data/Synthea Sample Data Latest/procedures.csv"
_provicers = "data/Synthea Sample Data Latest/providers.csv"
_supplies = "data/Synthea Sample Data Latest/supplies.csv"

# Load Data
allergies_df = pd.read_csv(_allergies)
patients_df = pd.read_csv(_patients)
observations_df = pd.read_csv(_obervations)
immunizations_df = pd.read_csv(_immunizations)

# Could need
conditions_df = pd.read_csv(_conditions)
encounters_df = pd.read_csv(_encounters)
supplies_df = pd.read_csv(_supplies)
procedures_df = pd.read_csv(_procedures)

# Clean up columns and data

In [2]:
# Clean column names (lower case & strip whitespace)
patients_df.columns = patients_df.columns.str.strip()
patients_df.columns = patients_df.columns.str.lower()
allergies_df.columns = allergies_df.columns.str.strip()
allergies_df.columns = allergies_df.columns.str.lower()
observations_df.columns = observations_df.columns.str.strip()
observations_df.columns = observations_df.columns.str.lower()
immunizations_df.columns = immunizations_df.columns.str.strip()
immunizations_df.columns = immunizations_df.columns.str.lower()
encounters_df.columns = encounters_df.columns.str.strip()
encounters_df.columns = encounters_df.columns.str.lower()
procedures_df.columns = procedures_df.columns.str.strip()
procedures_df.columns = procedures_df.columns.str.lower()

# Rename columns to combine properly
patients = patients_df.rename(columns={"id": "patient"})
allergies_df = allergies_df.rename(columns={"code": "allergy_code"})
allergies_df = allergies_df.rename(columns={"description": "allergy"})
allergies_df = allergies_df.rename(columns={"description2": "allergy2"})
allergies_df = allergies_df.rename(columns={"type": "allergy_type"})
allergies_df = allergies_df.rename(columns={"category": "allergy_category"})
allergies_df = allergies_df.rename(columns={"system": "allergy_system"})
immunizations_df = immunizations_df.rename(columns={"code": "immunization_code"})
immunizations_df = immunizations_df.rename(columns={"description": "immunization_description"})
immunizations_df = immunizations_df.rename(columns={"date": "immunization_date"})
observations_df = observations_df.rename(columns={"code": "observation_code"})
observations_df = observations_df.rename(columns={"description": "observation_description"})
observations_df = observations_df.rename(columns={"value": "observation_value"})
observations_df = observations_df.rename(columns={"units": "observation_units"})
observations_df = observations_df.rename(columns={"date": "observation_date"})
observations_df = observations_df.rename(columns={"type": "observation_type"})
observations_df = observations_df.rename(columns={"category": "observation_category"})

# Drop unneeded columns and add proper birth year and month
patients["birthyear"] = patients["birthdate"].str.split("-", expand=True)[0]
patients["birthmonth"] = patients["birthdate"].str.split("-", expand=True)[1]
patients = patients.drop(columns=["ssn", "drivers", "passport", "fips", "lat", "lon", "deathdate", "prefix", "first", "middle", "last", "suffix", "maiden",  "city", "state", "address", "birthplace", "zip", "birthdate", "county"])
allergies = allergies_df.drop(columns=["encounter", "allergy_type", "stop", "start"])
immunization = immunizations_df.drop(columns=["encounter", "base_cost"])
observations = observations_df.drop(columns=["encounter", "observation_type"])

encounters = encounters_df.drop(columns=["id", "start", "stop", "organization", "provider", "payer", "code", "base_encounter_cost", "total_claim_cost", "total_claim_cost", "payer_coverage"])

# Fill all NaN with a dummy value
patients["marital"].fillna("Unmarried", inplace=True)
# patients["marital"].replace({"M": 1, "D": -1}, inplace=True) # M = 1, D = -1, Nan = 0
# patients["gender"].replace({"M": 1, "F": 0}, inplace=True) # M = 1, F = 0
allergies["allergy"].fillna("none", inplace=True)
allergies["allergy2"].fillna("none", inplace=True)
allergies["reaction1"].fillna("none", inplace=True)
allergies["reaction2"].fillna("none", inplace=True)
allergies["severity1"].fillna("none ", inplace=True)
allergies["severity2"].fillna("none", inplace=True)
allergies["description1"].fillna("none ", inplace=True)

# Combine tables
pa = pd.merge(patients, allergies, on="patient")
pai = pd.merge(pa, immunization, on="patient")
paio = pd.merge(pai, observations, on="patient")

# Add new "Label" column for a patient having an allergy
paio["is_allergy"] = paio.apply(lambda x: 1 if x["reaction1"] != "none" or x["reaction2"] != "none" else 0, axis=1)

# Allergy cleaning
paio["allergy"] = allergies_df["allergy"].str.replace(r" \(.*?\)", "", regex=True)

In [3]:
encounters_df.head()
# procedures_df.head()

Unnamed: 0,id,start,stop,patient,organization,provider,payer,encounterclass,code,description,base_encounter_cost,total_claim_cost,payer_coverage,reasoncode,reasondescription
0,294d0dab-907e-8fce-7a47-0c0d322a5734,2012-04-01T09:04:48Z,2012-04-01T10:02:47Z,30a6452c-4297-a1ac-977a-6a23237c7b46,f2068cee-c75c-321d-9b2c-c33535db89c9,c3d07214-c20f-3f33-ad41-0e55adf5b024,d31fccc3-1767-390d-966a-22a5156f4219,wellness,162673000,General examination of patient (procedure),136.8,1567.0,87.2,,
1,2ccec874-cbaa-e280-7abb-f2bc2b603961,2013-04-07T09:04:48Z,2013-04-07T09:55:49Z,30a6452c-4297-a1ac-977a-6a23237c7b46,f2068cee-c75c-321d-9b2c-c33535db89c9,c3d07214-c20f-3f33-ad41-0e55adf5b024,d31fccc3-1767-390d-966a-22a5156f4219,wellness,162673000,General examination of patient (procedure),136.8,704.2,0.0,,
2,953c5138-ce17-4084-3432-1ac23f184528,2015-09-28T09:04:48Z,2015-09-28T11:02:48Z,30a6452c-4297-a1ac-977a-6a23237c7b46,db106514-f254-3402-b6a4-6d210c78c7e2,2c4b7d17-0ded-3e16-b5eb-6dda1d6a81bb,d31fccc3-1767-390d-966a-22a5156f4219,emergency,50849002,Emergency room admission (procedure),146.18,1008.98,0.0,125605004.0,Fracture of bone (disorder)
3,17dd3b88-0b85-2b6f-c342-c9d6cf5315cb,2015-10-31T11:02:48Z,2015-10-31T11:17:48Z,30a6452c-4297-a1ac-977a-6a23237c7b46,f8918a95-31e8-3ac4-8d12-29ca6080ebda,b4d9fbc9-fdca-369d-bbba-019479923f08,d31fccc3-1767-390d-966a-22a5156f4219,ambulatory,185349003,Encounter for check up (procedure),85.55,85.55,3.95,359817006.0,Closed fracture of hip (disorder)
4,0b03e41b-06a6-66fa-b972-acc5a83b134a,2016-04-10T09:04:48Z,2016-04-10T10:00:45Z,30a6452c-4297-a1ac-977a-6a23237c7b46,f2068cee-c75c-321d-9b2c-c33535db89c9,c3d07214-c20f-3f33-ad41-0e55adf5b024,d31fccc3-1767-390d-966a-22a5156f4219,wellness,162673000,General examination of patient (procedure),136.8,2039.18,464.94,,


# Save cleaned data to CSV

In [4]:
paio.head()
paio.to_csv('data/allergy_patients.csv', index=False, header=True)

In [6]:
paio.iloc[0]

patient                     4569671e-ed39-055f-8e78-422b96c9896b
marital                                                Unmarried
race                                                       white
ethnicity                                            nonhispanic
gender                                                         F
healthcare_expenses                                      9821.14
healthcare_coverage                                     27142.51
income                                                     58294
birthyear                                                   2013
birthmonth                                                    08
allergy_code                                           419199007
allergy_system                                         SNOMED-CT
allergy                                     Allergy to substance
allergy_category                                     environment
reaction1                                                   none
description1             