In [None]:
# Libraries
import pandas as pd
import json
from pprint import pprint
import uuid
from datetime import datetime
from typing import List, Dict
from pydantic import BaseModel

In [None]:
#file path
with open('') as f:
    data = json.load(f)

#pprint(data)
#pprint(data.keys())



{}
dict_keys(['resourceType', 'type', 'entry'])


In [None]:
#Defined Pydantic models for Structured DATA

class Address(BaseModel):
    billing_address_line: str
    billing_city: str
    billing_state: str
    billing_zip: str


class ClinicInfo(BaseModel):
    clinic_id: str
    clinic_name: str
    clinic_NPI_number: str
    clinic_ein: str
    clinic_address: Address

class SubscriberInfo(BaseModel):
    subscriber_responsibility: str
    relationship_code: str
    claim_filing_code: str
    subscriber_first_name: str
    subscriber_last_name: str
    subscriber_is_patient: str
    subscriber_primary_ID: str
    subscriber_dob: str
    subscriber_gender: str
    subscriber_address: Address


class PatientInfo(BaseModel):
    patient_first_name: str
    patient_last_name: str
    patient_primary_ID: str
    patient_dob: str
    patient_gender: str
    patient_address: Address


class PayerInfo(BaseModel):
    payer_name: str
    payer_identifier: str
    payer_address: Address


class ClaimInfo(BaseModel):
    claim_number: str
    claim_reference_number: str
    total_charge: str
    place_of_service_code: str
    facility_code: str
    claim_frequency: str
    benefits_assignment: str
    assignment_code: str
    patient_signature: str
    release_info_code: str
    diagnosis_code_abk: List[str]
    diagnosis_code_abf: List[str]
    procedure_codes: Dict[str, List[Dict[str, Dict[str, int]]]]
    units: str
    diagnosis_code_pointer: str


class ClaimEnvelope(BaseModel):
    claim_data: ClaimData
    clinic_info: ClinicInfo
    subscriber_info: SubscriberInfo
    patient_info: PatientInfo
    payer_info: PayerInfo
    claim_info: ClaimInfo




In [None]:
# Event Data Extraction

def extract_patient_address(patient_data):
    # Extract the first address entry from patient data
    address_data = patient_data.get('address', [{}])[0]
    # Create an Address object with extracted fields
    patient_address = Address(
        billing_address_line=address_data.get('line', [None])[0],
        billing_city=address_data.get('city', 'unknown'),
        billing_state=address_data.get('state', 'unknown'),
        billing_zip=address_data.get('postalCode', 'unknown')
    )
    return patient_address


def extract_clinic_info(data):
    # Extract clinic-related data from 'custodian'
    clinic_data = data.get('custodian', {})
    # Extract clinic address from custodian
    address_data = clinic_data.get('address', [{}])[0]
    # Create an Address object for the clinic
    clinic_address = Address(
        billing_address_line=address_data.get('line', [None])[0],
        billing_city=address_data.get('city', 'unknown'),
        billing_state=address_data.get('state', 'unknown'),
        billing_zip=address_data.get('postalCode', 'unknown')
    )
    # Create a ClinicInfo object with extracted data
    clinic_info = ClinicInfo(
        clinic_id=clinic_data.get('id', 'unknown'),
        clinic_name=clinic_data.get('display', 'unknown'),
        clinic_NPI_number=clinic_data.get('NPI', 'unknown'),
        clinic_ein=clinic_data.get('ein', 'unknown'),
        clinic_address=clinic_address
    )
    return clinic_info


def extract_SubscriberInfo(data):
    # Extract insurance data to build subscriber information
    insurance_data = data.get('insurance', [{}])[0]
    # Extract various subscriber details
    coverage_reference = insurance_data.get('coverage', {}).get('reference', 'unknown')
    subscriber_responsibility = insurance_data.get('focal', 'unknown')
    relationship_code = insurance_data.get('relationship', {}).get('coding', [{}])[0].get('code', 'unknown')
    # Create a placeholder Address for the subscriber
    subscriber_address = Address(
        billing_address_line="unknown",
        billing_city="unknown",
        billing_state="unknown",
        billing_zip="unknown"
    )
    # Create a SubscriberInfo object
    subscriber_info = SubscriberInfo(
        subscriber_responsibility=subscriber_responsibility,
        relationship_code=relationship_code,
        claim_filing_code="unknown",
        subscriber_first_name="unknown",
        subscriber_last_name="unknown",
        subscriber_is_patient="unknown",
        subscriber_primary_ID="unknown",
        subscriber_dob="unknown",
        subscriber_gender="unknown",
        subscriber_address=subscriber_address
    )
    return subscriber_info


def extract_PatientInfo(data):
    # Find and extract patient resource from the dataset
    patient_data = next(
        (entry.get('resource', {}) for entry in data.get('entry', [])
         if entry.get('resource', {}).get('resourceType') == 'Patient'), {}
    )
    # Extract patient address
    address_data = patient_data.get('address', [{}])[0]
    patient_address = Address(
        billing_address_line=address_data.get('line', [None])[0],
        billing_city=address_data.get('city', 'unknown'),
        billing_state=address_data.get('state', 'unknown'),
        billing_zip=address_data.get('postalCode', 'unknown')
    )
    # Create a PatientInfo object with extracted details
    patient_info = PatientInfo(
        patient_first_name=patient_data.get('name', [{}])[0].get('given', ['unknown'])[0],
        patient_last_name=patient_data.get('name', [{}])[0].get('family', 'unknown'),
        patient_primary_ID=patient_data.get('id', 'unknown'),
        patient_dob=patient_data.get('birthDate', 'unknown'),
        patient_gender=patient_data.get('gender', 'unknown'),
        patient_address=patient_address
    )
    return patient_info


def extract_PayerInfo(data):
    # Extract insurance data to retrieve payer details
    insurance_data = data.get('insurance', [{}])[0]
    payer_data = insurance_data.get('payor', [{}])[0]
    # Create a placeholder Address for the payer
    payer_address = Address(
        billing_address_line="unknown",
        billing_city="unknown",
        billing_state="unknown",
        billing_zip="unknown"
    )
    # Create a PayerInfo object with extracted data
    payer_info = PayerInfo(
        payer_name=payer_data.get('display', 'unknown'),
        payer_identifier=insurance_data.get('coverage', {}).get('reference', 'unknown'),
        payer_address=payer_address
    )
    return payer_info


def extract_ClaimInfo(data):
    # Locate claim or encounter information
    encounter_data = data.get('entry', [{}])[0].get('resource', {})
    # Extract total charges, diagnoses, and procedures
    total_charge = encounter_data.get('total', {}).get('value', '0.0')
    diagnosis = encounter_data.get('diagnosis', [{}])
    procedures = encounter_data.get('procedures', [{}])
    # Extract diagnosis codes and descriptions
    diagnosis_abk = [diag.get('coding', [{}])[0].get('code', 'unknown') for diag in diagnosis]
    diagnosis_abf = [diag.get('coding', [{}])[0].get('display', 'unknown') for diag in diagnosis]
    # Extract procedure codes
    procedure_codes = {
        proc.get('id', 'unknown'): [
            {
                "procedure": proc.get('coding', [{}])[0].get('code', 'unknown'),
                "units": proc.get('quantity', {}).get('value', 1)
            }
        ]
        for proc in procedures
    }
    # Create a ClaimInfo object with extracted data
    claim_info = ClaimInfo(
        claim_number=encounter_data.get('id', 'unknown'),
        claim_reference_number=encounter_data.get('reference', 'unknown'),
        total_charge=total_charge,
        place_of_service_code=encounter_data.get('class', {}).get('code', 'unknown'), #dont expect to get this
        facility_code=encounter_data.get('location', {}).get('id', 'unknown'), #dont expect to get this
        claim_frequency="unknown", #dont expect to get this
        benefits_assignment="unknown",
        assignment_code="unknown",
        patient_signature="unknown",
        release_info_code="unknown",
        diagnosis_code_abk=diagnosis_abk,
        diagnosis_code_abf=diagnosis_abf,
        procedure_codes=procedure_codes,
        units="1",
        diagnosis_code_pointer="unknown"
    )
    return claim_info


def extract_ClaimEnvelope(data):
    # Call extraction functions to build ClaimEnvelope components
    clinic_info = extract_clinic_info(data)
    subscriber_info = extract_SubscriberInfo(data)
    patient_info = extract_PatientInfo(data)
    payer_info = extract_PayerInfo(data)
    claim_info = extract_ClaimInfo(data)
    # Build and return a ClaimEnvelope object
    claim_envelope = ClaimEnvelope(
        claim_data=data,
        clinic_info=clinic_info,
        subscriber_info=subscriber_info,
        patient_info=patient_info,
        payer_info=payer_info,
        claim_info=claim_info
    )
    return claim_envelope



In [None]:
#Extract Event Data (Encounters, Immunizations, Conditions, Procedures) with One Row per Event
def extract_events(data):
    events = []
    for entry in data.get('entry', []):
        resource = entry.get('resource', {})
        patient_id = resource.get('subject', {}).get('reference', '').replace('urn:uuid:', '')

        # Generalized extraction based on resource type
        if resource.get('resourceType') == 'Encounter':
            events.append({
                "patient_id": patient_id,
                "event_type": "Encounter",
                "event_id": resource.get('id'),
                "status": resource.get('status', 'unknown'),
                "class_code": resource.get('class', {}).get('code', 'unknown'),
                "period_start": resource.get('period', {}).get('start', 'unknown'),
                "period_end": resource.get('period', {}).get('end', 'unknown'),
                "type_code": resource.get('type', [{}])[0].get('coding', [{}])[0].get('code', 'unknown'),
                "type_display": resource.get('type', [{}])[0].get('coding', [{}])[0].get('display', 'unknown')
            })
        elif resource.get('resourceType') == 'Immunization':
            events.append({
                "patient_id": resource.get('patient', {}).get('reference', '').replace('urn:uuid:', ''),
                "event_type": "Immunization",
                "event_id": resource.get('id', 'unknown'),
                "status": resource.get('status', 'unknown'),
                "vaccine_code": resource.get('vaccineCode', {}).get('coding', [{}])[0].get('code', 'unknown'),
                "vaccine_display": resource.get('vaccineCode', {}).get('coding', [{}])[0].get('display', 'unknown'),
                "occurrence_date": resource.get('occurrenceDateTime', 'unknown')
            })
        elif resource.get('resourceType') == 'Condition':
            events.append({
                "patient_id": patient_id,
                "event_type": "Condition",
                "event_id": resource.get('id', 'unknown'),
                "clinical_status": resource.get('clinicalStatus', {}).get('coding', [{}])[0].get('code', 'unknown'),
                "condition_code": resource.get('code', {}).get('coding', [{}])[0].get('code', 'unknown'),
                "condition_display": resource.get('code', {}).get('coding', [{}])[0].get('display', 'unknown'),
                "onset_date": resource.get('onsetDateTime', 'unknown')
            })
        elif resource.get('resourceType') == 'Procedure':
            events.append({
                "patient_id": patient_id,
                "event_type": "Procedure",
                "event_id": resource.get('id', 'unknown'),
                "status": resource.get('status', 'unknown'),
                "procedure_code": resource.get('code', {}).get('coding', [{}])[0].get('code', 'unknown'),
                "procedure_display": resource.get('code', {}).get('coding', [{}])[0].get('display', 'unknown'),
                "performed_date": resource.get('performedDateTime', 'unknown')
            })

    return events


'''
# Convert encounters to a DataFrame
encounters_df = pd.DataFrame(encounters)

# Summarize encounter data for each patient
encounter_summary = encounters_df.groupby('patient_id').agg({
    'encounter_id': 'count',             # Total encounters per patient
    'period_start': 'min',               # Earliest encounter start
    'period_end': 'max'                  # Latest encounter end
}).rename(columns={'encounter_id': 'total_encounters'}).reset_index()
'''



"\n# Convert encounters to a DataFrame\nencounters_df = pd.DataFrame(encounters)\n\n# Summarize encounter data for each patient\nencounter_summary = encounters_df.groupby('patient_id').agg({\n    'encounter_id': 'count',             # Total encounters per patient\n    'period_start': 'min',               # Earliest encounter start\n    'period_end': 'max'                  # Latest encounter end\n}).rename(columns={'encounter_id': 'total_encounters'}).reset_index()\n"

In [None]:
#Testing the data
'''
# Step 2: Extract Patient Data (One Row per Patient)
patient_data = data['entry'][0]['resource']
patient_info = {
    "id": patient_data.get('id', 'N/A'),
    "gender": patient_data.get('gender', 'unknown'),
    "birthDate": patient_data.get('birthDate', None),
    "maritalStatus": patient_data.get('maritalStatus', {}).get('text', 'unknown'),
    "multipleBirth": patient_data.get('multipleBirthBoolean', False),
    "fullUrl": data['entry'][0].get('fullUrl', 'N/A'),
    "communication_language": patient_data.get('communication', [{}])[0].get('language', {}).get('text', 'unknown'),
    "phone": patient_data.get('telecom', [{}])[0].get('value', 'N/A'),
    "race": patient_data.get('extension', [])[0]['extension'][1].get('valueString', 'unknown') if 'extension' in patient_data and len(patient_data['extension']) > 0 else 'unknown',
    "ethnicity": patient_data.get('extension', [])[1]['extension'][1].get('valueString', 'unknown') if 'extension' in patient_data and len(patient_data['extension']) > 1 else 'unknown',
    "address_line": patient_data.get('address', [{}])[0].get('line', [None])[0],
    "City": patient_data.get('address', [{}])[0].get('city', 'unknown'),
    "state": patient_data.get('address', [{}])[0].get('state', 'unknown'),
    "postal_code": patient_data.get('address', [{}])[0].get('postalCode', 'unknown'),
    "country": patient_data.get('address', [{}])[0].get('country', 'unknown'),
    "disability_adjusted_life_years": next((ext['valueDecimal'] for ext in patient_data.get('extension', []) if ext['url'] == "http://synthetichealth.github.io/synthea/disability-adjusted-life-years"), None),
    "quality_adjusted_life_years": next((ext['valueDecimal'] for ext in patient_data.get('extension', []) if ext['url'] == "http://synthetichealth.github.io/synthea/quality-adjusted-life-years"), None),
    "medical_record_number": next((iden['value'] for iden in patient_data.get('identifier', []) if iden.get('type', {}).get('coding', [{}])[0].get('code') == "MR"), 'unknown'),
    "social_security_number": next((iden['value'] for iden in patient_data.get('identifier', []) if iden.get('type', {}).get('coding', [{}])[0].get('code') == "SS"), 'unknown')
}

# Convert extracted patient data to DataFrame
patient_df = pd.DataFrame([patient_info])

# Step 3: Merge Patient Data with Event Data for Detailed Row Structure
# This will keep multiple rows per patient, one for each event
merged_df = events_df.merge(patient_df, left_on='patient_id', right_on='id', how='left')

# Data Cleaning: Set Data Types and Handle Missing Values
merged_df['birthDate'] = pd.to_datetime(merged_df['birthDate'], errors='coerce')
merged_df['gender'] = merged_df['gender'].str.lower()
merged_df['communication_language'] = merged_df['communication_language'].str.title()
merged_df['race'] = merged_df['race'].str.title()
merged_df['ethnicity'] = merged_df['ethnicity'].str.title()
merged_df['disability_adjusted_life_years'].fillna(0, inplace=True)
merged_df['quality_adjusted_life_years'].fillna(0, inplace=True)
merged_df['multipleBirth'] = merged_df['multipleBirth'].astype(bool, errors='ignore')

# Fill text columns with 'unknown' if missing
text_columns = ['gender', 'maritalStatus', 'communication_language', 'phone', 'race', 'ethnicity', 'address_line', 'City', 'state', 'country']
merged_df[text_columns] = merged_df[text_columns].fillna('unknown')

# UUID validation
def is_valid_uuid(val):
    try:
        uuid.UUID(val, version=4)
        return True
    except ValueError:
        return False

merged_df['is_valid_uuid'] = merged_df['id'].apply(is_valid_uuid)

# Display and save the merged DataFrame with multiple rows per patient (one row per event)
print(merged_df.head())
merged_df.to_csv('/content/All_synthetic_data/final_cleaned_multiple_rows_data.csv', index=False)

                             patient_id    event_type  \
0  af08479b-d904-74fc-d8fc-0e32731ffc2d     Encounter   
1  af08479b-d904-74fc-d8fc-0e32731ffc2d     Condition   
2  af08479b-d904-74fc-d8fc-0e32731ffc2d     Procedure   
3  af08479b-d904-74fc-d8fc-0e32731ffc2d  Immunization   
4  af08479b-d904-74fc-d8fc-0e32731ffc2d     Encounter   

                               event_id     status class_code  \
0  b6e44183-49ed-4afc-f119-f2abe911cff2   finished        AMB   
1  00ec0bf2-b1b5-8cfb-4844-d0874bd22c13        NaN        NaN   
2  76402ea4-b492-dca5-d153-8efee881fa61  completed        NaN   
3  974cfd30-fa3d-9678-5720-7473c8c787bc  completed        NaN   
4  9d8ad14b-19b9-5422-780e-6253df923e3c   finished        AMB   

                period_start                 period_end  type_code  \
0  2015-04-07T05:52:55-04:00  2015-04-07T06:07:55-04:00  410620009   
1                        NaN                        NaN        NaN   
2                        NaN                        NaN 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df['disability_adjusted_life_years'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df['quality_adjusted_life_years'].fillna(0, inplace=True)
