In [1]:
import pandas as pd
from transformers import pipeline
import json
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('DataRaw1.csv')

In [3]:
df.head()

Unnamed: 0,Free_Text
0,"Pasien Rahmat Widodo, KTP 3276010101010004, 52..."
1,"Pasien Wawan Purnomo, KTP 3276010101010009, 62..."
2,"Pasien Lestari Nugroho, KTP 3276010101010007, ..."
3,"Pasien Yudi Raharjo, KTP 3276010101010004, 20 ..."
4,"Pasien Wawan Purnomo, KTP 3276010101010004, 45..."


## Step 1: Preprocessing

In [4]:
# Check for empty cells
empty_cells = df.isnull().sum()
print(empty_cells)

Free_Text    0
dtype: int64


In [5]:
# Check for Duplicates
duplicates = df.duplicated().sum()
print("Number of duplicates: ", duplicates)

Number of duplicates:  0


## Step 2: Extracting entities using NER to identify and extract key pieces of information from the dataset

Initialize NER pipeline with the Indobert pre-trained model

In [6]:
# Initialize NER pipeline
ner_pipeline = pipeline("ner", model="indolem/indobert-base-uncased", token="indolem/indobert-base-uncased")




Some weights of BertForTokenClassification were not initialized from the model checkpoint at indolem/indobert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


We are only considering the data available in the dataset. The rest will be filled with placeholders.

For example, in the first row of DataRaw1.csv, we are able to extract:

1. **Patient Name (PERSON)**:
   - "Pasien Rahmat Widodo"
2. **Patient KTP Number (KTP)**:
   - "KTP 3276010101010004"
3. **Patient Age (AGE)**:
   - "52 tahun"
4. **Patient Gender (GENDER)**:
   - "perempuan"
5. **Patient Address (ADDRESS)**:
   - "alamat Jl. Imam Bonjol No. 78"
6. **Patient City (CITY)**:
   - "Surabaya"
7. **Patient Phone Number (PHONE)**:
   - "nomor telepon 081987654321"
8. **Patient Blood Type (BLOOD TYPE)**:
   - "golongan darah A"
9. **Patient Medical Records ID (MEDICAL RECORD NUMBER)**:
   - "nomor rekam medis 727106"
10. **Patient Date of Entry to the Hospital (DATE)**:
    - "datang pada 10/09/2023"
11. **Doctor Name (DOCTOR_NAME)**:
    - "Dr. Farida Lestari"
12. **Doctor KTP Number (DOCTOR_KTP)**:
    - "3173010202020004"
13. **Patient Primary Complaint (PRIMARY_COMPLAINT)**:
    - "Keluhan: hipertensi"
14. **Patient Additional Complaint (ADDITIONAL COMPLAINT)**:
    - "keluhan tambahan: sesak napas"
15. **Doctor's Recommendation (RECOMMENDATION)**:
    - "disarankan mengubah pola makan"
16. **Doctor's Referral (REFERRAL)**:
    - "Dokter rujukan: Dr. Hasan"
17. **Additional Notes (NOTES)**:
    - "keluhan masih tetap dirasakan, perlu observasi lebih lanjut"

Initializing an empty dictionary, and then filling it with data

In [7]:
def extract_entities(text):
    entities = {
        'PERSON': [],
        'KTP': [],
        'AGE': [],
        'GENDER': [],
        'ADDRESS': [],
        'CITY': [],
        'PHONE': [],
        'BLOOD_TYPE': [],
        'MEDICAL_RECORD_NUMBER': [],
        'DATE': [],
        'DOCTOR_NAME': [],
        'DOCTOR_KTP': [],
        'PRIMARY_COMPLAINT': [],
        'ADDITIONAL_COMPLAINT': [],
        'RECOMMENDATION': [],
        'REFERRAL': [],
        'NOTES': []
    }
    
    # Define regular expressions for each entity type
    patterns = {
        'PERSON': r'Pasien\s([A-Za-z\s]+),',
        'KTP': r'KTP\s(\d+),',
        'AGE': r'(\d+)\stahun',
        'GENDER': r'\d+\stahun,\s(perempuan|laki-laki),',
        'ADDRESS': r'alamat\s([A-Za-z\s\.0-9]+),',
        'CITY': r'alamat\s[A-Za-z\s\.0-9]+,\s([A-Za-z]+),',
        'PHONE': r'nomor telepon\s(\d+),',
        'BLOOD_TYPE': r'golongan darah\s([A-Za-z]+),',
        'MEDICAL_RECORD_NUMBER': r'nomor rekam medis\s(\d+),',
        'DATE': r'datang pada\s([\d/]+)',
        'DOCTOR_NAME': r'Diperiksa oleh\s([A-Za-z\s\.]+),',
        'DOCTOR_KTP': r'Diperiksa oleh\s[A-Za-z\s\.]+,\sKTP\s(\d+)\.',
        'PRIMARY_COMPLAINT': r'Keluhan:\s([A-Za-z\s]+),',
        'ADDITIONAL_COMPLAINT': r'keluhan tambahan:\s([A-Za-z\s]+)\.',
        'RECOMMENDATION': r'(disarankan|diberikan|direkomendasikan)\s([A-Za-z\s]+)\.',
        'REFERRAL': r'Dokter rujukan:\s([A-Za-z\s\.]+)\.',
        'NOTES': r'Dokter rujukan:\s[A-Za-z\s\.]+\. (.*)'
    }
    
    # Extract entities using regular expressions
    for entity_type, pattern in patterns.items():
        match = re.search(pattern, text)
        if match:
            if entity_type == 'RECOMMENDATION':
                entities[entity_type].append(match.group(2))
            else:
                entities[entity_type].append(match.group(1))
        else:
            entities[entity_type].append('Unknown')
    
    return entities

Plugging in the dataset into the function

In [8]:
# Apply the extract_entities function to each row in the 'Free_Text' column
df['Extracted_Entities'] = df['Free_Text'].apply(extract_entities)

In [9]:
df.head()

Unnamed: 0,Free_Text,Extracted_Entities
0,"Pasien Rahmat Widodo, KTP 3276010101010004, 52...","{'PERSON': ['Rahmat Widodo'], 'KTP': ['3276010..."
1,"Pasien Wawan Purnomo, KTP 3276010101010009, 62...","{'PERSON': ['Wawan Purnomo'], 'KTP': ['3276010..."
2,"Pasien Lestari Nugroho, KTP 3276010101010007, ...","{'PERSON': ['Lestari Nugroho'], 'KTP': ['32760..."
3,"Pasien Yudi Raharjo, KTP 3276010101010004, 20 ...","{'PERSON': ['Yudi Raharjo'], 'KTP': ['32760101..."
4,"Pasien Wawan Purnomo, KTP 3276010101010004, 45...","{'PERSON': ['Wawan Purnomo'], 'KTP': ['3276010..."


In [10]:
# Normalize the extracted entities into a new DataFrame
extracted_df = pd.json_normalize(df['Extracted_Entities'])

# Concatenate the original DataFrame with the extracted entities DataFrame
result_df = pd.concat([df, extracted_df], axis=1)

In [11]:
# Display the head of the new DataFrame
result_df.head()

Unnamed: 0,Free_Text,Extracted_Entities,PERSON,KTP,AGE,GENDER,ADDRESS,CITY,PHONE,BLOOD_TYPE,MEDICAL_RECORD_NUMBER,DATE,DOCTOR_NAME,DOCTOR_KTP,PRIMARY_COMPLAINT,ADDITIONAL_COMPLAINT,RECOMMENDATION,REFERRAL,NOTES
0,"Pasien Rahmat Widodo, KTP 3276010101010004, 52...","{'PERSON': ['Rahmat Widodo'], 'KTP': ['3276010...",[Rahmat Widodo],[3276010101010004],[52],[perempuan],[Jl. Imam Bonjol No. 78],[Surabaya],[081987654321],[A],[727106],[10/09/2023],[Dr. Farida Lestari],[3173010202020004],[hipertensi],[sesak napas],[mengubah pola makan],[Dr. Hasan],"[keluhan masih tetap dirasakan, perlu observas..."
1,"Pasien Wawan Purnomo, KTP 3276010101010009, 62...","{'PERSON': ['Wawan Purnomo'], 'KTP': ['3276010...",[Wawan Purnomo],[3276010101010009],[62],[perempuan],[Jl. Sudirman No. 56],[Bandung],[081234567890],[B],[870412],[11/09/2023],[Dr. Andika Pratama],[3173010202020003],[migrain],[nyeri kepala],[suplemen zat besi],[Dr. Hasan],[direkomendasikan untuk menjaga pola makan dan...
2,"Pasien Lestari Nugroho, KTP 3276010101010007, ...","{'PERSON': ['Lestari Nugroho'], 'KTP': ['32760...",[Lestari Nugroho],[3276010101010007],[41],[perempuan],[Jl. Imam Bonjol No. 78],[Surabaya],[081987654321],[AB],[151697],[27/09/2023],[Dr. Sumarno Hadi],[3173010202020007],[hipertensi],[pusing],[obat antihipertensi],[Dr. Hasan],[direkomendasikan untuk menjaga pola makan dan...
3,"Pasien Yudi Raharjo, KTP 3276010101010004, 20 ...","{'PERSON': ['Yudi Raharjo'], 'KTP': ['32760101...",[Yudi Raharjo],[3276010101010004],[20],[laki-laki],[Jl. Sudirman No. 56],[Bandung],[081987654321],[A],[583165],[3/08/2023],[Dr. Wahyu Nugraha],[3173010202020010],[infeksi saluran pernapasan],[nyeri kepala],[obat antihipertensi],[Dr. Dian],[perlu dilakukan tes darah ulang]
4,"Pasien Wawan Purnomo, KTP 3276010101010004, 45...","{'PERSON': ['Wawan Purnomo'], 'KTP': ['3276010...",[Wawan Purnomo],[3276010101010004],[45],[perempuan],[Jl. Diponegoro No. 12],[Medan],[081234567890],[A],[522131],[25/08/2023],[Dr. Sari Utami],[3173010202020009],[migrain],[sakit pinggang],[Unknown],[Dr. Dian],[diminta mengurangi konsumsi garam dan lemak]


Save into a .csv file

In [12]:
result_df.to_csv('DataProcessed1.csv', index=False)

## Step 3: Organizing the extracted information into a structured format, including placeholders where data is missing

Loading CSV file

In [13]:
df = pd.read_csv('DataProcessed1.csv')
df.head()

Unnamed: 0,Free_Text,Extracted_Entities,PERSON,KTP,AGE,GENDER,ADDRESS,CITY,PHONE,BLOOD_TYPE,MEDICAL_RECORD_NUMBER,DATE,DOCTOR_NAME,DOCTOR_KTP,PRIMARY_COMPLAINT,ADDITIONAL_COMPLAINT,RECOMMENDATION,REFERRAL,NOTES
0,"Pasien Rahmat Widodo, KTP 3276010101010004, 52...","{'PERSON': ['Rahmat Widodo'], 'KTP': ['3276010...",['Rahmat Widodo'],['3276010101010004'],['52'],['perempuan'],['Jl. Imam Bonjol No. 78'],['Surabaya'],['081987654321'],['A'],['727106'],['10/09/2023'],['Dr. Farida Lestari'],['3173010202020004'],['hipertensi'],['sesak napas'],['mengubah pola makan'],['Dr. Hasan'],"['keluhan masih tetap dirasakan, perlu observa..."
1,"Pasien Wawan Purnomo, KTP 3276010101010009, 62...","{'PERSON': ['Wawan Purnomo'], 'KTP': ['3276010...",['Wawan Purnomo'],['3276010101010009'],['62'],['perempuan'],['Jl. Sudirman No. 56'],['Bandung'],['081234567890'],['B'],['870412'],['11/09/2023'],['Dr. Andika Pratama'],['3173010202020003'],['migrain'],['nyeri kepala'],['suplemen zat besi'],['Dr. Hasan'],['direkomendasikan untuk menjaga pola makan da...
2,"Pasien Lestari Nugroho, KTP 3276010101010007, ...","{'PERSON': ['Lestari Nugroho'], 'KTP': ['32760...",['Lestari Nugroho'],['3276010101010007'],['41'],['perempuan'],['Jl. Imam Bonjol No. 78'],['Surabaya'],['081987654321'],['AB'],['151697'],['27/09/2023'],['Dr. Sumarno Hadi'],['3173010202020007'],['hipertensi'],['pusing'],['obat antihipertensi'],['Dr. Hasan'],['direkomendasikan untuk menjaga pola makan da...
3,"Pasien Yudi Raharjo, KTP 3276010101010004, 20 ...","{'PERSON': ['Yudi Raharjo'], 'KTP': ['32760101...",['Yudi Raharjo'],['3276010101010004'],['20'],['laki-laki'],['Jl. Sudirman No. 56'],['Bandung'],['081987654321'],['A'],['583165'],['3/08/2023'],['Dr. Wahyu Nugraha'],['3173010202020010'],['infeksi saluran pernapasan'],['nyeri kepala'],['obat antihipertensi'],['Dr. Dian'],['perlu dilakukan tes darah ulang']
4,"Pasien Wawan Purnomo, KTP 3276010101010004, 45...","{'PERSON': ['Wawan Purnomo'], 'KTP': ['3276010...",['Wawan Purnomo'],['3276010101010004'],['45'],['perempuan'],['Jl. Diponegoro No. 12'],['Medan'],['081234567890'],['A'],['522131'],['25/08/2023'],['Dr. Sari Utami'],['3173010202020009'],['migrain'],['sakit pinggang'],['Unknown'],['Dr. Dian'],['diminta mengurangi konsumsi garam dan lemak']


Entering in currently available data

Patient

In [14]:
def create_fhir_patient_resource(info):
    patient = {
        "resourceType": "Patient",
        "identifier": [
            {
                "use": "official",
                "system": "https://fhir.kemkes.go.id/id/nik",
                "value": info.get('patient_ktp', 'unknown')
            }
        ],
        "active": True,
        "name": [
            {
                "use": "official",
                "text": info.get('patient_name', 'unknown')
            }
        ],
        "telecom": [
            {
                "system": "phone",
                "value": info.get('patient_phone', 'unknown'),
                "use": "mobile"
            }
        ],
        "gender": "male" if info.get('patient_gender') == 'laki-laki' else "female",
        "birthDate": f"{2024 - int(info.get('patient_age', 0))}-01-01" if info.get('patient_age') else "unknown",
        "address": [
            {
                "use": "home",
                "line": [info.get('patient_address', 'unknown')],
                "city": info.get('patient_city', 'unknown'),
                "state": "unknown",  # Placeholder, update if available
                "postalCode": "unknown",  # Placeholder, update if available
                "country": "Indonesia"  # Placeholder, update if available
            }
        ],
        "maritalStatus": {
            "text": "unknown"
        },
        "communication": [
            {
                "language": {
                    "text": "unknown"
                }
            }
        ]
    }
    return patient

Practitioner

In [15]:
def create_fhir_practitioner_resource(info):
    practitioner = {
        "resourceType": "Practitioner",
        "identifier": [
            {
                "use": "official",
                "system": "https://fhir.kemkes.go.id/id/nik",
                "value": info.get('doctor_ktp', 'unknown')
            }
        ],
        "active": True,
        "name": [
            {
                "use": "official",
                "text": info.get('doctor_name', 'unknown')
            }
        ],
        "telecom": [
            {
                "system": "phone",
                "value": "unknown",
                "use": "work"
            },
            {
                "system": "email",
                "value": "unknown",
                "use": "work"
            }
        ],
        "address": [
            {
                "use": "work",
                "line": ["unknown"],
                "city": "unknown",
                "state": "unknown",
                "postalCode": "unknown",
                "country": "unknown"
            }
        ],
        "qualification": [
            {
                "identifier": [
                    {
                        "system": "https://fhir.kemkes.go.id/id/qualification",
                        "value": "unknown"
                    }
                ],
                "code": {
                    "coding": [
                        {
                            "system": "http://terminology.hl7.org/CodeSystem/v2-0360",
                            "code": "MD",
                            "display": "Doctor of Medicine"
                        }
                    ]
                },
                "period": {
                    "start": "unknown",
                    "end": "unknown"
                },
                "issuer": {
                    "display": "unknown"
                }
            }
        ]
    }
    return practitioner

Encounter

In [16]:
def create_fhir_encounter_resource(info):
    encounter = {
        "resourceType": "Encounter",
        "status": "finished",
        "class": {
            "system": "http://terminology.hl7.org/CodeSystem/v3-ActCode",
            "code": "AMB",
            "display": "ambulatory"
        },
        "subject": {
            "reference": f"Patient/{info.get('patient_ktp', 'unknown')}"
        },
        "participant": [
            {
                "individual": {
                    "reference": f"Practitioner/{info.get('doctor_ktp', 'unknown')}"
                }
            }
        ],
        "period": {
            "start": info.get('patient_date_of_entry', 'unknown')
        },
        "reasonCode": [
            {
                "text": info.get('patient_primary_complaint', 'unknown')
            }
        ],
        "diagnosis": [
            {
                "condition": {
                    "reference": "Condition/unknown"
                },
                "use": {
                    "coding": [
                        {
                            "system": "http://terminology.hl7.org/CodeSystem/diagnosis-role",
                            "code": "AD",
                            "display": "Admission diagnosis"
                        }
                    ]
                }
            }
        ],
        "note": [
            {
                "text": info.get('additional_notes', 'unknown')
            }
        ]
    }
    return encounter

Observation

In [17]:
def create_fhir_observation_resource(info):
    observation = {
        "resourceType": "Observation",
        "status": "final",
        "category": [
            {
                "coding": [
                    {
                        "system": "http://terminology.hl7.org/CodeSystem/observation-category",
                        "code": "vital-signs",
                        "display": "Vital Signs"
                    }
                ]
            }
        ],
        "code": {
            "coding": [
                {
                    "system": "http://loinc.org",
                    "code": "85354-9",
                    "display": "Blood pressure panel with all children optional"
                }
            ],
            "text": "Blood Pressure"
        },
        "subject": {
            "reference": f"Patient/{info.get('patient_ktp', 'unknown')}"
        },
        "effectiveDateTime": info.get('patient_date_of_entry', 'unknown'),
        "performer": [
            {
                "reference": f"Practitioner/{info.get('doctor_ktp', 'unknown')}"
            }
        ],
        "valueString": info.get('patient_primary_complaint', 'unknown'),
        "component": [
            {
                "code": {
                    "coding": [
                        {
                            "system": "http://loinc.org",
                            "code": "55284-4",
                            "display": "Blood pressure systolic & diastolic"
                        }
                    ]
                },
                "valueQuantity": {
                    "value": 120,
                    "unit": "mmHg",
                    "system": "http://unitsofmeasure.org",
                    "code": "mm[Hg]"
                }
            }
        ],
        "interpretation": [
            {
                "coding": [
                    {
                        "system": "http://terminology.hl7.org/CodeSystem/v3-ObservationInterpretation",
                        "code": "N",
                        "display": "Normal"
                    }
                ]
            }
        ],
        "note": [
            {
                "text": info.get('additional_notes', 'unknown')
            }
        ]
    }
    return observation

Applying functions to data

In [18]:
def create_fhir_resources(info):
    patient = create_fhir_patient_resource(info)
    practitioner = create_fhir_practitioner_resource(info)
    encounter = create_fhir_encounter_resource(info)
    observation = create_fhir_observation_resource(info)
    
    fhir_resources = {
        "Patient": patient,
        "Practitioner": practitioner,
        "Encounter": encounter,
        "Observation": observation
    }
    
    return json.dumps(fhir_resources, indent=2)

In [None]:
# Example usage
info = {
    "patient_name": "Rahmat Widodo",
    "patient_ktp": "3276010101010004",
    "patient_age": "52",
    "patient_gender": "perempuan",
    "patient_address": "Jl. Imam Bonjol No. 78",
    "patient_city": "Surabaya",
    "patient_phone": "081987654321",
    "patient_medical_records_id": "MRN123456",
    "patient_date_of_entry": "2023-10-01T08:00:00+07:00",
    "doctor_name": "Dr. Siti Nurhaliza",
    "doctor_ktp": "3276010101010005",
    "patient_primary_complaint": "Headache",
    "additional_notes": "Patient reported severe headache for the past 3 days."
}

fhir_json = create_fhir_resources(info)
print(fhir_json)

ValueError: invalid literal for int() with base 10: "['52']"

## Step 4: Creating FHIR compliant JSON representations for Patient, Practitioner, Encounter, and Condition

## Step 5: Printing FHIR Resources