This file contains the preprocessing and cleaning steps for inpatient and outpatient patient data. The goal is to transform raw data from various sources into a consistent, clean format ready for further analysis.

In [1]:
import pandas as pd
import numpy as np
import os
from glob import glob

In [None]:
# --- IMPORTANT NOTE ---
# Raw data files are not included in this repository due to patient privacy.
# The code below only demonstrates the data loading pipeline.

def load_data(file_paths):
    """Loads and combines data from a list of Excel files."""
    dataframes = [pd.read_excel(f, engine='openpyxl') for f in file_paths]
    df_combined = pd.concat(dataframes, ignore_index=True)
    return df_combined

# Load inpatient patient data (2021-2024)
base_path_inap = 'path/to/your/rawat_inap_folder' # Change to your absolute path
files_inap = glob(os.path.join(base_path_inap, 'RwtInap*.xlsx'))
df_rawatinap = load_data(files_inap)

# Load outpatient patient data (2021-2023)
base_path_jalan = 'path/to/your/rawat_jalan_folder' # Change to your absolute path
files_jalan = glob(os.path.join(base_path_jalan, 'Rawat_jalan_*.xlsx'))
df_rawatjalan = load_data(files_jalan)

print("Inpatient data loaded with", df_rawatinap.shape[0], "rows.")
print("Outpatient data loaded with", df_rawatjalan.shape[0], "rows.")

In [None]:
# Rename columns for readability
rename_inap = {
    'Sex': 'gender', 'RM': 'rm_id', 'Tgl. Admisi': 'admission_date',
    'Tgl. Keluar': 'discharge_date', 'TINDAKAN': 'procedure',
    'Diagnosis Utama': 'main_diagnosis', 'ICD-10': 'icd_10',
    'BAYAR': 'payment_method', 'DISPOSISI': 'disposition_status', 'LOS': 'los',
    'Arti ICD': 'icd_desc', 'Umur': 'age', 'DOKTER': 'doctor'
}
df_rawatinap = df_rawatinap.rename(columns=rename_inap)

# Drop irrelevant columns
columns_to_drop_inap = [
    'Urut', 'Klp Umur', 'Klp Umur.1', 'ICD 9', 'SEBAB LUAR', 'DTD', 'DTD+',
    'Srvlc', 'KELURAHAN', 'KOTA/KAB', 'Tahun', 'Bulan', 'Hari'
]
df_rawatinap = df_rawatinap.drop(columns=columns_to_drop_inap, errors='ignore')

# Standardize values and fill missing data
df_rawatinap['gender'] = df_rawatinap['gender'].map({'L': 'Male', 'P': 'Female'})
df_rawatinap['doctor'].fillna('Unknown Doctor', inplace=True)
df_rawatinap['procedure'].fillna('No Procedure', inplace=True)

# Create 'age_group' column
def categorize_age(age):
    if age < 1: return 'Baby'
    elif age < 5: return 'Toddler'
    elif age < 12: return 'Child'
    elif age < 18: return 'Adolescent'
    elif age < 60: return 'Adult'
    else: return 'Elderly'
df_rawatinap['age_group'] = df_rawatinap['age'].apply(categorize_age)
df_rawatinap['patient_type'] = 'Inpatient'

In [None]:
# Standardize column names
rename_jalan = {
    'TANGGAL': 'date', 'JK': 'gender', 'KODE x': 'code1',
    'KODE TDK1': 'procedure1', 'NO MR': 'rm_id', 'CR BYR': 'payment_method',
    'POLI': 'poly', 'DPJP': 'doctor'
}
df_rawatjalan = df_rawatjalan.rename(columns=rename_jalan, errors='ignore')

# Drop irrelevant columns
columns_to_drop_jalan = [
    'NO', 'UMUR_TAHUN', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22',
    'dokter2', 'BULAN', 'KODE GABUNG', 'TDK GABUNG'
]
df_rawatjalan = df_rawatjalan.drop(columns=columns_to_drop_jalan, errors='ignore')

# Fill missing data
kode_cols = ['code1', 'code2', 'code3', 'code4', 'code5', 'code6']
procedure_cols = ['procedure1', 'procedure2', 'procedure3', 'procedure4', 'procedure5', 'procedure6']

df_rawatjalan[kode_cols] = df_rawatjalan[kode_cols].fillna('No Diagnosis Code')
df_rawatjalan[procedure_cols] = df_rawatjalan[procedure_cols].fillna('No Procedure')

# Standardize gender values
df_rawatjalan['gender'] = df_rawatjalan['gender'].map({'LK': 'Male', 'PR': 'Female'})
df_rawatjalan['patient_type'] = 'Outpatient'

In [None]:
# Displays the first 5 rows of the cleaned data.
print("Clean Inpatient Data:")
print(df_rawatinap.head())

print("\nClean Outpatient Data:")
print(df_rawatjalan.head())

# Saving clean dataframes for subsequent notebooks