In [1]:
import pandas as pd

In [2]:
# Reading the dataset

dataset = "../datasets/healthcare_dataset.csv"
df = pd.read_csv(dataset)

#### Cleaning

In [3]:
# Fix names ("LesLie TErRy" => "Leslie Terry")

df["Name"] = df["Name"].str.title()

In [4]:
# Strip whitespace 

df.columns = df.columns.str.strip()

for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].astype(str).str.strip()

In [5]:
for col in df.columns:
    if df[col].dtype in ["float32", "int32", "float64", "int64"]:
        # replace non-existing numerical values with median number for the corresponding column
        df[col] = df[col].fillna(df[col].median())  
    else:
        # otherwise if not "float32/64" or "int32/64" just set the row to "Unknown"
        df[col] = df[col].fillna("Unknown")

In [6]:
# Converting to datetime

df["Date of Admission"] = pd.to_datetime(df["Date of Admission"], errors="coerce")
df["Discharge Date"] = pd.to_datetime(df["Discharge Date"], errors="coerce")

#### Dataset Normalization

In [7]:
# 1. Patients table
patients = df[["Name", "Age", "Gender", "Blood Type", "Medical Condition"]].drop_duplicates().reset_index(drop=True)
patients["PatientID"] = range(1, len(patients)+1)

# 2. Doctors table
doctors = df[["Doctor"]].drop_duplicates().reset_index(drop=True)
doctors["DoctorID"] = range(1, len(doctors)+1)

# 3. Hospitals table
hospitals = df[["Hospital"]].drop_duplicates().reset_index(drop=True)
hospitals["HospitalID"] = range(1, len(hospitals)+1)

# 4. InsuranceProviders table
insurers = df[["Insurance Provider"]].drop_duplicates().reset_index(drop=True)
insurers["InsuranceID"] = range(1, len(insurers)+1)

# Dropping columns if already exists (the notebook cell may have been already run so this line would prevent error)
df = df.drop(columns=["PatientID", "DoctorID", "HospitalID", "InsuranceID", "AdmissionID"], errors="ignore")

# 5. Admissions table
# Map IDs
df = df.merge(patients, on=["Name", "Age", "Gender", "Blood Type", "Medical Condition"], how="left")
df = df.merge(doctors, on="Doctor", how="left")
df = df.merge(hospitals, on="Hospital", how="left")
df = df.merge(insurers, on="Insurance Provider", how="left")

admissions_cols = ["PatientID", "DoctorID", "HospitalID", "InsuranceID",
                   "Date of Admission", "Discharge Date", "Room Number", "Admission Type", "Billing Amount"]
admissions = df[admissions_cols].copy().drop_duplicates().reset_index(drop=True)
admissions["AdmissionID"] = range(1, len(admissions)+1)

# Merge AdmissionID back to df to use for medications and tests
df = df.merge(admissions, on=admissions_cols, how="left")

# 6. Medications table
medications = df[["AdmissionID", "Medication"]].drop_duplicates().reset_index(drop=True)
medications["MedicationID"] = range(1, len(medications)+1)

# 7. TestResults table
test_results = df[["AdmissionID","Test Results"]].drop_duplicates().reset_index(drop=True)
test_results["TestResultID"] = range(1, len(test_results)+1)

#### Loading Data

In [8]:
tables = {
    "patients": patients,
    "doctors": doctors,
    "hospitals": hospitals,
    "insurers": insurers,
    "admissions": admissions,
    "medications": medications,
    "test_results": test_results
}

table_output_path = "../output"

for tname, data in tables.items():
    data.to_csv(f"{table_output_path}/{tname}.csv", index=False)