In [1]:
import pandas as pd
from faker import Faker
from tqdm import tqdm
import random

In [2]:
random.seed(42)
Faker.seed(42)

fake = Faker()

num_hospitals = 5
num_patients = 100000


For Hospital Table  
<small><i>Columns: hospital_id, hospital_name</i></small>

In [3]:
hospital_data = {
    "hospital_id": list(range(1, num_hospitals + 1)),
    "hospital_name": [fake.company() for _ in range(num_hospitals)]
}
hospitals_df = pd.DataFrame(hospital_data)

In [4]:
print(hospitals_df)

   hospital_id                    hospital_name
0            1  Rodriguez, Figueroa and Sanchez
1            2                        Doyle Ltd
2            3    Mcclain, Miller and Henderson
3            4                   Davis and Sons
4            5      Guzman, Hoffman and Baldwin


Patient Table  
<small><i>Columns: patient_id, hospital_id, patient_name, dob, admission_datetime,
discharge_datetime </small></i>

In [5]:
patients = []

for i in tqdm(range(1,num_patients+1)):
    hospital_id = random.randint(1,num_hospitals)
    name = fake.name()
    dob = fake.date_of_birth(minimum_age=1,maximum_age=99)
    admit = fake.date_time_between(start_date='-2y',end_date='-1d')
    discharge = fake.date_time_between(start_date=admit,end_date='now')
    patients.append([i,hospital_id,name,dob,admit,discharge])

patients_df = pd.DataFrame(patients,columns=["patient_id", "hospital_id", "patient_name", "dob", "admission_datetime",
"discharge_datetime"])

100%|██████████| 100000/100000 [00:12<00:00, 7956.59it/s]


In [6]:
print(patients[0])

[1, 1, 'Jennifer Cole', datetime.date(1963, 5, 9), datetime.datetime(2025, 3, 19, 7, 2, 57), datetime.datetime(2025, 3, 20, 13, 20, 8)]


In [7]:
print(patients_df)

       patient_id  hospital_id       patient_name         dob  \
0               1            1      Jennifer Cole  1963-05-09   
1               2            1  Caitlin Henderson  1963-04-24   
2               3            3    Gabrielle Davis  1938-08-27   
3               4            2        Renee Blair  1998-06-22   
4               5            2      Edward Fuller  1974-09-16   
...           ...          ...                ...         ...   
99995       99996            5       Brian Snyder  1936-04-12   
99996       99997            1      Cheryl Murray  1934-02-02   
99997       99998            4      Nicole Fowler  1986-05-29   
99998       99999            5    Stephen Pittman  1980-03-13   
99999      100000            2    Peggy Morgan MD  1982-03-25   

       admission_datetime  discharge_datetime  
0     2025-03-19 07:02:57 2025-03-20 13:20:08  
1     2023-10-28 08:31:11 2024-04-12 14:15:32  
2     2024-04-03 10:01:57 2024-12-26 13:49:54  
3     2023-10-04 13:06:00 2

Diagnosis Table  
<smalll><i>Columns: diagnosis_id, patient_id, diagnosis_name</small></i>

In [8]:
diagnoses_list = ["Flu", "Diabetes", "Hypertension", "Asthma", "Covid-19", "Migraine"]
diagnoses = []

for pid in tqdm(patients_df['patient_id']):
    for _ in range(2):
        diagnoses.append([fake.uuid4(),pid,random.choice(diagnoses_list)])

diagnoses_df = pd.DataFrame(diagnoses,columns=['diagnosis_id', 'patient_id', 'diagnosis_name'])

100%|██████████| 100000/100000 [00:00<00:00, 140244.75it/s]


In [9]:
print(diagnoses_df)

                                diagnosis_id  patient_id diagnosis_name
0       9b524d60-68c1-4521-98bd-63be25fc386b           1   Hypertension
1       a4de63ed-a8c5-4d47-bde5-ebca183645ce           1       Diabetes
2       9b1b4720-ddc0-447b-b6c3-72967d14c162           2       Diabetes
3       e9c7f5d1-8bd8-4fb4-a4a1-da5a3ffbbb5b           2         Asthma
4       86f3dd82-cf14-46d5-906a-146341555699           3            Flu
...                                      ...         ...            ...
199995  98f8567f-9516-4496-8d24-a50590cc3e98       99998       Migraine
199996  b81233d8-fd85-4ece-a171-21b7e98b55e3       99999            Flu
199997  b4777d31-e662-437f-835f-019c88c0797f       99999            Flu
199998  1e2b423c-7ba0-4292-a106-8d0838cdb48e      100000       Covid-19
199999  c6673946-7f3e-43f3-9ef9-7f795ed5770e      100000   Hypertension

[200000 rows x 3 columns]


Treatment Table  
<small><i>Columns: treatment_id, patient_id, medicine_name, dose_time, duration</small></i>

In [10]:
medicines = ["Paracetamol", "Ibuprofen", "Aspirin", "Amoxicillin", "Metformin", "Atorvastatin"]
treatments = []

for pid in tqdm(patients_df['patient_id']):
    for _ in range(5):
        treatments.append([
            fake.uuid4(),
            pid,
            random.choice(medicines),
            fake.time(),
            random.randint(1,30)
        ])

treatments_df = pd.DataFrame(treatments,columns=['treatment_id', 'patient_id', 'medicine_name', 'dose_time', 'duration'])        



100%|██████████| 100000/100000 [00:06<00:00, 15409.86it/s]


In [11]:
print(treatments_df)

                                treatment_id  patient_id medicine_name  \
0       441f3cad-cdb7-4466-8c87-343004384fc3           1   Paracetamol   
1       af34270a-1c6c-4d7d-9e33-fcc1219fb2b4           1       Aspirin   
2       c45e68b8-64ae-45c6-bc01-2b0c760f9b06           1       Aspirin   
3       06218d83-4c03-4c96-bfd4-a051bb668b7b           1     Metformin   
4       357527c0-3599-4ba1-aa4d-24572c643aaa           1   Paracetamol   
...                                      ...         ...           ...   
499995  f6cef84c-a81b-40f5-a304-aa1fb3a8bfae      100000     Ibuprofen   
499996  ae2a4ae1-26b0-4f7b-a655-7d26bdb08a88      100000   Amoxicillin   
499997  3d49e6fd-7f0d-43ec-bae5-543cb9bf70df      100000     Metformin   
499998  b7261a5b-e90c-4b58-a1fe-1835f0a2d4a1      100000   Paracetamol   
499999  1cfc5208-11f5-4712-a153-7cfcd2c9882e      100000   Paracetamol   

       dose_time  duration  
0       15:35:21         9  
1       21:35:56        26  
2       21:45:07        

Billing Table  
<small><i>Columns: bill_id, patient_id, bill_amount, payment_mode (cash/credit)</small></i>

In [12]:
billing = []

for pid in tqdm(patients_df['patient_id']):
    billing.append([
        fake.uuid4(),
        pid,
        round(random.uniform(1000,5000),2),
        random.choice(["cash","credit"])
    ])

billing_df = pd.DataFrame(billing,columns=['bill_id', 'patient_id', 'bill_amount', 'payment_mode'])

100%|██████████| 100000/100000 [00:00<00:00, 150657.20it/s]


In [13]:
print(billing_df)

                                    bill_id  patient_id  bill_amount  \
0      4c3f55f4-10e1-4ef9-86a2-41af029d5292           1      3671.99   
1      a37192f5-ed71-43f6-bb44-ebd443fe47d8           2      2331.78   
2      a8598dbe-d685-460b-bc3a-896e4ace792d           3      2221.35   
3      d52667ce-39b2-45f5-87e8-1ac8426d430b           4      1740.24   
4      73747ca3-592f-4fa1-8e44-2e57e1e2746a           5      1237.87   
...                                     ...         ...          ...   
99995  5f041181-a879-4cbf-a9cd-405183afb991       99996      3366.48   
99996  c1ca1c79-09dd-47d3-90d8-5465bf8b2970       99997      1071.78   
99997  df856f16-d2fb-4b01-8e16-eb15ffbe1c6b       99998      3897.63   
99998  0b4a8c71-7222-4447-8c3a-bc9aec559326       99999      3120.30   
99999  8cf0faf4-acbb-4bea-82ec-c68e7fafe1d4      100000      4982.36   

      payment_mode  
0           credit  
1           credit  
2             cash  
3           credit  
4             cash  
...      

In [None]:
hospitals_df.to_csv("hospitals.csv", index=False)
patients_df.to_csv("patients.csv", index=False)
diagnoses_df.to_csv("diagnoses.csv", index=False)
treatments_df.to_csv("treatments.csv", index=False)
billing_df.to_csv("billing.csv", index=False)