# Data Preparation

<hr>


In [1]:
# Imports
import ast
import json
import logging
import random
import re
import sqlite3
import time
import uuid
from datetime import date, datetime, timedelta
from sqlite3 import OperationalError

import numpy as np
import pandas as pd
import requests
from faker import Faker
from IPython.display import display
from passlib.context import CryptContext

In [2]:
# Connect to (or create) the SQLite database file
conn = sqlite3.connect("../data/vaccination_db.sqlite")
cursor = conn.cursor()

# Read the schema file and execute the SQL commands
with open("../data/schema.sql", "r") as f:
    sql_script = f.read()

try:
    cursor.executescript(sql_script)
except OperationalError:
    pass
finally:
    conn.commit()
    conn.close()

In [3]:
fake = Faker()
fake.seed_instance(42)
np.random.seed(42)

<hr>

### **OneMap API**

API Document Link: https://www.onemap.gov.sg/apidocs/search


In [4]:
def get_address_info(text_input: str) -> dict | None:
    url = f"https://www.onemap.gov.sg/api/common/elastic/search?searchVal={text_input}&returnGeom=Y&getAddrDetails=Y&pageNum=1"
    headers = {"Authorization": "Bearer **********************"}
    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        data = response.json()
        if data["found"] > 0:  # If postal code exists in OneMap
            result = data["results"][0]
            return {
                "postal_code": result["POSTAL"],
                "address": result[
                    "ADDRESS"
                ],  # f"{(result['BLK_NO'] if result['BLK_NO'] != 'NIL' else '')} {(result['ROAD_NAME'] if result['ROAD_NAME'] != 'NIL' else '')} {(result['BUILDING'] if result['BUILDING'] != 'NIL' else '')}",
                "latitude": result["LATITUDE"],
                "longitude": result["LONGITUDE"],
            }
    return None  # Invalid postal code

<hr>

## Clinics Table

Note: The column `postal_code` will be later mapped to its corresponding `address_id`. It is stored first in order to populate the `Addresses` table.


### Polyclinics Data


In [5]:
# -----------------------------
# Load the JSON Polyclinic data
# -----------------------------
with open("../data/get_instituitions_response.json", "r") as f:
    institutions_data = json.load(f)

In [6]:
# -----------------------------
# Process JSON Polyclinic data
# -----------------------------
# Extract the list of health organizations
health_organizations = institutions_data["Result"]["HealthOrganizations"]

# Normalize the 'Cluster' and 'Group' columns
institutions_df = pd.json_normalize(health_organizations, sep="_")

# Sort the DataFrame by the 'Id' column
institutions_df = institutions_df.sort_values(by="Id").reset_index(drop=True)

# Fliter for polyclinics (Count: 26)
polyclinics_df = institutions_df[
    institutions_df["GroupName"] == "Polyclinics"
].reset_index()

# Data Check
print(f"Polyclinic Count: {len(polyclinics_df)}")
print(f"Unique Polyclinic Name Count: {len(polyclinics_df['Name'].unique())}")

# Extract relevant columns
include_columns = ["Name"]
polyclinics_df = polyclinics_df[include_columns]
# Rename to "name" column
polyclinics_df.rename(columns={"Name": "name"}, inplace=True)

# Get Postal Code using OneMap
polyclinics_df["postal_code"] = polyclinics_df["name"].apply(
    lambda name: (
        pd.Series(get_address_info(name), index=["postal_code"])
        if get_address_info(name)
        else pd.Series([None, None], index=["postal_code"])
    )
)

# Update incorrectly identified postal code(s)
update_postal_code = {"Sembawang Polyclinic": "756973"}
for polyclinic, postal_code in update_postal_code.items():
    polyclinics_df.loc[polyclinics_df["name"] == polyclinic, "postal_code"] = (
        postal_code
    )

# Add new column "type"
polyclinics_df["type"] = "polyclinic"

display(polyclinics_df)

Polyclinic Count: 26
Unique Polyclinic Name Count: 26


Unnamed: 0,name,postal_code,type
0,Ang Mo Kio Polyclinic,569666,polyclinic
1,Geylang Polyclinic,389707,polyclinic
2,Hougang Polyclinic,538829,polyclinic
3,Kallang Polyclinic,328263,polyclinic
4,Khatib Polyclinic,769567,polyclinic
5,Toa Payoh Polyclinic,319260,polyclinic
6,Sembawang Polyclinic,756973,polyclinic
7,Woodlands Polyclinic,738579,polyclinic
8,Yishun Polyclinic,768898,polyclinic
9,Bukit Batok Polyclinic,659164,polyclinic


### General Practitioners Data


In [7]:
# -------------------------------------
# Load clinic dataset from `data.gov.sg`
# -------------------------------------
# Define dataset ID and URL
dataset_id = "d_3cd840069e95b6a521aa5301a084b25a"
url = "https://data.gov.sg/api/action/datastore_search?resource_id=" + dataset_id

# Initialize an empty list to hold all records
all_clinic_records = []

# Set the initial offset and limit
limit = 100  # Max records per request
offset = 0

# Loop through the paginated data as the API only returns 100 records per request
while True:
    response = requests.get(f"{url}&limit={limit}&offset={offset}")

    if response.status_code == 200:
        data = response.json()
        records = data["result"]["records"]

        all_clinic_records.extend(records)

        # Check if there are more records (if the number of records returned is less than the limit, stop)
        if len(records) < limit:
            break

        # Update offset for the next batch of records
        offset += limit
    else:
        print(f"Request failed with status code {response.status_code}")
        break

data_gov_clinic_df = pd.DataFrame(all_clinic_records)

In [8]:
# ------------
# Process data
# ------------
# Get General Practitioners only
gp_api_df = data_gov_clinic_df[
    data_gov_clinic_df["category"].isin(["gp", "non-chas-gp"])
]
print(f"GP Count: {len(gp_api_df)}")
# There may be different outlets across Singapore
print(f"Unique GP Name Count: {len(gp_api_df['name'].unique())}")
print(f"Unique GP Postal Code Count: {len(gp_api_df['postal_code'].unique())}")

gps_df = gp_api_df.copy()
# Extract the required columns
columns_to_extract = ["name", "postal_code"]
gps_df = gps_df[columns_to_extract].reset_index(drop=True)

# Add new column "type"
gps_df["type"] = "gp"

display(gps_df)

GP Count: 1532
Unique GP Name Count: 1486
Unique GP Postal Code Count: 1074


Unnamed: 0,name,postal_code,type
0,Cavenagh Medical Clinic And Home Care,269695,gp
1,Mei Ling Clinic,140158,gp
2,Rcmc Rivervale Crescent Medical Centre,541182,gp
3,360 Clinic,560407,gp
4,SKY Medical,079027,gp
...,...,...,...
1527,Trinity Medical@Shenton,068908,gp
1528,Saudara Clinic By A+J General Physicians,419741,gp
1529,Zara Clinic,680026,gp
1530,360 Clinic,640221,gp


### Combined Clinics Data


In [9]:
# Clinics Table
clinics_data = {
    "id": [
        str(uuid.uuid4()) for _ in range(1, len(gps_df) + len(polyclinics_df) + 1)
    ],  # Generate a UUID v4 as a string
    "name": polyclinics_df["name"].tolist() + gps_df["name"].tolist(),
    "postal_code": polyclinics_df["postal_code"].tolist()
    + gps_df["postal_code"].tolist(),
    "type": polyclinics_df["type"].tolist() + gps_df["type"].tolist(),
}
clinics_df = pd.DataFrame(clinics_data)

# Get list of unique gp postal_code
clinic_postal_code = clinics_df["postal_code"].drop_duplicates().tolist()
print(f"Clinics `postal_code` Unique Count: {len(clinic_postal_code)}")
display(clinics_df)

Clinics `postal_code` Unique Count: 1096


Unnamed: 0,id,name,postal_code,type
0,1d2e196d-af66-48f1-8f2c-0e23ad0ecde3,Ang Mo Kio Polyclinic,569666,polyclinic
1,859fe92f-9c4b-4f3d-beda-b6c96e67887f,Geylang Polyclinic,389707,polyclinic
2,76da5477-df98-49d2-99d5-ba6d3900c3b7,Hougang Polyclinic,538829,polyclinic
3,1489b9d7-5c8b-4df8-a614-70e76eb35722,Kallang Polyclinic,328263,polyclinic
4,84b1aa83-156c-4eca-a9bf-ddc72424de1a,Khatib Polyclinic,769567,polyclinic
...,...,...,...,...
1553,1e618339-2b5e-4386-9696-d4b27874f4be,Trinity Medical@Shenton,068908,gp
1554,d3931e52-a261-4d65-a91a-9ecfcad14ee0,Saudara Clinic By A+J General Physicians,419741,gp
1555,7d44d74b-ef2b-40d0-a18d-8f5487e28371,Zara Clinic,680026,gp
1556,1e346932-a436-432a-b2c4-1d77b25df77a,360 Clinic,640221,gp


<hr>

## Users Table


### Functions


In [10]:
# Regex pattern for nric
nric_regex = r"[STFGM]\d{7}[A-Z]"


def generate_nric(dob: date) -> str:
    year = dob.year
    year_suffix = str(year % 100).zfill(2)  # Extract last two digits of the year
    serial_number = str(random.randint(10000, 99999))  # Ensure exactly 7 digits
    checksum = random.choice("ABCDEFGHIJKLMNOPQRSTUVWXYZ")  # Random letter for checksum

    # Determine prefix based on birth year
    prefix = "S" if year < 2000 else "T"

    sample_nric = f"{prefix}{year_suffix}{serial_number}{checksum}"

    # Validate against regex before returning
    if not re.fullmatch(nric_regex, sample_nric):
        raise ValueError(
            f"Generated NRIC '{sample_nric}' does not match expected pattern."
        )

    return sample_nric


def generate_email(first_name, last_name):
    email_username = f"{first_name.lower()}.{last_name.lower()}"
    email_domain = fake.free_email_domain()  # Use a real email domain
    return f"{email_username}@{email_domain}"


# Set up password hashing
logging.getLogger("passlib").setLevel(logging.ERROR)
pwd_context = CryptContext(schemes=["bcrypt"], deprecated="auto")


def hash_password(password: str) -> str:
    return pwd_context.hash(password)

### Generate Users Data


In [11]:
N = 5

ids = []
enrolled_clinic_ids = []
nrics = []
first_names = []
last_names = []
emails = []
date_of_births = []
genders = []
passwords = ["password123", "abc123", "123", "admin", "guest"]

polyclinic_id_list = clinics_df[clinics_df["type"] == "polyclinic"]["id"].tolist()
gp_id_list = clinics_df[clinics_df["type"] == "gp"]["id"].tolist()

for _ in range(N):
    # Generate data
    gender = np.random.choice(["M", "F"], p=[0.5, 0.5])
    first_name = fake.first_name_male() if gender == "M" else fake.first_name_female()
    last_name = fake.last_name_male() if gender == "M" else fake.last_name_female()
    email = (
        first_name.lower() + "." + last_name.lower() + "@" + fake.email().split("@")[1]
    )
    date_of_birth = fake.date_of_birth(minimum_age=18, maximum_age=80)
    nric = generate_nric(date_of_birth)

    if np.random.rand() < 0.7:  # 70% chance for polyclinics
        enrolled_clinic_id = np.random.choice(polyclinic_id_list)
    elif np.random.rand() < 0.2:  # 20% chance for GPs
        enrolled_clinic_id = np.random.choice(gp_id_list)
    else:  # 10% chance for None
        enrolled_clinic_id = None

    # Append to list
    ids.append(str(uuid.uuid4()))
    nrics.append(nric)
    first_names.append(first_name)
    last_names.append(last_name)
    emails.append(email)
    date_of_births.append(date_of_birth)
    genders.append(gender)
    enrolled_clinic_ids.append(enrolled_clinic_id)

# Users Table
users_data = {
    "id": ids,
    "enrolled_clinic_id": enrolled_clinic_ids,
    "nric": nrics,
    "first_name": first_names,
    "last_name": last_names,
    "email": emails,
    "date_of_birth": date_of_births,
    "gender": genders,
    "password": [hash_password(password) for password in passwords],
}

users_df = pd.DataFrame(users_data)
display(users_df)

Unnamed: 0,id,enrolled_clinic_id,nric,first_name,last_name,email,date_of_birth,gender,password
0,f2967c0a-5121-4c46-a4dc-2b7979cfb421,,S9071950Q,Mark,Johnson,mark.johnson@example.net,1990-11-21,M,$2b$12$gf1LfqFawqeRB1xqfYzITulmwjcAhOUdsP09FqZ...
1,e5723722-2395-447f-9512-ea0f17fac47f,983a9652-d9a5-4560-a6b8-d7befb3fef45,S8257099H,Kimberly,Garza,kimberly.garza@example.net,1982-02-26,F,$2b$12$ITAS7rMGe475rz.1x53sPe8sxMEYl9xbDh53Q3o...
2,fd20c08f-8b5b-4dc0-b110-5e4decd0aea9,f53f8540-5d7b-4dde-b806-b638ae0615f4,S9550684U,Justin,Baldwin,justin.baldwin@example.com,1995-03-22,M,$2b$12$atm0zwhIWEUQgpOq/oIUc.1TGMOogrMAZnvhp/c...
3,f5b0dfd9-4a27-49c2-aa6a-c05163bfbeaa,9c84ac40-ac28-4cd7-8fb5-516473c65719,T0478320K,Abigail,Shaffer,abigail.shaffer@example.org,2004-07-13,F,$2b$12$/az9tuFgGPDzR7BkkiPZwO8n1m00CiMsLdnhYEX...
4,ed5d95e8-0eaf-4101-92da-8149d99b53f5,66648c66-52cd-48f4-8198-cd7d45919430,S9511354W,Gabrielle,Davis,gabrielle.davis@example.com,1995-01-29,F,$2b$12$clgwLm467Hn2N40vlzBegOMQGCITn8jzU7CU0ge...


### Generate `postal_code` for `Users` table

The column `postal_code` will be later mapped to its corresponding `address_id`. It is generated first in order to populate the `Addresses` table.

To generate the `postal_code` for `Users`, run either of the two cells:


In [12]:
valid_postal_codes = []
checked_codes = set()
postal_code_filepath = "../data/users_postal_codes.txt"

try:
    # (1) Try to load postal codes from .txt file
    with open(postal_code_filepath, "r") as file:
        valid_postal_codes = [line.strip() for line in file.readlines()]

    # Convert to actual dictionary
    valid_postal_codes = [ast.literal_eval(entry) for entry in valid_postal_codes]
    print("Loaded users' postal codes from txt file")

except FileNotFoundError:
    print(f"{postal_code_filepath} not found. Generating postal codes...")

    # (2) Generate postal codes if .txt file is not found
    while len(valid_postal_codes) < N:
        random_code = str(
            random.randint(100000, 999999)
        )  # Generate random 6-digit postal code

        if random_code in clinic_postal_code or random_code in checked_codes:
            continue  # Skip if already checked or is a clinic postal code

        checked_codes.add(random_code)
        geoData = get_address_info(random_code)

        if geoData:
            valid_postal_codes.append(geoData)
            print(
                f"Valid: {random_code} (Lat: {geoData['latitude']}, Lon: {geoData['longitude']}), Address: {geoData['address']}"
            )

        time.sleep(0.5)  # Sleep to prevent hitting API rate limits

    # Write postal codes to a .txt file for faster setup next time
    with open(postal_code_filepath, "w") as file:
        for code in valid_postal_codes:
            file.write(f"{code}\n")

../data/users_postal_codes.txt not found. Generating postal codes...
Valid: 359130 (Lat: 1.34529638664758, Lon: 103.869196813411), Address: 30 JALAN LATEH SINGAPORE 359130
Valid: 279121 (Lat: 1.31153641997817, Lon: 103.784768446309), Address: 82 GROVE DRIVE HENRY PARK SINGAPORE 279121
Valid: 545411 (Lat: 1.3683394040113, Lon: 103.879974433288), Address: 44 PARK VILLAS GREEN PARK VILLAS SINGAPORE 545411
Valid: 598309 (Lat: 1.33339215301366, Lon: 103.779092104114), Address: 25 KING ALBERT PARK KING ALBERT PARK SINGAPORE 598309
Valid: 229182 (Lat: 1.31149426333879, Lon: 103.843411659057), Address: 1 HOOPER ROAD SINGAPORE 229182


In [13]:
# List of generated postal codes
users_postal_codes = [entry["postal_code"] for entry in valid_postal_codes][:N]
users_df["postal_code"] = users_postal_codes

display(users_df)

Unnamed: 0,id,enrolled_clinic_id,nric,first_name,last_name,email,date_of_birth,gender,password,postal_code
0,f2967c0a-5121-4c46-a4dc-2b7979cfb421,,S9071950Q,Mark,Johnson,mark.johnson@example.net,1990-11-21,M,$2b$12$gf1LfqFawqeRB1xqfYzITulmwjcAhOUdsP09FqZ...,359130
1,e5723722-2395-447f-9512-ea0f17fac47f,983a9652-d9a5-4560-a6b8-d7befb3fef45,S8257099H,Kimberly,Garza,kimberly.garza@example.net,1982-02-26,F,$2b$12$ITAS7rMGe475rz.1x53sPe8sxMEYl9xbDh53Q3o...,279121
2,fd20c08f-8b5b-4dc0-b110-5e4decd0aea9,f53f8540-5d7b-4dde-b806-b638ae0615f4,S9550684U,Justin,Baldwin,justin.baldwin@example.com,1995-03-22,M,$2b$12$atm0zwhIWEUQgpOq/oIUc.1TGMOogrMAZnvhp/c...,545411
3,f5b0dfd9-4a27-49c2-aa6a-c05163bfbeaa,9c84ac40-ac28-4cd7-8fb5-516473c65719,T0478320K,Abigail,Shaffer,abigail.shaffer@example.org,2004-07-13,F,$2b$12$/az9tuFgGPDzR7BkkiPZwO8n1m00CiMsLdnhYEX...,598309
4,ed5d95e8-0eaf-4101-92da-8149d99b53f5,66648c66-52cd-48f4-8198-cd7d45919430,S9511354W,Gabrielle,Davis,gabrielle.davis@example.com,1995-01-29,F,$2b$12$clgwLm467Hn2N40vlzBegOMQGCITn8jzU7CU0ge...,229182


<hr>

## Addresses Table


In [None]:
# List all unique postal codes
# 5 (users) + 26 (polyclinics) + 1074 (gps) = 1105 postal_codes (expected) - 4 (duplicates) = 1101 postal_codes
all_postal_code = list(dict.fromkeys(clinic_postal_code + users_postal_codes))

address_data_filepath = "../data/addresses_data.json"
try:
    # (1) Try to load address data from JSON file
    with open(address_data_filepath, "r") as file:
        addresses_data = [json.loads(line) for line in file]
    print("Loaded addresses_data from JSON file")
except FileNotFoundError:
    print(f"{address_data_filepath} not found. Generating addresses_data...")
    # (2) Get the geo data from OneMap API, runtime: ~2mins
    addresses_data = [get_address_info(postal_code) for postal_code in all_postal_code]
    # Save data as JSON lines to preserve the structure
    with open(address_data_filepath, "w") as file:
        for entry in addresses_data:
            file.write(json.dumps(entry) + "\n")  # Each line is a JSON object

addresses_df = pd.DataFrame(addresses_data)

# Create 'id` column`
addresses_df["id"] = [str(uuid.uuid4()) for _ in range(1, len(addresses_df) + 1)]
# Insert 'id' as the first column
addresses_df.insert(0, "id", addresses_df.pop("id"))

display(addresses_df)

../data/addresses_data.json not found. Generating addresses_data...


Unnamed: 0,id,postal_code,address,latitude,longitude
0,fa03e01a-219b-4ed1-91e6-14238e533df0,569666,21 ANG MO KIO CENTRAL 2 ANG MO KIO POLYCLINIC ...,1.3743245905856,103.845677779279
1,73532a10-1eff-4b9c-a36b-d5f7523bf760,389707,21 GEYLANG EAST CENTRAL GEYLANG POLYCLINIC SIN...,1.31949365581957,103.887166041622
2,77ac2094-04eb-483e-927d-076602cde3b7,538829,89 HOUGANG AVENUE 4 HOUGANG POLYCLINIC SINGAPO...,1.3699068381066,103.88900146446
3,bffe5d37-8def-4b9b-80e3-10e30a6734f8,328263,701 SERANGOON ROAD KALLANG POLYCLINIC AND LONG...,1.31678496747374,103.858752270079
4,02931488-c56f-4c2f-8695-af6d7c28e1ba,769567,690 YISHUN RING ROAD KHATIB POLYCLINIC SINGAPO...,1.41820402220914,103.834428037791
...,...,...,...,...,...
1096,918a15ec-2333-4ed3-a4b7-39a66cc92ffa,359130,30 JALAN LATEH SINGAPORE 359130,1.34529638664758,103.869196813411
1097,797a0d10-0e78-4426-856c-0823a957a090,279121,82 GROVE DRIVE HENRY PARK SINGAPORE 279121,1.31153641997817,103.784768446309
1098,44851715-59cb-4ee3-81c3-11ced10d5fdc,545411,44 PARK VILLAS GREEN PARK VILLAS SINGAPORE 545411,1.3683394040113,103.879974433288
1099,7b9479b8-4384-48d1-b6d6-8e2d1ee2545b,598309,25 KING ALBERT PARK KING ALBERT PARK SINGAPORE...,1.33339215301366,103.779092104114


<hr>

## Replace `postal_code` column with `address_id` in `Clinics` and `Users` tables


In [15]:
# Mapping of postal_codes to address_ids from addresses_df
postal_to_address_id_mapping = dict(
    zip(addresses_df["postal_code"], addresses_df["id"])
)

# Map address_id to postal_code
clinics_df["address_id"] = clinics_df["postal_code"].map(postal_to_address_id_mapping)
users_df["address_id"] = users_df["postal_code"].map(postal_to_address_id_mapping)

# Select columns
clinics_df = clinics_df[["id", "address_id", "name", "type"]]
users_df = users_df[
    [
        "id",
        "address_id",
        "enrolled_clinic_id",
        "nric",
        "first_name",
        "last_name",
        "email",
        "date_of_birth",
        "gender",
        "password",
    ]
]

display(clinics_df)
display(users_df)

Unnamed: 0,id,address_id,name,type
0,1d2e196d-af66-48f1-8f2c-0e23ad0ecde3,fa03e01a-219b-4ed1-91e6-14238e533df0,Ang Mo Kio Polyclinic,polyclinic
1,859fe92f-9c4b-4f3d-beda-b6c96e67887f,73532a10-1eff-4b9c-a36b-d5f7523bf760,Geylang Polyclinic,polyclinic
2,76da5477-df98-49d2-99d5-ba6d3900c3b7,77ac2094-04eb-483e-927d-076602cde3b7,Hougang Polyclinic,polyclinic
3,1489b9d7-5c8b-4df8-a614-70e76eb35722,bffe5d37-8def-4b9b-80e3-10e30a6734f8,Kallang Polyclinic,polyclinic
4,84b1aa83-156c-4eca-a9bf-ddc72424de1a,02931488-c56f-4c2f-8695-af6d7c28e1ba,Khatib Polyclinic,polyclinic
...,...,...,...,...
1553,1e618339-2b5e-4386-9696-d4b27874f4be,b45669cd-00d4-42bf-b94f-15d759fb0978,Trinity Medical@Shenton,gp
1554,d3931e52-a261-4d65-a91a-9ecfcad14ee0,f435aa5a-56b0-46d8-a073-d7d6986fd7ae,Saudara Clinic By A+J General Physicians,gp
1555,7d44d74b-ef2b-40d0-a18d-8f5487e28371,40d044c7-e40b-4d39-ad1f-321fdc0895dc,Zara Clinic,gp
1556,1e346932-a436-432a-b2c4-1d77b25df77a,21822a2b-6b6d-43ad-a2a9-93cb1411a01c,360 Clinic,gp


Unnamed: 0,id,address_id,enrolled_clinic_id,nric,first_name,last_name,email,date_of_birth,gender,password
0,f2967c0a-5121-4c46-a4dc-2b7979cfb421,918a15ec-2333-4ed3-a4b7-39a66cc92ffa,,S9071950Q,Mark,Johnson,mark.johnson@example.net,1990-11-21,M,$2b$12$gf1LfqFawqeRB1xqfYzITulmwjcAhOUdsP09FqZ...
1,e5723722-2395-447f-9512-ea0f17fac47f,797a0d10-0e78-4426-856c-0823a957a090,983a9652-d9a5-4560-a6b8-d7befb3fef45,S8257099H,Kimberly,Garza,kimberly.garza@example.net,1982-02-26,F,$2b$12$ITAS7rMGe475rz.1x53sPe8sxMEYl9xbDh53Q3o...
2,fd20c08f-8b5b-4dc0-b110-5e4decd0aea9,44851715-59cb-4ee3-81c3-11ced10d5fdc,f53f8540-5d7b-4dde-b806-b638ae0615f4,S9550684U,Justin,Baldwin,justin.baldwin@example.com,1995-03-22,M,$2b$12$atm0zwhIWEUQgpOq/oIUc.1TGMOogrMAZnvhp/c...
3,f5b0dfd9-4a27-49c2-aa6a-c05163bfbeaa,7b9479b8-4384-48d1-b6d6-8e2d1ee2545b,9c84ac40-ac28-4cd7-8fb5-516473c65719,T0478320K,Abigail,Shaffer,abigail.shaffer@example.org,2004-07-13,F,$2b$12$/az9tuFgGPDzR7BkkiPZwO8n1m00CiMsLdnhYEX...
4,ed5d95e8-0eaf-4101-92da-8149d99b53f5,172f4d51-5c70-4f32-8633-87aaf0eb4847,66648c66-52cd-48f4-8198-cd7d45919430,S9511354W,Gabrielle,Davis,gabrielle.davis@example.com,1995-01-29,F,$2b$12$clgwLm467Hn2N40vlzBegOMQGCITn8jzU7CU0ge...


<hr>

## Vaccines Table

In Singapore, there is the National Childhood Immnisation Schedule and National Adult Immunisation Schedule.

Reference: https://www.moh.gov.sg/seeking-healthcare/overview-of-diseases/communicable-diseases/nationally-recommended-vaccines


In [16]:
# Vaccine Table
vaccines_data = {
    "id": [str(uuid.uuid4()) for _ in range(1, 8)],
    "name": [  # Adult Vaccines
        "Influenza (INF)",
        "Pneumococcal Conjugate (PCV13)",
        "Human Papillomavirus (HPV)",
        "Tetanus, Diphtheria, Pertussis (Tdap)",
        "Hepatitis B (HepB)",
        "Measles, Mumps, Rubella (MMR)",
        "Varicella (VAR)",
    ],
    "price": [9.0, 16.0, 23.0, 10.0, 9.0, 9.0, 11.0],
    "doses_required": [1, 1, 3, 1, 3, 2, 2],
    "age_criteria": [
        "18+ years old",
        "65+ years old",
        "18-26 years old",
        "18+ years old",
        "18+ years old",
        "18+ years old",
        "18+ years old",
    ],
    "gender_criteria": [
        "None",
        "None",
        "F",
        "F",
        "None",
        "None",
        "None",
    ],
    # condition_crietria
}
vaccines_df = pd.DataFrame(vaccines_data)
display(vaccines_df)

Unnamed: 0,id,name,price,doses_required,age_criteria,gender_criteria
0,8c99160b-aed4-46fe-b6c8-f25aacfc6e0d,Influenza (INF),9.0,1,18+ years old,
1,3149b2d6-ccd9-4107-b656-4209ed8eca1c,Pneumococcal Conjugate (PCV13),16.0,1,65+ years old,
2,100f8392-e99a-4036-a45b-8914f0e522d8,Human Papillomavirus (HPV),23.0,3,18-26 years old,F
3,0ab9b4ed-e58d-4019-84e0-7e0a6c5df7ad,"Tetanus, Diphtheria, Pertussis (Tdap)",10.0,1,18+ years old,F
4,3c3cdfbc-e67e-4ba1-b831-a1ffaab56302,Hepatitis B (HepB),9.0,3,18+ years old,
5,3460d1d7-fe6e-4f55-a00e-a35e817770bb,"Measles, Mumps, Rubella (MMR)",9.0,2,18+ years old,
6,03aba890-a9bc-4a31-98a3-7cb2ea6ca5e3,Varicella (VAR),11.0,2,18+ years old,


<hr>

## BookingSlots Table


In [17]:
# Define start and end dates for March and April 2025
start_date = datetime(2025, 3, 1)
end_date = datetime(2025, 4, 30)

# Calculate number of booking slots: weekdays only (Monday to Friday)
weekday_slots_per_day = 18  # 8 AM to 5 PM in 30-min intervals, 18 slots per day
N = 0  # Track number of generated slots

ids = []
polyclinic_ids = []
vaccine_ids = []
datetimes = []

# BookingSlots will only be for Polyclinics
polyclinic_id_list = clinics_df[clinics_df["type"] == "polyclinic"]["id"].tolist()

# Loop through each day in March and April 2025
for day in range((end_date - start_date).days + 1):
    current_day = start_date + timedelta(days=day)

    # Check if the current day is a weekday (0 to 4 are weekdays)
    if current_day.weekday() < 5:  # 0 = Monday, 4 = Friday
        # Randomly select half of the polyclinics for this day
        selected_polyclinics = random.sample(
            polyclinic_id_list, len(polyclinic_id_list) // 2
        )

        for polyclinic_id in selected_polyclinics:  # 13 polyclinics
            for hour in range(8, 18):  # 8 AM to 5 PM
                for minute in [0, 30]:  # 0 for AM slots, 30 for PM slots
                    slot_time = current_day.replace(
                        hour=hour, minute=minute, second=0, microsecond=0
                    )

                    vaccine_id = random.choice(vaccines_df["id"].tolist())

                    # Append data for this slot
                    ids.append(str(uuid.uuid4()))
                    polyclinic_ids.append(polyclinic_id)
                    vaccine_ids.append(vaccine_id)
                    datetimes.append(slot_time)
                    N += 1

# Booking Slots Table
booking_slots_data = {
    "id": ids,
    "polyclinic_id": polyclinic_ids,
    "vaccine_id": vaccine_ids,
    "datetime": datetimes,
}
booking_slots_df = pd.DataFrame(booking_slots_data)
# Sort DataFrame by the 'datetime' column
booking_slots_df = booking_slots_df.sort_values(by="datetime").reset_index(drop=True)

display(booking_slots_df)

Unnamed: 0,id,polyclinic_id,vaccine_id,datetime
0,c3bcbe9a-0e6d-4d5e-9f11-21677e3661b4,d04e2d6c-9ee8-4871-b79e-35d8e587cda9,100f8392-e99a-4036-a45b-8914f0e522d8,2025-03-03 08:00:00
1,5a949a2d-74bc-4a8d-966e-06299aea086e,4de9e253-08ac-445f-95a6-3d086068c49c,0ab9b4ed-e58d-4019-84e0-7e0a6c5df7ad,2025-03-03 08:00:00
2,443d8791-9d16-4666-873c-cfeffd378602,015991c6-8565-4f5b-a162-abb083bf6d58,3149b2d6-ccd9-4107-b656-4209ed8eca1c,2025-03-03 08:00:00
3,3f7f75c0-b28c-4bb7-8c9a-991e5d150bc3,2017474b-c611-42a9-be8c-3fa14457f8b6,8c99160b-aed4-46fe-b6c8-f25aacfc6e0d,2025-03-03 08:00:00
4,1c62eeac-2c5d-4600-8a2e-eed78d88356c,859fe92f-9c4b-4f3d-beda-b6c96e67887f,8c99160b-aed4-46fe-b6c8-f25aacfc6e0d,2025-03-03 08:00:00
...,...,...,...,...
11175,3b004cb5-0c8a-4483-9420-7bda8cdf49b9,f53f8540-5d7b-4dde-b806-b638ae0615f4,8c99160b-aed4-46fe-b6c8-f25aacfc6e0d,2025-04-30 17:30:00
11176,793bd4bb-cc1b-4fd0-bb30-721ebf0faa0d,84b1aa83-156c-4eca-a9bf-ddc72424de1a,3c3cdfbc-e67e-4ba1-b831-a1ffaab56302,2025-04-30 17:30:00
11177,c9d71d4d-3355-44f9-a0ae-27f5f77588ec,76da5477-df98-49d2-99d5-ba6d3900c3b7,3149b2d6-ccd9-4107-b656-4209ed8eca1c,2025-04-30 17:30:00
11178,9b801873-2ced-49a2-a203-d8191af48eae,14398891-c7a3-4425-9d6d-63a5c8cc4ac8,3149b2d6-ccd9-4107-b656-4209ed8eca1c,2025-04-30 17:30:00


<hr>

## VaccineRecords Table

Set as empty database


In [18]:
vaccine_records_count = 0

# Vaccine Records Table
vaccine_records_data = {
    "id": [str(uuid.uuid4()) for _ in range(1, vaccine_records_count + 1)],
    "user_id": [random.choice(users_data["id"]) for _ in range(vaccine_records_count)],
    "booking_slot_id": [
        np.random.choice(booking_slots_data["id"], replace=False)
        for _ in range(vaccine_records_count)
    ],
    "status": [
        random.choice(["booked", "completed"]) for _ in range(vaccine_records_count)
    ],
}
vaccine_records_df = pd.DataFrame(vaccine_records_data)

display(vaccine_records_df)

Unnamed: 0,id,user_id,booking_slot_id,status


<hr>

## Insert Vaccination Data into SQLite Database

Delete the existing SQLite file before running the cell below to avoid appending to old SQLite database.


In [19]:
# Define SQLite database file path again
sqlite_db_path = "../data/vaccination_db.sqlite"

# Reconnect to SQLite database
conn = sqlite3.connect(sqlite_db_path)
cursor = conn.cursor()


# Function to insert DataFrame data into SQLite
def insert_csv_to_sqlite(
    csv_data: pd.DataFrame, table_name: str, conn: sqlite3.Connection
):
    csv_data.to_sql(table_name, conn, if_exists="append", index=False)


# Insert data into SQLite tables
insert_csv_to_sqlite(users_df, "Users", conn)
insert_csv_to_sqlite(clinics_df, "Clinics", conn)
insert_csv_to_sqlite(addresses_df, "Addresses", conn)
insert_csv_to_sqlite(vaccines_df, "Vaccines", conn)
insert_csv_to_sqlite(booking_slots_df, "BookingSlots", conn)
insert_csv_to_sqlite(vaccine_records_df, "VaccineRecords", conn)

# Commit changes and close connection
conn.commit()
conn.close()

<hr>

## Query SQLite Database


In [20]:
conn = sqlite3.connect("../data/vaccination_db.sqlite")
cursor = conn.cursor()

# Example: Fetch all users
cursor.execute("SELECT * FROM Users")
rows = cursor.fetchall()
for row in rows:
    print(row)

conn.close()

('f2967c0a-5121-4c46-a4dc-2b7979cfb421', '918a15ec-2333-4ed3-a4b7-39a66cc92ffa', None, 'S9071950Q', 'Mark', 'Johnson', 'mark.johnson@example.net', '1990-11-21', 'M', '$2b$12$gf1LfqFawqeRB1xqfYzITulmwjcAhOUdsP09FqZWeKh87LRwNdebG', '2025-03-25 07:55:49', '2025-03-25 07:55:49')
('e5723722-2395-447f-9512-ea0f17fac47f', '797a0d10-0e78-4426-856c-0823a957a090', '983a9652-d9a5-4560-a6b8-d7befb3fef45', 'S8257099H', 'Kimberly', 'Garza', 'kimberly.garza@example.net', '1982-02-26', 'F', '$2b$12$ITAS7rMGe475rz.1x53sPe8sxMEYl9xbDh53Q3oZMqM4n9VM.R.uG', '2025-03-25 07:55:49', '2025-03-25 07:55:49')
('fd20c08f-8b5b-4dc0-b110-5e4decd0aea9', '44851715-59cb-4ee3-81c3-11ced10d5fdc', 'f53f8540-5d7b-4dde-b806-b638ae0615f4', 'S9550684U', 'Justin', 'Baldwin', 'justin.baldwin@example.com', '1995-03-22', 'M', '$2b$12$atm0zwhIWEUQgpOq/oIUc.1TGMOogrMAZnvhp/cB7DWkOOPzfl5HW', '2025-03-25 07:55:49', '2025-03-25 07:55:49')
('f5b0dfd9-4a27-49c2-aa6a-c05163bfbeaa', '7b9479b8-4384-48d1-b6d6-8e2d1ee2545b', '9c84ac40-ac28-

In [21]:
db_filename = "../data/vaccination_db.sqlite"

try:
    conn = sqlite3.connect(db_filename)
    cursor = conn.cursor()

    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    table_names = [row[0] for row in cursor.fetchall()]

    print("Tables in the database:")
    for table_name in table_names:
        print(table_name)

except sqlite3.Error as e:
    print(f"An error occurred: {e}")

finally:
    if conn:
        conn.close()

Tables in the database:
Users
Clinics
Addresses
Vaccines
BookingSlots
VaccineRecords
