# Dummy Data Preparation


In [393]:
import random
import re
import sqlite3
from datetime import datetime, timedelta

import pandas as pd
from faker import Faker
from IPython.display import display

In [394]:
fake = Faker()

Note:

The `id` column will be in string. This is to mimic the real life scenario, whereby the ids may consist of characters.

## Users Table


In [395]:
# Regex pattern for nric
nric_regex = r"[STFGM]\d{7}[A-Z]"

# Generate nric based on birthdate


def generate_nric(dob):
    year = dob.year
    year_suffix = str(year % 100).zfill(2)  # Extract last two digits of the year
    serial_number = str(random.randint(10000, 99999))  # Ensure exactly 7 digits
    checksum = random.choice("ABCDEFGHIJKLMNOPQRSTUVWXYZ")  # Random letter for checksum

    # Determine prefix based on birth year
    prefix = "S" if year < 2000 else "T"

    sample_nric = f"{prefix}{year_suffix}{serial_number}{checksum}"

    # Validate against regex before returning
    if not re.fullmatch(nric_regex, sample_nric):
        raise ValueError(
            f"Generated nric '{sample_nric}' does not match expected pattern."
        )

    return sample_nric


# Generate emails based on names


def generate_email(name):
    name_parts = name.lower().split()
    email_username = ".".join(
        name_parts
    )  # Convert name to lowercase and join with dots
    email_domain = fake.free_email_domain()  # Use a real email domain
    return f"{email_username}@{email_domain}"

In [396]:
# Users Table
users_data = {
    "name": [fake.name() for _ in range(200)],
}

# Generate other attributes
users_data["date_of_birth"] = [
    fake.date_of_birth(minimum_age=18, maximum_age=80) for _ in range(200)
]
users_data["gender"] = [
    fake.random_element(elements=["Male", "Female"]) for _ in range(200)
]

# Generate nric based on date of birth
dob_list = users_data["date_of_birth"]
users_data["nric"] = [generate_nric(dob) for dob in dob_list]  # Fixed function call

users_data["email"] = [generate_email(name) for name in users_data["name"]]

users_df = pd.DataFrame(users_data)
# Reorder columns
column_order = ["nric", "name", "email", "date_of_birth", "gender"]
users_df = users_df[column_order]

# Verify all nrics match the expected pattern
if all(re.fullmatch(nric_regex, nric) for nric in users_df["nric"]):
    print("All nrics match the expected format.")
else:
    print("Some nrics do not match the expected format.")
    print(
        users_df.loc[~users_df["nric"].str.match(nric_regex), ["nric"]]
    )  # Show incorrect nrics

display(users_df)

All nrics match the expected format.


Unnamed: 0,nric,name,email,date_of_birth,gender
0,S9023750P,Jason Hull,jason.hull@gmail.com,1990-12-18,Male
1,S6452972K,Jessica Weiss,jessica.weiss@yahoo.com,1964-06-24,Male
2,S9225451F,Tracy Hobbs,tracy.hobbs@yahoo.com,1992-11-29,Female
3,S9029526C,Julie Keith,julie.keith@gmail.com,1990-01-20,Male
4,T0470925A,Patrick Watts,patrick.watts@hotmail.com,2004-01-30,Male
...,...,...,...,...,...
195,S4787101O,Christopher Wilson,christopher.wilson@yahoo.com,1947-07-03,Female
196,S8042180E,Vanessa Brooks,vanessa.brooks@hotmail.com,1980-06-16,Male
197,S8460015L,April Bennett,april.bennett@hotmail.com,1984-03-12,Male
198,S4970882Y,Christy Cervantes,christy.cervantes@hotmail.com,1949-10-28,Male


In [397]:
# Check that the names are all unique
print(users_df["name"].nunique())

# Check that dates are in range
start_date = "1920-01-01"
end_date = "2025-03-01"

users_df["date_of_birth"] = pd.to_datetime(users_df["date_of_birth"])
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)

is_in_range = (users_df["date_of_birth"] >= start_date) & (
    users_df["date_of_birth"] <= end_date
)

# check if the date is under range
if is_in_range.all():
    print("All date are in range")
else:
    print("Some dates are not in range")

200
All date are in range


## Vaccines Table


In [398]:
# Vaccine Table
vaccines_data = {
    "id": [1, 2, 3, 4, 5, 6, 7, 8],
    "name": [
        "Influenza (INF)",
        "Human Papillomavirus (HPV)",
        "Pneumococcal",
        "Human Immunodeficiency Viruses (HIV)",
        "Tetanus, Diphtheria, Pertussis (Tdap)",
        "Hepatitis B (HepB)",
        "Measles, Mumps, Rubella (MMR)",
        "Varicella (VAR)",
    ],
    "price": [9.0, 0.0, 16.0, 23.0, 10.0, 9.0, 9.0, 11.0],
    "doses_required": [2, 4, 1, 2, 1, 2, 1, 3],
    "age_criteria": [
        "18+ years old",
        "18+ years old",
        "65+ years old",
        "18-26 years old",
        "27-64 years old",
        "18+ years old",
        "18+ years old",
        "18+ years old",
    ],
    "gender_criteria": [
        "None",
        "None",
        "None",
        "Female",
        "Female",
        "None",
        "None",
        "None",
    ],
    # condition_crietria
}
vaccines_df = pd.DataFrame(vaccines_data)
display(vaccines_df)

Unnamed: 0,id,name,price,doses_required,age_criteria,gender_criteria
0,1,Influenza (INF),9.0,2,18+ years old,
1,2,Human Papillomavirus (HPV),0.0,4,18+ years old,
2,3,Pneumococcal,16.0,1,65+ years old,
3,4,Human Immunodeficiency Viruses (HIV),23.0,2,18-26 years old,Female
4,5,"Tetanus, Diphtheria, Pertussis (Tdap)",10.0,1,27-64 years old,Female
5,6,Hepatitis B (HepB),9.0,2,18+ years old,
6,7,"Measles, Mumps, Rubella (MMR)",9.0,1,18+ years old,
7,8,Varicella (VAR),11.0,3,18+ years old,


## Polyclinics Table


In [399]:
# Polyclinic Table
polyclinics_data = {
    "id": [i for i in range(1, 31)],  # 30 polyclinics
    "name": [fake.company() + " Polyclinic" for _ in range(30)],
    "address": [fake.address() for _ in range(30)],
}
polyclinics_df = pd.DataFrame(polyclinics_data)
display(polyclinics_df)

Unnamed: 0,id,name,address
0,1,"Simon, Taylor and Cunningham Polyclinic","259 Tamara Keys\nMasonfurt, IL 82540"
1,2,Shelton and Sons Polyclinic,"PSC 7274, Box 2164\nAPO AE 94733"
2,3,"Mahoney, Soto and Clarke Polyclinic","38967 Nguyen Courts\nNew Barbaraland, AL 60240"
3,4,Simmons-Frank Polyclinic,"595 Julie Avenue Apt. 648\nMichellestad, OR 32398"
4,5,Carter-Benjamin Polyclinic,"55259 Hannah Center\nAbbottburgh, VI 22321"
5,6,Gould Ltd Polyclinic,"83557 Newton Haven Apt. 334\nJustinfort, MN 15909"
6,7,Bailey and Sons Polyclinic,"123 Andrew Lock Apt. 769\nCraigshire, MT 67128"
7,8,"Richmond, Cannon and Pittman Polyclinic","99604 Knight Spurs Suite 484\nHamptonchester, ..."
8,9,Torres-Bell Polyclinic,"49910 Stacey Island\nWest Juliefort, OR 89338"
9,10,"Sanchez, Spencer and Hurst Polyclinic","7635 Willis Avenue Suite 183\nJoannafurt, NV 9..."


In [400]:
# Check that the names are all unique
print(polyclinics_df["name"].nunique())

30


## Booking Slots Table


In [401]:
# 1 day: 5 slots
# 1 week (5 working days): 25 slots
# 30 polyclincs: 750
# Booking Slots Table

# Define the start and end dates for the range
start_date = datetime(2025, 3, 17)
end_date = datetime(2025, 3, 21)

# Booking Slots Table
booking_slots_data = {
    "id": [i for i in range(1, 751)],
    "polyclinic_id": [random.choice(polyclinics_data["id"]) for _ in range(750)],
    "vaccine_id": [random.choice(vaccines_data["id"]) for _ in range(750)],
    "datetime": [
        # Generate random date between 17th March and 21st March
        (
            start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
        ).replace(
            hour=random.randint(9, 18),  # Random hour between 9 AM and 6 PM
            minute=0,  # Set minute to 0 for hourly slots
            second=0,  # Set second to 0
            microsecond=0,  # Set microsecond to 0
        )
        for _ in range(750)
    ],
    "is_booked": [random.choice([True, False]) for _ in range(750)],
}

booking_slots_df = pd.DataFrame(booking_slots_data)

# Assign user_id only for booked slots
unique_bookings = set()  # To ensure uniqueness
user_nrics = users_df["nric"]


def assign_user_id(row):
    if row["is_booked"]:
        random.shuffle(user_nrics)  # Shuffle to randomize selection
        for user_id in user_nrics:
            key = (row["polyclinic_id"], row["datetime"], user_id)
            if key not in unique_bookings:
                unique_bookings.add(key)
                return user_id
    return None


booking_slots_df["user_nric"] = booking_slots_df.apply(assign_user_id, axis=1)

# Display the DataFrame
display(booking_slots_df)

Unnamed: 0,id,polyclinic_id,vaccine_id,datetime,is_booked,user_nric
0,1,5,4,2025-03-17 09:00:00,True,T0195932K
1,2,10,2,2025-03-19 10:00:00,False,
2,3,13,2,2025-03-20 17:00:00,False,
3,4,23,8,2025-03-19 16:00:00,True,S5857431S
4,5,9,7,2025-03-20 15:00:00,False,
...,...,...,...,...,...,...
745,746,28,6,2025-03-21 18:00:00,True,S8477164C
746,747,29,7,2025-03-18 18:00:00,False,
747,748,16,2,2025-03-19 11:00:00,True,S6452972K
748,749,22,5,2025-03-21 13:00:00,True,S6883705H


In [403]:
# Check balance
booking_slots_df["is_booked"].value_counts()

is_booked
True     398
False    352
Name: count, dtype: int64

## Vaccine Records Table


In [404]:
# Vaccine Records Table
vaccine_records_data = {
    "id": [i for i in range(1, 501)],
    "user_nric": [random.choice(users_data["nric"]) for _ in range(500)],
    "vaccine_id": [random.choice(vaccines_data["id"]) for _ in range(500)],
    "polyclinic_id": [random.choice(polyclinics_data["id"]) for _ in range(500)],
    "vaccination_date": [fake.date_this_year() for _ in range(500)],
}
vaccine_records_df = pd.DataFrame(vaccine_records_data)
display(vaccine_records_df)

Unnamed: 0,id,user_nric,vaccine_id,polyclinic_id,vaccination_date
0,1,S4596847Y,2,20,2025-02-26
1,2,T0165401U,7,5,2025-01-03
2,3,S9813264L,2,5,2025-01-07
3,4,S6082123Z,7,1,2025-01-21
4,5,S4897648J,1,12,2025-01-15
...,...,...,...,...,...
495,496,S9230948P,7,27,2025-02-18
496,497,S7630569X,3,10,2025-01-15
497,498,S6638882M,5,27,2025-03-16
498,499,S8650967W,4,21,2025-01-30


## Vaccine Stock Inventory Table


In [406]:
# 30 polyclinics
# 8 vaccines
# Vaccine Stock Inventory Table
# Number of records
total_records = 200

# Initialize a list to hold the vaccine stock data
vaccine_stock_data = {
    "id": [],
    "polyclinic_id": [],
    "vaccine_id": [],
    "stock_quantity": [],
}

# Initialize a set to track unique (polyclinic_id, vaccine_id) combinations
existing_combinations = set()

id_counter = 1

# Loop to create 200 records with random vaccine availability
while len(vaccine_stock_data["polyclinic_id"]) < total_records:
    polyclinic = random.choice(polyclinics_df["id"])  # Randomly choose a polyclinic
    available_vaccines = random.sample(
        vaccines_df["id"].tolist(), random.randint(1, len(vaccines_df))
    )  # Randomly pick some vaccines for the polyclinic

    for vaccine in available_vaccines:
        # Ensure the (polyclinic_id, vaccine_id) combination is unique
        combination = (polyclinic, vaccine)
        if combination not in existing_combinations:
            vaccine_stock_data["id"].append(id_counter)
            vaccine_stock_data["polyclinic_id"].append(polyclinic)  # Polyclinic ID
            vaccine_stock_data["vaccine_id"].append(vaccine)  # Vaccine ID
            vaccine_stock_data["stock_quantity"].append(
                random.randint(50, 200)
            )  # Random stock quantity
            existing_combinations.add(combination)  # Track the combination
            id_counter += 1  # Increment ID

# Convert to DataFrame
vaccine_stock_df = pd.DataFrame(vaccine_stock_data)
# Display the resulting DataFrame
display(vaccine_stock_df)

Unnamed: 0,id,polyclinic_id,vaccine_id,stock_quantity
0,1,5,5,86
1,2,5,6,135
2,3,5,8,107
3,4,5,7,132
4,5,5,1,144
...,...,...,...,...
196,197,13,6,77
197,198,25,1,116
198,199,25,2,162
199,200,25,3,154


In [407]:
# Group the dataframe by polyclinic_id and vaccine_id and count occurrences
grouped = (
    vaccine_stock_df.groupby(["polyclinic_id", "vaccine_id"])
    .size()
    .reset_index(name="count")
)

# Check if any combination appears more than once
duplicates = grouped[grouped["count"] > 1]

# Display the duplicates if any
if not duplicates.empty:
    print("Found duplicate entries:")
    display(duplicates)
else:
    print("All polyclinic_id and vaccine_id combinations are unique.")

All polyclinic_id and vaccine_id combinations are unique.


## Insert Vaccination Data into SQLite Database

Remember to delete the existing SQLite file before running the cell below.

In [408]:
# Define SQLite database file path again
sqlite_db_path = "../data/vaccination_db.sqlite"

# Reconnect to SQLite database
conn = sqlite3.connect(sqlite_db_path)
cursor = conn.cursor()


# Function to insert DataFrame data into SQLite
def insert_csv_to_sqlite(
    csv_data: pd.DataFrame, table_name: str, conn: sqlite3.Connection
):
    csv_data.to_sql(table_name, conn, if_exists="append", index=False)


# Insert data into SQLite tables
insert_csv_to_sqlite(users_df, "Users", conn)
insert_csv_to_sqlite(vaccines_df, "Vaccines", conn)
insert_csv_to_sqlite(polyclinics_df, "Polyclinics", conn)
insert_csv_to_sqlite(booking_slots_df, "BookingSlots", conn)
insert_csv_to_sqlite(vaccine_records_df, "VaccineRecords", conn)
insert_csv_to_sqlite(vaccine_stock_df, "VaccineStockInventory", conn)

# Commit changes and close connection
conn.commit()
conn.close()

## Query SQLite Database

In [409]:
conn = sqlite3.connect("../data/vaccination_db.sqlite")
cursor = conn.cursor()

In [410]:
# Example: Fetch all users
cursor.execute("SELECT * FROM Users")
rows = cursor.fetchall()
for row in rows:
    print(row)

conn.close()

('S4953513M', 'Jason Hull', 'jason.hull@gmail.com', '1990-12-18 00:00:00', 'Male')
('S9959458B', 'Jessica Weiss', 'jessica.weiss@yahoo.com', '1964-06-24 00:00:00', 'Male')
('S8275482H', 'Tracy Hobbs', 'tracy.hobbs@yahoo.com', '1992-11-29 00:00:00', 'Female')
('S5557889B', 'Julie Keith', 'julie.keith@gmail.com', '1990-01-20 00:00:00', 'Male')
('T0560939K', 'Patrick Watts', 'patrick.watts@hotmail.com', '2004-01-30 00:00:00', 'Male')
('S4527596P', 'Alexander Williams', 'alexander.williams@gmail.com', '1990-09-01 00:00:00', 'Female')
('S6269040P', 'Michael Graham', 'michael.graham@yahoo.com', '1962-11-10 00:00:00', 'Male')
('S9630089U', 'Cindy Holden', 'cindy.holden@gmail.com', '2000-08-31 00:00:00', 'Female')
('T0053896Q', 'William Galloway', 'william.galloway@hotmail.com', '1984-05-22 00:00:00', 'Female')
('S5739991M', 'Yvonne Williams', 'yvonne.williams@hotmail.com', '1954-01-18 00:00:00', 'Male')
('S6172221A', 'Melissa Anderson', 'melissa.anderson@hotmail.com', '1968-03-29 00:00:00', '

In [411]:
db_filename = "../data/vaccination_db.sqlite"

try:
    conn = sqlite3.connect(db_filename)
    cursor = conn.cursor()

    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    table_names = [row[0] for row in cursor.fetchall()]

    print("Tables in the database:")
    for table_name in table_names:
        print(table_name)

except sqlite3.Error as e:
    print(f"An error occurred: {e}")

finally:
    if conn:
        conn.close()

Tables in the database:
Users
Vaccines
Polyclinics
BookingSlots
VaccineRecords
VaccineStockInventory
