# Dummy Data Preparation


In [61]:
import random
import re
import sqlite3
from datetime import datetime, timedelta

import pandas as pd
from faker import Faker
from IPython.display import display

In [62]:
fake = Faker()

Note:

The `id` column will be in string. This is to mimic the real life scenario, whereby the ids may consist of characters.

## Users Table


In [63]:
# Regex pattern for nric
nric_regex = r"[STFGM]\d{7}[A-Z]"


def generate_nric(dob):
    year = dob.year
    year_suffix = str(year % 100).zfill(2)  # Extract last two digits of the year
    serial_number = str(random.randint(10000, 99999))  # Ensure exactly 7 digits
    checksum = random.choice("ABCDEFGHIJKLMNOPQRSTUVWXYZ")  # Random letter for checksum

    # Determine prefix based on birth year
    prefix = "S" if year < 2000 else "T"

    sample_nric = f"{prefix}{year_suffix}{serial_number}{checksum}"

    # Validate against regex before returning
    if not re.fullmatch(nric_regex, sample_nric):
        raise ValueError(
            f"Generated nric '{sample_nric}' does not match expected pattern."
        )

    return sample_nric


def generate_email(first_name, last_name):
    email_username = f"{first_name.lower()}.{last_name.lower()}"
    email_domain = fake.free_email_domain()  # Use a real email domain
    return f"{email_username}@{email_domain}"

In [64]:
# Users Table
users_data = {
    "nric": [],
    "first_name": [],
    "last_name": [],
    "email": [],
    "date_of_birth": [],
    "gender": [],
}

# Gender (50% Male and 50% Female)
users_data["gender"] = ["Male" if i % 2 == 0 else "Female" for i in range(200)]
# Gender-specific names based on the gender
for gender in users_data["gender"]:
    if gender == "Male":
        first_name = fake.first_name_male()
        last_name = fake.last_name()  # Last name is gender-neutral
    else:
        first_name = fake.first_name_female()
        last_name = fake.last_name()  # Last name is gender-neutral

    # Append to the lists
    users_data["first_name"].append(first_name)
    users_data["last_name"].append(last_name)

# Date of Birth
users_data["date_of_birth"] = [
    fake.date_of_birth(minimum_age=18, maximum_age=80) for _ in range(200)
]
# NRIC based on date of birth
dob_list = users_data["date_of_birth"]
users_data["nric"] = [generate_nric(dob) for dob in dob_list]

# Emails based on first and last name
users_data["email"] = [
    generate_email(first_name, last_name)
    for first_name, last_name in zip(users_data["first_name"], users_data["last_name"])
]

# Create DataFrame
users_df = pd.DataFrame(users_data)

# Verify all nrics match the expected pattern
if all(re.fullmatch(nric_regex, nric) for nric in users_df["nric"]):
    print("All nrics match the expected format.")
else:
    print("Some nrics do not match the expected format.")
    print(
        users_df.loc[~users_df["nric"].str.match(nric_regex), ["nric"]]
    )  # Show incorrect nrics

display(users_df)

All nrics match the expected format.


Unnamed: 0,nric,first_name,last_name,email,date_of_birth,gender
0,T0559380W,Jordan,Johnson,jordan.johnson@hotmail.com,2005-11-21,Male
1,S7835832Z,Michelle,Price,michelle.price@gmail.com,1978-03-31,Female
2,S5149482R,William,Murphy,william.murphy@gmail.com,1951-08-10,Male
3,S8833428Z,Dawn,Robinson,dawn.robinson@gmail.com,1988-06-19,Female
4,S5289849B,Troy,Mcintyre,troy.mcintyre@yahoo.com,1952-03-11,Male
...,...,...,...,...,...,...
195,S6543701W,Judy,Meyer,judy.meyer@yahoo.com,1965-02-06,Female
196,S6071393C,Nathaniel,Ballard,nathaniel.ballard@hotmail.com,1960-01-10,Male
197,S9978308E,Anna,Brock,anna.brock@hotmail.com,1999-02-26,Female
198,T0343443A,Kevin,Whitehead,kevin.whitehead@gmail.com,2003-04-12,Male


In [None]:
# Combine first name and last name into a full name for checking uniqueness
check_fullName = users_df["first_name"] + " " + users_df["last_name"]
# Check that the full names are all unique
print(f"Number of unique full names: {check_fullName.nunique()}")

# Check that dates are in range
start_date = "1920-01-01"
end_date = "2025-03-01"

users_df["date_of_birth"] = pd.to_datetime(users_df["date_of_birth"])
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)

is_in_range = (users_df["date_of_birth"] >= start_date) & (
    users_df["date_of_birth"] <= end_date
)

# check if the date is under range
if is_in_range.all():
    print("All date are in range")
else:
    print("Some dates are not in range")

Number of unique full names: 200
All date are in range


## Vaccines Table


In [66]:
# Vaccine Table
vaccines_data = {
    "id": [1, 2, 3, 4, 5, 6, 7, 8],
    "name": [
        "Influenza (INF)",
        "Human Papillomavirus (HPV)",
        "Pneumococcal",
        "Human Immunodeficiency Viruses (HIV)",
        "Tetanus, Diphtheria, Pertussis (Tdap)",
        "Hepatitis B (HepB)",
        "Measles, Mumps, Rubella (MMR)",
        "Varicella (VAR)",
    ],
    "price": [9.0, 0.0, 16.0, 23.0, 10.0, 9.0, 9.0, 11.0],
    "doses_required": [2, 4, 1, 2, 1, 2, 1, 3],
    "age_criteria": [
        "18+ years old",
        "18+ years old",
        "65+ years old",
        "18-26 years old",
        "27-64 years old",
        "18+ years old",
        "18+ years old",
        "18+ years old",
    ],
    "gender_criteria": [
        "None",
        "None",
        "None",
        "Female",
        "Female",
        "None",
        "None",
        "None",
    ],
    # condition_crietria
}
vaccines_df = pd.DataFrame(vaccines_data)
display(vaccines_df)

Unnamed: 0,id,name,price,doses_required,age_criteria,gender_criteria
0,1,Influenza (INF),9.0,2,18+ years old,
1,2,Human Papillomavirus (HPV),0.0,4,18+ years old,
2,3,Pneumococcal,16.0,1,65+ years old,
3,4,Human Immunodeficiency Viruses (HIV),23.0,2,18-26 years old,Female
4,5,"Tetanus, Diphtheria, Pertussis (Tdap)",10.0,1,27-64 years old,Female
5,6,Hepatitis B (HepB),9.0,2,18+ years old,
6,7,"Measles, Mumps, Rubella (MMR)",9.0,1,18+ years old,
7,8,Varicella (VAR),11.0,3,18+ years old,


## Polyclinics Table


In [67]:
# Polyclinic Table
polyclinics_data = {
    "id": [i for i in range(1, 31)],  # 30 polyclinics
    "name": [fake.company() + " Polyclinic" for _ in range(30)],
    "address": [fake.address() for _ in range(30)],
}
polyclinics_df = pd.DataFrame(polyclinics_data)
display(polyclinics_df)

Unnamed: 0,id,name,address
0,1,Sutton-Fox Polyclinic,"0419 Jeremy Islands\nAnthonyborough, AR 35395"
1,2,Romero-Henderson Polyclinic,Unit 8408 Box 0111\nDPO AE 46596
2,3,Lee Group Polyclinic,"4129 Tiffany Shoal Suite 129\nPort Shawn, IL 8..."
3,4,Carter-Sparks Polyclinic,"431 Castillo Hill Suite 272\nJoannborough, NC ..."
4,5,Silva-Avila Polyclinic,"186 Meghan View Apt. 062\nNew Ninamouth, SC 38674"
5,6,Michael Inc Polyclinic,"101 Morse Estates\nLake Kurtmouth, MH 09042"
6,7,Salazar-Salinas Polyclinic,"6614 Chelsea Valley\nNielsenfort, AL 39533"
7,8,Greer-Liu Polyclinic,Unit 2876 Box 5544\nDPO AP 53595
8,9,"Russo, Collins and Cooper Polyclinic","02236 Charles Pass\nWest Brandon, SD 44613"
9,10,Williams-Mcguire Polyclinic,"53946 Mcclure Rapids\nEmilyfort, KY 56712"


In [68]:
# Check that the names are all unique
print(polyclinics_df["name"].nunique())

30


## Booking Slots Table


In [69]:
# 1 day: 5 slots
# 1 week (5 working days): 25 slots
# 30 polyclincs: 750
# Booking Slots Table

# Define the start and end dates for the range
start_date = datetime(2025, 3, 17)
end_date = datetime(2025, 3, 21)

# Booking Slots Table
booking_slots_data = {
    "id": [i for i in range(1, 751)],
    "polyclinic_id": [random.choice(polyclinics_data["id"]) for _ in range(750)],
    "vaccine_id": [random.choice(vaccines_data["id"]) for _ in range(750)],
    "datetime": [
        # Generate random date between 17th March and 21st March
        (
            start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
        ).replace(
            hour=random.randint(9, 18),  # Random hour between 9 AM and 6 PM
            minute=0,  # Set minute to 0 for hourly slots
            second=0,  # Set second to 0
            microsecond=0,  # Set microsecond to 0
        )
        for _ in range(750)
    ],
    "is_booked": [random.choice([True, False]) for _ in range(750)],
}

booking_slots_df = pd.DataFrame(booking_slots_data)

# Assign user_id only for booked slots
unique_bookings = set()  # To ensure uniqueness
user_nrics = users_df["nric"]


def assign_user_id(row):
    if row["is_booked"]:
        random.shuffle(user_nrics)  # Shuffle to randomize selection
        for user_id in user_nrics:
            key = (row["polyclinic_id"], row["datetime"], user_id)
            if key not in unique_bookings:
                unique_bookings.add(key)
                return user_id
    return None


booking_slots_df["user_nric"] = booking_slots_df.apply(assign_user_id, axis=1)

# Display the DataFrame
display(booking_slots_df)

Unnamed: 0,id,polyclinic_id,vaccine_id,datetime,is_booked,user_nric
0,1,2,5,2025-03-18 09:00:00,True,S4656854K
1,2,14,1,2025-03-18 15:00:00,True,S9671626H
2,3,21,2,2025-03-17 10:00:00,False,
3,4,9,8,2025-03-18 16:00:00,False,
4,5,28,6,2025-03-18 16:00:00,True,S9671626H
...,...,...,...,...,...,...
745,746,16,4,2025-03-19 09:00:00,True,S5018775A
746,747,20,8,2025-03-19 11:00:00,True,S7258804H
747,748,15,6,2025-03-17 09:00:00,True,S5343224D
748,749,12,4,2025-03-18 09:00:00,False,


In [70]:
# Check balance
booking_slots_df["is_booked"].value_counts()

is_booked
False    395
True     355
Name: count, dtype: int64

## Vaccine Records Table


In [71]:
# Vaccine Records Table
vaccine_records_data = {
    "id": [i for i in range(1, 501)],
    "user_nric": [random.choice(users_data["nric"]) for _ in range(500)],
    "vaccine_id": [random.choice(vaccines_data["id"]) for _ in range(500)],
    "polyclinic_id": [random.choice(polyclinics_data["id"]) for _ in range(500)],
    "vaccination_date": [fake.date_this_year() for _ in range(500)],
}
vaccine_records_df = pd.DataFrame(vaccine_records_data)
display(vaccine_records_df)

Unnamed: 0,id,user_nric,vaccine_id,polyclinic_id,vaccination_date
0,1,S7546104H,3,27,2025-01-23
1,2,S4857156I,8,8,2025-02-11
2,3,S7352939I,5,3,2025-02-01
3,4,S6054081N,7,4,2025-01-10
4,5,S7546104H,3,26,2025-02-15
...,...,...,...,...,...
495,496,S5018775A,8,14,2025-03-18
496,497,S4857156I,2,13,2025-03-03
497,498,S5149482R,6,12,2025-01-03
498,499,S7659285Q,1,28,2025-02-02


## Vaccine Stock Inventory Table


In [72]:
# 30 polyclinics
# 8 vaccines
# Vaccine Stock Inventory Table
# Number of records
total_records = 200

# Initialize a list to hold the vaccine stock data
vaccine_stock_data = {
    "id": [],
    "polyclinic_id": [],
    "vaccine_id": [],
    "stock_quantity": [],
}

# Initialize a set to track unique (polyclinic_id, vaccine_id) combinations
existing_combinations = set()

id_counter = 1

# Loop to create 200 records with random vaccine availability
while len(vaccine_stock_data["polyclinic_id"]) < total_records:
    polyclinic = random.choice(polyclinics_df["id"])  # Randomly choose a polyclinic
    available_vaccines = random.sample(
        vaccines_df["id"].tolist(), random.randint(1, len(vaccines_df))
    )  # Randomly pick some vaccines for the polyclinic

    for vaccine in available_vaccines:
        # Ensure the (polyclinic_id, vaccine_id) combination is unique
        combination = (polyclinic, vaccine)
        if combination not in existing_combinations:
            vaccine_stock_data["id"].append(id_counter)
            vaccine_stock_data["polyclinic_id"].append(polyclinic)  # Polyclinic ID
            vaccine_stock_data["vaccine_id"].append(vaccine)  # Vaccine ID
            vaccine_stock_data["stock_quantity"].append(
                random.randint(50, 200)
            )  # Random stock quantity
            existing_combinations.add(combination)  # Track the combination
            id_counter += 1  # Increment ID

# Convert to DataFrame
vaccine_stock_df = pd.DataFrame(vaccine_stock_data)
# Display the resulting DataFrame
display(vaccine_stock_df)

Unnamed: 0,id,polyclinic_id,vaccine_id,stock_quantity
0,1,4,6,177
1,2,4,3,161
2,3,4,7,164
3,4,4,2,98
4,5,4,5,96
...,...,...,...,...
197,198,23,2,196
198,199,24,5,91
199,200,24,6,157
200,201,24,3,89


In [73]:
# Group the dataframe by polyclinic_id and vaccine_id and count occurrences
grouped = (
    vaccine_stock_df.groupby(["polyclinic_id", "vaccine_id"])
    .size()
    .reset_index(name="count")
)

# Check if any combination appears more than once
duplicates = grouped[grouped["count"] > 1]

# Display the duplicates if any
if not duplicates.empty:
    print("Found duplicate entries:")
    display(duplicates)
else:
    print("All polyclinic_id and vaccine_id combinations are unique.")

All polyclinic_id and vaccine_id combinations are unique.


## Insert Vaccination Data into SQLite Database

Remember to delete the existing SQLite file before running the cell below.

In [74]:
# Define SQLite database file path again
sqlite_db_path = "../data/vaccination_db.sqlite"

# Reconnect to SQLite database
conn = sqlite3.connect(sqlite_db_path)
cursor = conn.cursor()


# Function to insert DataFrame data into SQLite
def insert_csv_to_sqlite(
    csv_data: pd.DataFrame, table_name: str, conn: sqlite3.Connection
):
    csv_data.to_sql(table_name, conn, if_exists="append", index=False)


# Insert data into SQLite tables
insert_csv_to_sqlite(users_df, "Users", conn)
insert_csv_to_sqlite(vaccines_df, "Vaccines", conn)
insert_csv_to_sqlite(polyclinics_df, "Polyclinics", conn)
insert_csv_to_sqlite(booking_slots_df, "BookingSlots", conn)
insert_csv_to_sqlite(vaccine_records_df, "VaccineRecords", conn)
insert_csv_to_sqlite(vaccine_stock_df, "VaccineStockInventory", conn)

# Commit changes and close connection
conn.commit()
conn.close()

## Query SQLite Database

In [75]:
conn = sqlite3.connect("../data/vaccination_db.sqlite")
cursor = conn.cursor()

In [76]:
# Example: Fetch all users
cursor.execute("SELECT * FROM Users")
rows = cursor.fetchall()
for row in rows:
    print(row)

conn.close()

('S8938686B', 'Jordan', 'Johnson', 'jordan.johnson@hotmail.com', '2005-11-21 00:00:00', 'Male')
('S8236775O', 'Michelle', 'Price', 'michelle.price@gmail.com', '1978-03-31 00:00:00', 'Female')
('S9968741O', 'William', 'Murphy', 'william.murphy@gmail.com', '1951-08-10 00:00:00', 'Male')
('S6795239E', 'Dawn', 'Robinson', 'dawn.robinson@gmail.com', '1988-06-19 00:00:00', 'Female')
('S4879016H', 'Troy', 'Mcintyre', 'troy.mcintyre@yahoo.com', '1952-03-11 00:00:00', 'Male')
('S6913264W', 'Maria', 'Estes', 'maria.estes@yahoo.com', '1996-03-27 00:00:00', 'Female')
('S8921285X', 'Bobby', 'Allen', 'bobby.allen@gmail.com', '1976-04-08 00:00:00', 'Male')
('S5227967Y', 'Kristi', 'Randall', 'kristi.randall@yahoo.com', '2003-11-20 00:00:00', 'Female')
('S8847851P', 'Joshua', 'Castillo', 'joshua.castillo@yahoo.com', '1987-01-16 00:00:00', 'Male')
('S9129141X', 'Ashley', 'Wong', 'ashley.wong@yahoo.com', '1969-07-11 00:00:00', 'Female')
('S9747465H', 'Joel', 'Warren', 'joel.warren@hotmail.com', '1987-03-

In [77]:
db_filename = "../data/vaccination_db.sqlite"

try:
    conn = sqlite3.connect(db_filename)
    cursor = conn.cursor()

    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    table_names = [row[0] for row in cursor.fetchall()]

    print("Tables in the database:")
    for table_name in table_names:
        print(table_name)

except sqlite3.Error as e:
    print(f"An error occurred: {e}")

finally:
    if conn:
        conn.close()

Tables in the database:
Users
Vaccines
Polyclinics
BookingSlots
VaccineRecords
VaccineStockInventory
