# Dummy Data Preparation


In [22]:
import random
import sqlite3
from datetime import datetime, timedelta

import pandas as pd
from faker import Faker
from IPython.display import display

In [23]:
fake = Faker()

Note:

The ids such as `user_id`, `polyclinic_id`, `vaccine_id` and etc. will be in string.

This is to mimic the real life scenario, whereby the ids may consist of characters.

## Users Table


In [24]:
# Users Table
users_data = {
    "user_id": [i for i in range(1, 201)],
    "name": [fake.name() for _ in range(200)],
    "email": [fake.email() for _ in range(200)],
    "date_of_birth": [
        fake.date_of_birth(minimum_age=18, maximum_age=80) for _ in range(200)
    ],
    "gender": [fake.random_element(elements=["Male", "Female"]) for _ in range(200)],
}
users_df = pd.DataFrame(users_data)
users_df["user_id"] = users_df["user_id"].astype(str)
display(users_df)

Unnamed: 0,user_id,name,email,date_of_birth,gender
0,1,Amanda Coleman,vincentjohnson@example.com,1946-11-23,Female
1,2,Joshua Jackson,allenevan@example.com,1974-03-22,Female
2,3,Colin Pierce,davidjames@example.net,1991-05-13,Male
3,4,Craig Mills,william17@example.org,1991-06-03,Female
4,5,James Reid,bennettshirley@example.com,1950-11-14,Female
...,...,...,...,...,...
195,196,Brian Meadows,slloyd@example.com,2005-12-07,Male
196,197,Daniel Wang,dickersonbrittany@example.org,1983-02-18,Male
197,198,Rachel Jimenez,allison64@example.com,1966-02-08,Male
198,199,Jennifer Curry,jlopez@example.com,1959-11-28,Male


In [25]:
# Check that the names are all unique
print(users_df["name"].nunique())

# Check that dates are in range
start_date = "1920-01-01"
end_date = "2025-03-01"

users_df["date_of_birth"] = pd.to_datetime(users_df["date_of_birth"])
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)

is_in_range = (users_df["date_of_birth"] >= start_date) & (
    users_df["date_of_birth"] <= end_date
)

# check if the date is under range
if is_in_range.all():
    print("All date are in range")
else:
    print("Some dates are not in range")

200
All date are in range


## Vaccines Table


In [26]:
# Vaccine Table
vaccines_data = {
    "vaccine_id": [1, 2, 3, 4, 5, 6, 7, 8],
    "vaccine_name": [
        "Influenza (INF)",
        "Human Papillomavirus (HPV)",
        "Pneumococcal",
        "Human Immunodeficiency Viruses (HIV)",
        "Tetanus, Diphtheria, Pertussis (Tdap)",
        "Hepatitis B (HepB)",
        "Measles, Mumps, Rubella (MMR)",
        "Varicella (VAR)",
    ],
    "vaccine_price": ["$9", "$0", "$16", "$23", "$10", "$9", "$9", "$11"],
    "doses_required": [2, 4, 1, 2, 1, 2, 1, 3],
    "age_criteria": [
        "18+ years old",
        "18+ years old",
        "65+ years old",
        "18-26 years old",
        "27-64 years old",
        "18+ years old",
        "18+ years old",
        "18+ years old",
    ],
    "gender_criteria": ["All", "All", "All", "Female", "Female", "All", "All", "All"],
    # condition_crietria
}
vaccines_df = pd.DataFrame(vaccines_data)
vaccines_df["vaccine_id"] = vaccines_df["vaccine_id"].astype(str)
display(vaccines_df)

Unnamed: 0,vaccine_id,vaccine_name,vaccine_price,doses_required,age_criteria,gender_criteria
0,1,Influenza (INF),$9,2,18+ years old,All
1,2,Human Papillomavirus (HPV),$0,4,18+ years old,All
2,3,Pneumococcal,$16,1,65+ years old,All
3,4,Human Immunodeficiency Viruses (HIV),$23,2,18-26 years old,Female
4,5,"Tetanus, Diphtheria, Pertussis (Tdap)",$10,1,27-64 years old,Female
5,6,Hepatitis B (HepB),$9,2,18+ years old,All
6,7,"Measles, Mumps, Rubella (MMR)",$9,1,18+ years old,All
7,8,Varicella (VAR),$11,3,18+ years old,All


## Polyclinics Table


In [27]:
# Polyclinic Table
polyclinics_data = {
    "polyclinic_id": [i for i in range(1, 31)],  # 30 polyclinics
    "polyclinic_name": [fake.company() + " Polyclinic" for _ in range(30)],
    "address": [fake.address() for _ in range(30)],
}
polyclinics_df = pd.DataFrame(polyclinics_data)
polyclinics_df["polyclinic_id"] = polyclinics_df["polyclinic_id"].astype(str)
display(polyclinics_df)

Unnamed: 0,polyclinic_id,polyclinic_name,address
0,1,Nelson-Martinez Polyclinic,"01643 Thomas Street\nNew Lisatown, AK 83943"
1,2,Smith-Ward Polyclinic,"4024 Robert Forges Suite 051\nKimtown, FM 52686"
2,3,Robertson-Pierce Polyclinic,USNV Anderson\nFPO AP 99047
3,4,Bauer-Mcfarland Polyclinic,"91249 Sergio Tunnel\nPort Drew, WA 39202"
4,5,"Alexander, Yates and Carter Polyclinic","49930 Jennifer Mission\nWest Marc, MI 53089"
5,6,"Long, Roberts and Gilbert Polyclinic","82584 Castro Landing\nSouth Michael, DC 62417"
6,7,Henry Ltd Polyclinic,"18085 Michele Green\nJoshuaburgh, SC 90579"
7,8,"Blair, Miranda and Rhodes Polyclinic","231 Carl Ford\nKevinberg, AR 41949"
8,9,West-Reid Polyclinic,"9318 Smith Club Apt. 123\nWest Michaelshire, K..."
9,10,Ray Group Polyclinic,"123 Katherine Burg Apt. 317\nWest James, NE 88150"


In [28]:
# Check that the names are all unique
print(polyclinics_df["polyclinic_name"].nunique())

30


## Booking Slots Table


In [29]:
# 1 day: 5 slots
# 1 week (5 working days): 25 slots
# 30 polyclincs: 750
# Booking Slots Table

# Define the start and end dates for the range
start_date = datetime(2025, 3, 17)
end_date = datetime(2025, 3, 21)

# Booking Slots Table
booking_slots_data = {
    "slot_id": [i for i in range(1, 751)],
    "polyclinic_id": [
        random.choice(polyclinics_data["polyclinic_id"]) for _ in range(750)
    ],
    "slot_datetime": [
        # Generate random date between 17th March and 21st March
        (
            start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
        ).replace(
            hour=random.randint(9, 18),  # Random hour between 9 AM and 6 PM
            minute=0,  # Set minute to 0 for hourly slots
            second=0,  # Set second to 0
            microsecond=0,  # Set microsecond to 0
        )
        for _ in range(750)
    ],
    "is_booked": [random.choice([True, False]) for _ in range(750)],
}

booking_slots_df = pd.DataFrame(booking_slots_data)
booking_slots_df["slot_id"] = booking_slots_df["slot_id"].astype(str)
display(booking_slots_df)

Unnamed: 0,slot_id,polyclinic_id,slot_datetime,is_booked
0,1,6,2025-03-21 09:00:00,True
1,2,1,2025-03-17 17:00:00,False
2,3,6,2025-03-20 18:00:00,False
3,4,1,2025-03-19 10:00:00,True
4,5,22,2025-03-20 13:00:00,True
...,...,...,...,...
745,746,29,2025-03-18 09:00:00,False
746,747,24,2025-03-17 13:00:00,False
747,748,14,2025-03-19 10:00:00,False
748,749,6,2025-03-18 17:00:00,True


In [30]:
# Assign user_id only for booked slots
unique_bookings = set()  # To ensure uniqueness
user_ids = list(range(1, 201))  # User IDs from 1 to 200


def assign_user_id(row):
    if row["is_booked"]:
        random.shuffle(user_ids)  # Shuffle to randomize selection
        for user_id in user_ids:
            key = (row["polyclinic_id"], row["slot_datetime"], user_id)
            if key not in unique_bookings:
                unique_bookings.add(key)
                return str(user_id)
    return "None"


booking_slots_df["user_id"] = booking_slots_df.apply(assign_user_id, axis=1)
booking_slots_df["polyclinic_id"] = booking_slots_df["polyclinic_id"].astype(str)
# Display the DataFrame
display(booking_slots_df)

Unnamed: 0,slot_id,polyclinic_id,slot_datetime,is_booked,user_id
0,1,6,2025-03-21 09:00:00,True,196
1,2,1,2025-03-17 17:00:00,False,
2,3,6,2025-03-20 18:00:00,False,
3,4,1,2025-03-19 10:00:00,True,62
4,5,22,2025-03-20 13:00:00,True,194
...,...,...,...,...,...
745,746,29,2025-03-18 09:00:00,False,
746,747,24,2025-03-17 13:00:00,False,
747,748,14,2025-03-19 10:00:00,False,
748,749,6,2025-03-18 17:00:00,True,94


In [31]:
# Check balance
booking_slots_df["is_booked"].value_counts()

is_booked
True     394
False    356
Name: count, dtype: int64

## Vaccine Records Table


In [32]:
# Vaccine Records Table
vaccine_records_data = {
    "record_id": [i for i in range(1, 501)],
    "user_id": [random.choice(users_data["user_id"]) for _ in range(500)],
    "vaccine_id": [random.choice(vaccines_data["vaccine_id"]) for _ in range(500)],
    "polyclinic_id": [
        random.choice(polyclinics_data["polyclinic_id"]) for _ in range(500)
    ],
    "vaccination_date": [fake.date_this_year() for _ in range(500)],
}
vaccine_records_df = pd.DataFrame(vaccine_records_data)
vaccine_records_df["record_id"] = vaccine_records_df["record_id"].astype(str)
vaccine_records_df["user_id"] = vaccine_records_df["user_id"].astype(str)
vaccine_records_df["vaccine_id"] = vaccine_records_df["vaccine_id"].astype(str)
vaccine_records_df["polyclinic_id"] = vaccine_records_df["polyclinic_id"].astype(str)
display(vaccine_records_df)

Unnamed: 0,record_id,user_id,vaccine_id,polyclinic_id,vaccination_date
0,1,164,5,27,2025-02-18
1,2,171,3,19,2025-03-04
2,3,63,6,17,2025-02-05
3,4,103,2,19,2025-02-03
4,5,181,5,26,2025-02-18
...,...,...,...,...,...
495,496,40,2,29,2025-01-20
496,497,38,5,11,2025-02-01
497,498,102,1,29,2025-03-11
498,499,160,2,5,2025-02-11


In [33]:
print(type(vaccine_records_df.iloc[0]["record_id"]))
print(type(vaccine_records_df.iloc[0]["user_id"]))
print(type(vaccine_records_df.iloc[0]["vaccine_id"]))
print(type(vaccine_records_df.iloc[0]["polyclinic_id"]))

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


## Vaccine Stock Inventory Table


In [34]:
# 30 polyclinics
# 8 vaccines
# Vaccine Stock Inventory Table
# Number of records
total_records = 200

# Initialize a list to hold the vaccine stock data
vaccine_stock_data = {
    "polyclinic_id": [],
    "vaccine_id": [],
    "stock_quantity": [],
}

# Initialize a set to track unique (polyclinic_id, vaccine_id) combinations
existing_combinations = set()

# Loop to create 200 records with random vaccine availability
while len(vaccine_stock_data["polyclinic_id"]) < total_records:
    polyclinic = random.choice(
        polyclinics_df["polyclinic_id"]
    )  # Randomly choose a polyclinic
    available_vaccines = random.sample(
        vaccines_df["vaccine_id"].tolist(), random.randint(1, len(vaccines_df))
    )  # Randomly pick some vaccines for the polyclinic

    for vaccine in available_vaccines:
        # Ensure the (polyclinic_id, vaccine_id) combination is unique
        combination = (polyclinic, vaccine)
        if combination not in existing_combinations:
            vaccine_stock_data["polyclinic_id"].append(polyclinic)  # Polyclinic ID
            vaccine_stock_data["vaccine_id"].append(vaccine)  # Vaccine ID
            vaccine_stock_data["stock_quantity"].append(
                random.randint(50, 200)
            )  # Random stock quantity
            existing_combinations.add(combination)  # Track the combination

# Convert to DataFrame
vaccine_stock_df = pd.DataFrame(vaccine_stock_data)

# Display the resulting DataFrame
display(vaccine_stock_df)

Unnamed: 0,polyclinic_id,vaccine_id,stock_quantity
0,21,5,52
1,21,4,114
2,21,6,171
3,15,6,91
4,15,4,81
...,...,...,...
195,23,7,166
196,5,1,75
197,5,6,83
198,5,8,70


In [35]:
# Group the dataframe by polyclinic_id and vaccine_id and count occurrences
grouped = (
    vaccine_stock_df.groupby(["polyclinic_id", "vaccine_id"])
    .size()
    .reset_index(name="count")
)

# Check if any combination appears more than once
duplicates = grouped[grouped["count"] > 1]

# Display the duplicates if any
if not duplicates.empty:
    print("Found duplicate entries:")
    display(duplicates)
else:
    print("All polyclinic_id and vaccine_id combinations are unique.")

All polyclinic_id and vaccine_id combinations are unique.


## Insert Vaccination Data into SQLite Database

Remember to delete the existing SQLite file before running the cell below.

In [36]:
# Define SQLite database file path again
sqlite_db_path = "../data/vaccination_db.sqlite"

# Reconnect to SQLite database
conn = sqlite3.connect(sqlite_db_path)
cursor = conn.cursor()


# Function to insert DataFrame data into SQLite
def insert_csv_to_sqlite(
    csv_data: pd.DataFrame, table_name: str, conn: sqlite3.Connection
):
    csv_data.to_sql(table_name, conn, if_exists="append", index=False)


# Insert data into SQLite tables
insert_csv_to_sqlite(users_df, "Users", conn)
insert_csv_to_sqlite(vaccines_df, "Vaccines", conn)
insert_csv_to_sqlite(polyclinics_df, "Polyclinics", conn)
insert_csv_to_sqlite(booking_slots_df, "BookingSlots", conn)
insert_csv_to_sqlite(vaccine_records_df, "VaccineRecords", conn)
insert_csv_to_sqlite(vaccine_stock_df, "VaccineStockInventory", conn)

# Commit changes and close connection
conn.commit()
conn.close()