# Dummy Data Preparation


In [187]:
import random
import sqlite3
from datetime import datetime, timedelta

import pandas as pd
from faker import Faker
from IPython.display import display

In [188]:
fake = Faker()

Note:

The `id` column will be in string. This is to mimic the real life scenario, whereby the ids may consist of characters.

## Users Table


In [189]:
# Function to generate a name-based email
def generate_email(name):
    name_parts = name.lower().split()
    email_username = ".".join(
        name_parts
    )  # Convert name to lowercase and join with dots
    email_domain = fake.free_email_domain()  # Use a real email domain
    return f"{email_username}@{email_domain}"


# Users Table
users_data = {
    "id": [str(i) for i in range(1, 201)],
    "name": [fake.name() for _ in range(200)],
}

# Generate emails based on names
users_data["email"] = [generate_email(name) for name in users_data["name"]]

# Generate other attributes
users_data["date_of_birth"] = [
    fake.date_of_birth(minimum_age=18, maximum_age=80) for _ in range(200)
]
users_data["gender"] = [
    fake.random_element(elements=["Male", "Female"]) for _ in range(200)
]

users_df = pd.DataFrame(users_data)

display(users_df)

Unnamed: 0,id,name,email,date_of_birth,gender
0,1,Olivia Sullivan,olivia.sullivan@yahoo.com,1999-06-14,Male
1,2,Lauren Sloan,lauren.sloan@yahoo.com,1974-04-30,Female
2,3,Tiffany Thompson,tiffany.thompson@hotmail.com,1964-09-02,Male
3,4,Sherry Foster,sherry.foster@gmail.com,1986-04-14,Female
4,5,Brandi Robbins,brandi.robbins@hotmail.com,2000-08-10,Male
...,...,...,...,...,...
195,196,Lori Reynolds,lori.reynolds@hotmail.com,1964-03-12,Female
196,197,Alicia Swanson,alicia.swanson@gmail.com,1953-07-28,Male
197,198,Alexis Malone,alexis.malone@yahoo.com,1982-12-13,Male
198,199,Tyler Lopez,tyler.lopez@gmail.com,2004-08-17,Female


In [190]:
# Check that the names are all unique
print(users_df["name"].nunique())

# Check that dates are in range
start_date = "1920-01-01"
end_date = "2025-03-01"

users_df["date_of_birth"] = pd.to_datetime(users_df["date_of_birth"])
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)

is_in_range = (users_df["date_of_birth"] >= start_date) & (
    users_df["date_of_birth"] <= end_date
)

# check if the date is under range
if is_in_range.all():
    print("All date are in range")
else:
    print("Some dates are not in range")

200
All date are in range


## Vaccines Table


In [191]:
# Vaccine Table
vaccines_data = {
    "id": [1, 2, 3, 4, 5, 6, 7, 8],
    "name": [
        "Influenza (INF)",
        "Human Papillomavirus (HPV)",
        "Pneumococcal",
        "Human Immunodeficiency Viruses (HIV)",
        "Tetanus, Diphtheria, Pertussis (Tdap)",
        "Hepatitis B (HepB)",
        "Measles, Mumps, Rubella (MMR)",
        "Varicella (VAR)",
    ],
    "price": [9.0, 0.0, 16.0, 23.0, 10.0, 9.0, 9.0, 11.0],
    "doses_required": [2, 4, 1, 2, 1, 2, 1, 3],
    "age_criteria": [
        "18+ years old",
        "18+ years old",
        "65+ years old",
        "18-26 years old",
        "27-64 years old",
        "18+ years old",
        "18+ years old",
        "18+ years old",
    ],
    "gender_criteria": [
        "None",
        "None",
        "None",
        "Female",
        "Female",
        "None",
        "None",
        "None",
    ],
    # condition_crietria
}
vaccines_df = pd.DataFrame(vaccines_data)
vaccines_df["id"] = vaccines_df["id"].astype(str)
display(vaccines_df)

Unnamed: 0,id,name,price,doses_required,age_criteria,gender_criteria
0,1,Influenza (INF),9.0,2,18+ years old,
1,2,Human Papillomavirus (HPV),0.0,4,18+ years old,
2,3,Pneumococcal,16.0,1,65+ years old,
3,4,Human Immunodeficiency Viruses (HIV),23.0,2,18-26 years old,Female
4,5,"Tetanus, Diphtheria, Pertussis (Tdap)",10.0,1,27-64 years old,Female
5,6,Hepatitis B (HepB),9.0,2,18+ years old,
6,7,"Measles, Mumps, Rubella (MMR)",9.0,1,18+ years old,
7,8,Varicella (VAR),11.0,3,18+ years old,


## Polyclinics Table


In [192]:
# Polyclinic Table
polyclinics_data = {
    "id": [i for i in range(1, 31)],  # 30 polyclinics
    "name": [fake.company() + " Polyclinic" for _ in range(30)],
    "address": [fake.address() for _ in range(30)],
}
polyclinics_df = pd.DataFrame(polyclinics_data)
polyclinics_df["id"] = polyclinics_df["id"].astype(str)
display(polyclinics_df)

Unnamed: 0,id,name,address
0,1,"Santiago, Medina and Wright Polyclinic",3934 Delgado Shoals Apt. 944\nNorth Samanthabo...
1,2,Huber and Sons Polyclinic,"PSC 8694, Box 2014\nAPO AP 08638"
2,3,Hernandez and Sons Polyclinic,"104 Anna Isle Apt. 017\nPort Codyfurt, FL 22360"
3,4,"Weeks, Jensen and Phelps Polyclinic",5478 Stevens Junction Suite 574\nNorth Lancemo...
4,5,Moody-Wood Polyclinic,"5103 Stanley Parkway Suite 168\nWest Robert, N..."
5,6,Williams-Gonzalez Polyclinic,024 Bennett Prairie Suite 445\nChristopherbury...
6,7,Dodson Ltd Polyclinic,"9551 Sanchez Union Suite 508\nPerkinsmouth, AS..."
7,8,Mccormick Inc Polyclinic,"878 Brian Fork Suite 124\nThompsonfurt, ME 42573"
8,9,Meyers-Paul Polyclinic,"5661 Mitchell Ridges Apt. 145\nMillermouth, NE..."
9,10,Smith Group Polyclinic,"824 Greer Street Apt. 038\nWilcoxland, MN 99377"


In [193]:
# Check that the names are all unique
print(polyclinics_df["name"].nunique())

30


## Booking Slots Table


In [194]:
# 1 day: 5 slots
# 1 week (5 working days): 25 slots
# 30 polyclincs: 750
# Booking Slots Table

# Define the start and end dates for the range
start_date = datetime(2025, 3, 17)
end_date = datetime(2025, 3, 21)

# Booking Slots Table
booking_slots_data = {
    "id": [i for i in range(1, 751)],
    "polyclinic_id": [random.choice(polyclinics_data["id"]) for _ in range(750)],
    "vaccine_id": [random.choice(vaccines_data["id"]) for _ in range(750)],
    "slot_datetime": [
        # Generate random date between 17th March and 21st March
        (
            start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
        ).replace(
            hour=random.randint(9, 18),  # Random hour between 9 AM and 6 PM
            minute=0,  # Set minute to 0 for hourly slots
            second=0,  # Set second to 0
            microsecond=0,  # Set microsecond to 0
        )
        for _ in range(750)
    ],
    "is_booked": [random.choice([True, False]) for _ in range(750)],
}

booking_slots_df = pd.DataFrame(booking_slots_data)
booking_slots_df["id"] = booking_slots_df["id"].astype(str)
display(booking_slots_df)

Unnamed: 0,id,polyclinic_id,vaccine_id,slot_datetime,is_booked
0,1,28,3,2025-03-21 17:00:00,True
1,2,5,7,2025-03-17 15:00:00,True
2,3,30,1,2025-03-21 16:00:00,False
3,4,30,1,2025-03-19 18:00:00,True
4,5,16,2,2025-03-21 16:00:00,False
...,...,...,...,...,...
745,746,7,2,2025-03-20 15:00:00,False
746,747,11,7,2025-03-19 09:00:00,True
747,748,24,2,2025-03-21 15:00:00,False
748,749,27,2,2025-03-18 11:00:00,True


In [195]:
# Assign user_id only for booked slots
unique_bookings = set()  # To ensure uniqueness
user_ids = list(range(1, 201))  # User IDs from 1 to 200


def assign_user_id(row):
    if row["is_booked"]:
        random.shuffle(user_ids)  # Shuffle to randomize selection
        for user_id in user_ids:
            key = (row["polyclinic_id"], row["slot_datetime"], user_id)
            if key not in unique_bookings:
                unique_bookings.add(key)
                return str(user_id)
    return "None"


booking_slots_df["user_id"] = booking_slots_df.apply(assign_user_id, axis=1)
booking_slots_df["polyclinic_id"] = booking_slots_df["polyclinic_id"].astype(str)
booking_slots_df["vaccine_id"] = booking_slots_df["vaccine_id"].astype(str)
# Display the DataFrame
display(booking_slots_df)

Unnamed: 0,id,polyclinic_id,vaccine_id,slot_datetime,is_booked,user_id
0,1,28,3,2025-03-21 17:00:00,True,30
1,2,5,7,2025-03-17 15:00:00,True,99
2,3,30,1,2025-03-21 16:00:00,False,
3,4,30,1,2025-03-19 18:00:00,True,40
4,5,16,2,2025-03-21 16:00:00,False,
...,...,...,...,...,...,...
745,746,7,2,2025-03-20 15:00:00,False,
746,747,11,7,2025-03-19 09:00:00,True,137
747,748,24,2,2025-03-21 15:00:00,False,
748,749,27,2,2025-03-18 11:00:00,True,57


In [196]:
# Check balance
booking_slots_df["is_booked"].value_counts()

is_booked
True     376
False    374
Name: count, dtype: int64

## Vaccine Records Table


In [197]:
# Vaccine Records Table
vaccine_records_data = {
    "id": [i for i in range(1, 501)],
    "user_id": [random.choice(users_data["id"]) for _ in range(500)],
    "vaccine_id": [random.choice(vaccines_data["id"]) for _ in range(500)],
    "polyclinic_id": [random.choice(polyclinics_data["id"]) for _ in range(500)],
    "vaccination_date": [fake.date_this_year() for _ in range(500)],
}
vaccine_records_df = pd.DataFrame(vaccine_records_data)
vaccine_records_df["id"] = vaccine_records_df["id"].astype(str)
vaccine_records_df["user_id"] = vaccine_records_df["user_id"].astype(str)
vaccine_records_df["vaccine_id"] = vaccine_records_df["vaccine_id"].astype(str)
vaccine_records_df["polyclinic_id"] = vaccine_records_df["polyclinic_id"].astype(str)
display(vaccine_records_df)

Unnamed: 0,id,user_id,vaccine_id,polyclinic_id,vaccination_date
0,1,48,8,20,2025-02-05
1,2,116,8,16,2025-01-03
2,3,117,2,29,2025-03-13
3,4,53,2,28,2025-02-18
4,5,88,7,13,2025-01-13
...,...,...,...,...,...
495,496,144,5,21,2025-03-11
496,497,152,4,30,2025-02-03
497,498,182,3,22,2025-03-11
498,499,87,8,7,2025-03-03


In [198]:
print(type(vaccine_records_df.iloc[0]["id"]))
print(type(vaccine_records_df.iloc[0]["user_id"]))
print(type(vaccine_records_df.iloc[0]["vaccine_id"]))
print(type(vaccine_records_df.iloc[0]["polyclinic_id"]))

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


## Vaccine Stock Inventory Table


In [199]:
# 30 polyclinics
# 8 vaccines
# Vaccine Stock Inventory Table
# Number of records
total_records = 200

# Initialize a list to hold the vaccine stock data
vaccine_stock_data = {
    "id": [],
    "polyclinic_id": [],
    "vaccine_id": [],
    "stock_quantity": [],
}

# Initialize a set to track unique (polyclinic_id, vaccine_id) combinations
existing_combinations = set()

id_counter = 1

# Loop to create 200 records with random vaccine availability
while len(vaccine_stock_data["polyclinic_id"]) < total_records:
    polyclinic = random.choice(polyclinics_df["id"])  # Randomly choose a polyclinic
    available_vaccines = random.sample(
        vaccines_df["id"].tolist(), random.randint(1, len(vaccines_df))
    )  # Randomly pick some vaccines for the polyclinic

    for vaccine in available_vaccines:
        # Ensure the (polyclinic_id, vaccine_id) combination is unique
        combination = (polyclinic, vaccine)
        if combination not in existing_combinations:
            vaccine_stock_data["id"].append(id_counter)
            vaccine_stock_data["polyclinic_id"].append(polyclinic)  # Polyclinic ID
            vaccine_stock_data["vaccine_id"].append(vaccine)  # Vaccine ID
            vaccine_stock_data["stock_quantity"].append(
                random.randint(50, 200)
            )  # Random stock quantity
            existing_combinations.add(combination)  # Track the combination
            id_counter += 1  # Increment ID

# Convert to DataFrame
vaccine_stock_df = pd.DataFrame(vaccine_stock_data)
vaccine_stock_df["id"] = vaccine_stock_df["id"].astype(str)
# Display the resulting DataFrame
display(vaccine_stock_df)

Unnamed: 0,id,polyclinic_id,vaccine_id,stock_quantity
0,1,27,3,179
1,2,27,8,154
2,3,27,1,53
3,4,17,1,129
4,5,17,3,93
...,...,...,...,...
195,196,13,8,125
196,197,24,7,188
197,198,24,5,65
198,199,24,6,185


In [200]:
# Group the dataframe by polyclinic_id and vaccine_id and count occurrences
grouped = (
    vaccine_stock_df.groupby(["polyclinic_id", "vaccine_id"])
    .size()
    .reset_index(name="count")
)

# Check if any combination appears more than once
duplicates = grouped[grouped["count"] > 1]

# Display the duplicates if any
if not duplicates.empty:
    print("Found duplicate entries:")
    display(duplicates)
else:
    print("All polyclinic_id and vaccine_id combinations are unique.")

All polyclinic_id and vaccine_id combinations are unique.


## Insert Vaccination Data into SQLite Database

Remember to delete the existing SQLite file before running the cell below.

In [201]:
# Define SQLite database file path again
sqlite_db_path = "../data/vaccination_db.sqlite"

# Reconnect to SQLite database
conn = sqlite3.connect(sqlite_db_path)
cursor = conn.cursor()


# Function to insert DataFrame data into SQLite
def insert_csv_to_sqlite(
    csv_data: pd.DataFrame, table_name: str, conn: sqlite3.Connection
):
    csv_data.to_sql(table_name, conn, if_exists="append", index=False)


# Insert data into SQLite tables
insert_csv_to_sqlite(users_df, "Users", conn)
insert_csv_to_sqlite(vaccines_df, "Vaccines", conn)
insert_csv_to_sqlite(polyclinics_df, "Polyclinics", conn)
insert_csv_to_sqlite(booking_slots_df, "BookingSlots", conn)
insert_csv_to_sqlite(vaccine_records_df, "VaccineRecords", conn)
insert_csv_to_sqlite(vaccine_stock_df, "VaccineStockInventory", conn)

# Commit changes and close connection
conn.commit()
conn.close()

## Query SQLite Database

In [202]:
conn = sqlite3.connect("../data/vaccination_db.sqlite")
cursor = conn.cursor()

In [203]:
# Example: Fetch all users
cursor.execute("SELECT * FROM Users")
rows = cursor.fetchall()
for row in rows:
    print(row)

conn.close()

('1', 'Olivia Sullivan', 'olivia.sullivan@yahoo.com', '1999-06-14 00:00:00', 'Male')
('2', 'Lauren Sloan', 'lauren.sloan@yahoo.com', '1974-04-30 00:00:00', 'Female')
('3', 'Tiffany Thompson', 'tiffany.thompson@hotmail.com', '1964-09-02 00:00:00', 'Male')
('4', 'Sherry Foster', 'sherry.foster@gmail.com', '1986-04-14 00:00:00', 'Female')
('5', 'Brandi Robbins', 'brandi.robbins@hotmail.com', '2000-08-10 00:00:00', 'Male')
('6', 'Mr. Nicholas Nelson', 'mr..nicholas.nelson@yahoo.com', '1966-11-01 00:00:00', 'Female')
('7', 'April Knight', 'april.knight@hotmail.com', '1961-11-15 00:00:00', 'Male')
('8', 'Sheila West', 'sheila.west@yahoo.com', '1953-05-15 00:00:00', 'Male')
('9', 'Lisa Oliver', 'lisa.oliver@yahoo.com', '1993-02-06 00:00:00', 'Male')
('10', 'Gerald Perkins', 'gerald.perkins@gmail.com', '1954-12-12 00:00:00', 'Female')
('11', 'Anita Reed', 'anita.reed@yahoo.com', '1995-12-07 00:00:00', 'Male')
('12', 'Kristin Griffin', 'kristin.griffin@hotmail.com', '1960-06-07 00:00:00', 'Male

In [204]:
db_filename = "../data/vaccination_db.sqlite"

try:
    conn = sqlite3.connect(db_filename)
    cursor = conn.cursor()

    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    table_names = [row[0] for row in cursor.fetchall()]

    print("Tables in the database:")
    for table_name in table_names:
        print(table_name)

except sqlite3.Error as e:
    print(f"An error occurred: {e}")

finally:
    if conn:
        conn.close()

Tables in the database:
Users
Vaccines
Polyclinics
BookingSlots
VaccineRecords
VaccineStockInventory
