# Dummy Data Preparation


In [216]:
import random
import sqlite3
from datetime import datetime, timedelta

import pandas as pd
from faker import Faker
from IPython.display import display

In [217]:
fake = Faker()

Note:

The `id` column will be in string. This is to mimic the real life scenario, whereby the ids may consist of characters.

## Users Table


In [218]:
# Function to generate a name-based email
def generate_email(name):
    name_parts = name.lower().split()
    email_username = ".".join(
        name_parts
    )  # Convert name to lowercase and join with dots
    email_domain = fake.free_email_domain()  # Use a real email domain
    return f"{email_username}@{email_domain}"


# Users Table
users_data = {
    "id": [str(i) for i in range(1, 201)],
    "name": [fake.name() for _ in range(200)],
}

# Generate emails based on names
users_data["email"] = [generate_email(name) for name in users_data["name"]]

# Generate other attributes
users_data["date_of_birth"] = [
    fake.date_of_birth(minimum_age=18, maximum_age=80) for _ in range(200)
]
users_data["gender"] = [
    fake.random_element(elements=["Male", "Female"]) for _ in range(200)
]

users_df = pd.DataFrame(users_data)

display(users_df)

Unnamed: 0,id,name,email,date_of_birth,gender
0,1,Richard Ruiz,richard.ruiz@yahoo.com,1958-11-12,Female
1,2,Kristen Howell,kristen.howell@gmail.com,1987-02-17,Male
2,3,Breanna Arnold,breanna.arnold@gmail.com,1971-11-27,Male
3,4,Ivan Simon,ivan.simon@yahoo.com,1991-12-23,Male
4,5,Dr. Louis Burton,dr..louis.burton@hotmail.com,1955-09-25,Male
...,...,...,...,...,...
195,196,Diane Gonzalez,diane.gonzalez@hotmail.com,1994-12-15,Female
196,197,Alejandro Martin,alejandro.martin@yahoo.com,1989-09-12,Male
197,198,Anne Jarvis,anne.jarvis@yahoo.com,1982-12-11,Female
198,199,Benjamin Garcia,benjamin.garcia@hotmail.com,1996-11-20,Female


In [219]:
# Check that the names are all unique
print(users_df["name"].nunique())

# Check that dates are in range
start_date = "1920-01-01"
end_date = "2025-03-01"

users_df["date_of_birth"] = pd.to_datetime(users_df["date_of_birth"])
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)

is_in_range = (users_df["date_of_birth"] >= start_date) & (
    users_df["date_of_birth"] <= end_date
)

# check if the date is under range
if is_in_range.all():
    print("All date are in range")
else:
    print("Some dates are not in range")

200
All date are in range


## Vaccines Table


In [220]:
# Vaccine Table
vaccines_data = {
    "id": [1, 2, 3, 4, 5, 6, 7, 8],
    "name": [
        "Influenza (INF)",
        "Human Papillomavirus (HPV)",
        "Pneumococcal",
        "Human Immunodeficiency Viruses (HIV)",
        "Tetanus, Diphtheria, Pertussis (Tdap)",
        "Hepatitis B (HepB)",
        "Measles, Mumps, Rubella (MMR)",
        "Varicella (VAR)",
    ],
    "price": [9.0, 0.0, 16.0, 23.0, 10.0, 9.0, 9.0, 11.0],
    "doses_required": [2, 4, 1, 2, 1, 2, 1, 3],
    "age_criteria": [
        "18+ years old",
        "18+ years old",
        "65+ years old",
        "18-26 years old",
        "27-64 years old",
        "18+ years old",
        "18+ years old",
        "18+ years old",
    ],
    "gender_criteria": [
        "None",
        "None",
        "None",
        "Female",
        "Female",
        "None",
        "None",
        "None",
    ],
    # condition_crietria
}
vaccines_df = pd.DataFrame(vaccines_data)
vaccines_df["id"] = vaccines_df["id"].astype(str)
display(vaccines_df)

Unnamed: 0,id,name,price,doses_required,age_criteria,gender_criteria
0,1,Influenza (INF),9.0,2,18+ years old,
1,2,Human Papillomavirus (HPV),0.0,4,18+ years old,
2,3,Pneumococcal,16.0,1,65+ years old,
3,4,Human Immunodeficiency Viruses (HIV),23.0,2,18-26 years old,Female
4,5,"Tetanus, Diphtheria, Pertussis (Tdap)",10.0,1,27-64 years old,Female
5,6,Hepatitis B (HepB),9.0,2,18+ years old,
6,7,"Measles, Mumps, Rubella (MMR)",9.0,1,18+ years old,
7,8,Varicella (VAR),11.0,3,18+ years old,


## Polyclinics Table


In [221]:
# Polyclinic Table
polyclinics_data = {
    "id": [i for i in range(1, 31)],  # 30 polyclinics
    "name": [fake.company() + " Polyclinic" for _ in range(30)],
    "address": [fake.address() for _ in range(30)],
}
polyclinics_df = pd.DataFrame(polyclinics_data)
polyclinics_df["id"] = polyclinics_df["id"].astype(str)
display(polyclinics_df)

Unnamed: 0,id,name,address
0,1,Quinn Group Polyclinic,"0977 Francisco Ridges\nJillshire, UT 20751"
1,2,Campbell-Johnson Polyclinic,"299 Brown Roads Suite 243\nWest Nicole, UT 36884"
2,3,Pierce-Garcia Polyclinic,"1026 Regina Road\nJuliemouth, OR 63304"
3,4,"Baker, Simmons and Vincent Polyclinic","51789 Adkins Mews\nNew Kimberly, SD 87832"
4,5,Esparza Ltd Polyclinic,"18885 John Way\nConnieton, ME 31348"
5,6,Coleman Ltd Polyclinic,"PSC 8231, Box 0303\nAPO AP 45811"
6,7,"Collins, Khan and Martinez Polyclinic","8470 Perry Station\nEast Edward, AL 17611"
7,8,"Yang, Castillo and Sanchez Polyclinic","0814 Steele Radial\nPort Davidside, NY 79097"
8,9,Brown-Strickland Polyclinic,"3015 Sandra Greens Apt. 732\nAndreaberg, PA 27220"
9,10,Williams and Sons Polyclinic,"5705 Jackson Estates Apt. 276\nAnnabury, FM 83503"


In [222]:
# Check that the names are all unique
print(polyclinics_df["name"].nunique())

30


## Booking Slots Table


In [223]:
# 1 day: 5 slots
# 1 week (5 working days): 25 slots
# 30 polyclincs: 750
# Booking Slots Table

# Define the start and end dates for the range
start_date = datetime(2025, 3, 17)
end_date = datetime(2025, 3, 21)

# Booking Slots Table
booking_slots_data = {
    "id": [i for i in range(1, 751)],
    "polyclinic_id": [random.choice(polyclinics_data["id"]) for _ in range(750)],
    "vaccine_id": [random.choice(vaccines_data["id"]) for _ in range(750)],
    "datetime": [
        # Generate random date between 17th March and 21st March
        (
            start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
        ).replace(
            hour=random.randint(9, 18),  # Random hour between 9 AM and 6 PM
            minute=0,  # Set minute to 0 for hourly slots
            second=0,  # Set second to 0
            microsecond=0,  # Set microsecond to 0
        )
        for _ in range(750)
    ],
    "is_booked": [random.choice([True, False]) for _ in range(750)],
}

booking_slots_df = pd.DataFrame(booking_slots_data)
booking_slots_df["id"] = booking_slots_df["id"].astype(str)
display(booking_slots_df)

Unnamed: 0,id,polyclinic_id,vaccine_id,datetime,is_booked
0,1,28,6,2025-03-21 16:00:00,True
1,2,25,4,2025-03-20 17:00:00,True
2,3,19,3,2025-03-20 16:00:00,True
3,4,14,1,2025-03-19 18:00:00,False
4,5,8,3,2025-03-20 11:00:00,True
...,...,...,...,...,...
745,746,26,3,2025-03-17 09:00:00,False
746,747,19,3,2025-03-19 16:00:00,False
747,748,1,4,2025-03-17 18:00:00,False
748,749,4,7,2025-03-20 15:00:00,True


In [224]:
# Assign user_id only for booked slots
unique_bookings = set()  # To ensure uniqueness
user_ids = list(range(1, 201))  # User IDs from 1 to 200


def assign_user_id(row):
    if row["is_booked"]:
        random.shuffle(user_ids)  # Shuffle to randomize selection
        for user_id in user_ids:
            key = (row["polyclinic_id"], row["datetime"], user_id)
            if key not in unique_bookings:
                unique_bookings.add(key)
                return str(user_id)
    return "None"


booking_slots_df["user_id"] = booking_slots_df.apply(assign_user_id, axis=1)
booking_slots_df["polyclinic_id"] = booking_slots_df["polyclinic_id"].astype(str)
booking_slots_df["vaccine_id"] = booking_slots_df["vaccine_id"].astype(str)
# Display the DataFrame
display(booking_slots_df)

Unnamed: 0,id,polyclinic_id,vaccine_id,datetime,is_booked,user_id
0,1,28,6,2025-03-21 16:00:00,True,87
1,2,25,4,2025-03-20 17:00:00,True,183
2,3,19,3,2025-03-20 16:00:00,True,198
3,4,14,1,2025-03-19 18:00:00,False,
4,5,8,3,2025-03-20 11:00:00,True,47
...,...,...,...,...,...,...
745,746,26,3,2025-03-17 09:00:00,False,
746,747,19,3,2025-03-19 16:00:00,False,
747,748,1,4,2025-03-17 18:00:00,False,
748,749,4,7,2025-03-20 15:00:00,True,53


In [225]:
# Check balance
booking_slots_df["is_booked"].value_counts()

is_booked
True     398
False    352
Name: count, dtype: int64

## Vaccine Records Table


In [226]:
# Vaccine Records Table
vaccine_records_data = {
    "id": [i for i in range(1, 501)],
    "user_id": [random.choice(users_data["id"]) for _ in range(500)],
    "vaccine_id": [random.choice(vaccines_data["id"]) for _ in range(500)],
    "polyclinic_id": [random.choice(polyclinics_data["id"]) for _ in range(500)],
    "vaccination_date": [fake.date_this_year() for _ in range(500)],
}
vaccine_records_df = pd.DataFrame(vaccine_records_data)
vaccine_records_df["id"] = vaccine_records_df["id"].astype(str)
vaccine_records_df["user_id"] = vaccine_records_df["user_id"].astype(str)
vaccine_records_df["vaccine_id"] = vaccine_records_df["vaccine_id"].astype(str)
vaccine_records_df["polyclinic_id"] = vaccine_records_df["polyclinic_id"].astype(str)
display(vaccine_records_df)

Unnamed: 0,id,user_id,vaccine_id,polyclinic_id,vaccination_date
0,1,86,5,10,2025-01-02
1,2,181,7,17,2025-01-26
2,3,95,1,22,2025-03-08
3,4,130,2,7,2025-03-07
4,5,81,4,6,2025-02-19
...,...,...,...,...,...
495,496,84,3,19,2025-01-14
496,497,200,2,25,2025-01-28
497,498,30,3,25,2025-02-18
498,499,148,6,7,2025-01-14


In [227]:
print(type(vaccine_records_df.iloc[0]["id"]))
print(type(vaccine_records_df.iloc[0]["user_id"]))
print(type(vaccine_records_df.iloc[0]["vaccine_id"]))
print(type(vaccine_records_df.iloc[0]["polyclinic_id"]))

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


## Vaccine Stock Inventory Table


In [228]:
# 30 polyclinics
# 8 vaccines
# Vaccine Stock Inventory Table
# Number of records
total_records = 200

# Initialize a list to hold the vaccine stock data
vaccine_stock_data = {
    "id": [],
    "polyclinic_id": [],
    "vaccine_id": [],
    "stock_quantity": [],
}

# Initialize a set to track unique (polyclinic_id, vaccine_id) combinations
existing_combinations = set()

id_counter = 1

# Loop to create 200 records with random vaccine availability
while len(vaccine_stock_data["polyclinic_id"]) < total_records:
    polyclinic = random.choice(polyclinics_df["id"])  # Randomly choose a polyclinic
    available_vaccines = random.sample(
        vaccines_df["id"].tolist(), random.randint(1, len(vaccines_df))
    )  # Randomly pick some vaccines for the polyclinic

    for vaccine in available_vaccines:
        # Ensure the (polyclinic_id, vaccine_id) combination is unique
        combination = (polyclinic, vaccine)
        if combination not in existing_combinations:
            vaccine_stock_data["id"].append(id_counter)
            vaccine_stock_data["polyclinic_id"].append(polyclinic)  # Polyclinic ID
            vaccine_stock_data["vaccine_id"].append(vaccine)  # Vaccine ID
            vaccine_stock_data["stock_quantity"].append(
                random.randint(50, 200)
            )  # Random stock quantity
            existing_combinations.add(combination)  # Track the combination
            id_counter += 1  # Increment ID

# Convert to DataFrame
vaccine_stock_df = pd.DataFrame(vaccine_stock_data)
vaccine_stock_df["id"] = vaccine_stock_df["id"].astype(str)
# Display the resulting DataFrame
display(vaccine_stock_df)

Unnamed: 0,id,polyclinic_id,vaccine_id,stock_quantity
0,1,3,2,65
1,2,3,6,155
2,3,3,5,73
3,4,3,4,126
4,5,3,8,122
...,...,...,...,...
195,196,16,8,188
196,197,16,3,101
197,198,25,4,62
198,199,16,5,126


In [229]:
# Group the dataframe by polyclinic_id and vaccine_id and count occurrences
grouped = (
    vaccine_stock_df.groupby(["polyclinic_id", "vaccine_id"])
    .size()
    .reset_index(name="count")
)

# Check if any combination appears more than once
duplicates = grouped[grouped["count"] > 1]

# Display the duplicates if any
if not duplicates.empty:
    print("Found duplicate entries:")
    display(duplicates)
else:
    print("All polyclinic_id and vaccine_id combinations are unique.")

All polyclinic_id and vaccine_id combinations are unique.


## Insert Vaccination Data into SQLite Database

Remember to delete the existing SQLite file before running the cell below.

In [230]:
# Define SQLite database file path again
sqlite_db_path = "../data/vaccination_db.sqlite"

# Reconnect to SQLite database
conn = sqlite3.connect(sqlite_db_path)
cursor = conn.cursor()


# Function to insert DataFrame data into SQLite
def insert_csv_to_sqlite(
    csv_data: pd.DataFrame, table_name: str, conn: sqlite3.Connection
):
    csv_data.to_sql(table_name, conn, if_exists="append", index=False)


# Insert data into SQLite tables
insert_csv_to_sqlite(users_df, "Users", conn)
insert_csv_to_sqlite(vaccines_df, "Vaccines", conn)
insert_csv_to_sqlite(polyclinics_df, "Polyclinics", conn)
insert_csv_to_sqlite(booking_slots_df, "BookingSlots", conn)
insert_csv_to_sqlite(vaccine_records_df, "VaccineRecords", conn)
insert_csv_to_sqlite(vaccine_stock_df, "VaccineStockInventory", conn)

# Commit changes and close connection
conn.commit()
conn.close()

## Query SQLite Database

In [231]:
conn = sqlite3.connect("../data/vaccination_db.sqlite")
cursor = conn.cursor()

In [232]:
# Example: Fetch all users
cursor.execute("SELECT * FROM Users")
rows = cursor.fetchall()
for row in rows:
    print(row)

conn.close()

('1', 'Richard Ruiz', 'richard.ruiz@yahoo.com', '1958-11-12 00:00:00', 'Female')
('2', 'Kristen Howell', 'kristen.howell@gmail.com', '1987-02-17 00:00:00', 'Male')
('3', 'Breanna Arnold', 'breanna.arnold@gmail.com', '1971-11-27 00:00:00', 'Male')
('4', 'Ivan Simon', 'ivan.simon@yahoo.com', '1991-12-23 00:00:00', 'Male')
('5', 'Dr. Louis Burton', 'dr..louis.burton@hotmail.com', '1955-09-25 00:00:00', 'Male')
('6', 'Colleen Mercer', 'colleen.mercer@yahoo.com', '1959-02-01 00:00:00', 'Male')
('7', 'William Boyd', 'william.boyd@hotmail.com', '1946-11-17 00:00:00', 'Female')
('8', 'Meagan Lopez', 'meagan.lopez@yahoo.com', '1984-01-14 00:00:00', 'Female')
('9', 'Eric Mejia', 'eric.mejia@hotmail.com', '1966-03-08 00:00:00', 'Female')
('10', 'Michelle Porter', 'michelle.porter@gmail.com', '1957-12-27 00:00:00', 'Female')
('11', 'Joshua Leach', 'joshua.leach@yahoo.com', '1986-11-25 00:00:00', 'Male')
('12', 'Holly Gonzalez', 'holly.gonzalez@yahoo.com', '1956-03-12 00:00:00', 'Male')
('13', 'Jer

In [233]:
db_filename = "../data/vaccination_db.sqlite"

try:
    conn = sqlite3.connect(db_filename)
    cursor = conn.cursor()

    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    table_names = [row[0] for row in cursor.fetchall()]

    print("Tables in the database:")
    for table_name in table_names:
        print(table_name)

except sqlite3.Error as e:
    print(f"An error occurred: {e}")

finally:
    if conn:
        conn.close()

Tables in the database:
Users
Vaccines
Polyclinics
BookingSlots
VaccineRecords
VaccineStockInventory
