# Dummy Data Preparation


In [151]:
import random
import sqlite3
from datetime import datetime, timedelta

import pandas as pd
from faker import Faker
from IPython.display import display

In [152]:
fake = Faker()

Note:

The `id` column will be in string. This is to mimic the real life scenario, whereby the ids may consist of characters.

## Users Table


In [153]:
# Users Table
users_data = {
    "id": [i for i in range(1, 201)],
    "name": [fake.name() for _ in range(200)],
    "email": [fake.email() for _ in range(200)],
    "date_of_birth": [
        fake.date_of_birth(minimum_age=18, maximum_age=80) for _ in range(200)
    ],
    "gender": [fake.random_element(elements=["Male", "Female"]) for _ in range(200)],
}
users_df = pd.DataFrame(users_data)
users_df["id"] = users_df["id"].astype(str)
display(users_df)

Unnamed: 0,id,name,email,date_of_birth,gender
0,1,Alicia Phillips,richard15@example.org,1968-10-07,Male
1,2,Clayton Duarte,vanessa30@example.org,1960-08-15,Male
2,3,Cassandra Young,laurenhill@example.com,1946-09-12,Male
3,4,Scott Hill,bgriffin@example.com,1946-05-24,Male
4,5,Katie Davis,nnielsen@example.org,1990-11-29,Female
...,...,...,...,...,...
195,196,Margaret Gould,bryan12@example.net,1988-03-20,Male
196,197,Anthony Brown,michaelolsen@example.org,1973-09-18,Male
197,198,Julie Charles,andreacrawford@example.net,1951-06-26,Female
198,199,Ryan Thomas,reynoldschristopher@example.com,1985-10-05,Male


In [154]:
# Check that the names are all unique
print(users_df["name"].nunique())

# Check that dates are in range
start_date = "1920-01-01"
end_date = "2025-03-01"

users_df["date_of_birth"] = pd.to_datetime(users_df["date_of_birth"])
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)

is_in_range = (users_df["date_of_birth"] >= start_date) & (
    users_df["date_of_birth"] <= end_date
)

# check if the date is under range
if is_in_range.all():
    print("All date are in range")
else:
    print("Some dates are not in range")

200
All date are in range


## Vaccines Table


In [155]:
# Vaccine Table
vaccines_data = {
    "id": [1, 2, 3, 4, 5, 6, 7, 8],
    "vaccine": [
        "Influenza (INF)",
        "Human Papillomavirus (HPV)",
        "Pneumococcal",
        "Human Immunodeficiency Viruses (HIV)",
        "Tetanus, Diphtheria, Pertussis (Tdap)",
        "Hepatitis B (HepB)",
        "Measles, Mumps, Rubella (MMR)",
        "Varicella (VAR)",
    ],
    "price": [9.0, 0.0, 16.0, 23.0, 10.0, 9.0, 9.0, 11.0],
    "doses_required": [2, 4, 1, 2, 1, 2, 1, 3],
    "age_criteria": [
        "18+ years old",
        "18+ years old",
        "65+ years old",
        "18-26 years old",
        "27-64 years old",
        "18+ years old",
        "18+ years old",
        "18+ years old",
    ],
    "gender_criteria": [
        "None",
        "None",
        "None",
        "Female",
        "Female",
        "None",
        "None",
        "None",
    ],
    # condition_crietria
}
vaccines_df = pd.DataFrame(vaccines_data)
vaccines_df["id"] = vaccines_df["id"].astype(str)
display(vaccines_df)

Unnamed: 0,id,vaccine,price,doses_required,age_criteria,gender_criteria
0,1,Influenza (INF),9.0,2,18+ years old,
1,2,Human Papillomavirus (HPV),0.0,4,18+ years old,
2,3,Pneumococcal,16.0,1,65+ years old,
3,4,Human Immunodeficiency Viruses (HIV),23.0,2,18-26 years old,Female
4,5,"Tetanus, Diphtheria, Pertussis (Tdap)",10.0,1,27-64 years old,Female
5,6,Hepatitis B (HepB),9.0,2,18+ years old,
6,7,"Measles, Mumps, Rubella (MMR)",9.0,1,18+ years old,
7,8,Varicella (VAR),11.0,3,18+ years old,


## Polyclinics Table


In [156]:
# Polyclinic Table
polyclinics_data = {
    "id": [i for i in range(1, 31)],  # 30 polyclinics
    "name": [fake.company() + " Polyclinic" for _ in range(30)],
    "address": [fake.address() for _ in range(30)],
}
polyclinics_df = pd.DataFrame(polyclinics_data)
polyclinics_df["id"] = polyclinics_df["id"].astype(str)
display(polyclinics_df)

Unnamed: 0,id,name,address
0,1,Riley Ltd Polyclinic,95509 Chavez Mission Apt. 305\nEast Matthewsid...
1,2,Williams-Lee Polyclinic,"10208 Jennifer Underpass\nEast Raymond, AZ 45168"
2,3,"Phillips, Larsen and Anderson Polyclinic","9210 Davis Wall\nKrystalport, VT 22598"
3,4,Dorsey-Ruiz Polyclinic,"959 Eaton Loaf\nWest Jennashire, AK 83915"
4,5,Cohen Ltd Polyclinic,"1724 Cowan Throughway\nGeorgeport, PW 46408"
5,6,Sawyer LLC Polyclinic,"241 Brown Village\nMarkhaven, NM 53988"
6,7,Anderson and Sons Polyclinic,"13457 William Garden\nLake Kristen, AR 59309"
7,8,Odom Inc Polyclinic,"943 Avery Forks Suite 506\nPort Brenda, MO 61245"
8,9,"Grant, Johnson and Thomas Polyclinic","360 Jimenez Knolls Suite 324\nJamesland, PW 15865"
9,10,Barker LLC Polyclinic,"483 Lisa Shoal\nGinafort, MA 92919"


In [157]:
# Check that the names are all unique
print(polyclinics_df["name"].nunique())

30


## Booking Slots Table


In [158]:
# 1 day: 5 slots
# 1 week (5 working days): 25 slots
# 30 polyclincs: 750
# Booking Slots Table

# Define the start and end dates for the range
start_date = datetime(2025, 3, 17)
end_date = datetime(2025, 3, 21)

# Booking Slots Table
booking_slots_data = {
    "id": [i for i in range(1, 751)],
    "polyclinic_id": [random.choice(polyclinics_data["id"]) for _ in range(750)],
    "vaccine_id": [random.choice(vaccines_data["id"]) for _ in range(750)],
    "slot_datetime": [
        # Generate random date between 17th March and 21st March
        (
            start_date + timedelta(days=random.randint(0, (end_date - start_date).days))
        ).replace(
            hour=random.randint(9, 18),  # Random hour between 9 AM and 6 PM
            minute=0,  # Set minute to 0 for hourly slots
            second=0,  # Set second to 0
            microsecond=0,  # Set microsecond to 0
        )
        for _ in range(750)
    ],
    "is_booked": [random.choice([True, False]) for _ in range(750)],
}

booking_slots_df = pd.DataFrame(booking_slots_data)
booking_slots_df["id"] = booking_slots_df["id"].astype(str)
display(booking_slots_df)

Unnamed: 0,id,polyclinic_id,vaccine_id,slot_datetime,is_booked
0,1,29,1,2025-03-20 15:00:00,False
1,2,21,7,2025-03-19 11:00:00,True
2,3,14,2,2025-03-18 16:00:00,True
3,4,24,7,2025-03-18 18:00:00,True
4,5,21,7,2025-03-17 11:00:00,True
...,...,...,...,...,...
745,746,3,8,2025-03-19 18:00:00,False
746,747,14,7,2025-03-17 11:00:00,False
747,748,27,1,2025-03-17 16:00:00,True
748,749,18,1,2025-03-21 13:00:00,True


In [159]:
# Assign user_id only for booked slots
unique_bookings = set()  # To ensure uniqueness
user_ids = list(range(1, 201))  # User IDs from 1 to 200


def assign_user_id(row):
    if row["is_booked"]:
        random.shuffle(user_ids)  # Shuffle to randomize selection
        for user_id in user_ids:
            key = (row["polyclinic_id"], row["slot_datetime"], user_id)
            if key not in unique_bookings:
                unique_bookings.add(key)
                return str(user_id)
    return "None"


booking_slots_df["user_id"] = booking_slots_df.apply(assign_user_id, axis=1)
booking_slots_df["polyclinic_id"] = booking_slots_df["polyclinic_id"].astype(str)
booking_slots_df["vaccine_id"] = booking_slots_df["vaccine_id"].astype(str)
# Display the DataFrame
display(booking_slots_df)

Unnamed: 0,id,polyclinic_id,vaccine_id,slot_datetime,is_booked,user_id
0,1,29,1,2025-03-20 15:00:00,False,
1,2,21,7,2025-03-19 11:00:00,True,51
2,3,14,2,2025-03-18 16:00:00,True,72
3,4,24,7,2025-03-18 18:00:00,True,107
4,5,21,7,2025-03-17 11:00:00,True,82
...,...,...,...,...,...,...
745,746,3,8,2025-03-19 18:00:00,False,
746,747,14,7,2025-03-17 11:00:00,False,
747,748,27,1,2025-03-17 16:00:00,True,26
748,749,18,1,2025-03-21 13:00:00,True,103


In [160]:
# Check balance
booking_slots_df["is_booked"].value_counts()

is_booked
False    392
True     358
Name: count, dtype: int64

## Vaccine Records Table


In [161]:
# Vaccine Records Table
vaccine_records_data = {
    "id": [i for i in range(1, 501)],
    "user_id": [random.choice(users_data["id"]) for _ in range(500)],
    "vaccine_id": [random.choice(vaccines_data["id"]) for _ in range(500)],
    "polyclinic_id": [random.choice(polyclinics_data["id"]) for _ in range(500)],
    "vaccination_date": [fake.date_this_year() for _ in range(500)],
}
vaccine_records_df = pd.DataFrame(vaccine_records_data)
vaccine_records_df["id"] = vaccine_records_df["id"].astype(str)
vaccine_records_df["user_id"] = vaccine_records_df["user_id"].astype(str)
vaccine_records_df["vaccine_id"] = vaccine_records_df["vaccine_id"].astype(str)
vaccine_records_df["polyclinic_id"] = vaccine_records_df["polyclinic_id"].astype(str)
display(vaccine_records_df)

Unnamed: 0,id,user_id,vaccine_id,polyclinic_id,vaccination_date
0,1,4,6,16,2025-02-16
1,2,32,6,25,2025-02-20
2,3,102,5,14,2025-02-16
3,4,173,2,12,2025-01-08
4,5,99,2,17,2025-01-29
...,...,...,...,...,...
495,496,1,7,19,2025-03-07
496,497,136,8,10,2025-02-21
497,498,183,7,24,2025-02-27
498,499,45,2,18,2025-02-05


In [162]:
print(type(vaccine_records_df.iloc[0]["id"]))
print(type(vaccine_records_df.iloc[0]["user_id"]))
print(type(vaccine_records_df.iloc[0]["vaccine_id"]))
print(type(vaccine_records_df.iloc[0]["polyclinic_id"]))

<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>


## Vaccine Stock Inventory Table


In [163]:
# 30 polyclinics
# 8 vaccines
# Vaccine Stock Inventory Table
# Number of records
total_records = 200

# Initialize a list to hold the vaccine stock data
vaccine_stock_data = {
    "id": [],
    "polyclinic_id": [],
    "vaccine_id": [],
    "stock_quantity": [],
}

# Initialize a set to track unique (polyclinic_id, vaccine_id) combinations
existing_combinations = set()

id_counter = 1

# Loop to create 200 records with random vaccine availability
while len(vaccine_stock_data["polyclinic_id"]) < total_records:
    polyclinic = random.choice(polyclinics_df["id"])  # Randomly choose a polyclinic
    available_vaccines = random.sample(
        vaccines_df["id"].tolist(), random.randint(1, len(vaccines_df))
    )  # Randomly pick some vaccines for the polyclinic

    for vaccine in available_vaccines:
        # Ensure the (polyclinic_id, vaccine_id) combination is unique
        combination = (polyclinic, vaccine)
        if combination not in existing_combinations:
            vaccine_stock_data["id"].append(id_counter)
            vaccine_stock_data["polyclinic_id"].append(polyclinic)  # Polyclinic ID
            vaccine_stock_data["vaccine_id"].append(vaccine)  # Vaccine ID
            vaccine_stock_data["stock_quantity"].append(
                random.randint(50, 200)
            )  # Random stock quantity
            existing_combinations.add(combination)  # Track the combination
            id_counter += 1  # Increment ID

# Convert to DataFrame
vaccine_stock_df = pd.DataFrame(vaccine_stock_data)
vaccine_stock_df["id"] = vaccine_stock_df["id"].astype(str)
# Display the resulting DataFrame
display(vaccine_stock_df)

Unnamed: 0,id,polyclinic_id,vaccine_id,stock_quantity
0,1,26,5,52
1,2,26,4,167
2,3,26,1,90
3,4,26,6,158
4,5,26,3,119
...,...,...,...,...
196,197,30,8,129
197,198,16,7,90
198,199,13,1,142
199,200,13,4,70


In [164]:
# Group the dataframe by polyclinic_id and vaccine_id and count occurrences
grouped = (
    vaccine_stock_df.groupby(["polyclinic_id", "vaccine_id"])
    .size()
    .reset_index(name="count")
)

# Check if any combination appears more than once
duplicates = grouped[grouped["count"] > 1]

# Display the duplicates if any
if not duplicates.empty:
    print("Found duplicate entries:")
    display(duplicates)
else:
    print("All polyclinic_id and vaccine_id combinations are unique.")

All polyclinic_id and vaccine_id combinations are unique.


## Insert Vaccination Data into SQLite Database

Remember to delete the existing SQLite file before running the cell below.

In [165]:
# Define SQLite database file path again
sqlite_db_path = "../data/vaccination_db.sqlite"

# Reconnect to SQLite database
conn = sqlite3.connect(sqlite_db_path)
cursor = conn.cursor()


# Function to insert DataFrame data into SQLite
def insert_csv_to_sqlite(
    csv_data: pd.DataFrame, table_name: str, conn: sqlite3.Connection
):
    csv_data.to_sql(table_name, conn, if_exists="append", index=False)


# Insert data into SQLite tables
insert_csv_to_sqlite(users_df, "Users", conn)
insert_csv_to_sqlite(vaccines_df, "Vaccines", conn)
insert_csv_to_sqlite(polyclinics_df, "Polyclinics", conn)
insert_csv_to_sqlite(booking_slots_df, "BookingSlots", conn)
insert_csv_to_sqlite(vaccine_records_df, "VaccineRecords", conn)
insert_csv_to_sqlite(vaccine_stock_df, "VaccineStockInventory", conn)

# Commit changes and close connection
conn.commit()
conn.close()

## Query SQLite Database

In [166]:
conn = sqlite3.connect("../data/vaccination_db.sqlite")
cursor = conn.cursor()

In [167]:
# Example: Fetch all users
cursor.execute("SELECT * FROM Users")
rows = cursor.fetchall()
for row in rows:
    print(row)

conn.close()

('1', 'Alicia Phillips', 'richard15@example.org', '1968-10-07 00:00:00', 'Male')
('2', 'Clayton Duarte', 'vanessa30@example.org', '1960-08-15 00:00:00', 'Male')
('3', 'Cassandra Young', 'laurenhill@example.com', '1946-09-12 00:00:00', 'Male')
('4', 'Scott Hill', 'bgriffin@example.com', '1946-05-24 00:00:00', 'Male')
('5', 'Katie Davis', 'nnielsen@example.org', '1990-11-29 00:00:00', 'Female')
('6', 'Meredith Sellers', 'jmason@example.com', '1954-12-05 00:00:00', 'Male')
('7', 'Glenda King', 'jillmartin@example.net', '1961-09-02 00:00:00', 'Female')
('8', 'Michael Howell', 'oliviadiaz@example.org', '1987-01-15 00:00:00', 'Female')
('9', 'Roger Thompson', 'andrew50@example.org', '1968-01-07 00:00:00', 'Male')
('10', 'Erin Hernandez', 'rickwalker@example.org', '1968-06-04 00:00:00', 'Female')
('11', 'Alan Cohen', 'shawn56@example.net', '1987-02-18 00:00:00', 'Female')
('12', 'Patricia Garcia', 'johnsonsamuel@example.net', '1965-09-23 00:00:00', 'Male')
('13', 'Wendy Taylor', 'melissabenne

In [168]:
db_filename = "../data/vaccination_db.sqlite"

try:
    conn = sqlite3.connect(db_filename)
    cursor = conn.cursor()

    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    table_names = [row[0] for row in cursor.fetchall()]

    print("Tables in the database:")
    for table_name in table_names:
        print(table_name)

except sqlite3.Error as e:
    print(f"An error occurred: {e}")

finally:
    if conn:
        conn.close()

Tables in the database:
Users
Vaccines
Polyclinics
BookingSlots
VaccineRecords
VaccineStockInventory
