In [15]:
import re
import sqlite3

import pandas as pd
from IPython.display import display

## Connect to SQLite database


In [16]:
# Define SQLite database file path
sqlite_db_path = "../data/vaccination_db.sqlite"

# Reconnect to SQLite database
conn = sqlite3.connect(sqlite_db_path)

## Query Database

To query the database, define the `query`. 

Next, execute the query and load the results into a DataFrame: `pd.read_sql_query(query, conn)`

### Check info for all tables

In [17]:
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

print(f"There are {len(tables)} tables in this sqlite dataset")
for table in tables:
    table_name = table[0]
    print(f"\nName of tables: {table_name}")

    # count rows for the table
    cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
    row_count = cursor.fetchone()[0]
    print(f"row counts: {row_count}")
    # df = pd.read_sql(f"SELECT * FROM {table_name}", conn)
    # print(df.describe())

There are 7 tables in this sqlite dataset

Name of tables: Users
row counts: 5

Name of tables: Polyclinics
row counts: 26

Name of tables: GeneralPractitioners
row counts: 1532

Name of tables: Addresses
row counts: 1101

Name of tables: Vaccines
row counts: 7

Name of tables: VaccineRecords
row counts: 0

Name of tables: BookingSlots
row counts: 5460


#### User

In [18]:
# Query the Vaccines table
query = "SELECT * FROM Users"
# Execute the query and load results into a DataFrame
user_df = pd.read_sql(query, conn)
# Display the results
display(user_df)

Unnamed: 0,id,nric,first_name,last_name,email,date_of_birth,gender,password,postal_code
0,db87de91-9166-44a8-9acd-1994950bb727,S9463220K,Mark,Johnson,mark.johnson@example.net,1994-08-18 00:00:00,M,$2b$12$zWmi3kkXinr1zmZbBQPowO113FQr6Tzg57MukMe...,597245
1,22b64c6f-0b3b-4627-8fd4-7e08d29afbfb,S6047615J,Stephanie,Miller,stephanie.miller@example.org,1960-01-22 00:00:00,F,$2b$12$EJ8U3vOPqPQ.vOaE43qvhuMmcczimZn.yKq9Q/Q...,589603
2,70cc4d17-ca29-46f1-87cf-0fa7f65cb382,S5976393E,Kathy,Johnson,kathy.johnson@example.com,1959-03-25 00:00:00,F,$2b$12$yVeJjAaRU3ELppoeYw2DqOY0JlXTf421UEn8IT9...,576167
3,82e2531b-458e-48c5-b1fa-07b3bc1a7369,S6756268W,Joann,Ramirez,joann.ramirez@example.com,1967-05-19 00:00:00,F,$2b$12$T7CiVbjWI5mbXQTS5StTZO9C61NtO30wQ28lMYu...,575835
4,c6e59527-41f0-4482-9c83-6e044dc90394,T0173163B,Donald,Lewis,donald.lewis@example.com,2001-11-24 00:00:00,M,$2b$12$4dPEKgkVTBlSG0bbQyfr3ubgvNb5rd6JJ5qw4Ix...,520144


In [19]:
# ----------
# Check nric
# ----------
# Regex pattern for nric
nric_regex = r"[STFGM]\d{7}[A-Z]"
# Verify all nrics match the expected pattern
if all(re.fullmatch(nric_regex, nric) for nric in user_df["nric"]):
    print("All nrics match the expected format.")
else:
    print("Some nrics do not match the expected format.")
    print(user_df.loc[~user_df["nric"].str.match(nric_regex), ["nric"]])

# ----------
# Check name
# ----------
# Combine first name and last name into a full name for checking uniqueness
check_fullName = user_df["first_name"] + " " + user_df["last_name"]
# Check that the full names are all unique
print(f"Number of unique full names: {check_fullName.nunique()}")

# ---------
# Check dob
# ---------
start_date = "1920-01-01"
end_date = "2025-03-01"

user_df["date_of_birth"] = pd.to_datetime(user_df["date_of_birth"])
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)

is_in_range = (user_df["date_of_birth"] >= start_date) & (
    user_df["date_of_birth"] <= end_date
)

# check if the date is under range
if is_in_range.all():
    print("All date are in range")
else:
    print("Some dates are not in range")

All nrics match the expected format.
Number of unique full names: 5
All date are in range


#### Vaccines

In [20]:
# Query the Vaccines table
query = "SELECT * FROM Vaccines"
# Execute the query and load results into a DataFrame
vaccines_df = pd.read_sql(query, conn)
# Display the results
display(vaccines_df)

Unnamed: 0,id,name,price,doses_required,age_criteria,gender_criteria
0,cf1bb2e0-c87d-430f-88ca-e1f85b2d9120,Influenza (INF),9.0,1,18+ years old,
1,3d508ffd-595b-45ab-b6ad-82951932d001,Pneumococcal Conjugate (PCV13),16.0,1,65+ years old,
2,e592e664-7f17-4d9f-871c-59e38f3ecb53,Human Papillomavirus (HPV),23.0,3,18-26 years old,F
3,5a1c220f-6241-433c-918f-ab868b01dd9b,"Tetanus, Diphtheria, Pertussis (Tdap)",10.0,1,18+ years old,F
4,77c89fef-9ea2-4d18-b5f9-b12b48daa3fa,Hepatitis B (HepB),9.0,3,18+ years old,
5,048aea99-9787-4e5e-bd45-384ba721da58,"Measles, Mumps, Rubella (MMR)",9.0,2,18+ years old,
6,48f0eeb1-775c-41f3-b304-7b23d97537a8,Varicella (VAR),11.0,2,18+ years old,


#### Polyclinics

In [21]:
# Query the Vaccines table
query = "SELECT * FROM Polyclinics"
# Execute the query and load results into a DataFrame
polyclinic_df = pd.read_sql(query, conn)
# Display the results
display(polyclinic_df)

Unnamed: 0,id,name,postal_code
0,2d918b91-dd1b-4987-982f-fb96583a735d,Ang Mo Kio Polyclinic,569666
1,86d1e83c-a705-4314-9d57-7ee15642c9a9,Geylang Polyclinic,389707
2,b6954896-0fe0-4c8c-a94a-cab5d2524a33,Hougang Polyclinic,538829
3,2cdb11fd-7969-4116-b9a5-d4a6dd12f892,Kallang Polyclinic,328263
4,749937a4-650b-4538-9e96-993157d68e71,Khatib Polyclinic,769567
5,e0d0c692-afb9-4d38-9992-a51001d7c2ae,Toa Payoh Polyclinic,319260
6,0561118e-58ff-4f4a-8b70-c88c9c282532,Sembawang Polyclinic,756973
7,fe9a330e-846e-48d2-881a-82f3b755e112,Woodlands Polyclinic,738579
8,8734430f-d151-4bdb-acc5-403323c106a2,Yishun Polyclinic,768898
9,859d326f-2330-4e0c-bb8a-45f370249666,Bukit Batok Polyclinic,659164


#### GeneralPractitioners

In [22]:
# Query the Vaccines table
query = "SELECT * FROM GeneralPractitioners"
# Execute the query and load results into a DataFrame
gp_df = pd.read_sql(query, conn)
# Display the results
display(gp_df)

Unnamed: 0,id,name,postal_code
0,1190b261-4fea-4875-9db3-16274474b249,Cavenagh Medical Clinic And Home Care,269695
1,0678cd7f-cc4e-4eaa-b68b-6dce6a37394a,Mei Ling Clinic,140158
2,9ae47061-96bf-45d5-b5ef-d7f3690832a6,Rcmc Rivervale Crescent Medical Centre,541182
3,64291bff-1bbf-449c-b0e1-5d32846b9945,360 Clinic,560407
4,373832a9-e255-4940-b213-314e9d30ad8b,SKY Medical,079027
...,...,...,...
1527,e4f88686-0728-46f9-bed7-2aa24ed26abf,Trinity Medical@Shenton,068908
1528,e50ff677-6e27-4996-8161-8f72c773bd7e,Saudara Clinic By A+J General Physicians,419741
1529,44b973ba-2240-4d99-8028-cf92361fd0c7,Zara Clinic,680026
1530,35fc82d6-7b81-48e5-be5c-8235f70079da,360 Clinic,640221


#### Addresses

In [23]:
# Query the Vaccines table
query = "SELECT * FROM Addresses"
# Execute the query and load results into a DataFrame
addresses_df = pd.read_sql(query, conn)
# Display the results
display(addresses_df)

Unnamed: 0,id,postal_code,address,latitude,longitude
0,b9e0c89b-692e-4e25-8608-ecb5e2fd2b54,569666,21 ANG MO KIO CENTRAL 2 ANG MO KIO POLYCLINIC ...,1.3743245905856,103.845677779279
1,0785e85d-12e0-49e1-9f73-b190bdfa3a74,389707,21 GEYLANG EAST CENTRAL GEYLANG POLYCLINIC SIN...,1.31949365581957,103.887166041622
2,c0884b68-b4ef-46d0-99e0-96a0f7fa2c90,538829,89 HOUGANG AVENUE 4 HOUGANG POLYCLINIC SINGAPO...,1.3699068381066,103.88900146446
3,66681bee-890e-4349-9867-e09c3ed9dfbd,328263,701 SERANGOON ROAD KALLANG POLYCLINIC AND LONG...,1.31678496747374,103.858752270079
4,69354476-1319-42dc-b793-07626fd8e603,769567,690 YISHUN RING ROAD KHATIB POLYCLINIC SINGAPO...,1.41820402220914,103.834428037791
...,...,...,...,...,...
1096,ac364efb-85a5-4518-9dea-9ae8cccae897,597245,6A SUNSET VALE SINGAPORE 597245,1.32916492200213,103.768880413609
1097,bfda9322-964e-4eed-87fb-3a661161214a,589603,9 JALAN ANAK BUKIT THE RESERVE RESIDENCES SING...,1.33991958805043,103.776935231106
1098,82a0efe4-4ab4-4013-b3e9-530e4b02e639,576167,11 ORCHID DRIVE ADELPHI PARK ESTATE SINGAPORE ...,1.35511540506355,103.82901701112
1099,6ff1ebb6-bc67-4c5c-8801-0ff081b150a7,575835,21 SERAYA CRESCENT SEMBAWANG HILLS ESTATE SING...,1.37327032575504,103.827232710718


#### BookingSlots

In [24]:
# Query the Vaccines table
query = "SELECT * FROM BookingSlots"
# Execute the query and load results into a DataFrame
bookingslot_df = pd.read_sql(query, conn)
# Display the results
display(bookingslot_df)

Unnamed: 0,id,polyclinic_id,vaccine_id,datetime
0,bc7d1dfb-b9ac-432c-89cf-b95e92a5445b,94ccb4aa-00b8-4a5e-80cf-c22a227b312d,3d508ffd-595b-45ab-b6ad-82951932d001,2025-03-03 08:00:00
1,f2f2c146-e3c6-42b5-b0bd-b35fe6052ecb,859d326f-2330-4e0c-bb8a-45f370249666,48f0eeb1-775c-41f3-b304-7b23d97537a8,2025-03-03 08:00:00
2,80b3f301-0552-4ac9-b6fb-2d19c83d8a3e,358c539e-e6cc-4637-ad7e-0168b2468f4f,3d508ffd-595b-45ab-b6ad-82951932d001,2025-03-03 08:00:00
3,0453816e-fd41-49d7-8fb0-7fde21ec78e9,86d1e83c-a705-4314-9d57-7ee15642c9a9,cf1bb2e0-c87d-430f-88ca-e1f85b2d9120,2025-03-03 08:00:00
4,013995fa-a11b-48f5-9261-208e4e351515,749937a4-650b-4538-9e96-993157d68e71,77c89fef-9ea2-4d18-b5f9-b12b48daa3fa,2025-03-03 08:00:00
...,...,...,...,...
5455,6ffe7a3d-f5ec-426a-957c-f95311325209,b6954896-0fe0-4c8c-a94a-cab5d2524a33,3d508ffd-595b-45ab-b6ad-82951932d001,2025-03-31 17:30:00
5456,fe520d79-626c-4c48-9926-707b97705a0a,7320a7f2-f50a-42b0-a61c-50cdde191509,5a1c220f-6241-433c-918f-ab868b01dd9b,2025-03-31 17:30:00
5457,70833523-bdb9-4e5a-8895-b0106534d1f1,7bee5488-3941-4b4f-b0ea-cc28fb993388,48f0eeb1-775c-41f3-b304-7b23d97537a8,2025-03-31 17:30:00
5458,3ade9564-4467-4968-8977-7723a3b99726,8734430f-d151-4bdb-acc5-403323c106a2,cf1bb2e0-c87d-430f-88ca-e1f85b2d9120,2025-03-31 17:30:00


In [25]:
bookingslot_df["datetime"].value_counts()

datetime
2025-03-31 17:30:00    13
2025-03-03 08:00:00    13
2025-03-03 08:30:00    13
2025-03-03 09:00:00    13
2025-03-31 09:30:00    13
                       ..
2025-03-03 12:00:00    13
2025-03-03 11:30:00    13
2025-03-03 11:00:00    13
2025-03-03 10:30:00    13
2025-03-03 10:00:00    13
Name: count, Length: 420, dtype: int64

#### VaccineRecords

In [26]:
# Query the Vaccines table
query = "SELECT * FROM VaccineRecords"
# Execute the query and load results into a DataFrame
vaccinerecord_df = pd.read_sql(query, conn)
# Display the results
display(vaccinerecord_df)

Unnamed: 0,id,user_id,booking_slot_id,status


#### VaccineStockInventory (excluded)

In [27]:
# # Query the Vaccines table
# query = "SELECT * FROM VaccineStockInventory"
# # Execute the query and load results into a DataFrame
# vaccinestock_df = pd.read_sql(query, conn)
# # Display the results
# display(vaccinestock_df)

## Close SQLite Database Connection


In [28]:
# Close the connection
conn.close()