In [1]:
import re
import sqlite3

import pandas as pd
from IPython.display import display

## Connect to SQLite database


In [2]:
# Define SQLite database file path
sqlite_db_path = "../data/vaccination_db.sqlite"

# Reconnect to SQLite database
conn = sqlite3.connect(sqlite_db_path)

## Query Database

To query the database, define the `query`. 

Next, execute the query and load the results into a DataFrame: `pd.read_sql_query(query, conn)`

### Check info for all tables

In [3]:
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

print(f"There are {len(tables)} tables in this sqlite dataset")
for table in tables:
    table_name = table[0]
    print(f"\nName of tables: {table_name}")

    # count rows for the table
    cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
    row_count = cursor.fetchone()[0]
    print(f"row counts: {row_count}")
    # df = pd.read_sql(f"SELECT * FROM {table_name}", conn)
    # print(df.describe())

There are 7 tables in this sqlite dataset

Name of tables: Users
row counts: 5

Name of tables: Polyclinics
row counts: 26

Name of tables: GeneralPractitioners
row counts: 1532

Name of tables: Addresses
row counts: 1101

Name of tables: Vaccines
row counts: 7

Name of tables: VaccineRecords
row counts: 0

Name of tables: BookingSlots
row counts: 5460


#### User

In [4]:
# Query the Vaccines table
query = "SELECT * FROM Users"
# Execute the query and load results into a DataFrame
user_df = pd.read_sql(query, conn)
# Display the results
display(user_df)

Unnamed: 0,id,nric,first_name,last_name,email,date_of_birth,gender,password,postal_code,created_at,updated_at
0,020d1b61-98d1-460a-a434-7e4df76e06ed,S9411065U,Mark,Johnson,mark.johnson@example.net,1994-08-18 00:00:00,M,$2b$12$BxNNjECOSOd0h.Ggr5Ebwe1DIxHflDRhArtOYpA...,597245,2025-03-24 15:24:21,2025-03-24 15:24:21
1,909918f5-4099-4f38-a427-f9c404371af5,S6080947P,Stephanie,Miller,stephanie.miller@example.org,1960-01-22 00:00:00,F,$2b$12$kNP6GfYVKSkYElFrQevC9umIQq31aA10Kdq4IfQ...,589603,2025-03-24 15:24:21,2025-03-24 15:24:21
2,5df694d8-6455-4653-9bb4-4426008138cc,S5987208Q,Kathy,Johnson,kathy.johnson@example.com,1959-03-25 00:00:00,F,$2b$12$FzP4fahS/zS0kgYICrcVaOSbA7Audfxuqm2paOS...,576167,2025-03-24 15:24:21,2025-03-24 15:24:21
3,9026d9cb-94f0-4c29-b1b8-74cd669e4593,S6763428M,Joann,Ramirez,joann.ramirez@example.com,1967-05-19 00:00:00,F,$2b$12$X.0LXXUdHM87S9naZvFXeeEDrpxzQEhOHAc0VJM...,575835,2025-03-24 15:24:21,2025-03-24 15:24:21
4,15994d77-9bc4-40d8-aa38-d1e9e548267f,T0196362E,Donald,Lewis,donald.lewis@example.com,2001-11-24 00:00:00,M,$2b$12$4iPz.u2OZhy5BbOuOuglTOzxPld.K.E8bb6eYFY...,520144,2025-03-24 15:24:21,2025-03-24 15:24:21


In [5]:
# ----------
# Check nric
# ----------
# Regex pattern for nric
nric_regex = r"[STFGM]\d{7}[A-Z]"
# Verify all nrics match the expected pattern
if all(re.fullmatch(nric_regex, nric) for nric in user_df["nric"]):
    print("All nrics match the expected format.")
else:
    print("Some nrics do not match the expected format.")
    print(user_df.loc[~user_df["nric"].str.match(nric_regex), ["nric"]])

# ----------
# Check name
# ----------
# Combine first name and last name into a full name for checking uniqueness
check_fullName = user_df["first_name"] + " " + user_df["last_name"]
# Check that the full names are all unique
print(f"Number of unique full names: {check_fullName.nunique()}")

# ---------
# Check dob
# ---------
start_date = "1920-01-01"
end_date = "2025-03-01"

user_df["date_of_birth"] = pd.to_datetime(user_df["date_of_birth"])
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)

is_in_range = (user_df["date_of_birth"] >= start_date) & (
    user_df["date_of_birth"] <= end_date
)

# check if the date is under range
if is_in_range.all():
    print("All date are in range")
else:
    print("Some dates are not in range")

All nrics match the expected format.
Number of unique full names: 5
All date are in range


#### Vaccines

In [6]:
# Query the Vaccines table
query = "SELECT * FROM Vaccines"
# Execute the query and load results into a DataFrame
vaccines_df = pd.read_sql(query, conn)
# Display the results
display(vaccines_df)

Unnamed: 0,id,name,price,doses_required,age_criteria,gender_criteria,created_at,updated_at
0,03f8a7a3-f038-4047-95bb-9b5a6348f8b9,Influenza (INF),9.0,1,18+ years old,,2025-03-24 15:26:21,2025-03-24 15:26:21
1,288f7c80-3312-4b2e-9c21-ba8b74550c04,Pneumococcal Conjugate (PCV13),16.0,1,65+ years old,,2025-03-24 15:26:21,2025-03-24 15:26:21
2,6a196471-2035-4d74-bea1-1316bfc9fc22,Human Papillomavirus (HPV),23.0,3,18-26 years old,F,2025-03-24 15:26:21,2025-03-24 15:26:21
3,12e4c2c6-076e-42d8-b266-ffe8aa6239d5,"Tetanus, Diphtheria, Pertussis (Tdap)",10.0,1,18+ years old,F,2025-03-24 15:26:21,2025-03-24 15:26:21
4,d0160ebc-096a-430a-b5b5-953719f888e0,Hepatitis B (HepB),9.0,3,18+ years old,,2025-03-24 15:26:21,2025-03-24 15:26:21
5,469b84a1-ea5e-4bf3-8961-011390956c4e,"Measles, Mumps, Rubella (MMR)",9.0,2,18+ years old,,2025-03-24 15:26:21,2025-03-24 15:26:21
6,621d6ac4-e4eb-44fb-9ac3-33eb4caf2561,Varicella (VAR),11.0,2,18+ years old,,2025-03-24 15:26:21,2025-03-24 15:26:21


#### Polyclinics

In [7]:
# Query the Vaccines table
query = "SELECT * FROM Polyclinics"
# Execute the query and load results into a DataFrame
polyclinic_df = pd.read_sql(query, conn)
# Display the results
display(polyclinic_df)

Unnamed: 0,id,name,postal_code,created_at,updated_at
0,058eabbc-e1b5-4bfc-9491-ae35c3b65b11,Ang Mo Kio Polyclinic,569666,2025-03-24 15:24:19,2025-03-24 15:24:19
1,fe6e03da-d3a7-4ea4-ba5f-afad891cc1eb,Geylang Polyclinic,389707,2025-03-24 15:24:19,2025-03-24 15:24:19
2,21af55f4-e5ce-4935-8fc8-80592202589e,Hougang Polyclinic,538829,2025-03-24 15:24:19,2025-03-24 15:24:19
3,5cacdeeb-d4ff-44dd-8173-248155841374,Kallang Polyclinic,328263,2025-03-24 15:24:19,2025-03-24 15:24:19
4,3c2b287d-4fa8-4acb-a84b-d84c1b3fb9f7,Khatib Polyclinic,769567,2025-03-24 15:24:19,2025-03-24 15:24:19
5,9589bbd5-148c-4a8e-b0b9-da2da5b0154c,Toa Payoh Polyclinic,319260,2025-03-24 15:24:19,2025-03-24 15:24:19
6,b0c945f5-5370-496b-9cc0-3e9dfd287faa,Sembawang Polyclinic,756973,2025-03-24 15:24:19,2025-03-24 15:24:19
7,1c246056-b987-4885-8cc2-8b5024c921d0,Woodlands Polyclinic,738579,2025-03-24 15:24:19,2025-03-24 15:24:19
8,d2d5cdfb-b3a6-4645-ad56-2946104aeb12,Yishun Polyclinic,768898,2025-03-24 15:24:19,2025-03-24 15:24:19
9,f3b44a5c-229f-4ef4-ac16-654249fefc49,Bukit Batok Polyclinic,659164,2025-03-24 15:24:19,2025-03-24 15:24:19


#### GeneralPractitioners

In [8]:
# Query the Vaccines table
query = "SELECT * FROM GeneralPractitioners"
# Execute the query and load results into a DataFrame
gp_df = pd.read_sql(query, conn)
# Display the results
display(gp_df)

Unnamed: 0,id,name,postal_code,created_at,updated_at
0,ff62e077-a0c6-40bd-b055-daa78d482162,Cavenagh Medical Clinic And Home Care,269695,2025-03-24 15:24:19,2025-03-24 15:24:19
1,9dff00f9-51ef-4002-a3e9-eb4d61d42fa0,Mei Ling Clinic,140158,2025-03-24 15:24:19,2025-03-24 15:24:19
2,f84c9a11-68de-460b-8482-4d2f2ede5c69,Rcmc Rivervale Crescent Medical Centre,541182,2025-03-24 15:24:19,2025-03-24 15:24:19
3,f3b9bd75-41fa-4f83-aab9-5d5257235345,360 Clinic,560407,2025-03-24 15:24:19,2025-03-24 15:24:19
4,47f1787c-f4f5-4946-ad9d-0ec581df2d40,SKY Medical,079027,2025-03-24 15:24:19,2025-03-24 15:24:19
...,...,...,...,...,...
1527,d0cbee11-533d-43dd-8137-658312495569,Trinity Medical@Shenton,068908,2025-03-24 15:24:19,2025-03-24 15:24:19
1528,a30a6fbb-fcf4-4b1e-b26b-7d6d417ecc46,Saudara Clinic By A+J General Physicians,419741,2025-03-24 15:24:19,2025-03-24 15:24:19
1529,655969dc-ddd5-4287-abe1-7ce1bef978b3,Zara Clinic,680026,2025-03-24 15:24:19,2025-03-24 15:24:19
1530,41a488e0-093a-4ddc-b00b-d8c20335d537,360 Clinic,640221,2025-03-24 15:24:19,2025-03-24 15:24:19


#### Addresses

In [9]:
# Query the Vaccines table
query = "SELECT * FROM Addresses"
# Execute the query and load results into a DataFrame
addresses_df = pd.read_sql(query, conn)
# Display the results
display(addresses_df)

Unnamed: 0,id,postal_code,address,latitude,longitude,created_at,updated_at
0,8e1d8be3-9409-4f3b-a08e-6267e9ace574,569666,21 ANG MO KIO CENTRAL 2 ANG MO KIO POLYCLINIC ...,1.3743245905856,103.845677779279,2025-03-24 15:26:21,2025-03-24 15:26:21
1,0a9aa8e6-f4eb-4366-92c3-17e26c554c0d,389707,21 GEYLANG EAST CENTRAL GEYLANG POLYCLINIC SIN...,1.31949365581957,103.887166041622,2025-03-24 15:26:21,2025-03-24 15:26:21
2,07743fda-1871-43d6-b775-eabf56afeeb0,538829,89 HOUGANG AVENUE 4 HOUGANG POLYCLINIC SINGAPO...,1.3699068381066,103.88900146446,2025-03-24 15:26:21,2025-03-24 15:26:21
3,127c0399-0039-4bec-8a6a-64d78c4d933e,328263,701 SERANGOON ROAD KALLANG POLYCLINIC AND LONG...,1.31678496747374,103.858752270079,2025-03-24 15:26:21,2025-03-24 15:26:21
4,d66913a1-18ed-4fe6-ab34-2df017375711,769567,690 YISHUN RING ROAD KHATIB POLYCLINIC SINGAPO...,1.41820402220914,103.834428037791,2025-03-24 15:26:21,2025-03-24 15:26:21
...,...,...,...,...,...,...,...
1096,f7a3f50e-cea7-470b-9cda-5052f99415b4,597245,6A SUNSET VALE SINGAPORE 597245,1.32916492200213,103.768880413609,2025-03-24 15:26:21,2025-03-24 15:26:21
1097,34f8f813-646b-4a37-a82c-7da6935dc6a3,589603,9 JALAN ANAK BUKIT THE RESERVE RESIDENCES SING...,1.33991958805043,103.776935231106,2025-03-24 15:26:21,2025-03-24 15:26:21
1098,cc9610db-7624-494c-a79b-ba059c8d3e0d,576167,11 ORCHID DRIVE ADELPHI PARK ESTATE SINGAPORE ...,1.35511540506355,103.82901701112,2025-03-24 15:26:21,2025-03-24 15:26:21
1099,7ed367b0-9140-4423-932d-57691f05f0e2,575835,21 SERAYA CRESCENT SEMBAWANG HILLS ESTATE SING...,1.37327032575504,103.827232710718,2025-03-24 15:26:21,2025-03-24 15:26:21


#### BookingSlots

In [10]:
# Query the Vaccines table
query = "SELECT * FROM BookingSlots"
# Execute the query and load results into a DataFrame
bookingslot_df = pd.read_sql(query, conn)
# Display the results
display(bookingslot_df)

Unnamed: 0,id,polyclinic_id,vaccine_id,datetime,created_at,updated_at
0,6a399313-734b-4e0f-bd6a-548845af8bab,89d9fdf8-9067-43a4-b70a-98d24f506c82,6a196471-2035-4d74-bea1-1316bfc9fc22,2025-03-03 08:00:00,2025-03-24 15:26:21,2025-03-24 15:26:21
1,4957314a-a58f-4cca-b2a2-d6efebd15de8,e00d4623-22b1-402a-80f0-0e619dcb6738,469b84a1-ea5e-4bf3-8961-011390956c4e,2025-03-03 08:00:00,2025-03-24 15:26:21,2025-03-24 15:26:21
2,c7c99668-630f-4537-9d45-098d2f749558,a99a48e9-42d3-418e-9a53-bc558c6d80ee,d0160ebc-096a-430a-b5b5-953719f888e0,2025-03-03 08:00:00,2025-03-24 15:26:21,2025-03-24 15:26:21
3,ce8ebc4c-2d12-49ee-8c65-e00fa21847cd,b0c945f5-5370-496b-9cc0-3e9dfd287faa,03f8a7a3-f038-4047-95bb-9b5a6348f8b9,2025-03-03 08:00:00,2025-03-24 15:26:21,2025-03-24 15:26:21
4,4ea98756-1586-4376-8c10-24e6d32ff301,5cacdeeb-d4ff-44dd-8173-248155841374,d0160ebc-096a-430a-b5b5-953719f888e0,2025-03-03 08:00:00,2025-03-24 15:26:21,2025-03-24 15:26:21
...,...,...,...,...,...,...
5455,aee74f7c-e62b-4a49-8b42-2228a6ca9401,058eabbc-e1b5-4bfc-9491-ae35c3b65b11,03f8a7a3-f038-4047-95bb-9b5a6348f8b9,2025-03-31 17:30:00,2025-03-24 15:26:21,2025-03-24 15:26:21
5456,71e3beeb-8c0e-427e-980c-48d98409412e,d2d5cdfb-b3a6-4645-ad56-2946104aeb12,6a196471-2035-4d74-bea1-1316bfc9fc22,2025-03-31 17:30:00,2025-03-24 15:26:21,2025-03-24 15:26:21
5457,5680f642-a002-4b11-b847-bfe64422a828,a99a48e9-42d3-418e-9a53-bc558c6d80ee,621d6ac4-e4eb-44fb-9ac3-33eb4caf2561,2025-03-31 17:30:00,2025-03-24 15:26:21,2025-03-24 15:26:21
5458,e50117c7-6cb0-4876-9792-886c8c2914fe,3c2b287d-4fa8-4acb-a84b-d84c1b3fb9f7,12e4c2c6-076e-42d8-b266-ffe8aa6239d5,2025-03-31 17:30:00,2025-03-24 15:26:21,2025-03-24 15:26:21


In [11]:
bookingslot_df["datetime"].value_counts()

datetime
2025-03-31 17:30:00    13
2025-03-03 08:00:00    13
2025-03-03 08:30:00    13
2025-03-03 09:00:00    13
2025-03-31 09:30:00    13
                       ..
2025-03-03 12:00:00    13
2025-03-03 11:30:00    13
2025-03-03 11:00:00    13
2025-03-03 10:30:00    13
2025-03-03 10:00:00    13
Name: count, Length: 420, dtype: int64

#### VaccineRecords

In [12]:
# Query the Vaccines table
query = "SELECT * FROM VaccineRecords"
# Execute the query and load results into a DataFrame
vaccinerecord_df = pd.read_sql(query, conn)
# Display the results
display(vaccinerecord_df)

Unnamed: 0,id,user_id,booking_slot_id,status,created_at,updated_at


#### VaccineStockInventory (excluded)

In [13]:
# # Query the Vaccines table
# query = "SELECT * FROM VaccineStockInventory"
# # Execute the query and load results into a DataFrame
# vaccinestock_df = pd.read_sql(query, conn)
# # Display the results
# display(vaccinestock_df)

## Close SQLite Database Connection


In [14]:
# Close the connection
conn.close()