In [55]:
import re
import sqlite3

import pandas as pd
from IPython.display import display

## Connect to SQLite database


In [56]:
# Define SQLite database file path
sqlite_db_path = "../data/vaccination_db.sqlite"

# Reconnect to SQLite database
conn = sqlite3.connect(sqlite_db_path)

## Query Database

To query the database, define the `query`.

Next, execute the query and load the results into a DataFrame: `pd.read_sql_query(query, conn)`


### Check info for all tables


In [57]:
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

print(f"There are {len(tables)} tables in this sqlite dataset")
for table in tables:
    table_name = table[0]
    print(f"\nName of tables: {table_name}")

    # count rows for the table
    cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
    row_count = cursor.fetchone()[0]
    print(f"row counts: {row_count}")
    # df = pd.read_sql(f"SELECT * FROM {table_name}", conn)
    # print(df.describe())

There are 7 tables in this sqlite dataset

Name of tables: Users
row counts: 5

Name of tables: Clinics
row counts: 1558

Name of tables: Addresses
row counts: 1101

Name of tables: Vaccines
row counts: 13

Name of tables: VaccineCriteria
row counts: 41

Name of tables: BookingSlots
row counts: 11180

Name of tables: VaccineRecords
row counts: 0


## Users


In [58]:
# Query the Vaccines table
query = "SELECT * FROM Users"
# Execute the query and load results into a DataFrame
user_df = pd.read_sql(query, conn)
# Display the results
display(user_df)

Unnamed: 0,id,address_id,enrolled_clinic_id,nric,first_name,last_name,email,date_of_birth,gender,password,created_at,updated_at
0,812cf9f3-05be-429b-84d8-60b0fccac402,eb3c6dee-c95e-4595-92f5-9b574702369d,,S9489143A,Mark,Johnson,mark.johnson@example.net,1994-08-27,M,$2b$12$kly/wBEGaHIDWw.1RWGNtOoz7I1UZbOqw7ShG8s...,2025-04-02 03:56:35,2025-04-02 03:56:35
1,b14a41d8-5819-4685-93fa-d46434f32e0a,6708a9a1-88fe-47f2-80a3-518a01d75fe9,042e6bc3-6cd0-4808-909c-16fbde02d4ee,S6091645N,Stephanie,Miller,stephanie.miller@example.org,1960-01-31,F,$2b$12$Zpo8JyOjvPF2sHBZzakUO.WYt8wIsT9pp.2qXNL...,2025-04-02 03:56:35,2025-04-02 03:56:35
2,d9d303b7-240c-4320-ae6c-e3d7ef914ce3,059d2e2a-8b8d-466c-a644-19ab8039288b,a0e4b2dc-4bff-414f-ae07-00584aef455a,S5962037V,Jonathan,Johnson,jonathan.johnson@example.com,1959-04-03,M,$2b$12$LGg25.ziP0jnuamBfeRx6.UvzRHcqLnyZjajdcB...,2025-04-02 03:56:35,2025-04-02 03:56:35
3,fcef2a1c-a9bc-4c27-a7bd-3aedc4ce4c53,9c1728ef-581c-4956-a8ab-ee77eb775be8,4acb36b8-ffba-4db9-9893-70a3a81c8349,S6789723Q,Joann,Ramirez,joann.ramirez@example.com,1967-05-28,F,$2b$12$Md3jkP7vNprR2H/FZQOOeuG6X/9j6qdt.zAKEkS...,2025-04-02 03:56:35,2025-04-02 03:56:35
4,4d1ae0f6-dcbe-4b30-ba7d-9df9e1e9a95a,fe68a41d-6ee2-449b-9b5a-68103a7f676f,dd30df9a-de3b-401c-890c-66b33d556bbd,T0144555I,Diana,Lewis,diana.lewis@example.com,2001-12-03,F,$2b$12$GY7xUYyMlGQlVQ1EzWsNqujEMvp2UGNXllBRFrN...,2025-04-02 03:56:35,2025-04-02 03:56:35


In [59]:
# ----------
# Check nric
# ----------
# Regex pattern for nric
nric_regex = r"[STFGM]\d{7}[A-Z]"
# Verify all nrics match the expected pattern
if all(re.fullmatch(nric_regex, nric) for nric in user_df["nric"]):
    print("All nrics match the expected format.")
else:
    print("Some nrics do not match the expected format.")
    print(user_df.loc[~user_df["nric"].str.match(nric_regex), ["nric"]])

# ----------
# Check name
# ----------
# Combine first name and last name into a full name for checking uniqueness
check_fullName = user_df["first_name"] + " " + user_df["last_name"]
# Check that the full names are all unique
print(f"Number of unique full names: {check_fullName.nunique()}")

# ---------
# Check dob
# ---------
start_date = "1920-01-01"
end_date = "2025-03-01"

user_df["date_of_birth"] = pd.to_datetime(user_df["date_of_birth"])
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)

is_in_range = (user_df["date_of_birth"] >= start_date) & (
    user_df["date_of_birth"] <= end_date
)

# check if the date is under range
if is_in_range.all():
    print("All date are in range")
else:
    print("Some dates are not in range")

All nrics match the expected format.
Number of unique full names: 5
All date are in range


## Clinics


In [60]:
# Query the Vaccines table
query = "SELECT * FROM Clinics"
# Execute the query and load results into a DataFrame
clinics_df = pd.read_sql(query, conn)
# Display the results
display(clinics_df)

Unnamed: 0,id,address_id,name,type,created_at,updated_at
0,ca848978-29e9-4345-8233-6d968543cbec,c83ac8b7-8506-4408-95ae-8cc0e5079c2a,Ang Mo Kio Polyclinic,polyclinic,2025-04-02 03:56:35,2025-04-02 03:56:35
1,5244d263-c742-415b-aa42-9c79e745d272,759cbf94-0cd4-4e14-844c-20bb6f3929c5,Geylang Polyclinic,polyclinic,2025-04-02 03:56:35,2025-04-02 03:56:35
2,3010c7a1-52b2-4df1-9922-8c67236ca4e3,1c364832-89e7-453b-b846-88841cc26727,Hougang Polyclinic,polyclinic,2025-04-02 03:56:35,2025-04-02 03:56:35
3,00ce402a-08ce-4be5-85d8-ad176779c60c,a97e81bd-f4c2-4f34-a8cd-a34f40fb8c89,Kallang Polyclinic,polyclinic,2025-04-02 03:56:35,2025-04-02 03:56:35
4,d66f950a-694e-435c-b0af-ea43f80391a0,962c3c1c-8809-4182-a918-4c0e14682f32,Khatib Polyclinic,polyclinic,2025-04-02 03:56:35,2025-04-02 03:56:35
...,...,...,...,...,...,...
1553,918370f4-fbe8-44c0-9017-7492fc922661,caa88de6-c9e2-4dfd-bb37-8bb21b56bfce,Trinity Medical@Shenton,gp,2025-04-02 03:56:35,2025-04-02 03:56:35
1554,98ab4ef0-9344-4f8a-acf1-663e193f03d8,4256935f-fb2b-4d1f-8ea4-78bfc1a4df53,Saudara Clinic By A+J General Physicians,gp,2025-04-02 03:56:35,2025-04-02 03:56:35
1555,20055a21-31ec-46d0-8a3a-c562089bf9cd,4a1d1d1b-765b-428b-8733-eba7bf8bbe76,Zara Clinic,gp,2025-04-02 03:56:35,2025-04-02 03:56:35
1556,f4f9e8f6-1416-4b21-993b-86bee5c62684,79e2aa5a-6dac-4b23-93b4-6693965b87e9,360 Clinic,gp,2025-04-02 03:56:35,2025-04-02 03:56:35


## Addresses


In [61]:
# Query the Vaccines table
query = "SELECT * FROM Addresses"
# Execute the query and load results into a DataFrame
addresses_df = pd.read_sql(query, conn)
# Display the results
display(addresses_df)

Unnamed: 0,id,postal_code,address,latitude,longitude,created_at,updated_at
0,c83ac8b7-8506-4408-95ae-8cc0e5079c2a,569666,21 ANG MO KIO CENTRAL 2 ANG MO KIO POLYCLINIC ...,1.374325,103.845678,2025-04-02 03:56:35,2025-04-02 03:56:35
1,759cbf94-0cd4-4e14-844c-20bb6f3929c5,389707,21 GEYLANG EAST CENTRAL GEYLANG POLYCLINIC SIN...,1.319494,103.887166,2025-04-02 03:56:35,2025-04-02 03:56:35
2,1c364832-89e7-453b-b846-88841cc26727,538829,89 HOUGANG AVENUE 4 HOUGANG POLYCLINIC SINGAPO...,1.369907,103.889001,2025-04-02 03:56:35,2025-04-02 03:56:35
3,a97e81bd-f4c2-4f34-a8cd-a34f40fb8c89,328263,701 SERANGOON ROAD KALLANG POLYCLINIC AND LONG...,1.316785,103.858752,2025-04-02 03:56:35,2025-04-02 03:56:35
4,962c3c1c-8809-4182-a918-4c0e14682f32,769567,690 YISHUN RING ROAD KHATIB POLYCLINIC SINGAPO...,1.418204,103.834428,2025-04-02 03:56:35,2025-04-02 03:56:35
...,...,...,...,...,...,...,...
1096,eb3c6dee-c95e-4595-92f5-9b574702369d,249393,6 HOOT KIAM ROAD SINGAPORE 249393,1.296160,103.830433,2025-04-02 03:56:35,2025-04-02 03:56:35
1097,6708a9a1-88fe-47f2-80a3-518a01d75fe9,258201,22 ROBIN ROAD SINGAPORE 258201,1.317570,103.828201,2025-04-02 03:56:35,2025-04-02 03:56:35
1098,059d2e2a-8b8d-466c-a644-19ab8039288b,208511,101 KITCHENER ROAD JALAN BESAR PLAZA SINGAPORE...,1.308658,103.858067,2025-04-02 03:56:35,2025-04-02 03:56:35
1099,9c1728ef-581c-4956-a8ab-ee77eb775be8,109923,3 BUKIT CHERMIN ROAD LEARNING SEEDS CHILD DEVE...,1.268826,103.811916,2025-04-02 03:56:35,2025-04-02 03:56:35


## Vaccines


In [62]:
# Query the Vaccines table
query = "SELECT * FROM Vaccines"
# Execute the query and load results into a DataFrame
vaccines_df = pd.read_sql(query, conn)
# Display the results
display(vaccines_df)

Unnamed: 0,id,name,created_at,updated_at
0,b79e4769-a79f-4142-96fc-da07d538cf9e,Influenza (INF),2025-04-02 03:56:35,2025-04-02 03:56:35
1,681b0970-aac4-4e10-b71a-95904751de8a,Pneumococcal conjugate (PCV13),2025-04-02 03:56:35,2025-04-02 03:56:35
2,cb5b468c-5403-465e-8980-3105fc0a6570,Pneumococcal polysaccharide (PPSV23),2025-04-02 03:56:35,2025-04-02 03:56:35
3,faebabbd-8d6d-4f2f-b54c-5dfbe411ae11,"Tetanus, reduced diphtheria and acellular pert...",2025-04-02 03:56:35,2025-04-02 03:56:35
4,b6d6ead4-ab98-4d83-9057-c0f1a7347d6f,Human papillomavirus (HPV2 or HPV4),2025-04-02 03:56:35,2025-04-02 03:56:35
5,82104f64-fa90-41b2-a48b-4418cfebcc4b,Hepatitis B (HepB),2025-04-02 03:56:35,2025-04-02 03:56:35
6,e9b88fcc-5474-44c5-b040-a5b3c21571ce,"Measles, mumps and rubella (MMR)",2025-04-02 03:56:35,2025-04-02 03:56:35
7,6561d897-3ca3-44da-bb66-551d5ec104b2,Varicella (VAR),2025-04-02 03:56:35,2025-04-02 03:56:35
8,b5e16dea-3fec-4449-a494-61ca2c85a0db,Bacillus Calmette-Guérin (BCG),2025-04-02 03:56:35,2025-04-02 03:56:35
9,94d10a24-04eb-4c18-bb42-ad6d7132efe2,"Diphtheria, tetanus and acellular pertussis (D...",2025-04-02 03:56:35,2025-04-02 03:56:35


## VaccineCriteria


In [63]:
# Query the VaccineCriteria table
query = "SELECT * FROM VaccineCriteria"
# Execute the query and load results into a DataFrame
vaccineCriteria_df = pd.read_sql(query, conn)
# Display the results
display(vaccineCriteria_df)

Unnamed: 0,id,vaccine_id,age_criteria,gender_criteria,health_condition_criteria,doses_required,frequency,created_at,updated_at
0,126d0049-383f-4978-9c4c-a89c23728687,b79e4769-a79f-4142-96fc-da07d538cf9e,18-64 years,,Specific medical conditions or indications,1,Annually or per season,2025-04-02 03:56:35,2025-04-02 03:56:35
1,617c7996-546a-4b6c-b5bb-3e2cdd3bf0a1,b79e4769-a79f-4142-96fc-da07d538cf9e,65+ years,,,1,Annually or per season,2025-04-02 03:56:35,2025-04-02 03:56:35
2,8bdca1b3-dceb-4fd4-b8dd-d5398fc42133,681b0970-aac4-4e10-b71a-95904751de8a,18-64 years,,Specific medical conditions or indications,1,Once,2025-04-02 03:56:35,2025-04-02 03:56:35
3,34ae6224-b415-4989-956b-050865b08be3,681b0970-aac4-4e10-b71a-95904751de8a,65+ years,,,1,Once,2025-04-02 03:56:35,2025-04-02 03:56:35
4,a8bf08bd-2d54-4da4-a92b-6536139ac3f3,cb5b468c-5403-465e-8980-3105fc0a6570,18-64 years,,Depending on indication,1,Once,2025-04-02 03:56:35,2025-04-02 03:56:35
5,15d78fae-31fd-4680-8fe0-f9e7bb3d3b20,cb5b468c-5403-465e-8980-3105fc0a6570,65+ years,,,1,Once,2025-04-02 03:56:35,2025-04-02 03:56:35
6,1e4f3f91-8dbb-4c63-9174-7f05167ece9e,faebabbd-8d6d-4f2f-b54c-5dfbe411ae11,18+ years,,Pregnancy,1,Per pregnancy,2025-04-02 03:56:35,2025-04-02 03:56:35
7,ba6d8ab5-4532-462e-95f4-18dde61a1765,b6d6ead4-ab98-4d83-9057-c0f1a7347d6f,18-26 years,F,Unvaccinated adults or uncertain history,3,Once,2025-04-02 03:56:35,2025-04-02 03:56:35
8,cd79fd4e-ffa4-48b4-b84d-cdc5814da685,82104f64-fa90-41b2-a48b-4418cfebcc4b,18+ years,,Unvaccinated adults or uncertain history,3,Once,2025-04-02 03:56:35,2025-04-02 03:56:35
9,2c02d8df-9b3a-4e8f-965d-95ad8f4dbab3,e9b88fcc-5474-44c5-b040-a5b3c21571ce,18+ years,,Unvaccinated adults or uncertain history,2,Once,2025-04-02 03:56:35,2025-04-02 03:56:35


## BookingSlots


In [64]:
# Query the Vaccines table
query = "SELECT * FROM BookingSlots"
# Execute the query and load results into a DataFrame
bookingslot_df = pd.read_sql(query, conn)
# Display the results
display(bookingslot_df)

Unnamed: 0,id,polyclinic_id,vaccine_id,datetime,created_at,updated_at
0,ba19c4e2-88a1-4feb-881b-38f0aa0da73c,dbcd1f73-cb22-484f-ab60-a8b931bd6033,cb5b468c-5403-465e-8980-3105fc0a6570,2025-03-03 08:00:00,2025-04-02 03:56:35,2025-04-02 03:56:35
1,161c078b-fb75-40b8-a850-311657fffc0a,5ac611d4-abfe-4008-a254-508ea1038161,82104f64-fa90-41b2-a48b-4418cfebcc4b,2025-03-03 08:00:00,2025-04-02 03:56:35,2025-04-02 03:56:35
2,b27d7807-69a5-4408-93ea-c08b54895001,adc2e872-c0bf-499a-bf0f-3f6bc057e5fb,681b0970-aac4-4e10-b71a-95904751de8a,2025-03-03 08:00:00,2025-04-02 03:56:35,2025-04-02 03:56:35
3,63e49af2-503c-4ab0-9de9-16df11f98671,7a6bc0a2-c61c-4129-8740-7dd7e5cc5bef,faebabbd-8d6d-4f2f-b54c-5dfbe411ae11,2025-03-03 08:00:00,2025-04-02 03:56:35,2025-04-02 03:56:35
4,db921906-c271-4f46-b0b0-476f171ab14d,db8e14db-f153-4f84-ab0d-439d3dfb1a0a,b6d6ead4-ab98-4d83-9057-c0f1a7347d6f,2025-03-03 08:00:00,2025-04-02 03:56:35,2025-04-02 03:56:35
...,...,...,...,...,...,...
11175,1d6f4133-f222-4442-a00f-7091198513cc,a0e4b2dc-4bff-414f-ae07-00584aef455a,94d10a24-04eb-4c18-bb42-ad6d7132efe2,2025-04-30 17:30:00,2025-04-02 03:56:35,2025-04-02 03:56:35
11176,baf7efb5-ae74-4501-8d98-692fe45f1724,3010c7a1-52b2-4df1-9922-8c67236ca4e3,b5e16dea-3fec-4449-a494-61ca2c85a0db,2025-04-30 17:30:00,2025-04-02 03:56:35,2025-04-02 03:56:35
11177,ec2c83f0-ff14-4231-ac4d-f45c4e401909,1b006fbf-6cea-4b4d-aebc-1c0868fa5c0b,681b0970-aac4-4e10-b71a-95904751de8a,2025-04-30 17:30:00,2025-04-02 03:56:35,2025-04-02 03:56:35
11178,401216ee-4f39-4ea8-9d49-1b41fb57f6c6,27d4f24d-232f-4cc3-9d32-93d40cf282d8,cb5b468c-5403-465e-8980-3105fc0a6570,2025-04-30 17:30:00,2025-04-02 03:56:35,2025-04-02 03:56:35


In [65]:
bookingslot_df["datetime"].value_counts()

datetime
2025-04-30 17:30:00    13
2025-03-03 08:00:00    13
2025-03-03 08:30:00    13
2025-03-03 09:00:00    13
2025-03-03 09:30:00    13
                       ..
2025-03-03 16:00:00    13
2025-03-03 15:30:00    13
2025-03-03 15:00:00    13
2025-03-03 14:30:00    13
2025-03-03 14:00:00    13
Name: count, Length: 860, dtype: int64

## Vaccine Records


In [66]:
# Query the Vaccines table
query = "SELECT * FROM VaccineRecords"
# Execute the query and load results into a DataFrame
vaccinerecord_df = pd.read_sql(query, conn)
# Display the results
display(vaccinerecord_df)

Unnamed: 0,id,user_id,booking_slot_id,status,created_at,updated_at


## Close SQLite Database Connection


In [67]:
# Close the connection
conn.close()