In [1]:
import re
import sqlite3

import pandas as pd
from IPython.display import display

## Connect to SQLite database


In [2]:
# Define SQLite database file path
sqlite_db_path = "../data/vaccination_db.sqlite"

# Reconnect to SQLite database
conn = sqlite3.connect(sqlite_db_path)

## Query Database

To query the database, define the `query`.

Next, execute the query and load the results into a DataFrame: `pd.read_sql_query(query, conn)`


### Check info for all tables


In [3]:
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

print(f"There are {len(tables)} tables in this sqlite dataset")
for table in tables:
    table_name = table[0]
    print(f"\nName of tables: {table_name}")

    # count rows for the table
    cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
    row_count = cursor.fetchone()[0]
    print(f"row counts: {row_count}")
    # df = pd.read_sql(f"SELECT * FROM {table_name}", conn)
    # print(df.describe())

There are 6 tables in this sqlite dataset

Name of tables: Users
row counts: 5

Name of tables: Clinics
row counts: 1558

Name of tables: Addresses
row counts: 1101

Name of tables: Vaccines
row counts: 7

Name of tables: BookingSlots
row counts: 11180

Name of tables: VaccineRecords
row counts: 0


## Users


In [4]:
# Query the Vaccines table
query = "SELECT * FROM Users"
# Execute the query and load results into a DataFrame
user_df = pd.read_sql(query, conn)
# Display the results
display(user_df)

Unnamed: 0,id,address_id,enrolled_clinic_id,nric,first_name,last_name,email,date_of_birth,gender,password,created_at,updated_at
0,f2967c0a-5121-4c46-a4dc-2b7979cfb421,918a15ec-2333-4ed3-a4b7-39a66cc92ffa,,S9071950Q,Mark,Johnson,mark.johnson@example.net,1990-11-21,M,$2b$12$gf1LfqFawqeRB1xqfYzITulmwjcAhOUdsP09FqZ...,2025-03-25 07:55:49,2025-03-25 07:55:49
1,e5723722-2395-447f-9512-ea0f17fac47f,797a0d10-0e78-4426-856c-0823a957a090,983a9652-d9a5-4560-a6b8-d7befb3fef45,S8257099H,Kimberly,Garza,kimberly.garza@example.net,1982-02-26,F,$2b$12$ITAS7rMGe475rz.1x53sPe8sxMEYl9xbDh53Q3o...,2025-03-25 07:55:49,2025-03-25 07:55:49
2,fd20c08f-8b5b-4dc0-b110-5e4decd0aea9,44851715-59cb-4ee3-81c3-11ced10d5fdc,f53f8540-5d7b-4dde-b806-b638ae0615f4,S9550684U,Justin,Baldwin,justin.baldwin@example.com,1995-03-22,M,$2b$12$atm0zwhIWEUQgpOq/oIUc.1TGMOogrMAZnvhp/c...,2025-03-25 07:55:49,2025-03-25 07:55:49
3,f5b0dfd9-4a27-49c2-aa6a-c05163bfbeaa,7b9479b8-4384-48d1-b6d6-8e2d1ee2545b,9c84ac40-ac28-4cd7-8fb5-516473c65719,T0478320K,Abigail,Shaffer,abigail.shaffer@example.org,2004-07-13,F,$2b$12$/az9tuFgGPDzR7BkkiPZwO8n1m00CiMsLdnhYEX...,2025-03-25 07:55:49,2025-03-25 07:55:49
4,ed5d95e8-0eaf-4101-92da-8149d99b53f5,172f4d51-5c70-4f32-8633-87aaf0eb4847,66648c66-52cd-48f4-8198-cd7d45919430,S9511354W,Gabrielle,Davis,gabrielle.davis@example.com,1995-01-29,F,$2b$12$clgwLm467Hn2N40vlzBegOMQGCITn8jzU7CU0ge...,2025-03-25 07:55:49,2025-03-25 07:55:49


In [5]:
# ----------
# Check nric
# ----------
# Regex pattern for nric
nric_regex = r"[STFGM]\d{7}[A-Z]"
# Verify all nrics match the expected pattern
if all(re.fullmatch(nric_regex, nric) for nric in user_df["nric"]):
    print("All nrics match the expected format.")
else:
    print("Some nrics do not match the expected format.")
    print(user_df.loc[~user_df["nric"].str.match(nric_regex), ["nric"]])

# ----------
# Check name
# ----------
# Combine first name and last name into a full name for checking uniqueness
check_fullName = user_df["first_name"] + " " + user_df["last_name"]
# Check that the full names are all unique
print(f"Number of unique full names: {check_fullName.nunique()}")

# ---------
# Check dob
# ---------
start_date = "1920-01-01"
end_date = "2025-03-01"

user_df["date_of_birth"] = pd.to_datetime(user_df["date_of_birth"])
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)

is_in_range = (user_df["date_of_birth"] >= start_date) & (
    user_df["date_of_birth"] <= end_date
)

# check if the date is under range
if is_in_range.all():
    print("All date are in range")
else:
    print("Some dates are not in range")

All nrics match the expected format.
Number of unique full names: 5
All date are in range


## Clinics


In [6]:
# Query the Vaccines table
query = "SELECT * FROM Clinics"
# Execute the query and load results into a DataFrame
clinics_df = pd.read_sql(query, conn)
# Display the results
display(clinics_df)

Unnamed: 0,id,address_id,name,type,created_at,updated_at
0,1d2e196d-af66-48f1-8f2c-0e23ad0ecde3,fa03e01a-219b-4ed1-91e6-14238e533df0,Ang Mo Kio Polyclinic,polyclinic,2025-03-25 07:55:49,2025-03-25 07:55:49
1,859fe92f-9c4b-4f3d-beda-b6c96e67887f,73532a10-1eff-4b9c-a36b-d5f7523bf760,Geylang Polyclinic,polyclinic,2025-03-25 07:55:49,2025-03-25 07:55:49
2,76da5477-df98-49d2-99d5-ba6d3900c3b7,77ac2094-04eb-483e-927d-076602cde3b7,Hougang Polyclinic,polyclinic,2025-03-25 07:55:49,2025-03-25 07:55:49
3,1489b9d7-5c8b-4df8-a614-70e76eb35722,bffe5d37-8def-4b9b-80e3-10e30a6734f8,Kallang Polyclinic,polyclinic,2025-03-25 07:55:49,2025-03-25 07:55:49
4,84b1aa83-156c-4eca-a9bf-ddc72424de1a,02931488-c56f-4c2f-8695-af6d7c28e1ba,Khatib Polyclinic,polyclinic,2025-03-25 07:55:49,2025-03-25 07:55:49
...,...,...,...,...,...,...
1553,1e618339-2b5e-4386-9696-d4b27874f4be,b45669cd-00d4-42bf-b94f-15d759fb0978,Trinity Medical@Shenton,gp,2025-03-25 07:55:49,2025-03-25 07:55:49
1554,d3931e52-a261-4d65-a91a-9ecfcad14ee0,f435aa5a-56b0-46d8-a073-d7d6986fd7ae,Saudara Clinic By A+J General Physicians,gp,2025-03-25 07:55:49,2025-03-25 07:55:49
1555,7d44d74b-ef2b-40d0-a18d-8f5487e28371,40d044c7-e40b-4d39-ad1f-321fdc0895dc,Zara Clinic,gp,2025-03-25 07:55:49,2025-03-25 07:55:49
1556,1e346932-a436-432a-b2c4-1d77b25df77a,21822a2b-6b6d-43ad-a2a9-93cb1411a01c,360 Clinic,gp,2025-03-25 07:55:49,2025-03-25 07:55:49


## Addresses


In [7]:
# Query the Vaccines table
query = "SELECT * FROM Addresses"
# Execute the query and load results into a DataFrame
addresses_df = pd.read_sql(query, conn)
# Display the results
display(addresses_df)

Unnamed: 0,id,postal_code,address,latitude,longitude,created_at,updated_at
0,fa03e01a-219b-4ed1-91e6-14238e533df0,569666,21 ANG MO KIO CENTRAL 2 ANG MO KIO POLYCLINIC ...,1.374325,103.845678,2025-03-25 07:55:49,2025-03-25 07:55:49
1,73532a10-1eff-4b9c-a36b-d5f7523bf760,389707,21 GEYLANG EAST CENTRAL GEYLANG POLYCLINIC SIN...,1.319494,103.887166,2025-03-25 07:55:49,2025-03-25 07:55:49
2,77ac2094-04eb-483e-927d-076602cde3b7,538829,89 HOUGANG AVENUE 4 HOUGANG POLYCLINIC SINGAPO...,1.369907,103.889001,2025-03-25 07:55:49,2025-03-25 07:55:49
3,bffe5d37-8def-4b9b-80e3-10e30a6734f8,328263,701 SERANGOON ROAD KALLANG POLYCLINIC AND LONG...,1.316785,103.858752,2025-03-25 07:55:49,2025-03-25 07:55:49
4,02931488-c56f-4c2f-8695-af6d7c28e1ba,769567,690 YISHUN RING ROAD KHATIB POLYCLINIC SINGAPO...,1.418204,103.834428,2025-03-25 07:55:49,2025-03-25 07:55:49
...,...,...,...,...,...,...,...
1096,918a15ec-2333-4ed3-a4b7-39a66cc92ffa,359130,30 JALAN LATEH SINGAPORE 359130,1.345296,103.869197,2025-03-25 07:55:49,2025-03-25 07:55:49
1097,797a0d10-0e78-4426-856c-0823a957a090,279121,82 GROVE DRIVE HENRY PARK SINGAPORE 279121,1.311536,103.784768,2025-03-25 07:55:49,2025-03-25 07:55:49
1098,44851715-59cb-4ee3-81c3-11ced10d5fdc,545411,44 PARK VILLAS GREEN PARK VILLAS SINGAPORE 545411,1.368339,103.879974,2025-03-25 07:55:49,2025-03-25 07:55:49
1099,7b9479b8-4384-48d1-b6d6-8e2d1ee2545b,598309,25 KING ALBERT PARK KING ALBERT PARK SINGAPORE...,1.333392,103.779092,2025-03-25 07:55:49,2025-03-25 07:55:49


## Vaccines


In [8]:
# Query the Vaccines table
query = "SELECT * FROM Vaccines"
# Execute the query and load results into a DataFrame
vaccines_df = pd.read_sql(query, conn)
# Display the results
display(vaccines_df)

Unnamed: 0,id,name,price,doses_required,age_criteria,gender_criteria,created_at,updated_at
0,8c99160b-aed4-46fe-b6c8-f25aacfc6e0d,Influenza (INF),9,1,18+ years old,,2025-03-25 07:55:49,2025-03-25 07:55:49
1,3149b2d6-ccd9-4107-b656-4209ed8eca1c,Pneumococcal Conjugate (PCV13),16,1,65+ years old,,2025-03-25 07:55:49,2025-03-25 07:55:49
2,100f8392-e99a-4036-a45b-8914f0e522d8,Human Papillomavirus (HPV),23,3,18-26 years old,F,2025-03-25 07:55:49,2025-03-25 07:55:49
3,0ab9b4ed-e58d-4019-84e0-7e0a6c5df7ad,"Tetanus, Diphtheria, Pertussis (Tdap)",10,1,18+ years old,F,2025-03-25 07:55:49,2025-03-25 07:55:49
4,3c3cdfbc-e67e-4ba1-b831-a1ffaab56302,Hepatitis B (HepB),9,3,18+ years old,,2025-03-25 07:55:49,2025-03-25 07:55:49
5,3460d1d7-fe6e-4f55-a00e-a35e817770bb,"Measles, Mumps, Rubella (MMR)",9,2,18+ years old,,2025-03-25 07:55:49,2025-03-25 07:55:49
6,03aba890-a9bc-4a31-98a3-7cb2ea6ca5e3,Varicella (VAR),11,2,18+ years old,,2025-03-25 07:55:49,2025-03-25 07:55:49


## BookingSlots


In [9]:
# Query the Vaccines table
query = "SELECT * FROM BookingSlots"
# Execute the query and load results into a DataFrame
bookingslot_df = pd.read_sql(query, conn)
# Display the results
display(bookingslot_df)

Unnamed: 0,id,polyclinic_id,vaccine_id,datetime,created_at,updated_at
0,c3bcbe9a-0e6d-4d5e-9f11-21677e3661b4,d04e2d6c-9ee8-4871-b79e-35d8e587cda9,100f8392-e99a-4036-a45b-8914f0e522d8,2025-03-03 08:00:00,2025-03-25 07:55:49,2025-03-25 07:55:49
1,5a949a2d-74bc-4a8d-966e-06299aea086e,4de9e253-08ac-445f-95a6-3d086068c49c,0ab9b4ed-e58d-4019-84e0-7e0a6c5df7ad,2025-03-03 08:00:00,2025-03-25 07:55:49,2025-03-25 07:55:49
2,443d8791-9d16-4666-873c-cfeffd378602,015991c6-8565-4f5b-a162-abb083bf6d58,3149b2d6-ccd9-4107-b656-4209ed8eca1c,2025-03-03 08:00:00,2025-03-25 07:55:49,2025-03-25 07:55:49
3,3f7f75c0-b28c-4bb7-8c9a-991e5d150bc3,2017474b-c611-42a9-be8c-3fa14457f8b6,8c99160b-aed4-46fe-b6c8-f25aacfc6e0d,2025-03-03 08:00:00,2025-03-25 07:55:49,2025-03-25 07:55:49
4,1c62eeac-2c5d-4600-8a2e-eed78d88356c,859fe92f-9c4b-4f3d-beda-b6c96e67887f,8c99160b-aed4-46fe-b6c8-f25aacfc6e0d,2025-03-03 08:00:00,2025-03-25 07:55:49,2025-03-25 07:55:49
...,...,...,...,...,...,...
11175,3b004cb5-0c8a-4483-9420-7bda8cdf49b9,f53f8540-5d7b-4dde-b806-b638ae0615f4,8c99160b-aed4-46fe-b6c8-f25aacfc6e0d,2025-04-30 17:30:00,2025-03-25 07:55:49,2025-03-25 07:55:49
11176,793bd4bb-cc1b-4fd0-bb30-721ebf0faa0d,84b1aa83-156c-4eca-a9bf-ddc72424de1a,3c3cdfbc-e67e-4ba1-b831-a1ffaab56302,2025-04-30 17:30:00,2025-03-25 07:55:49,2025-03-25 07:55:49
11177,c9d71d4d-3355-44f9-a0ae-27f5f77588ec,76da5477-df98-49d2-99d5-ba6d3900c3b7,3149b2d6-ccd9-4107-b656-4209ed8eca1c,2025-04-30 17:30:00,2025-03-25 07:55:49,2025-03-25 07:55:49
11178,9b801873-2ced-49a2-a203-d8191af48eae,14398891-c7a3-4425-9d6d-63a5c8cc4ac8,3149b2d6-ccd9-4107-b656-4209ed8eca1c,2025-04-30 17:30:00,2025-03-25 07:55:49,2025-03-25 07:55:49


In [10]:
bookingslot_df["datetime"].value_counts()

datetime
2025-03-03 08:00:00    13
2025-04-10 10:30:00    13
2025-04-10 11:30:00    13
2025-04-10 12:00:00    13
2025-04-10 12:30:00    13
                       ..
2025-03-21 12:30:00    13
2025-03-21 13:00:00    13
2025-03-21 13:30:00    13
2025-03-21 14:00:00    13
2025-04-30 17:30:00    13
Name: count, Length: 860, dtype: int64

## Vaccine Records


In [11]:
# Query the Vaccines table
query = "SELECT * FROM VaccineRecords"
# Execute the query and load results into a DataFrame
vaccinerecord_df = pd.read_sql(query, conn)
# Display the results
display(vaccinerecord_df)

Unnamed: 0,id,user_id,booking_slot_id,status,created_at,updated_at


## Close SQLite Database Connection


In [12]:
# Close the connection
conn.close()