In [14]:
import re
import sqlite3

import pandas as pd
from IPython.display import display

## Connect to SQLite database


In [15]:
# Define SQLite database file path
sqlite_db_path = "../data/vaccination_db.sqlite"

# Reconnect to SQLite database
conn = sqlite3.connect(sqlite_db_path)

## Query Database

To query the database, define the `query`. 

Next, execute the query and load the results into a DataFrame: `pd.read_sql_query(query, conn)`

### Check info for all tables

In [16]:
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

print(f"There are {len(tables)} tables in this sqlite dataset")
for table in tables:
    table_name = table[0]
    print(f"\nName of tables: {table_name}")

    # count rows for the table
    cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
    row_count = cursor.fetchone()[0]
    print(f"row counts: {row_count}")
    # df = pd.read_sql(f"SELECT * FROM {table_name}", conn)
    # print(df.describe())

There are 8 tables in this sqlite dataset

Name of tables: Users
row counts: 200

Name of tables: Polyclinics
row counts: 26

Name of tables: GeneralPractitioners
row counts: 1532

Name of tables: Addresses
row counts: 1296

Name of tables: Vaccines
row counts: 7

Name of tables: VaccineRecords
row counts: 500

Name of tables: VaccineStockInventory
row counts: 81

Name of tables: BookingSlots
row counts: 2600


#### User

In [17]:
# Query the Vaccines table
query = "SELECT * FROM Users"
# Execute the query and load results into a DataFrame
user_df = pd.read_sql(query, conn)
# Display the results
display(user_df)

Unnamed: 0,id,nric,first_name,last_name,email,date_of_birth,gender,postal_code
0,1,S6215649N,Benjamin,Maldonado,benjamin.maldonado@example.com,1982-09-07 00:00:00,M,597245
1,2,S6333399F,Marcia,Miller,marcia.miller@example.com,1999-06-01 00:00:00,F,589603
2,3,S7099781L,Robin,Stephens,robin.stephens@example.net,1954-06-16 00:00:00,F,576167
3,4,S6267533V,Erica,Morris,erica.morris@example.net,1967-01-23 00:00:00,F,575835
4,5,S5958219J,David,Allen,david.allen@example.net,1963-10-16 00:00:00,M,520144
...,...,...,...,...,...,...,...,...
195,196,S8828915Y,Stephanie,Morris,stephanie.morris@example.com,1973-06-15 00:00:00,F,728787
196,197,S9895163P,Joseph,James,joseph.james@example.org,1965-02-03 00:00:00,M,218601
197,198,S6942818Q,John,Carlson,john.carlson@example.net,1987-05-02 00:00:00,M,519936
198,199,S5457762V,Steve,Torres,steve.torres@example.com,1961-06-20 00:00:00,M,118926


In [18]:
# ----------
# Check nric
# ----------
# Regex pattern for nric
nric_regex = r"[STFGM]\d{7}[A-Z]"
# Verify all nrics match the expected pattern
if all(re.fullmatch(nric_regex, nric) for nric in user_df["nric"]):
    print("All nrics match the expected format.")
else:
    print("Some nrics do not match the expected format.")
    print(user_df.loc[~user_df["nric"].str.match(nric_regex), ["nric"]])

# ----------
# Check name
# ----------
# Combine first name and last name into a full name for checking uniqueness
check_fullName = user_df["first_name"] + " " + user_df["last_name"]
# Check that the full names are all unique
print(f"Number of unique full names: {check_fullName.nunique()}")

# ---------
# Check dob
# ---------
start_date = "1920-01-01"
end_date = "2025-03-01"

user_df["date_of_birth"] = pd.to_datetime(user_df["date_of_birth"])
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)

is_in_range = (user_df["date_of_birth"] >= start_date) & (
    user_df["date_of_birth"] <= end_date
)

# check if the date is under range
if is_in_range.all():
    print("All date are in range")
else:
    print("Some dates are not in range")

All nrics match the expected format.
Number of unique full names: 200
All date are in range


#### Vaccines

In [19]:
# Query the Vaccines table
query = "SELECT * FROM Vaccines"
# Execute the query and load results into a DataFrame
vaccines_df = pd.read_sql(query, conn)
# Display the results
display(vaccines_df)

Unnamed: 0,id,name,price,doses_required,age_criteria,gender_criteria
0,1,Influenza (INF),9.0,1,18+ years old,
1,2,Pneumococcal Conjugate (PCV13),16.0,1,65+ years old,
2,3,Human Papillomavirus (HPV),23.0,3,18-26 years old,F
3,4,"Tetanus, Diphtheria, Pertussis (Tdap)",10.0,1,18+ years old,F
4,5,Hepatitis B (HepB),9.0,3,18+ years old,
5,6,"Measles, Mumps, Rubella (MMR)",9.0,2,18+ years old,
6,7,Varicella (VAR),11.0,2,18+ years old,


#### Polyclinics

In [20]:
# Query the Vaccines table
query = "SELECT * FROM Polyclinics"
# Execute the query and load results into a DataFrame
polyclinic_df = pd.read_sql(query, conn)
# Display the results
display(polyclinic_df)

Unnamed: 0,id,name,postal_code
0,1,Ang Mo Kio Polyclinic,569666
1,2,Geylang Polyclinic,389707
2,3,Hougang Polyclinic,538829
3,4,Kallang Polyclinic,328263
4,5,Khatib Polyclinic,769567
5,6,Toa Payoh Polyclinic,319260
6,7,Sembawang Polyclinic,756973
7,8,Woodlands Polyclinic,738579
8,9,Yishun Polyclinic,768898
9,10,Bukit Batok Polyclinic,659164


#### GeneralPractitioners

In [21]:
# Query the Vaccines table
query = "SELECT * FROM GeneralPractitioners"
# Execute the query and load results into a DataFrame
gp_df = pd.read_sql(query, conn)
# Display the results
display(gp_df)

Unnamed: 0,id,name,postal_code
0,1,Cavenagh Medical Clinic And Home Care,269695
1,2,Mei Ling Clinic,140158
2,3,Rcmc Rivervale Crescent Medical Centre,541182
3,4,360 Clinic,560407
4,5,SKY Medical,079027
...,...,...,...
1527,1528,Trinity Medical@Shenton,068908
1528,1529,Saudara Clinic By A+J General Physicians,419741
1529,1530,Zara Clinic,680026
1530,1531,360 Clinic,640221


#### Addresses

In [22]:
# Query the Vaccines table
query = "SELECT * FROM Addresses"
# Execute the query and load results into a DataFrame
addresses_df = pd.read_sql(query, conn)
# Display the results
display(addresses_df)

Unnamed: 0,id,postal_code,address,latitude,longitude
0,1,569666,21 ANG MO KIO CENTRAL 2 ANG MO KIO POLYCLINIC ...,1.3743245905856,103.845677779279
1,2,389707,21 GEYLANG EAST CENTRAL GEYLANG POLYCLINIC SIN...,1.31949365581957,103.887166041622
2,3,538829,89 HOUGANG AVENUE 4 HOUGANG POLYCLINIC SINGAPO...,1.3699068381066,103.88900146446
3,4,328263,701 SERANGOON ROAD KALLANG POLYCLINIC AND LONG...,1.31678496747374,103.858752270079
4,5,769567,690 YISHUN RING ROAD KHATIB POLYCLINIC SINGAPO...,1.41820402220914,103.834428037791
...,...,...,...,...,...
1291,1292,728787,32 SUNGEI KADUT WAY HUA KOK INDUSTRIAL BUILDIN...,1.40664878570127,103.753025253834
1292,1293,218601,158 RACE COURSE ROAD SINGAPORE 218601,1.31037742043094,103.852688922685
1293,1294,519936,511A ELIAS ROAD SINGAPORE 519936,1.3807527338979,103.947562413466
1294,1295,118926,31 PASIR PANJANG DRIVE EALINE PARK SINGAPORE 1...,1.29229219697385,103.771242630575


#### BookingSlots

In [23]:
# Query the Vaccines table
query = "SELECT * FROM BookingSlots"
# Execute the query and load results into a DataFrame
bookingslot_df = pd.read_sql(query, conn)
# Display the results
display(bookingslot_df)

Unnamed: 0,id,polyclinic_id,vaccine_id,datetime
0,1,13,3,2025-03-05 15:00:00
1,2,20,6,2025-03-28 09:00:00
2,3,15,1,2025-03-21 17:00:00
3,4,19,2,2025-03-12 14:00:00
4,5,25,3,2025-03-04 17:00:00
...,...,...,...,...
2595,2596,18,4,2025-03-18 17:00:00
2596,2597,20,4,2025-03-27 10:00:00
2597,2598,17,2,2025-03-14 09:00:00
2598,2599,6,5,2025-03-09 09:00:00


In [24]:
bookingslot_df["datetime"].value_counts()

datetime
2025-03-11 17:00:00    17
2025-03-21 13:00:00    16
2025-03-18 13:00:00    16
2025-03-05 17:00:00    16
2025-03-08 18:00:00    15
                       ..
2025-03-11 11:00:00     3
2025-03-26 11:00:00     2
2025-03-15 13:00:00     2
2025-03-26 10:00:00     1
2025-03-07 10:00:00     1
Name: count, Length: 309, dtype: int64

#### VaccineRecords

In [25]:
# Query the Vaccines table
query = "SELECT * FROM VaccineRecords"
# Execute the query and load results into a DataFrame
vaccinerecord_df = pd.read_sql(query, conn)
# Display the results
display(vaccinerecord_df)

Unnamed: 0,id,user_nric,booking_slot_id,status
0,1,S4770378I,1037,booked
1,2,S9778449Y,2311,booked
2,3,S6123640Z,2010,booked
3,4,S9048194A,1776,completed
4,5,S7868331B,1822,booked
...,...,...,...,...
495,496,S6681820L,1591,completed
496,497,S7632098P,773,completed
497,498,T0768527E,2596,free
498,499,S4886693Z,31,free


#### VaccineStockInventory

In [26]:
# Query the Vaccines table
query = "SELECT * FROM VaccineStockInventory"
# Execute the query and load results into a DataFrame
vaccinestock_df = pd.read_sql(query, conn)
# Display the results
display(vaccinestock_df)

Unnamed: 0,id,polyclinic_id,vaccine_id,stock_quantity
0,1,5,6,76
1,2,5,7,153
2,3,5,3,197
3,4,8,7,122
4,5,8,3,59
...,...,...,...,...
76,77,2,6,192
77,78,2,5,147
78,79,11,3,52
79,80,11,6,180


## Close SQLite Database Connection


In [27]:
# Close the connection
conn.close()