In [25]:
import re
import sqlite3

import pandas as pd
from IPython.display import display

## Connect to SQLite database


In [26]:
# Define SQLite database file path
sqlite_db_path = "../data/vaccination_db.sqlite"

# Reconnect to SQLite database
conn = sqlite3.connect(sqlite_db_path)

## Query Database

To query the database, define the `query`. 

Next, execute the query and load the results into a DataFrame: `pd.read_sql_query(query, conn)`

### Check info for all tables

In [27]:
cursor = conn.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

print(f"There are {len(tables)} tables in this sqlite dataset")
for table in tables:
    table_name = table[0]
    print(f"\nName of tables: {table_name}")

    # count rows for the table
    cursor.execute(f"SELECT COUNT(*) FROM {table_name}")
    row_count = cursor.fetchone()[0]
    print(f"row counts: {row_count}")
    # df = pd.read_sql(f"SELECT * FROM {table_name}", conn)
    # print(df.describe())

There are 8 tables in this sqlite dataset

Name of tables: Users
row counts: 200

Name of tables: Polyclinics
row counts: 13

Name of tables: GeneralPractitioners
row counts: 1532

Name of tables: Addresses
row counts: 1286

Name of tables: Vaccines
row counts: 7

Name of tables: VaccineRecords
row counts: 500

Name of tables: VaccineStockInventory
row counts: 80

Name of tables: BookingSlots
row counts: 325


#### User

In [28]:
# Query the Vaccines table
query = "SELECT * FROM Users"
# Execute the query and load results into a DataFrame
user_df = pd.read_sql(query, conn)
# Display the results
display(user_df)

Unnamed: 0,nric,first_name,last_name,email,date_of_birth,gender,postal_code
0,S7829099H,Edward,Bradley,edward.bradley@gmail.com,1952-01-16 00:00:00,Male,458669
1,T0468396S,Susan,Smith,susan.smith@hotmail.com,1958-11-30 00:00:00,Female,276924
2,S5431842X,Devon,Perry,devon.perry@gmail.com,1992-06-23 00:00:00,Male,537530
3,S9580040C,Kathryn,Rogers,kathryn.rogers@gmail.com,2000-10-29 00:00:00,Female,238012
4,S9619540M,Joseph,Lloyd,joseph.lloyd@hotmail.com,1998-08-09 00:00:00,Male,425843
...,...,...,...,...,...,...,...
195,S6089571E,Veronica,Knox,veronica.knox@hotmail.com,1951-08-25 00:00:00,Female,227977
196,S8733586X,David,Pham,david.pham@gmail.com,1982-03-14 00:00:00,Male,417017
197,S6863714A,Tracy,Ward,tracy.ward@gmail.com,1958-12-10 00:00:00,Female,419460
198,S4477934F,Steven,Smith,steven.smith@gmail.com,1977-06-03 00:00:00,Male,219893


In [29]:
# ----------
# Check nric
# ----------
# Regex pattern for nric
nric_regex = r"[STFGM]\d{7}[A-Z]"
# Verify all nrics match the expected pattern
if all(re.fullmatch(nric_regex, nric) for nric in user_df["nric"]):
    print("All nrics match the expected format.")
else:
    print("Some nrics do not match the expected format.")
    print(user_df.loc[~user_df["nric"].str.match(nric_regex), ["nric"]])

# ----------
# Check name
# ----------
# Combine first name and last name into a full name for checking uniqueness
check_fullName = user_df["first_name"] + " " + user_df["last_name"]
# Check that the full names are all unique
print(f"Number of unique full names: {check_fullName.nunique()}")

# ---------
# Check dob
# ---------
start_date = "1920-01-01"
end_date = "2025-03-01"

user_df["date_of_birth"] = pd.to_datetime(user_df["date_of_birth"])
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)

is_in_range = (user_df["date_of_birth"] >= start_date) & (
    user_df["date_of_birth"] <= end_date
)

# check if the date is under range
if is_in_range.all():
    print("All date are in range")
else:
    print("Some dates are not in range")

All nrics match the expected format.
Number of unique full names: 200
All date are in range


#### Vaccines

In [30]:
# Query the Vaccines table
query = "SELECT * FROM Vaccines"
# Execute the query and load results into a DataFrame
vaccines_df = pd.read_sql(query, conn)
# Display the results
display(vaccines_df)

Unnamed: 0,id,name,price,doses_required,age_criteria,gender_criteria
0,1,Influenza (INF),9.0,1,18+ years old,
1,2,Pneumococcal Conjugate (PCV13),16.0,1,65+ years old,
2,3,Human Papillomavirus (HPV),23.0,3,18-26 years old,Female
3,4,"Tetanus, Diphtheria, Pertussis (Tdap)",10.0,1,18+ years old,Female
4,5,Hepatitis B (HepB),9.0,3,18+ years old,
5,6,"Measles, Mumps, Rubella (MMR)",9.0,2,18+ years old,
6,7,Varicella (VAR),11.0,2,18+ years old,


#### Polyclinics

In [31]:
# Query the Vaccines table
query = "SELECT * FROM Polyclinics"
# Execute the query and load results into a DataFrame
polyclinic_df = pd.read_sql(query, conn)
# Display the results
display(polyclinic_df)

Unnamed: 0,id,name,postal_code
0,1,Sembawang Polyclinic,756973
1,2,Hougang Polyclinic,538829
2,3,Khatib Polyclinic,769567
3,4,Bukit Batok Polyclinic,659164
4,5,Pioneer Polyclinic,648201
5,6,Clementi Polyclinic,120451
6,7,Bukit Panjang Polyclinic,677726
7,8,Geylang Polyclinic,389707
8,9,Ang Mo Kio Polyclinic,569666
9,10,Woodlands Polyclinic,738579


#### GeneralPractitioners

In [32]:
# Query the Vaccines table
query = "SELECT * FROM GeneralPractitioners"
# Execute the query and load results into a DataFrame
gp_df = pd.read_sql(query, conn)
# Display the results
display(gp_df)

Unnamed: 0,id,name,postal_code
0,1,Cavenagh Medical Clinic And Home Care,269695
1,2,Mei Ling Clinic,140158
2,3,Rcmc Rivervale Crescent Medical Centre,541182
3,4,360 Clinic,560407
4,5,SKY Medical,079027
...,...,...,...
1527,1528,Trinity Medical@Shenton,068908
1528,1529,Saudara Clinic By A+J General Physicians,419741
1529,1530,Zara Clinic,680026
1530,1531,360 Clinic,640221


#### Addresses

In [33]:
# Query the Vaccines table
query = "SELECT * FROM Addresses"
# Execute the query and load results into a DataFrame
addresses_df = pd.read_sql(query, conn)
# Display the results
display(addresses_df)

Unnamed: 0,postal_code,address,latitude,longitude
0,756973,21 CANBERRA LINK BUKIT CANBERRA SINGAPORE 756973,1.44826336410158,103.82276363189
1,538829,89 HOUGANG AVENUE 4 HOUGANG POLYCLINIC SINGAPO...,1.3699068381066,103.88900146446
2,769567,690 YISHUN RING ROAD KHATIB POLYCLINIC SINGAPO...,1.41820402220914,103.834428037791
3,659164,50 BUKIT BATOK WEST AVENUE 3 BUKIT BATOK POLYC...,1.35201517590477,103.747822265461
4,648201,26 JURONG WEST STREET 61 NATIONAL UNIVERSITY P...,1.33856179501564,103.699014641414
...,...,...,...,...
1281,227977,10 WINSTEDT ROAD SINGAPORE 227977,1.31015138561086,103.841932060489
1282,417017,86 LORONG MELAYU SINGAPORE 417017,1.32407676311342,103.909899417279
1283,419460,23A JALAN GRISEK SINGAPORE 419460,1.32438798177171,103.918482044009
1284,219893,31 BIRCH ROAD SINGAPORE 219893,1.31113913811163,103.853628919553


#### BookingSlots

In [34]:
# Query the Vaccines table
query = "SELECT * FROM BookingSlots"
# Execute the query and load results into a DataFrame
bookingslot_df = pd.read_sql(query, conn)
# Display the results
display(bookingslot_df)

Unnamed: 0,id,polyclinic_id,vaccine_id,datetime,is_booked,user_nric
0,1,10,2,2025-03-19 14:00:00,0,
1,2,3,1,2025-03-17 09:00:00,1,S5179355A
2,3,3,6,2025-03-21 11:00:00,0,
3,4,5,4,2025-03-19 14:00:00,1,S6028914A
4,5,5,5,2025-03-17 10:00:00,0,
...,...,...,...,...,...,...
320,321,4,4,2025-03-19 09:00:00,1,S4437704I
321,322,1,6,2025-03-21 18:00:00,1,S4646744D
322,323,3,6,2025-03-17 13:00:00,1,S7829099H
323,324,11,7,2025-03-20 12:00:00,0,


In [35]:
bookingslot_df["datetime"].value_counts()

datetime
2025-03-19 10:00:00    13
2025-03-18 16:00:00    12
2025-03-17 14:00:00    11
2025-03-17 17:00:00    11
2025-03-19 17:00:00    11
2025-03-19 11:00:00    10
2025-03-18 17:00:00     9
2025-03-20 14:00:00     9
2025-03-20 17:00:00     9
2025-03-21 18:00:00     8
2025-03-20 10:00:00     8
2025-03-17 13:00:00     8
2025-03-17 12:00:00     8
2025-03-20 12:00:00     8
2025-03-20 11:00:00     7
2025-03-19 14:00:00     7
2025-03-17 09:00:00     7
2025-03-17 15:00:00     7
2025-03-21 16:00:00     7
2025-03-18 11:00:00     7
2025-03-20 15:00:00     7
2025-03-19 09:00:00     7
2025-03-20 09:00:00     7
2025-03-21 10:00:00     7
2025-03-18 14:00:00     7
2025-03-18 10:00:00     6
2025-03-19 16:00:00     6
2025-03-20 16:00:00     6
2025-03-17 18:00:00     6
2025-03-19 18:00:00     6
2025-03-21 11:00:00     6
2025-03-17 10:00:00     5
2025-03-21 13:00:00     5
2025-03-19 12:00:00     5
2025-03-20 13:00:00     5
2025-03-21 17:00:00     5
2025-03-18 15:00:00     5
2025-03-17 11:00:00     5
202

#### VaccineRecords

In [36]:
# Query the Vaccines table
query = "SELECT * FROM VaccineRecords"
# Execute the query and load results into a DataFrame
vaccinerecord_df = pd.read_sql(query, conn)
# Display the results
display(vaccinerecord_df)

Unnamed: 0,id,user_nric,vaccine_id,polyclinic_id,vaccination_date
0,1,S4578160W,4,9,2025-01-21
1,2,S6069624D,4,13,2025-03-13
2,3,S6431084J,5,7,2025-03-08
3,4,S6318250E,3,2,2025-01-12
4,5,S7473943Q,2,4,2025-01-30
...,...,...,...,...,...
495,496,S8833406B,5,12,2025-02-07
496,497,S5179355A,3,6,2025-02-27
497,498,S5379985U,1,7,2025-01-25
498,499,S8686374E,6,7,2025-01-11


In [37]:
vaccinerecord_df["vaccination_date"].sort_values()

156    2025-01-01
432    2025-01-01
464    2025-01-01
333    2025-01-01
486    2025-01-01
          ...    
63     2025-03-20
338    2025-03-20
294    2025-03-20
395    2025-03-20
337    2025-03-20
Name: vaccination_date, Length: 500, dtype: object

#### VaccineStockInventory

In [38]:
# Query the Vaccines table
query = "SELECT * FROM VaccineStockInventory"
# Execute the query and load results into a DataFrame
vaccinestock_df = pd.read_sql(query, conn)
# Display the results
display(vaccinestock_df)

Unnamed: 0,id,polyclinic_id,vaccine_id,stock_quantity
0,1,10,5,119
1,2,10,4,107
2,3,10,6,69
3,4,10,2,51
4,5,10,3,151
...,...,...,...,...
75,76,2,5,105
76,77,3,6,167
77,78,3,3,166
78,79,3,2,152


## Close SQLite Database Connection


In [39]:
# Close the connection
conn.close()