### Data Quality & Validation

- Import Lybrary

In [1]:
import pandas as pd
from pathlib import Path

- Define Data Paths

In [2]:
PROJECT_ROOT = Path("..")
RAW_DATA = PROJECT_ROOT / "data" / "raw"
PROCESSED_DATA = PROJECT_ROOT / "data" / "processed"

- Register Source Files

In [3]:
files = {
    "2023_2": "appointments_2023_2.csv",
    "2024_1": "appointments_2024_1.csv",
    "2024_2": "appointments_2024_2.csv",
    "2025_1": "appointments_2025_1.csv",
    "2025_2": "appointments_2025_2.csv",
}

- Load, Standardize, Add Metadata

In [4]:
dfs = []

for period, filename in files.items():
    df = pd.read_csv(RAW_DATA / filename)

    # add source period
    df["source_period"] = period

    # rename long header
    df = df.rename(columns={
        "GP Consultation Nurse Visit Chronic Care Review Vaccination Follow-up":
        "appointment_status"
    })

    # create global unique id
    df["appointment_uid"] = (
        period + "_" + df["appointment_id"].astype(str)
    )

    dfs.append(df)

- Merge All data

In [5]:
appointments_all = pd.concat(dfs, ignore_index=True)

- Validation

In [6]:
appointments_all["appointment_uid"].is_unique

True

In [7]:
appointments_all.head()

Unnamed: 0,appointment_id,appointment_date,appointment_age_group,appointment_type,appointment_status,wait_time_days,consultation_duration_minutes,staff_role,clinic_location,outcome_category,source_period,appointment_uid
0,1,7/18/2023,50-64,GP Consultation,Cancelled,41.0,59.95,Practice Nurse,Community Centre,Further Tests Required,2023_2,2023_2_1
1,2,12/24/2023,18-34,Chronic Care Review,No-Show,9.0,36.13,Locum GP,Main Practice,Further Tests Required,2023_2,2023_2_2
2,3,12/9/2023,18-34,GP Consultation,Rescheduled,,58.21,Locum GP,Main Practice,Medication Prescribed,2023_2,2023_2_3
3,4,12/24/2023,0-17,GP Consultation,Rescheduled,43.0,56.68,Practice Nurse,Main Practice,Further Tests Required,2023_2,2023_2_4
4,5,9/25/2023,65+,Follow-up,Completed,29.0,19.45,Healthcare Assistant,Branch Clinic,Further Tests Required,2023_2,2023_2_5


In [8]:
appointments_all.shape

(5000, 12)

In [9]:
appointments_all.isnull().sum()

appointment_id                     0
appointment_date                   0
appointment_age_group              0
appointment_type                   0
appointment_status                 0
wait_time_days                   397
consultation_duration_minutes    301
staff_role                         0
clinic_location                    0
outcome_category                 705
source_period                      0
appointment_uid                    0
dtype: int64

- Save Output

In [10]:
appointments_all.to_csv(
    PROCESSED_DATA / "appointments_all.csv",
    index=False
)