# Bradford 0-19 Children and Young Peoples' Outcomes Framework: Data Overview and Linkage

# Data linkage

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sqlalchemy import create_engine
from IPython.display import display
from rich import print

from utils import (
    show_person_coverage, 
    display_asq_dist
)

import warnings
warnings.filterwarnings('ignore')

In [None]:
tbl_name = "person_linked_2016plus"

# Load data

In [None]:
print("First, we need to load each table from the database: ")

# Note: The connection string is specific to the Connected Bradford VDE.
# Replace the placeholder below with the internal server URL.
conn_str = "DATABASE_URL_PLACEHOLDER"

# Create SQLAlchemy engine
engine = create_engine(conn_str)

table_alias = {
    'base': 'cb_PersonDigest1998GPRecords',
    'person': 'person',
    'lsoa': 'personLSOA',
    'hv': 'tbl_GPData_HomeVisits',
    'srcode_asq': 'tbl_SRCODE_ASQ_ELIM',
    'eyfsp': 'cb_EYFSP_2003_to_2019',
    'addr': 'cb_Person_Address_History'
}

dfs = {}

for alias, table in table_alias.items():
    try:
        df = pd.read_sql(f"SELECT * FROM [FDM].[{table}]", engine)
        dfs[f"{alias}"] = df
        print(f" - Loaded {table} ‚Üí dfs['{alias}'] (rows={df.shape[0]}, persons={df['person_id'].nunique()})")
    except Exception as e:
        print(f" - Error reading {table}: {e}")

In [None]:
num_children_base = dfs['base']['person_id'].nunique()

# Link datasets

## Linking base population table with person demographics 

In [None]:
step = 1

print(
    "\nNow we'll merge the main tables on 'person_id', combining demographic, LSOA, "
    "health visiting (HV), ASQ, and EYFSP data into one dataset ready for analysis.\n"
)

print(f"[bold cyan]Step {step}: Linking base population table with person demographics (birth date, gender, ethnicity)...[/bold cyan]")

df = pd.merge(
    dfs['base'],
    dfs['person'][['person_id', 'birth_datetime', 'ethnicity_source_value', 'gender_source_value', 'death_datetime']]
        .rename(
            columns={
                "gender_source_value": "gender_raw",
                "ethnicity_source_value": "ethnicity_raw"
            }
        )
        .drop_duplicates(),
    how='left',
    on='person_id'
)
print(f"Done. Current rows: {len(df):,} | Unique person_id: {df['person_id'].nunique():,} \n")

## Merge LSOA and Ward

In [None]:
step += 1

print(f"[bold cyan]Step {step}: Linking LSOA and Ward information (geographic / area-level data)...[/bold cyan]")
df = pd.merge(df, dfs['lsoa'], how='left', on='person_id')
print(f"Done. Current rows: {len(df):,} | Unique person_id: {df['person_id'].nunique():,} \n")

## Merge ASQ data

In [None]:
step += 1

print(f"[bold cyan]Step {step}: Linking ASQ-3 24|27|30 months records (developmental assessment data)...[/bold cyan]")

# Filter ASQ-3 only
# first check the distribution of ASQ/ElEM records (ASQ-3, ASQ-SE, Others) 
_ = display_asq_dist(
    dfs['srcode_asq'], 
    text_col='CTV3Text', 
    code_col='CTV3Code', 
    title="Distribution of ASQ records (ASQ-3, ASQ-SE, Others) in Source Table"
)

asq3 = dfs['srcode_asq'].loc[
    dfs['srcode_asq']['CTV3Text'].str.lower().str.startswith(("asq-3", "asq third"), na=False)
    & dfs['srcode_asq']['CTV3Text'].str.contains(r'\b(24|27|30)\b', na=False)
]

print(f"   ‚Üí Found {asq3['person_id'].nunique():,} unique children with ASQ-3 24|27|30 months records in source table. \n")

df = pd.merge(
    df,
    asq3[['person_id', 'CTV3Code', 'CTV3Text', 'DateEvent', 'NumericValue']]
    .rename(
        columns={
            "DateEvent": "ASQ_DateEvent",
            "CTV3Code": "ASQ_CTV3Code",
            "CTV3Text": "ASQ_CTV3Text",
            "NumericValue": "ASQ_value",
        }
    )
    .drop_duplicates(),
    how='left',
    on='person_id'
)

df = display_asq_dist(
    df, 
    text_col='ASQ_CTV3Text', 
    code_col='ASQ_CTV3Code', 
    title="Distribution of ASQ records (ASQ-3, ASQ-SE, Others) in Linked Dataset"
)

asq_count = df.loc[df['ASQ_DateEvent'].notna(), 'person_id'].nunique()
print(f"Done. Current rows: {len(df):,} | Unique person_id: {df['person_id'].nunique():,} \n")
print(f"   ‚Üí {asq_count:,} children have ASQ records linked. \n")

## Merge EYFSP

In [None]:
step += 1

print(f"[bold cyan]Step {step}: Linking EYFSP (Early Years Foundation Stage Profile) outcomes...[/bold cyan]")
df = pd.merge(
    df,
    dfs['eyfsp'].drop(
        columns=['cb_date_added', 'cb_datasetref', 'Focus', 'EDRN'],
        errors='ignore'
    ).drop_duplicates(),
    how='left',
    on='person_id'
)
eyfsp_count = df.loc[df['FSP_GLD'].notna(), 'person_id'].nunique()
print(f"Done. Current rows: {len(df):,} | Unique person_id: {df['person_id'].nunique():,} \n")
print(f"   ‚Üí {eyfsp_count:,} children have EYFSP outcomes available. \n")

## Only keep children born on or after 2013-09-01

In [None]:
step += 1

print(f"[bold cyan]Step {step}: Filtering the cohort - only keeping children born on or after 2013-09-01...[/bold cyan]")

n_children_before = df['person_id'].nunique()

# Convert to datetime if not already
df['birth_datetime'] = pd.to_datetime(df['birth_datetime'], errors='coerce')

# Apply the lower bound filter
birth_lower_bound = pd.Timestamp("2013-09-01")
df = df.loc[df['birth_datetime'] >= birth_lower_bound].copy()

# Count how many children remain
num_children_filter_birth = df['person_id'].nunique()
print(f"Done. Current rows: {len(df):,} | Unique person_id: {num_children_filter_birth:,} \n")
print(f"   ‚Üí {num_children_filter_birth:,} unique children born on or after {birth_lower_bound.date()} retained. \n")
print(f"   ‚Üí {n_children_before - num_children_filter_birth:,} children who were born before {birth_lower_bound.date()} are excluded. \n")
print(f"   ‚Üí {num_children_base - num_children_filter_birth:,} children are excluded after above steps. \n")

## Remove children who are younger than 30 months or who died before 30 months (i.e., < 914 days old)

In [None]:
step += 1

# today = pd.Timestamp.today().normalize()
today = pd.Timestamp("2025-11-30")

print(
    f"[bold cyan]Step {step}: Removing children younger than 30 months (i.e., < 914 days old) "
    f"as of {today.date()}, or those who died before reaching 30 months...[/bold cyan]"
)

# We use the Public Health England definition of uptake of the two-year review: 
# All children who reached 30 months (914 days) within the study period and had a 
# two-year review coded as `XaQA6` (Health visitor child 24-30 month contact) 
# completed between 691 and 914 days of age (i.e. between 23 and 30 months).

# Calculate age in days
df['age_days'] = (today - df['birth_datetime']).dt.days
df["age_2_0"] = df["birth_datetime"] + pd.DateOffset(days=691)
df["age_2_5"] = df["birth_datetime"] + pd.DateOffset(days=914)

# Identify which person_ids are under 2.5
under2_ids = df.loc[df['age_days'] < 914, 'person_id'].unique()
n_under2 = len(under2_ids)

# Check if any child died before 914 days
n_died = df.loc[df["death_datetime"].notna(), "person_id"].nunique()

died_before_2_ids = df[
    (df["death_datetime"].notna()) &
    ((df["death_datetime"] - df["birth_datetime"]).dt.days < 914)
]["person_id"].unique()

n_died_before_2 = len(died_before_2_ids)

print(f"   ‚Üí Children who died (any age): {n_died:,}")
print(f"   ‚Üí Children who died before 2.5 years (< 914 days): {n_died_before_2:,}")
print(f"   ‚Üí Children currently younger than 2.5 years: {n_under2:,}")

remove_ids = set(under2_ids) | set(died_before_2_ids)
n_removed = len(remove_ids)

# Filter out those younger than 2.5 and died before 2.5
df = df.loc[~df['person_id'].isin(remove_ids)].copy()

remaining = df['person_id'].nunique()
print(f"Done. Current rows: {len(df):,} | Unique person_id: {remaining:,} \n")
print(
    f"   ‚Üí Retained children who were alive at 30 months: {remaining:,}. "
    f"There are still {df.loc[df["death_datetime"].notna(), "person_id"].nunique():,} children who died afterwards."
)
print(f"   ‚Üí Removed {n_removed:,} children who were either younger than 2.5 years or died before reaching 2.5 years. \n")
print(f"   ‚Üí {num_children_base - remaining:,} children are excluded after above steps. \n")

In [None]:
df_stud = show_person_coverage(
	df, 
	cols=["HV_DateEvent", "ASQ_value", "birth_datetime", "ethnicity_raw", "gender_raw", 'FSP_GLD', 'FSP_LSOA11', 'LSOA', 'ward', 'death_datetime'], 
	max_height=300, 
	return_df=True,
	show_html=False
)

df_stud.style.set_caption("Coverage metrics (children with values vs. missing) across key variables - after removing children born before 2013-09-01 and those younger than 30 months")

In [None]:
mask_asq3 = (
    df["ASQ_CTV3Text"].notna()
    & df["ASQ_CTV3Text"].astype("string")
        .str.contains(r"\basq[-\s]?3\b|\basq\s*third\b", case=False, regex=True)
)

persons_with_asq3 = df.loc[mask_asq3, "person_id"].nunique()
total_persons     = df["person_id"].nunique()
pct_with_asq3     = persons_with_asq3 / total_persons

print(f"Persons with any ASQ-3: {persons_with_asq3:,}, accounting for {pct_with_asq3:.1%} of the total persons.")

## Merge HV data

In [None]:
step += 1

print(f"[bold cyan]Step {step}: Adding 2y Home Visit (HV) data (DateEvent, CTV3Code, CTV3Text) and may only keep records for children born in bradford...[/bold cyan]")

# count how many children have any HV records, 2y HV, new birth HV, initial HV, no HV at all, respectively
n_has_any_hv = dfs['hv'].loc[dfs['hv']['DateEvent'].notna(), 'person_id'].nunique()

ids_with_2y_hv = set(
    dfs['hv'].loc[dfs['hv']['CTV3Code'] == 'XaQA6', 'person_id']
)
ids_with_new_birth = set(
    dfs['hv'].loc[dfs['hv']['CTV3Code'] == 'XaX4m', 'person_id']
)

n_has_2y_hv = len(ids_with_2y_hv)
n_has_new_birth_hv = len(ids_with_new_birth)
n_has_initial_hv = dfs['hv'].loc[dfs['hv']['CTV3Code'] == 'XaJG5', 'person_id'].nunique()
n_has_no_hv = dfs['hv'].loc[dfs['hv']['DateEvent'].isna(), 'person_id'].nunique()
n_has_new_birth_2y = len(ids_with_2y_hv & ids_with_new_birth)

hv_summary = pd.DataFrame({
    "HV Record Type": ["Any HV Record", "2y HV (XaQA6)", "New Birth HV (XaX4m)", "Initial HV (XaJG5)", "2y HV and New Birth HV", "No HV Record"],
    "Unique Children Count": [n_has_any_hv, n_has_2y_hv, n_has_new_birth_hv, n_has_initial_hv, n_has_new_birth_2y, n_has_no_hv]
})

display(hv_summary.style.hide(axis='index').set_caption("Home Visit (HV) Records Summary"))

num_children_before = df['person_id'].nunique()

df = pd.merge(
    # df.loc[df['person_id'].isin(ids_with_new_birth)], # keep only children with new birth HV records
    df,
    dfs['hv']
      .query("CTV3Code == 'XaQA6'")[['person_id', 'CTV3Code', 'CTV3Text', 'DateEvent']]
      .drop_duplicates()
      .rename(
          columns={
              "DateEvent": "HV_DateEvent",
              "CTV3Code": "HV_CTV3Code",
              "CTV3Text": "HV_CTV3Text",
          }
      ),
    how='left',
    on='person_id'
)

hv_count = df.loc[df['HV_DateEvent'].notna(), 'person_id'].nunique()
num_children_hv_added = df['person_id'].nunique()

print(f"Done. Current rows: {len(df):,} | Unique person_id: {num_children_hv_added:,} \n")
print(f"   ‚Üí {hv_count:,} children have 2y HV data. \n")

group_keys = ['person_id']
multi_valued = df.groupby(group_keys).filter(
	lambda g: g['HV_CTV3Code'].nunique() > 1
).reset_index(drop=False)


print(f"   ‚Üí There are {multi_valued.shape[0]:,} records with multiple HV records. \n")
print(f"   ‚Üí {num_children_before - num_children_hv_added:,} children without new birth HV are excluded. \n")
print(f"   ‚Üí {num_children_base - num_children_hv_added:,} children are excluded after above steps. \n")

In [None]:
df_stud = show_person_coverage(
	df, 
	cols=["HV_DateEvent", "ASQ_value", "birth_datetime", "ethnicity_raw", "gender_raw", 'FSP_GLD', 'FSP_LSOA11', 'LSOA', 'ward', 'death_datetime'], 
	max_height=300, 
	return_df=True,
	show_html=False
)

df_stud.style.set_caption("Coverage metrics (children with values vs. missing) across key variables - after removing those without new birth HV")

In [None]:
mask_asq3 = (
    df["ASQ_CTV3Text"].notna()
    & df["ASQ_CTV3Text"].astype("string")
        .str.contains(r"\basq[-\s]?3\b|\basq\s*third\b", case=False, regex=True)
)

persons_with_asq3 = df.loc[mask_asq3, "person_id"].nunique()
total_persons     = df["person_id"].nunique()
pct_with_asq3     = persons_with_asq3 / total_persons

print(f"Persons with any ASQ-3: {persons_with_asq3:,}, accounting for {pct_with_asq3:.1%} of the total persons.")

## Add historical address

In [None]:
step += 1

print(f"[bold cyan]Step {step}: Adding historical address (latest within 2.5 years)...[/bold cyan]")

df = pd.merge_asof(
    df.sort_values(["age_2_5"]),
    dfs['addr'][['person_id', 'PartialPostCode', 'DateEvent']]
        .rename(
            columns={
                "DateEvent": "PD_DateEvent", 
                "PartialPostCode": "PartialPD"
                }
            )
        .dropna(subset=["PD_DateEvent"])
        .sort_values(["PD_DateEvent"]),
    by="person_id",
    left_on="age_2_5",
    right_on="PD_DateEvent",
    direction="backward"
)

print(f"Done. Current rows: {len(df):,} | Unique person_id: {df['person_id'].nunique():,} \n")

df['is_bradford'] = df['PartialPD'].str.startswith("BD", na=False)
bradford_unique = df.loc[df['is_bradford'], 'person_id'].nunique()
n_bradford_xaqa6 = df.loc[
    (df["is_bradford"]) &
    (df["HV_CTV3Code"] == "XaQA6")
, 'person_id'].nunique()

print(f"Number of unique people in Bradford: {bradford_unique:,}", )
print(f"Bradford people with 2y HV: {n_bradford_xaqa6:,}")

## Save/upload linkded dataset

In [None]:
try:
    df.to_sql(
        name=f'{tbl_name}',   
        con=engine,                   
        schema='dbo',                
        if_exists='replace',          
        index=False                 
    )
except Exception as e:
    print("Error occurred while writing to SQL Server.")
    print(str(e).split("\n")[0])

engine.dispose()

In [None]:
df_stud = show_person_coverage(
	df, 
	cols=["HV_DateEvent", "ASQ_value", "birth_datetime", "ethnicity_raw", "gender_raw", 'FSP_GLD', 'FSP_LSOA11', 'LSOA', 'ward', 'PartialPD', 'death_datetime'], 
	max_height=300, 
	return_df=True,
	show_html=False
)

df_stud.style.set_caption("Coverage metrics (children with values vs. missing) across key variables - after linkage")