# Bradford 0-19 Children and Young Peoples' Outcomes Framework: Indicator Generation for Regression Analysis

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from datetime import datetime
from sqlalchemy import create_engine
import re
from rich import print
from tabulate import tabulate
from IPython.core.display import HTML
from IPython.display import IFrame
import warnings

import plotly.io as pio
pio.renderers.default = "notebook_connected"

from utils import (
    make_crosstab,
    report_value_counts
)

import warnings
warnings.filterwarnings('ignore')

# Load data

In [None]:
#| echo: false

# Note: The connection string is specific to the Connected Bradford VDE.
# Replace the placeholder below with the internal server URL.
conn_str = "DATABASE_URL_PLACEHOLDER"

# Create SQLAlchemy engine
engine = create_engine(conn_str)

tbl_name = "person_linked_2016plus"

df = pd.read_sql(f"SELECT * FROM [dbo].[{tbl_name}_cleaned]", engine)

print(df.shape)

## Set up global parameters

In [None]:
YES = 1
NO = 0

df_raw = df.copy() # keep a copy of raw linked data

# Research Questions

1. Two-Year Mandated Health Visitor Review

| Ref | Question | Outcome(s) | Exposure(s) | Notes |
|-----|----------|------------|-------------|-------|
| 1.1 | Number and proportion of children aged ≥30 months who have / have not received a mandated review between ages 2–2.5 years | Receipt of 2-year review (Yes/No) | — | Descriptive |
| 1.2 | Are socio-demographic variables associated with receipt of a review? | Receipt of 2-year review (Yes/No) | Sex; Ethnicity; IMD | Association |


2. ASQ-3 Completion at Two-Year Review

| Ref | Question | Outcome(s) | Exposure(s) | Notes |
|-----|----------|------------|-------------|-------|
| 2.1 | Among children with a 2-year review, number and proportion with ASQ-3 completed | ASQ-3 completion (Yes/No) | — | Descriptive |
| 2.2 | Are socio-demographic variables associated with ASQ-3 completion? | ASQ-3 completion (Yes/No) | Sex; Ethnicity; IMD | Association |


3. Good Level of Development (GLD) on ASQ-3

| Ref | Question | Outcome(s) | Exposure(s) | Notes |
|-----|----------|------------|-------------|-------|
| 3.1 | Among children with ASQ-3, number and proportion in each category:<br> a) GLD achieved<br> b) Monitoring<br> c) Not GLD | Overall ASQ-3 + Domain-specific category | — | Descriptive |
| 3.2 | Are socio-demographic variables associated with GLD (overall or by domain)? | GLD status (Yes/No); Domain status | Sex; Ethnicity; IMD | Association |


4. Good Level of Development (GLD) on EYFSP

| Ref | Question | Outcome(s) | Exposure(s) | Notes |
|-----|----------|------------|-------------|-------|
| 4.1 | Among children aged 5, number and proportion who reach / do not reach GLD | EYFSP GLD (Yes/No) | — | Descriptive |
| 4.2 | Are socio-demographic variables associated with EYFSP GLD (overall or by domain)? | EYFSP GLD; Domain scores | Age (months); Sex; Ethnicity; IMD | Association |
| 4.3 | Does non-receipt of 2-year review predict poor EYFSP GLD? Does this vary by socio-demographic factors? | EYFSP GLD (Yes/No) | 2-year review (Yes/No); Sex; Ethnicity; IMD | Prediction |

5. Predictive Validity: ASQ-3 (Age 2) → EYFSP (Age 5)

| Ref | Question | Outcome(s) | Exposure(s) | Notes |
|-----|----------|------------|-------------|-------|
| 5.1 | Is overall ASQ-3 score associated with overall EYFSP score? | EYFSP overall score | ASQ-3 overall score | Association |
| 5.2 | Do ASQ-3 domain scores predict EYFSP domain outcomes? | EYFSP domain score | ASQ-3 domain score | Mapping below |
| 5.3 | Does the ASQ-3–EYFSP association vary by socio-demographic variables? | EYFSP GLD / score | ASQ-3 score; Sex; Ethnicity; IMD | Interaction |

**Domain Mapping**

| ASQ-3 Domain | EYFSP Domain |
|-------------|--------------|
| Fine Motor / Gross Motor | Physical Development |
| Communication / Language | Communication |
| Personal & Social | Personal, Social & Emotional |
| Problem Solving | Problem Solving |

---

# RQ1

## Classify 2y home visit

To estimate how many children received the mandated **2-year review (SRCode: `XaQA6`)** versus those who did not, we applied the following procedure:

1. **Define age cutoffs**  
   - We use the [Public Health England definition](https://assets.publishing.service.gov.uk/media/5bf53350e5274a2b0b42681b/Health_visitor_service_delivery_metrics_and_outcomes_definitions.pdf) of uptake of the two-year review: All children who reached 30 months (914 days) within the study period and had a two-year review coded as `XaQA6` (`Health visitor child 24-30 month contact`) completed between 691 and 914 days of age (i.e. between 23 and 30 months).

2. **Classify 2y home visit review events**  
   The two-year review (SRCode: `XaQA6`) is expected to take place when children are aged 23 to 30 months. In practice, however, some reviews are recorded outside this recommended window.

   We classify each review as follows:
   
   - **Valid**: review date falls between 23-30 months (inclusive).  
   - **Too early**: review recorded before 23 months.  
   
   A 'Too late' category is not required, as we only consider the most recent review completed before 30 months and exclude any reviews conducted after this age.

1. **Group by child**  
   - For each child (`person_id`), we checked whether they had **one valid 2y home visit entry**.  
   - If yes → labelled as **Yes**.  
   - If no → labelled as **No**.

In [None]:
# Note: Each child should have only one 2y home visit record considered (the latest one before 30 months)

age_23m = 691 # in days
age_30m = 914

# Calculate child's age in days at the time of HV (if available)
df["age_days_at_HV"] = (df["HV_DateEvent"] - df["birth_datetime"]).dt.days

# Calculate child's age in days at the time of ASQ (if available)
df["age_days_at_ASQ"] = (df["ASQ_DateEvent"] - df["birth_datetime"]).dt.days

# Prepare masks
has_hv = df["HV_CTV3Code"].eq("XaQA6") & df["HV_DateEvent"].notna()
age_ok = df["age_days_at_HV"].between(age_23m, age_30m)
age_early = df["age_days_at_HV"] < age_23m
age_late = df["age_days_at_HV"] > age_30m

# HV_Status
df["HV_Status"] = np.select(
    [
        has_hv & age_ok, 
        has_hv & age_early, 
        has_hv & age_late
    ],
    [
        "Has valid 23-30m visit", 
        "Too early (<23m)", 
        "Too late (>30m)"
    ],
    default="Eligible but no review"
)

# Valid_2y_HV (YES / NO)
df["Valid_2y_HV"] = np.where(has_hv & age_ok, YES, NO)

df_person = df[["person_id", "HV_Status", "Valid_2y_HV"]].drop_duplicates()

# Verification
_ = make_crosstab(
    df_person,
    row_var="HV_Status",
    col_var="Valid_2y_HV",
    caption_prefix="HV Status x Valid 2y HV"
)

# RQ2

## Classify ASQ-3

In [None]:
# df['age_months_at_ASQ'] = (df["ASQ_DateEvent"] - df["birth_datetime"]).dt.days / 30.4375

# Mask: ASQ exists and text contains 24/27/30 month
asq_mask = (
    df["ASQ_DateEvent"].notna() &
    df["ASQ_CTV3Text"].str.contains(
        r"(24\s*month|27\s*month|30\s*month)",
        flags=re.IGNORECASE, regex=True, na=False
    )
)

# Age-based conditions
age_ok = df["age_days_at_ASQ"].between(age_23m, age_30m)
age_early = df["age_days_at_ASQ"] < age_23m
age_late = df["age_days_at_ASQ"] > age_30m

# Valid 2y ASQ (YES/NO)
df["Valid_2y_ASQ"] = np.where(asq_mask & age_ok, YES, NO)

# ASQ Status
df["ASQ_Status"] = np.select(
    [
        asq_mask & age_ok,
        asq_mask & age_early,
        asq_mask & age_late
    ],
    [
        "Has valid 23-30m visit",
        "Too early (<23m)",
        "Too late (>30m)"
    ],
    default=None
)

df["ASQ_Version"] = df["ASQ_Version"].replace(["Missing", "missing", "None", "none", ""], pd.NA)
df["ASQ_Version_Count"] = df.groupby("person_id")["ASQ_Version"].transform(lambda x: x.dropna().nunique()) # regardless domain
df["Multi_ASQ_Version"] = df["ASQ_Version_Count"] > 1

In [None]:
# Verification
df_person = df[["person_id", "ASQ_Version_Count", "Valid_2y_ASQ", "ASQ_Status", "ASQ_Version"]].drop_duplicates()

_ = make_crosstab(
    df_person,
    row_var="ASQ_Status",
    col_var=["ASQ_Version", "Valid_2y_ASQ"],
    caption_prefix="ASQ_Status x ASQ_Version x Valid_2y_ASQ"
)

df_person["Valid_2y_ASQ_All"] = (
    df_person.groupby("person_id")["Valid_2y_ASQ"]
    .transform(lambda x: ", ".join(
        sorted(
            x.dropna().astype(str).unique()
        )
    ))
)
df_person = df_person[["person_id", "ASQ_Version_Count", "Valid_2y_ASQ_All"]].drop_duplicates()

_ = make_crosstab(
    df_person,
    row_var="ASQ_Version_Count",
    col_var="Valid_2y_ASQ_All",
    caption_prefix="ASQ_Version_Count x Valid 2y ASQ"
)

# RQ3

## Derive domain-level ASQ classification

Each domain score is categorised into one of three levels based on age-specific cut-offs:

| Category          | Interpretation                                                  |
|-------------------|-----------------------------------------------------------------|
| **Below Cut-Off** | Requires further assessment or professional intervention        |
| **Monitor**       | Development should be monitored                                 |
| **Above Cut-Off** | Development is on schedule                                      |

In [None]:
# asign ASQ domain based on ASQ_CTV3Text
text = df["ASQ_CTV3Text"].str.lower()

df["ASQ_Domain"] = np.select(
    [
        text.str.contains("communication", na=False),
        text.str.contains("gross motor", na=False),
        text.str.contains("fine motor", na=False),
        text.str.contains("problem solving", na=False),
        text.str.contains("personal-social", na=False)
    ],
    [
        "Communication",
        "Gross Motor",
        "Fine Motor",
        "Problem Solving",
        "Personal-Social"
    ],
    default="Unknown"
)

df["ASQ_Domain"] = df["ASQ_Domain"].replace("Unknown", np.nan)

df["ASQ_Version_All"] = (
    df.groupby(["person_id", "ASQ_Domain"])["ASQ_Version"]
    .transform(lambda x: ", ".join(sorted(x.dropna().astype(str).unique())))
)

report_value_counts(
    df, 
    ["ASQ_CTV3Text", "ASQ_Domain"], 
    mode='style', 
    caption="ASQ_CTV3Text and corresponding mapped ASQ_Domain",
    max_height=300, 
)

In [None]:
# Verification

df_person = df[["person_id", "ASQ_Domain", "ASQ_Version_All"]].drop_duplicates()

_ = make_crosstab(
    df_person,
    row_var="ASQ_Version_All",
    col_var="ASQ_Domain",
    caption_prefix="ASQ_Domain_All x ASQ_Domain",
    pct=True,
    show_total=False
)

In [None]:
asq_cutoffs_dict = {
    "ASQ-3 24m": {
        "Communication":      {"below": 25, "monitor": 38},
        "Gross Motor":        {"below": 38, "monitor": 45},
        "Fine Motor":         {"below": 35, "monitor": 43},
        "Problem Solving":    {"below": 29, "monitor": 39},
        "Personal-Social":    {"below": 31, "monitor": 40},
    },
    "ASQ-3 27m": {
        "Communication":      {"below": 24, "monitor": 36},
        "Gross Motor":        {"below": 28, "monitor": 38},
        "Fine Motor":         {"below": 18, "monitor": 30},
        "Problem Solving":    {"below": 27, "monitor": 39},
        "Personal-Social":    {"below": 25, "monitor": 35},
    },
    "ASQ-3 30m": {
        "Communication":      {"below": 33, "monitor": 44},
        "Gross Motor":        {"below": 36, "monitor": 44},
        "Fine Motor":         {"below": 19, "monitor": 34},
        "Problem Solving":    {"below": 27, "monitor": 39},
        "Personal-Social":    {"below": 32, "monitor": 40},
    }
}

asq_cutoffs = (
    pd.DataFrame([
        {"ASQ_Version": ver, "ASQ_Domain": dom, **vals}
        for ver, domains in asq_cutoffs_dict.items()
        for dom, vals in domains.items()
    ]).rename(
        columns={
            "below": "ASQ_Cutoff_Below",
            "monitor": "ASQ_Cutoff_Monitor"
        }
    )
)

display(asq_cutoffs.style.hide(axis='index').set_caption("ASQ Cutoffs"))

df = df.merge(
    asq_cutoffs,
    on=['ASQ_Version', 'ASQ_Domain'],
    how='left'
)

# df['ASQ_value'] = df['ASQ_value'].astype(float)
df['ASQ_value'] = pd.to_numeric(df['ASQ_value'], errors='coerce')

if (df['ASQ_Cutoff_Monitor'] < df['ASQ_Cutoff_Below']).any():
    raise ValueError("Invalid cutoff: monitor < below.")

if (df[['ASQ_Cutoff_Monitor','ASQ_Cutoff_Below']] > 61).any().any():
    raise ValueError("Thresholds cannot exceed 61.")

invalid_value_mask = ~(df['ASQ_value'].between(0, 61)) & df['ASQ_value'].notna()
if invalid_value_mask.any():
    bad_vals = df.loc[invalid_value_mask, 'ASQ_value'].unique()
    raise ValueError(f"Invalid ASQ_value outside 0-61: {bad_vals}")

value = df['ASQ_value']
below = df['ASQ_Cutoff_Below']
monitor = df['ASQ_Cutoff_Monitor']

df['ASQ_Category'] = np.select(
    [
        value.isna(),
        value <= below,
        value <= monitor,
        value <= 60,
        value == 61
    ],
    [
        None,
        'Below Cut-Off',
        'Monitor',
        'Above Cut-Off',
        'Invalid Score'
    ],
    default='Invalid Score'
)

assert (df['ASQ_Category'] == 'Invalid Score').sum() == 0, "Some ASQ records have 'Invalid Score' category assigned!"

In [None]:
# Verification

df_person = df[["person_id", "ASQ_Category", "ASQ_Domain"]].drop_duplicates()

_ = make_crosstab(
    df_person,
    row_var="ASQ_Category",
    col_var="ASQ_Domain",
    caption_prefix="ASQ_Category x ASQ_Domain",
    show_total=False
)

## Derive binary overall ASQ indicators and continuous ASQ composite score

Two binary indicators are derived to summarise each child’s overall developmental status, following two conventions:

**(1) PHE Convention (Public Health England definition)**  
- **At Risk (0):** If *any* of the five domains are classified as **Below Cut-Off**.  
- **Not at Risk (1):** If *all* domains are either **Monitor** or **Above Cut-Off**.

**(2) FGLD Convention (Full Good Level of Development)**  
- **Good Level of Development (1):** If *all* five domains are **Above Cut-Off** (“No Risk”).  
- **Not GLD (0):** If *any* domain is classified as **Monitor** or **Below Cut-Off**.

<br>
A continuous composite score is also calculated:

$\text{ASQ Composite Score} = \sum_{d=1}^{5} \text{DomainBinary}_d$

Each domain contributes:  
- 1 = Above Cut-Off (No Risk)  
- 0 = Monitor or Below Cut-Off  

This produces a total score ranging from **0 to 5**, where higher values indicate **fewer developmental concerns**.

In [None]:
# Binary indicator for domain
# For FGLD: 1 = Not at risk (Above), 0 = At risk (Below or Monitor)
df['ASQ_DomainBinary_FGLD'] = df['ASQ_Category'].map({
    'Above Cut-Off': 1, # Not at risk
    'Monitor': 0,
    'Below Cut-Off': 0
}).astype("Int64")

# For PHE: 1 = Not at risk (Above or Monitor), 0 = At risk (Below)
df['ASQ_DomainBinary_PHE'] = df['ASQ_Category'].map({
    'Above Cut-Off': 1,   # Not at risk
    'Monitor': 1,         # Not at risk
    'Below Cut-Off': 0,   # At risk
}).astype("Int64")


dup_groups = df[df['Valid_2y_ASQ'] == YES].drop_duplicates(subset=['person_id', 'ASQ_CTV3Code', 'ASQ_value']) \
                .groupby(["person_id", "ASQ_Version"])["ASQ_Domain"] \
                .apply(lambda x: x.size != x.nunique())

violations = dup_groups[dup_groups].index

assert not dup_groups.any(), f"Duplicated ASQ_Domain found in groups: {list(violations)}"

asq_summary = (
    df[df['Valid_2y_ASQ'] == YES]
    .drop_duplicates(subset=['person_id', 'ASQ_CTV3Code', 'ASQ_value'])
    .groupby(['person_id', 'ASQ_Version'])
    .agg(
        n_domains=('ASQ_Domain', 'nunique'),
        n_fgld_ok=('ASQ_DomainBinary_FGLD', 'sum'), # for FGLD
        n_phe_ok=('ASQ_DomainBinary_PHE', 'sum'),  # for PHE
        n_asq_entries=('ASQ_Domain', 'size')
    )
    .assign(
        ASQ_Composite=lambda x: np.where(x['n_domains'] == 5, x['n_fgld_ok'], np.nan),
        
        # PHE: 1 = Not at risk, 0 = At risk
        ASQ_PHE_Risk=lambda x: np.where(
            x['n_domains'] == 5,
            np.where(x['n_phe_ok'] < 5, 0, 1),
            np.nan
        ),
        
        # FGLD: 1 = GLD, 0 = Not GLD
        ASQ_FGLD=lambda x: np.where(
            x['n_domains'] == 5,
            np.where(x['n_fgld_ok'] < 5, 0, 1),
            np.nan
        )
    )
    .reset_index()
    .rename(
        columns={
            'n_domains': 'ASQ_n_domains',
            'n_fgld_ok': 'ASQ_n_fgld_ok',
            'n_phe_ok': 'ASQ_n_phe_ok',
        }
    )
)

assert (asq_summary['n_asq_entries'] != asq_summary['ASQ_n_domains']).sum() == 0, 'Found duplicated ASQ domains'
assert asq_summary['ASQ_n_fgld_ok'].between(0, 5).all(), "Found ASQ_Composite values outside [0, 5] range"


df = pd.merge(
    df,
    asq_summary,
    on=['person_id', 'ASQ_Version'],
    how='left'
)

int_cols = ['ASQ_Composite', 'ASQ_PHE_Risk', 'ASQ_FGLD', 'ASQ_n_fgld_ok', 'ASQ_n_phe_ok', 'ASQ_n_domains']
df[int_cols] = df[int_cols].astype("Int64")

In [None]:
# Verification

df_person = df[["person_id", "ASQ_n_fgld_ok", "ASQ_n_domains"]].drop_duplicates()

_ = make_crosstab(
    df_person,
    row_var="ASQ_n_fgld_ok",
    col_var="ASQ_n_domains",
    caption_prefix="ASQ_n_fgld_ok x ASQ_n_domains"
)

In [None]:
# Verification

df_person = df[["person_id", "ASQ_n_fgld_ok", "ASQ_FGLD"]].drop_duplicates()

_ = make_crosstab(
    df_person,
    row_var="ASQ_n_fgld_ok",
    col_var="ASQ_FGLD",
    caption_prefix="ASQ_n_fgld_ok x ASQ_FGLD"
)

# RQ 4

## Calculate binary GLD indicator

**Variable selection and data harmonisation**

For the EYFSP dataset, multiple versions of variable naming conventions were present across extracts.  

In particular, the *Personal, Social and Emotional Development (PSE)* domain included both:

- assessment sub-items (e.g., `FSP_PSE_AS1`, `FSP_PSE_AS2`), and  
- an aggregated total score (`FSP_PSE_TOTAL`),  

in addition to the three formal Early Learning Goals (ELGs):  
`FSP_PSE_G06` (*Self-confidence and Self-awareness*),  
`FSP_PSE_G07` (*Managing Feelings and Behaviour*), and  
`FSP_PSE_G08` (*Making Relationships*).

To ensure consistency with the national EYFSP framework, **only the official ELG variables (those ending in `_G`) were retained** for analysis across all domains.  

Variables corresponding to sub-assessments (`_AS`) or domain totals (`_TOTAL`) were excluded from the computation of both the total EYFSP score and the Good Level of Development (GLD) indicator.

<br>

The EYFSP source table includes a variable `FSP_GLD`, but this field assigns a value of 0 to some children who have *no* valid EYFSP assessment recorded (e.g., all ELGs marked as “A”, or all ELGs missing). According to DfE guidance, a child should be assessed against each of the 17 Early Learning Goals (ELGs), and the code “A” is used when a judgement cannot be made (e.g., exemption).

Because a code of “A” indicates that **no judgement was made**, rather than that the child did not achieve the expected level, we do not treat these cases as “Not GLD”. Instead, when all ELG scores for a child are missing or recorded as “A”, we classify the GLD outcome as *missing* (i.e., insufficient information to determine GLD), and we derive a new GLD variable (`FSP_GLD_derived`) based solely on valid ELG assessments.

In [None]:
elg_domains = {
    'PSE': 'Personal, Social and Emotional',
    'COM': 'Communication and Language',
    'PHY': 'Physical Development', 
    'LIT': 'Literacy',
    'MAT': 'Mathematics (Problem Solving, Reasoning and Numeracy)',
    'UTW': 'Understanding the World',
    'EXP': 'Expressive Arts and Design' 
}

domain_cols = {
    k: [c for c in df.columns if c.startswith(f'FSP_{k}_G')]
    for k in elg_domains.keys()
}

core_for_gld = ['COM', 'PHY', 'PSE', 'LIT', 'MAT']
core_cols = [col for d in core_for_gld for col in domain_cols[d]]

def highlight_core(row):
    if row['Domain Code'] in core_for_gld:
        return ['background-color: #ffe599'] * len(row)
    else:
        return [''] * len(row)

domain_table = (
    pd.DataFrame([
        {
            "Domain Code": k,
            "Domain Name": elg_domains[k],
            "Number of ELG Columns": len(v),
            "ELG Columns": ", ".join(v)
        }
        for k, v in domain_cols.items()
    ])
    .sort_values("Domain Code")
    .reset_index(drop=True)
)

display(
    domain_table
    .style
    .hide(axis="index")
    .apply(highlight_core, axis=1)
    .set_caption("EYFSP Domains and Corresponding ELG Columns (core domains have been highlighted)")
)

total_elgs = domain_table["Number of ELG Columns"].sum()
print(f"Total ELG columns identified: {total_elgs}")

assert total_elgs == 17, f"Expected 17 ELG columns but found {total_elgs} ELG columns."

# Calculate GLD
# GLD achieved if all ELGs in five core domains (COM, PHY, PSE, LIT, MAT) ≥ 2.

df['FSP_GLD_derived'] = (
    df[core_cols]
    .apply(pd.to_numeric, errors='coerce')
    .pipe(lambda x: np.where(
        x.notna().all(axis=1),           # all core domains should have a valid score
        x.ge(2).all(axis=1).astype(int), # expected or exceeding (score >= 2) and across all core ELGs
        np.nan                           # all missing → NaN
    ))
)

diff = df['FSP_GLD_derived'].compare(df['FSP_GLD'])
df_compare = df.loc[diff.index, core_cols + ['FSP_GLD_derived', 'FSP_GLD']]
if not df_compare.empty:
    df_compare = df_compare.replace('A', np.nan).dropna(subset=core_cols, how='all', axis=0).drop_duplicates()
    if not df_compare.empty:
        display(df_compare)
        # raise AssertionError(
        #     f"Mismatch between derived and source GLD after excluding missing ELGs.\n"
        #     f"{df_compare.shape[0]} mismatched rows above."
        # )
        warnings.warn(
            f"Mismatch between derived and source GLD after excluding missing ELGs.\n"
            f"{df_compare.shape[0]} mismatched rows above."
        )

In [None]:
df_eyfsp = df[['person_id'] + core_cols].drop_duplicates().copy()
df_eyfsp['FSP_Present'] = df_eyfsp[core_cols].notna().any(axis=1)

df = pd.merge(
    df,
    df_eyfsp[['person_id', 'FSP_Present']],
    on='person_id',
    how='left'
)

In [None]:
# Verification

df_person = df[["person_id", "FSP_GLD_derived", "FSP_GLD", "FSP_Present"]].drop_duplicates()

_ = make_crosstab(
    df_person,
    row_var="FSP_GLD_derived",
    col_var="FSP_GLD",
    caption_prefix="FSP_GLD_derived x FSP_GLD"
)

In [None]:
int_cols = ['FSP_GLD_derived', 'FSP_GLD']
df[int_cols] = df[int_cols].astype("Int64")

df_person = df[["person_id", "FSP_GLD_derived", "FSP_GLD", "FSP_Present"]].drop_duplicates()

_ = make_crosstab(
    df_person,
    row_var="FSP_GLD_derived",
    col_var="FSP_GLD",
    caption_prefix="FSP_GLD_derived x FSP_GLD"
)

In [None]:
_ = make_crosstab(
    df_person,
    row_var="FSP_Present",
    col_var=["FSP_GLD_derived", "FSP_GLD"],
    caption_prefix="FSP_Present x FSP_GLD_derived x FSP_GLD"
)

## Calculate domain-level binary outcome

We derived a binary attainment indicator for each of the five core EYFSP domains - **Communication and Language (COM), Physical Development (PHY), Personal, Social and Emotional Development (PSE), Literacy (LIT), and Mathematics (MAT).** Each domain consists of two or three Early Learning Goals (ELGs), which are scored as *Emerging (1)*, *Expected (2)*, or *Exceeding (3)*, with *A* indicating that the ELG was not assessed.

Domain-level attainment was defined as follows:

- **1 (Achieved expected level)**  
  Assigned when *all* ELGs within that domain had a valid score **≥2** (Expected or Exceeding).  

- **0 (Did not achieve expected level)**  
  Assigned when *any* ELG within the domain had a score **<2** (Emerging).  

- **Missing (NA)**  
  Assigned when all ELGs in the domain were missing or recorded as **A**, meaning no valid judgement could be derived for that domain.

This operationalisation aligns with the principle used for determining overall GLD, where meeting the expected level across all relevant ELGs is required.

In [None]:
for d in core_for_gld:
    df[f"FSP_{d}_Binary"] = (
        df[domain_cols[d]]
        .apply(pd.to_numeric, errors='coerce')
        .pipe(lambda x: np.where(
            x.notna().all(axis=1),           # need valid scores for all ELGs
            x.ge(2).all(axis=1).astype(int), # expected or exceeding (score >= 2)
            np.nan                           # all missing → NaN
        ))
    )

fsp_cols = [f"FSP_{d}_Binary" for d in core_for_gld]
df[fsp_cols] = df[fsp_cols].astype("Int64")

In [None]:
df_person = df[['person_id'] + fsp_cols + core_cols].drop_duplicates().apply(pd.to_numeric, errors='coerce')

domain_stats = {}

for d in core_for_gld:
    cols = domain_cols[d]
    domain_score = df_person[cols].sum(axis=1, min_count=1)
    domain_stats[d] = domain_score.value_counts(dropna=False)

domain_stats_df = pd.DataFrame(domain_stats).astype("Int64")
domain_stats_df.index = domain_stats_df.index.astype('Int64')

# domain_stats_df["Total"] = domain_stats_df.sum(axis=1).astype("Int64")
# total_row = domain_stats_df.sum(axis=0).astype("Int64")
# domain_stats_df.loc["Total"] = total_row

display(
    domain_stats_df.style.set_caption('EYFSP domain total score distribution')
)

In [None]:
domain_binary_stats = {d: df_person[d].value_counts(dropna=False) for d in fsp_cols}

domain_binary_stats = pd.DataFrame(domain_binary_stats).astype("Int64")

display(
    domain_binary_stats.style.set_caption('EYFSP Binary Score x EYFSP Core Domain')
)

## Calculate continuous total EYFSP score

Any pupils with at least one `A` (Absent) in their results will not be included.

In [None]:
all_elg_cols = [col for sublist in domain_cols.values() for col in sublist]

assert df[['person_id'] + all_elg_cols].drop_duplicates().shape[0] == df.person_id.nunique()

df['FSP_TotalScore'] = df[all_elg_cols].apply(pd.to_numeric, errors='coerce').sum(axis=1, skipna=False)
df['n_ELGs_completed'] = df[all_elg_cols].apply(pd.to_numeric, errors='coerce').notna().sum(axis=1)

print(f"{df.loc[df['n_ELGs_completed'] == 17, 'person_id'].nunique()} children had a complete EYFSP record (17/17 ELGs).")
print(f"{df.loc[(df['n_ELGs_completed'] > 0) & (df['n_ELGs_completed'] < 17), 'person_id'].nunique()} children had partial EYFSP records (>0 and <17 ELGs).")
print(f"{df.loc[df['n_ELGs_completed'] == 0, 'person_id'].nunique()} children had no EYFSP assessments (0 ELGs).")

# df.loc[df['n_ELGs_completed'] < 17, 'FSP_TotalScore'] = np.nan

assert df['FSP_TotalScore'].dropna().between(17, 51).all(), "Found out-of-range EYFSP total scores"

summary = (
    df.drop_duplicates(subset='person_id')['FSP_GLD_derived']
      .value_counts(dropna=False)
      .rename_axis('FSP_GLD_Status')
      .reset_index(name='Number_of_Children')
)

summary['FSP_GLD_Status'] = summary['FSP_GLD_Status'].astype('Int64')

summary['Proportion (%)'] = (
    summary['Number_of_Children']
    / summary['Number_of_Children'].sum()
    * 100
).round(2)

display(summary.style.hide(axis='index').set_caption('Summary of EYFSP Good Level of Development (GLD)'))

## Deriving age in months when children took EYFSP

Because exact EYFSP assessment dates were not available in the source dataset, we needed to derive a proxy date. The Early Years Foundation Stage Profile (EYFSP) must be completed during the **final term of the academic year in which a child reaches age 5**, and submitted **no later than 30 June**. Based on this statutory requirement, we assigned an approximate assessment date of **15 June** within the relevant academic year.

We then calculated each child’s **completed months of age at assessment** using this proxy date and their recorded date of birth. 

Age in completed months was derived as: $\displaystyle \text{Age (months)} = \left\lfloor \frac{(\text{ProxyDate} - \text{DOB})_{\text{days}}}{30.4375} \right\rfloor$.

<br>

> “The early years foundation stage profile is collected annually and **must be completed for all children in the final term of the reception year in which the child reaches age five (no later than 30 June in that term).**”  
> — [*EYFSP Technical Specification 2025*, Department for Education](https://assets.publishing.service.gov.uk/media/677e429399c93b7286a3978c/EYFSP_2025_specification_v1.0.pdf)

> “The EYFS profile must be completed for each child in the **final term of the academic year in which they reach age 5**.”  
> — [*Early Years Foundation Stage Profile Handbook*, Department for Education](https://www.gov.uk/government/publications/early-years-foundation-stage-profile-handbook/early-years-foundation-stage-profile-handbook)

In [None]:
def eyfsp_date_from_acadyr(ay):
    if ay:
        eyfsp_year = int(ay.split("/")[1]) # EYFSP happens in the summer term of the second calendar year
        return pd.to_datetime(f"{eyfsp_year}-06-15")
    else:
        return pd.NA

df['EYFSP_ProxyDate'] = df['FSP_ACADYR'].apply(eyfsp_date_from_acadyr)

df['birth_datetime'] = pd.to_datetime(df['birth_datetime'], errors='coerce')
df['EYFSP_ProxyDate'] = pd.to_datetime(df['EYFSP_ProxyDate'], errors='coerce')

df['age_fsp_months'] = ((df['EYFSP_ProxyDate'] - df['birth_datetime']).dt.days / 30.4375).floordiv(1).astype('Int64')

In [None]:
# Verification

df_person = df[["person_id", "FSP_GLD_derived", "age_fsp_months"]].drop_duplicates()

_ = make_crosstab(
    df_person,
    row_var="age_fsp_months",
    col_var="FSP_GLD_derived",
    caption_prefix="age_fsp_months x FSP_GLD_derived"
)

In [None]:
# multi_fsp = df[['person_id', 'age_fsp_months', 'EYFSP_ProxyDate', 'FSP_ACADYR']].drop_duplicates()
# multi_fsp[multi_fsp.duplicated(subset="person_id", keep=False)]

In [None]:
# | echo: false

if use_mock_set == True:
    df.drop(index=1139261, inplace=True)
    df.to_parquet(f'../Data/{tbl_name}_derived.parquet')
else:
    df.to_parquet(f'../{tbl_name}_derived.parquet')

    print(df.shape)

In [None]:
# | echo: false

try:
    df.to_sql(
        name=f'{tbl_name}_derived',   
        con=engine,                   
        schema='dbo',                
        if_exists='replace',          
        index=False                 
    )
except Exception as e:
    print("Error occurred while writing to SQL Server.")
    print(str(e).split("\n")[0])

engine.dispose()