# Bradford 0-19 Children and Young Peoples' Outcomes Framework: Descriptives

In [None]:
import os
import contextlib

with open(os.devnull, "w") as f, contextlib.redirect_stderr(f):
    import rpy2.robjects as ro

%load_ext rpy2.ipython

In [None]:
import pandas as pd
from pathlib import Path
from rich import print
from IPython.display import display
from sqlalchemy import create_engine
from importlib import reload
import plotly.io as pio
pio.renderers.default = "notebook_connected"

from utils import (
    plot_bar, 
    show_tab_model,
    make_crosstab
)

import warnings
warnings.filterwarnings('ignore')

# Load data

In [None]:
#| echo: false

tbl_name = 'person_linked_2016plus'

# Note: The connection string is specific to the Connected Bradford VDE.
# Replace the placeholder below with the internal server URL.
conn_str = "DATABASE_URL_PLACEHOLDER"

# Create SQLAlchemy engine
engine = create_engine(conn_str)

df = pd.read_sql(f"SELECT * FROM [dbo].[{tbl_name}_derived]", engine)

print(df.shape)

In [None]:
year_min = df.birth_datetime.min()
year_max = df.birth_datetime.max()

print(f"DOB covers for the dataset is from {year_min} to {year_max}.")

# Set up

In [None]:
df.drop(columns=["FSP_GLD"], inplace=True)

df.rename(columns={
    'Valid_2y_HV': "Has_HV",
    "Valid_2y_ASQ": "Has_ASQ",
    "ASQ_DomainBinary_FGLD": "ASQ_FGLD_dom",
    "FSP_GLD_derived": "FSP_GLD",
    "ASQ_PHE_Risk": "ASQ_GLD"
}, inplace=True)


df['FSP_Present'] = df['FSP_Present'].map({True: 1, False: 0})

In [None]:
%%R

options(warn = -1)

pkgs <- c(
  "dplyr", "sjPlot", "tidyr", "car", "forcats", "broom", 
  "interactions", "ggeffects", "ggplot2", "gtsummary", "lubridate",
  "gt", "webshot2", "Cairo", "performance", "labelled", "magrittr"
)

invisible(lapply(pkgs, function(p) {
  suppressPackageStartupMessages(
    library(p, character.only = TRUE, warn.conflicts = FALSE)
  )
}))

options(repr.plot.width = 10, repr.plot.height = 8, repr.plot.res = 300)
options(jupyter.plot_mimetypes = "image/svg+xml")
options(device = function(...) CairoPNG(...))

results_dir <- file.path(getwd(), "Results")

if (!dir.exists(results_dir)) {
  dir.create(results_dir, recursive = TRUE)
}

labels_map <- list(
  gender       = "Gender",
  ethnicity    = "Ethnicity",      
  IMD          = "IMD (quintile)", 
  Has_HV       = "Has 2y HV",
  age_months   = "Age at FSP (months)", 
  ASQ_GLD      = "ASQ GLD Status",
  ASQ_FGLD_dom = "ASQ Domain FGLD Status"
)

prepare_df <- function(df,
                       factor_cols = c("gender", "ethnicity_group", "IMD19_quintile", "Has_HV", "Has_ASQ", "ASQ_FGLD_dom", "ASQ_GLD"), # just default setting
                       numeric_cols = NULL,
                       reference_levels = list(
                         ethnicity_group = "White British",
                         gender = "Female",
                         IMD19_quintile = "5",
                         Has_HV = "1",
                         Has_ASQ = "1",
                         ASQ_FGLD_dom = "1",
                         ASQ_GLD = "1"
                       )) {
  
  if ("gender" %in% names(df)) {
    df$gender[df$gender == "Unknown/Other"] <- NA
  }
  
  if (!is.null(numeric_cols)) {
    existing_num_cols <- numeric_cols[numeric_cols %in% names(df)]
    
    df[existing_num_cols] <- lapply(df[existing_num_cols], function(x) {
      suppressWarnings(as.numeric(as.character(x)))
    })
    
    factor_cols <- setdiff(factor_cols, existing_num_cols)
  }
  
  existing_factor_cols <- factor_cols[factor_cols %in% names(df)]
  df[existing_factor_cols] <- lapply(df[existing_factor_cols], factor)
  
  for (col in names(reference_levels)) {
    if (col %in% existing_factor_cols) {
      df[[col]] <- relevel(df[[col]], ref = reference_levels[[col]])
    }
  }

  cat("Reference levels used in this logit model:\n")
  invisible(
    sapply(existing_factor_cols, function(col) {
      cat(sprintf("  %s reference: %s\n", col, levels(df[[col]])[1]))
    })
  )
  cat("\n")
  
  rename_map <- list(
    ethnicity = "ethnicity_group",
    IMD = "IMD19_quintile",
    age_months = "age_fsp_months"            
   )

  for (new_name in names(rename_map)) {
    old_name <- rename_map[[new_name]]
    if (old_name %in% names(df)) {
      df <- df %>% rename(!!new_name := all_of(old_name))
    }
  }
  
  existing_labels <- labels_map[names(labels_map) %in% names(df)]

  df <- df %>% 
    set_variable_labels(.labels = existing_labels)
  
  return(df)
}

## Create person table

In [None]:
p_cols = ['ethnicity_group', 'gender', 'IMD19_deciles', 'IMD19_quintile', 'age_2_5', 'gender_raw', 'birth_datetime', 'FSP_ACADYR']

derived_cols = ['Has_HV', 'Has_ASQ', 'FSP_GLD', 'FSP_TotalScore', 'ASQ_FGLD', 'ASQ_GLD', 'ASQ_Composite', 'age_fsp_months', 'FSP_Present', "ASQ_Status", 'ASQ_Version', 'ASQ_n_domains']

df_person = df[['person_id'] + p_cols + derived_cols].drop_duplicates()

df_person = (
    df_person
    .sort_values(
        by=['Has_ASQ', 'ASQ_Version'],
        ascending=[False, False]   # priority: Yes + latest version
    )
    .drop_duplicates(subset='person_id', keep='first')
)

assert df_person['person_id'].is_unique, "person_id is not unique in df_person"

df_person_domains = pd.merge(
    df_person,
    df[['person_id', 'ASQ_Version', 'ASQ_Domain', 'ASQ_FGLD_dom', 'Has_ASQ'] + [c for c in df.columns if c.startswith('FSP_') and '_Binary' in c]].drop_duplicates(),
    on=['person_id', 'ASQ_Version', 'Has_ASQ'],
    how='left'
)

def check_conflict(df, col):
    bad = df.groupby("person_id")[col].nunique(dropna=True)
    conflict_ids = bad[bad > 1]
    if len(conflict_ids) > 0:
        print(f"⚠️ {col} has {len(conflict_ids)} conflicting persons")
        return df[df.person_id.isin(conflict_ids.index)][['person_id', col]].sort_values('person_id')
    # else:
    #     print(f"✓ {col} OK")

for col in p_cols + derived_cols:
    check_conflict(df_person, col)

<!-- <div style="
    position: fixed;
    top: 0;
    left: 0;
    width: 100%;
    background-color: red;
    color: white;
    text-align: center;
    font-size: 18px;
    font-weight: bold;
    padding: 10px;
    z-index: 1000;
">
⚠️ ALL CHARTS BELOW ARE INTERACTIVE — Use zoom, pan, and hover to explore ⚠️
</div> -->

# Overview

::: {.callout-note collapse="true"}
## Distribution of ASQ and HV completion each year
<!-- ## <span style="font-size: 1.5em;">Distribution of ASQ and HV completion each year</span> -->

In [None]:
df["HV_Year"] = df["HV_DateEvent"].dt.year
df["ASQ_Year"] = df["ASQ_DateEvent"].dt.year

hv_count = (
    df[["person_id", "HV_CTV3Code", "HV_Year"]]
      .dropna(subset=['HV_CTV3Code', 'HV_Year'])
      .drop_duplicates()
      .groupby("HV_Year")["person_id"]
      .apply(lambda x: x.nunique())
      .reset_index(name="HV_Count")
)

asq_count = (
    df[["person_id", "ASQ_CTV3Code", "ASQ_Year"]]
      .dropna(subset=['ASQ_CTV3Code', 'ASQ_Year'])
      .drop_duplicates()
      .groupby("ASQ_Year")["person_id"]
      .apply(lambda x: x.nunique())
      .reset_index(name="ASQ_Count")
)

hv_count = hv_count.rename(columns={"HV_Year": "Year"})
asq_count = asq_count.rename(columns={"ASQ_Year": "Year"})

count_df = pd.merge(hv_count, asq_count, on="Year", how="outer")
count_df["Year"] = count_df["Year"].astype(int)
count_df[["HV_Count","ASQ_Count"]] = count_df[["HV_Count","ASQ_Count"]].apply(pd.to_numeric, errors="coerce").fillna(0)

df_long = count_df.melt(
    id_vars="Year",
    value_vars=["HV_Count", "ASQ_Count"],
    var_name="Type",
    value_name="Count"
)

plot_bar(
    df=df_long,
    x_col="Year",
    y_col="Count",
    color_col="Type",
    title="Number of Completed Home Visits vs ASQ Assessments by Year",
    x_label="Year",
    y_label="Number of assessments",
    text_col="Count",
    color_discrete_sequence=["#A9BDA8", "#4F6658"],
    barmode="group"
)

:::

::: {.callout-note collapse="true"}
## Distribution of EYFSP completion each year
<!-- ## <span style="font-size: 1.5em;">Distribution of EYFSP completion each year</span> -->

In [None]:
df["FSP_Year"] = df["FSP_ACADYR"].str.split("/").str[1].astype("Int64")
df["birth_Year"] = df["birth_datetime"].dt.year

fsp_count = (
    df[["person_id", "FSP_Year"]]
      .dropna(subset=['FSP_Year'])
      .drop_duplicates()
      .groupby("FSP_Year")["person_id"]
      .apply(lambda x: x.nunique())
      .reset_index(name="FSP_Count")
)

birth_count = (
    df[["person_id", "birth_Year"]]
      .dropna(subset=['birth_Year'])
      .drop_duplicates()
      .groupby("birth_Year")["person_id"]
      .apply(lambda x: x.nunique())
      .reset_index(name="Birth_Count")
)

birth_count = birth_count.rename(columns={"birth_Year": "Year"})
fsp_count = fsp_count.rename(columns={"FSP_Year": "Year"})

count_df = pd.merge(birth_count, fsp_count, on="Year", how="outer")
count_df["Year"] = count_df["Year"].astype(int)
count_df[["Birth_Count","FSP_Count"]] = count_df[["Birth_Count","FSP_Count"]].apply(pd.to_numeric, errors="coerce").fillna(0)

df_long = count_df.melt(
    id_vars="Year",
    value_vars=["Birth_Count", "FSP_Count"],
    var_name="Type",
    value_name="Count"
)

plot_bar(
    df=df_long,
    x_col="Year",
    y_col="Count",
    color_col="Type",
    title="Number of New Births vs FSP Assessments by Year",
    x_label="Year",
    y_label="Number of person",
    text_col="Count",
    color_discrete_sequence=["#A9BDA8", "#4F6658"],
    barmode="group"
)

:::

## Summary

::: {.callout-note collapse="true"}

### Details

In [None]:
_ = make_crosstab(
    df_person,
    row_var="Has_HV",
    col_var='Has_ASQ',
    caption_prefix="HV x ASQ completion"
)

In [None]:
_ = make_crosstab(
    df_person,
    row_var="Has_ASQ",
    col_var='FSP_Present',
    caption_prefix="ASQ x EYFSP completion"
)

In [None]:
_ = make_crosstab(
    df_person,
    row_var="ASQ_GLD",
    col_var='FSP_GLD',
    caption_prefix="ASQ x FSP GLD"
)

In [None]:
_ = make_crosstab(
    df_person,
    row_var="ethnicity_group",
    col_var=['Has_HV', 'Has_ASQ'],
    caption_prefix="HV & ASQ completion by Ethnicity"
)

In [None]:
_ = make_crosstab(
    df_person,
    row_var="gender",
    col_var=['Has_HV', 'Has_ASQ'],
    caption_prefix="HV & ASQ completion by Sex"
)

In [None]:
_ = make_crosstab(
    df_person,
    row_var="ethnicity_group",
    col_var=['Has_ASQ', 'FSP_Present'],
    caption_prefix="ASQ & EYFSP completion by Ethnicity"
)

In [None]:
_ = make_crosstab(
    df_person,
    row_var="gender",
    col_var=['Has_ASQ', 'FSP_Present'],
    caption_prefix="ASQ & EYFSP completion by Sex"
)

:::

In [None]:
%%R -i df_person -o output_tbl_html

output_path <- file.path(results_dir, "tbl_demo_asq_eyfsp")
output_tbl_html <- paste0(output_path, ".html")
output_tbl_png <- paste0(output_path, ".png")
output_tbl_rds <- paste0(output_path, ".rds")

df0 <- df_person %>%
  mutate(
    gender = factor(gender),
    ethnicity_group = factor(ethnicity_group),
    IMD19_quintile = factor(IMD19_quintile),
    # IMD19_quintile = fct_explicit_na(factor(IMD19_quintile), na_level = "Unknown")
  )


df_asq <- df0 %>%
  filter(
    !is.na(person_id),

    Has_ASQ == 1,
    ASQ_n_domains == 5,
    !is.na(ASQ_Composite),
  )
  
df_eyfsp <- df0 %>%
  filter(
    !is.na(person_id),

    # EYFSP
    (!is.na(FSP_GLD))
  )

df_eyfsp_linked <- df0 %>%
  filter(
    !is.na(person_id),

    # EYFSP
    (!is.na(FSP_GLD)),
    
    # linked ASQ complete
    Has_ASQ == 1,
    ASQ_n_domains == 5,
    !is.na(ASQ_Composite),
  )

vars_tbl1 <- c("gender", "ethnicity_group", "IMD19_quintile")

tbl1_labels <- list(
  gender ~ "Sex",
  ethnicity_group ~ "Ethnicity",
  IMD19_quintile ~ "IMD 2019 Quintile"
)

tbl1_stat <- list(
  all_categorical() ~ "{n} ({p}%)",
  all_continuous()  ~ "{mean} ({sd})"
)

make_tbl <- function(data, vars = vars_tbl1) {
  data %>%
    select(all_of(vars)) %>%
    tbl_summary(
      statistic = tbl1_stat,
      digits = list(
          all_categorical() ~ c(0, 1)
      ),
      missing = "ifany",
      missing_text = "Unknown",
      missing_stat = "{N_miss} ({p_miss}%)",
      label = tbl1_labels
    )
}

tbl_asq          <- make_tbl(df_asq)
tbl_eyfsp        <- make_tbl(df_eyfsp)
tbl_eyfsp_linked <- make_tbl(df_eyfsp_linked)

table1 <- tbl_merge(
  tbls = list(tbl_asq, tbl_eyfsp, tbl_eyfsp_linked),
  tab_spanner = c(
    "\u2003\u2003\u2003**ASQ-3**\u2003\u2003\u2003",
    "\u2003\u2003\u2003**EYFSP**\u2003\u2003\u2003",
    "\u2003**EYFSP + ASQ-3**\u2003"
  )
) %>%
  # modify_caption("**Table 1. Sociodemographic characteristics of children in the ASQ-3 and EYFSP analyses**") %>%
  bold_labels()

gt_tbl <- as_gt(table1)

gt::gtsave(gt_tbl, output_tbl_html)

# gt::gtsave(
#     gt_tbl, 
#     output_tbl_png,
#     vwidth = 1800,
#     vheight = 1200,
#     expand = 10
# )

saveRDS(gt_tbl, output_tbl_rds)

In [None]:
show_tab_model(output_tbl_html)

# Who achieves a Good Level of Development (GLD) on the ASQ-3?

<details>
<summary style="background-color:#E6F4EA; padding:8px; border:1px solid #003366; border-radius:5px; cursor:pointer; font-weight:bold; color:#003366;">
ASQ-3 Scoring and Derivation of Composite Indicators (click to expand)
</summary>

The **Ages and Stages Questionnaire (ASQ-3)** includes three response options for each question:  

| Response   | Score |
|-------------|--------|
| Yes         | 10     |
| Sometimes   | 5      |
| Not yet     | 0      |

Each of the five developmental domains (**Communication**, **Gross Motor**, **Fine Motor**, **Problem Solving**, and **Personal-Social**) has a total possible score ranging from **0 to 60**.  

However, cut-offs differ by both age version (24, 27, or 30 months) and domain, meaning that raw scores are **not directly comparable** across all versions.

Therefore, it is not possible to derive a single continuous ASQ score with consistent meaning across ages.

<br>


**Domain-level classification**

Each domain score is categorised into one of three levels based on age-specific cut-offs:

| Category          | Interpretation                                                  |
|-------------------|-----------------------------------------------------------------|
| **Below Cut-Off** | Requires further assessment or professional intervention        |
| **Monitor**       | Development should be monitored                                 |
| **Above Cut-Off** | Development is on schedule                                      |


<br>


**Binary overall ASQ indicators**

Two binary indicators are derived to summarise each child’s overall developmental status, following two conventions:

**(1) PHE Convention (Public Health England definition)**  
- **At Risk (0):** If *any* of the five domains are classified as **Below Cut-Off**.  
- **Not at Risk (1):** If *all* domains are either **Monitor** or **Above Cut-Off**.

**(2) FGLD Convention (Full Good Level of Development)**  
- **Good Level of Development (1):** If *all* five domains are **Above Cut-Off** (“No Risk”).  
- **Not GLD (0):** If *any* domain is classified as **Monitor** or **Below Cut-Off**.


<br>

**Continuous ASQ composite score**

A continuous composite score is also calculated:

$\text{ASQ Composite Score} = \sum_{d=1}^{5} \text{DomainBinary}_d$

Each domain contributes:  
- 1 = Above Cut-Off (No Risk)  
- 0 = Monitor or Below Cut-Off  

This produces a total score ranging from **0 to 5**, where higher values indicate **fewer developmental concerns**.

<br>

**Notes**  
- Domain-level categories are derived using **age-appropriate validated cut-offs** for each ASQ version (24, 27, or 30 months).  

Below are the cut-offs used:

| ASQ_version | ASQ_Domain       | Below Cut-Off | Monitor Cut-Off | No Risk Min |
|--------------|------------------|--------------------|----------------|-------------|
| 30 | Communication    | 33 | 44 | 45 |
| 27 | Communication    | 24 | 36 | 37 |
| 24 | Communication    | 25 | 38 | 39 |
| 30 | Gross Motor      | 36 | 44 | 45 |
| 27 | Gross Motor      | 28 | 38 | 39 |
| 24 | Gross Motor      | 38 | 45 | 46 |
| 30 | Fine Motor       | 19 | 34 | 35 |
| 27 | Fine Motor       | 18 | 30 | 31 |
| 24 | Fine Motor       | 35 | 43 | 44 |
| 30 | Problem Solving  | 27 | 39 | 40 |
| 27 | Problem Solving  | 27 | 39 | 40 |
| 24 | Problem Solving  | 29 | 39 | 40 |
| 30 | Personal-Social  | 32 | 40 | 41 |
| 27 | Personal-Social  | 25 | 35 | 36 |
| 24 | Personal-Social  | 31 | 40 | 41 |

</details>

## Preparatory Work

### Create dataset of children with complete ASQ per domain (regardless of 2y HV status)

In [None]:
df_person_asq = df_person[df_person['Has_ASQ'] == 1]

_ = make_crosstab(
    df_person_asq,
    row_var="ASQ_n_domains",
    col_var='ASQ_Composite',
    caption_prefix="ASQ_n_domains x ASQ_Composite",
)

In [None]:
_ = make_crosstab(
    df_person_asq,
    row_var="ASQ_n_domains",
    col_var='Has_ASQ',
    caption_prefix="ASQ_n_domains x Has_ASQ",
)

In [None]:
df_complete_asq = df_person_asq[df_person_asq['ASQ_n_domains'] == 5]

version_counts = (
    df_complete_asq.groupby('ASQ_Version')['person_id']
    .nunique()
    .sort_index()
)

version_summary = "\n".join(
    f"  - [bold]{ver}[/bold]: {n:,}" for ver, n in version_counts.items()
)

print(
    f"[bold cyan]Eligible sample:[/bold cyan] "
    f"{df_complete_asq.person_id.nunique():,} unique children with a valid ASQ completed at or before 2.5 years of age, "
    f"using the most recent ASQ record with complete data across all five developmental domains.\n"
    # f"[bold cyan]Breakdown by ASQ version:[/bold cyan]\n{version_summary}"
)

_ = make_crosstab(
    df_complete_asq,
    row_var="ASQ_Version",
    col_var='ASQ_Composite',
    caption_prefix="ASQ_Version x ASQ_Composite",
)

## What numbers and proportion of children who have an ASQ-3 assessment are in the three categories for overall ASQ-3 and for each ASQ-3 domain: a) reach a good level of development; b) ‘monitoring’; c) not reached a GLD.

In [None]:
_ = make_crosstab(
    df_complete_asq,
    row_var="Has_ASQ",
    col_var='ASQ_GLD',
    caption_prefix="Overall ASQ-3 developmental status (PHE classification)",
)

In [None]:
_ = make_crosstab(
    df_complete_asq,
    row_var="Has_ASQ",
    col_var='ASQ_FGLD',
    caption_prefix="Overall ASQ-3 FGLD status",
)

In [None]:
id_complete_asq = df_complete_asq.person_id.unique()

df_match = df.merge(
    df_complete_asq[['person_id', 'ASQ_Version']],
    on=['person_id', 'ASQ_Version'],
    how='inner'
)

df_match_valid = df_match[df_match['ASQ_n_domains'] == 5][['person_id', 'ASQ_Domain', 'ASQ_Category']].drop_duplicates()

domain_summary = (
    df_match_valid[['ASQ_Domain', 'ASQ_Category']]
        .value_counts()
        .unstack(fill_value=0)
)

domain_summary['Total'] = domain_summary.sum(axis=1)

domain_pct = (domain_summary.div(domain_summary['Total'], axis=0) * 100).round(1)

def format_n_pct(n, pct):
    return f"{n} ({pct}%)"

domain_formatted = domain_summary.copy()
for col in domain_summary.columns:
    domain_formatted[col] = [
        format_n_pct(domain_summary.loc[idx, col], domain_pct.loc[idx, col])
        for idx in domain_summary.index
    ]

# # Drop Total% column (always 100%)
# domain_formatted = domain_formatted.drop(columns=['Total'])

display(
    domain_formatted.reset_index()
        .style
        .hide(axis='index')
        .set_caption("ASQ-3 developmental categories by domain")
)

In [None]:
%%R -i df_match_valid -o output_tbl_html

output_path <- file.path(results_dir, "tbl_asq_domain_summary")
output_tbl_html <- paste0(output_path, ".html")
output_tbl_rds  <- paste0(output_path, ".rds")

df_for_tbl <- df_match_valid %>%
  mutate(
    ASQ_Category = factor(ASQ_Category, levels = c("Above Cut-Off", "Monitor", "Below Cut-Off")),
    Risk_Status = ifelse(ASQ_Category == "Above Cut-Off", "Not Risk", "Risk"),
    Risk_Status = factor(Risk_Status, levels = c("Not Risk", "Risk")),
    Risk_Detail = case_when(
      ASQ_Category == "Monitor" ~ "Monitor",
      ASQ_Category == "Below Cut-Off" ~ "Below cut-off",
      TRUE ~ NA_character_
    ),
    Risk_Detail = factor(Risk_Detail, levels = c("Monitor", "Below cut-off"))
  )

N_children <- n_distinct(df_for_tbl$person_id)

df_overall <- df_for_tbl %>%
  group_by(person_id) %>%
  summarise(
    any_below   = any(ASQ_Category == "Below Cut-Off"),
    any_monitor = any(ASQ_Category == "Monitor"),
    .groups = "drop"
  ) %>%
  mutate(
    ASQ_Domain = "Overall",
    ASQ_Category = case_when(
      any_below ~ "Below Cut-Off",
      any_monitor ~ "Monitor",
      TRUE ~ "Above Cut-Off"
    ),
    Risk_Status = ifelse(ASQ_Category == "Above Cut-Off", "Not Risk", "Risk"),
    Risk_Detail = case_when(
      ASQ_Category == "Monitor" ~ "Monitor",
      ASQ_Category == "Below Cut-Off" ~ "Below cut-off",
      TRUE ~ NA_character_
    )
  ) %>%
  select(person_id, ASQ_Domain, ASQ_Category, Risk_Status, Risk_Detail)

N_risk_children <- df_overall %>%
  filter(Risk_Status == "Risk") %>%
  summarise(n = n_distinct(person_id)) %>%
  pull(n)

df_tbl <- bind_rows(
  df_for_tbl %>% select(person_id, ASQ_Domain, ASQ_Category, Risk_Status, Risk_Detail),
  df_overall
)

df_tbl <- df_tbl %>%
  mutate(
    ASQ_Domain = factor(
      ASQ_Domain,
      levels = c(
        "Communication",
        "Fine Motor",
        "Gross Motor",
        "Personal-Social",
        "Problem Solving",
        "Overall"
      )
    )
  )

tab_risk_hier <- df_tbl %>%
  tbl_summary(
    by = ASQ_Domain,
    include = c(Risk_Status, Risk_Detail),
    statistic = all_categorical() ~ "{n} ({p}%)",
    digits = list(
          all_categorical() ~ c(0, 1)
      ),
    percent = "column",
    missing = "no"
  ) %>%
  modify_table_body(
    ~ .x %>%
      mutate(
        N = case_when(
          row_type == "label" & variable == "Risk_Status" ~ format(N_children, big.mark = ","),
          row_type == "label" & variable == "Risk_Detail" ~ format(N_risk_children, big.mark = ","),
          TRUE ~ ""
        ),
        label = case_when(
          variable == "Risk_Status" & row_type == "label" ~ "Risk status",
          variable == "Risk_Detail" & row_type == "label" ~ "Risk breakdown",
          variable == "Risk_Detail" & row_type == "level" ~ paste0("    ", label),
          TRUE ~ label
        )
      )
  ) %>%
  modify_header(
    # label ~ "**Measure**",
    label ~ "",
    N ~ "**N (children)**",
    all_stat_cols() ~ "**{level}**"
  ) %>%
  bold_labels() %>%
  modify_footnote(everything() ~ NA)

gt_tbl <- tab_risk_hier %>%
  as_gt()

gt_cols <- colnames(gt_tbl[["_data"]])

# stat_1 .. stat_5 = ASQ domains, stat_6 = Overall
domain_cols <- gt_cols[grepl("^stat_", gt_cols)][1:5]

gt_tbl <- gt_tbl %>%
  gt::tab_spanner(
    label = gt::md("**ASQ-3 domain**"),
    columns = domain_cols
  ) %>%
  gt::tab_style(
    style = gt::cell_text(align = "center", v_align = "middle"),
    locations = list(
      gt::cells_column_labels(),
      gt::cells_column_spanners(),
      gt::cells_body(),
      gt::cells_stub()
    )
  ) %>%
  gt::tab_style(
    style = gt::cell_text(align = "center", v_align = "middle", weight = "bold"),
    locations = gt::cells_stubhead()
  )

gt::gtsave(gt_tbl, output_tbl_html)
saveRDS(gt_tbl, output_tbl_rds)

In [None]:
show_tab_model(output_tbl_html)

In [None]:
%%R -i df_match_valid -o output_tbl_html

output_path <- file.path(results_dir, "tbl_asq_domain_sum_no_overall")
output_tbl_html <- paste0(output_path, ".html")
output_tbl_rds  <- paste0(output_path, ".rds")

df_for_tbl <- df_match_valid %>%
  mutate(
    ASQ_Category = factor(ASQ_Category, levels = c("Above Cut-Off", "Monitor", "Below Cut-Off")),
    Risk_Status = ifelse(ASQ_Category == "Above Cut-Off", "Not Risk", "Risk"),
    Risk_Status = factor(Risk_Status, levels = c("Not Risk", "Risk")),
    Risk_Detail = case_when(
      ASQ_Category == "Monitor" ~ "Monitor",
      ASQ_Category == "Below Cut-Off" ~ "Below cut-off",
      TRUE ~ NA_character_
    ),
    Risk_Detail = factor(Risk_Detail, levels = c("Monitor", "Below cut-off"))
  )

N_children <- n_distinct(df_for_tbl$person_id)
N_risk_children <- df_for_tbl %>%
  filter(Risk_Status == "Risk") %>%
  summarise(n = n_distinct(person_id)) %>%
  pull(n)

df_tbl <- df_for_tbl %>%
  mutate(
    ASQ_Domain = factor(
      ASQ_Domain,
      levels = c(
        "Communication",
        "Fine Motor",
        "Gross Motor",
        "Personal-Social",
        "Problem Solving"
      )
    )
  )

tab_risk_hier <- df_tbl %>%
  tbl_summary(
    by = ASQ_Domain,
    include = c(Risk_Status, Risk_Detail),
    statistic = all_categorical() ~ "{n} ({p}%)",
    digits = list(all_categorical() ~ c(0, 1)),
    percent = "column",
    missing = "no"
  ) %>%
  modify_table_body(
    ~ .x %>%
      mutate(
        N = case_when(
          row_type == "label" & variable == "Risk_Status" ~ format(N_children, big.mark = ","),
          row_type == "label" & variable == "Risk_Detail" ~ format(N_risk_children, big.mark = ","),
          TRUE ~ ""
        ),
        label = case_when(
          variable == "Risk_Status" & row_type == "label" ~ "Risk status",
          variable == "Risk_Detail" & row_type == "label" ~ "Risk breakdown",
          variable == "Risk_Detail" & row_type == "level" ~ paste0("    ", label),
          TRUE ~ label
        )
      )
  ) %>%
  modify_header(
    label ~ "  ",
    N ~ "**N (children)**",
    all_stat_cols() ~ "**{level}**"
  ) %>%
  bold_labels() %>%
  modify_footnote(everything() ~ NA)

gt_tbl <- tab_risk_hier %>%
  as_gt()

gt_cols <- colnames(gt_tbl[["_data"]])
domain_cols <- gt_cols[grepl("^stat_", gt_cols)][1:5]

gt_tbl <- gt_tbl %>%
  gt::tab_spanner(
    label = gt::md("**ASQ-3 domain**"),
    columns = domain_cols
  ) %>%
  gt::tab_style(
    style = gt::cell_text(align = "center", v_align = "middle"),
    locations = list(
      gt::cells_column_labels(),
      gt::cells_column_spanners(),
      gt::cells_body(),
      gt::cells_stub()
    )
  ) %>%
  gt::tab_style(
    style = gt::cell_text(align = "center", v_align = "middle", weight = "bold"),
    locations = gt::cells_stubhead()
  )

gt_tbl <- gt_tbl %>%
  gt::cols_width(
    "N" ~ gt::px(120)           
  )

gt::gtsave(gt_tbl, output_tbl_html)
saveRDS(gt_tbl, output_tbl_rds)

In [None]:
show_tab_model(output_tbl_html)

# Who achieves a GLD on the EYFSP?

For the **Early Years Foundation Stage Profile (EYFSP)**, outcomes will be analysed in two complementary formats:

- **Binary Good Level of Development (GLD) indicator:**  
  A binary variable will be derived following the national definition of GLD. Children will be classified as having a *Good Level of Development* if they achieve at least the expected level in all of the core areas: `communication and language`, `physical development`, `personal, social and emotional development`, `literacy`, and `mathematics`.

- **Continuous total EYFSP score:**  
  Where available, a continuous total score will also be calculated by summing the scores across all *Early Learning Goals (ELGs)*. This measure provides greater sensitivity for modelling and secondary analyses.

<br>

**Structure of the EYFSP assessment**

The EYFSP comprises **17 Early Learning Goals (ELGs)**, organised under seven areas of learning and development:

| Area of learning and development | Number of ELGs|
|----------------------------------|----------------|
| Communication and Language | 3  |
| Physical Development | 2 |
| Personal, Social and Emotional Development | 3 | 
| Literacy | 2 | 
| Mathematics | 2 | 
| Understanding the World | 3 | 
| Expressive Arts and Design | 2 | 

Each ELG is typically scored on a three-point scale:  **1 = Emerging**, **2 = Expected**, **3 = Exceeding**. A rating of **A** is recorded for children who have been granted an exemption from the assessment.

The total EYFSP score therefore ranges from **17 (all Emerging)** to **51 (all Exceeding)**.

## Preparatory Work

### Create dataset with valid EYFSP

In [None]:
_ = make_crosstab(
    df_person,
    row_var="FSP_Present",
    col_var='FSP_GLD',
    caption_prefix="FSP_Present x FSP_GLD",
)

In [None]:
df_person_eyfsp = df_person[df_person['FSP_GLD'].notna()].drop_duplicates()
df_eyfsp = df.loc[df['FSP_GLD'].notna()]

print(f"[bold cyan]RQ4 eligible sample: {df_person_eyfsp.person_id.nunique():,} unique children who had a non-empty FSP_GLD entry (regardless of their HV and ASQ states).[/bold cyan]")
print(f"DOB range for RQ4 eligible sample: {df_person_eyfsp['birth_datetime'].min().date()} to {df_person_eyfsp['birth_datetime'].max().date()}. Academic years covered: {df_person_eyfsp['FSP_ACADYR'].min()} to {df_person_eyfsp['FSP_ACADYR'].max()}.")

today = pd.Timestamp.today().normalize()
df_person_eyfsp['age_5_0'] = df_person_eyfsp['birth_datetime'] + pd.DateOffset(days=1826)
# n_children_age_5 = df.loc[today >= df['age_5_0'], 'person_id'].nunique()
# print(f"Number of children aged 5 or older today in base table: {n_children_age_5:,}.")

## For all children aged 5, what numbers and proportion reach and do not reach a GLD?

In [None]:
_ = make_crosstab(
    df_person_eyfsp,
    row_var="FSP_GLD",
    col_var='ASQ_GLD',
    caption_prefix="FSP_GLD x ASQ_GLD",
)

# Do the developmental outcome on the ASQ-3 aged 2-years predict the developmental outcome on the EYFSP aged 5-years, using both dichotomous GLD and continuous score outcomes.  

In [None]:
df_fsp_asq = df_person_domains[(df_person_domains['FSP_GLD'].notna()) & (df_person_domains['ASQ_GLD'].notna())].drop_duplicates()
df_person_fsp_asq = df_fsp_asq[['person_id', 'gender', 'ethnicity_group', 'ASQ_Composite', 'ASQ_FGLD', 'ASQ_GLD', 'FSP_TotalScore', 'FSP_GLD', 'age_fsp_months', 'IMD19_deciles', 'IMD19_quintile']].drop_duplicates()

print(f"[bold cyan]RQ5 eligible sample: {df_person_fsp_asq.person_id.nunique():,} unique children who had a non-empty FSP_GLD and complete ASQ entries.[/bold cyan]")

In [None]:
_ = make_crosstab(
    df_person_fsp_asq,
    row_var="FSP_GLD",
    col_var='ASQ_GLD',
    caption_prefix="FSP_GLD x ASQ_GLD"
)