Setup & configuration

In [10]:
# === Section 1: Project paths & constants (run this first) ===
from pathlib import Path
import os

# --- Resolve project root robustly (works in notebooks, VS Code, and scripts) ---
if "__file__" in globals():
    PROJECT_ROOT = Path(__file__).resolve().parent.parent
else:
    # If working in a notebook, assume the repo root is one level up from the notebook
    # but allow an env override if you prefer:  os.environ["PROJECT_ROOT"] = "C:/path/to/repo"
    PROJECT_ROOT = Path(os.environ.get("PROJECT_ROOT", "..")).resolve()

print(f"PROJECT_ROOT = {PROJECT_ROOT}")

# --- Standard data directories ---
DATA_RAW       = PROJECT_ROOT / "data" / "raw"
DATA_INTERIM   = PROJECT_ROOT / "data" / "interim"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"

for d in (DATA_RAW, DATA_INTERIM, DATA_PROCESSED):
    d.mkdir(parents=True, exist_ok=True)

print(f"Using DATA_INTERIM = {DATA_INTERIM}")
print(f"Using DATA_PROCESSED = {DATA_PROCESSED}")

# --- Expected raw file locations (adjust if your layout differs) ---
files = {
    "k": DATA_RAW / "UKHLS" / "k_indresp.sav",
    "l": DATA_RAW / "UKHLS" / "l_indresp.sav",
    "n": DATA_RAW / "UKHLS" / "n_indresp.sav",
}

# Quick existence check with a friendly hint if something’s missing
missing = []
for key, p in files.items():
    exists = p.exists()
    print(f"{key}: exists={exists} -> {p}")
    if not exists:
        missing.append((key, p))

if missing:
    print("\n⚠️ Some raw files are missing:")
    for k, p in missing:
        print(f"  - {k}: {p}")
    print("   → Update `files[...]` paths above or put the files in the expected locations.")


PROJECT_ROOT = C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc
Using DATA_INTERIM = C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\interim
Using DATA_PROCESSED = C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\processed
k: exists=True -> C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\raw\UKHLS\k_indresp.sav
l: exists=True -> C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\raw\UKHLS\l_indresp.sav
n: exists=True -> C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\raw\UKHLS\n_indresp.sav


In [11]:
# === Section 2: Define per-wave "columns of interest" (ensure 21 with pidp first) ===
import pandas as pd

# Curated requests (note: fixed the stray space before k_hhsize)
k_cols_interest = [
    "pidp",
    "k_hrpid","k_ppid","k_ind5mus_xw","k_indinui_lw",
    "k_intdaty_dv","k_age_dv","k_sex_dv",
    "k_gor_dv","k_urban_dv","k_hhtype_dv","k_hhsize",
    "k_ethn_dv","k_mhealthtyp1",
    "k_jbft_dv","k_jbnssec_dv","k_jbhrs",
    "k_fimnnet_dv","k_sf12pcs_dv","k_scghq1_dv","k_nchild_dv",
]

l_cols_interest = [
    "pidp",
    "l_hrpid","l_ppid","l_ind5mus_xw","l_indinui_lw",
    "l_intdaty_dv","l_age_dv","l_sex_dv",
    "l_gor_dv","l_urban_dv","l_hhtype_dv","l_hhsize",
    "l_ethn_dv","l_mhealthtyp1",
    "l_jbft_dv","l_jbnssec_dv","l_jbhrs",
    "l_fimnnet_dv","l_sf12pcs_dv","l_scghq1_dv","l_nchild_dv",
]

# N uses anxiety = mhgad
n_cols_interest = [
    "pidp",
    "n_hrpid","n_ppid","n_ind5mus_xw","n_indinui_lw",
    "n_intdaty_dv","n_age_dv","n_sex_dv",
    "n_gor_dv","n_urban_dv","n_hhtype_dv","n_hhsize",
    "n_ethn_dv","n_mhgad",
    "n_jbft_dv","n_jbnssec_dv","n_jbhrs",
    "n_fimnnet_dv","n_sf12pcs_dv","n_scghq1_dv","n_nchild_dv",
]

def _read_all_cols(prefix: str) -> list[str]:
    """Load the full SPSS header list saved earlier by your 'show_columns' cell."""
    path = DATA_INTERIM / f"ukhls_{prefix}_all_columns.csv"
    s = pd.read_csv(path)["column"].astype(str).str.strip()
    return s.tolist()

def ensure_21_with_pidp(prefix: str, cols: list[str]) -> list[str]:
    """
    - Strip whitespace, drop duplicates (keep order)
    - Ensure 'pidp' present and first (if available in header)
    - Validate against saved header list for the wave
    - Cap to exactly 21 columns (your target)
    - Persist list to data/interim for later cells
    """
    requested = []
    seen = set()
    for c in [c.strip() for c in cols if isinstance(c, str)]:
        if c and c not in seen:
            requested.append(c); seen.add(c)

    header = set(_read_all_cols(prefix))

    # Ensure pidp first if available
    if "pidp" in header:
        requested = ["pidp"] + [c for c in requested if c != "pidp"]

    # Report any requested columns not in the header (helps catch typos)
    missing_in_header = [c for c in requested if c not in header]
    if missing_in_header:
        print(f"⚠️ {prefix.upper()} missing in header: {missing_in_header}")

    # Keep only those that actually exist in the header
    requested = [c for c in requested if c in header]

    # Cap to 21
    requested = requested[:21]

    # Persist & echo
    out = DATA_INTERIM / f"ukhls_{prefix}_columns_of_interest.csv"
    pd.Series(requested, name="column").to_frame().to_csv(out, index=False)
    print(f"{prefix.upper()} -> {len(requested)} columns")
    print(requested)
    print("-" * 120)
    return requested

k_cols_interest = ensure_21_with_pidp("k", k_cols_interest)
l_cols_interest = ensure_21_with_pidp("l", l_cols_interest)
n_cols_interest = ensure_21_with_pidp("n", n_cols_interest)

K -> 21 columns
['pidp', 'k_hrpid', 'k_ppid', 'k_ind5mus_xw', 'k_indinui_lw', 'k_intdaty_dv', 'k_age_dv', 'k_sex_dv', 'k_gor_dv', 'k_urban_dv', 'k_hhtype_dv', 'k_hhsize', 'k_ethn_dv', 'k_mhealthtyp1', 'k_jbft_dv', 'k_jbnssec_dv', 'k_jbhrs', 'k_fimnnet_dv', 'k_sf12pcs_dv', 'k_scghq1_dv', 'k_nchild_dv']
------------------------------------------------------------------------------------------------------------------------
L -> 21 columns
['pidp', 'l_hrpid', 'l_ppid', 'l_ind5mus_xw', 'l_indinui_lw', 'l_intdaty_dv', 'l_age_dv', 'l_sex_dv', 'l_gor_dv', 'l_urban_dv', 'l_hhtype_dv', 'l_hhsize', 'l_ethn_dv', 'l_mhealthtyp1', 'l_jbft_dv', 'l_jbnssec_dv', 'l_jbhrs', 'l_fimnnet_dv', 'l_sf12pcs_dv', 'l_scghq1_dv', 'l_nchild_dv']
------------------------------------------------------------------------------------------------------------------------
N -> 21 columns
['pidp', 'n_hrpid', 'n_ppid', 'n_ind5mus_xw', 'n_indinui_lw', 'n_intdaty_dv', 'n_age_dv', 'n_sex_dv', 'n_gor_dv', 'n_urban_dv', 'n_hhtyp

In [13]:
# === Save per-wave columns-of-interest to Parquet, save preview, then reload & show 10 rows ===
from pathlib import Path
from IPython.display import display
import pandas as pd
import pyreadstat

OVERWRITE = True  # set False to skip writing if files already exist

OUT_DIR = DATA_INTERIM  # saving Parquet & previews here
OUT_DIR.mkdir(parents=True, exist_ok=True)

keep_maps = {"k": k_cols_interest, "l": l_cols_interest, "n": n_cols_interest}
raw_paths = {"k": files["k"], "l": files["l"], "n": files["n"]}

def _unique(seq):
    seen = set()
    out = []
    for x in seq:
        if isinstance(x, str):
            x = x.strip()
        if x and x not in seen:
            out.append(x)
            seen.add(x)
    return out

def _arrow_safe_parquet(df: pd.DataFrame, out_path: Path):
    """
    Normalize dtypes that can bother Parquet engines and save.
    Tries pyarrow first, then fastparquet. Raises with a helpful hint otherwise.
    """
    df2 = df.copy()
    for c in df2.columns:
        # Avoid deprecation warning: use isinstance(..., pd.CategoricalDtype)
        if isinstance(df2[c].dtype, pd.CategoricalDtype):
            df2[c] = df2[c].astype("string")
        elif pd.api.types.is_object_dtype(df2[c]):
            df2[c] = df2[c].astype("string")

    last_err = None
    for engine in ("pyarrow", "fastparquet"):
        try:
            df2.to_parquet(out_path, index=False, engine=engine)
            return out_path
        except Exception as e:
            last_err = e
    raise RuntimeError(
        f"Failed to write Parquet at {out_path}.\n"
        f"Last error: {type(last_err).__name__}: {last_err}\n"
        "Hint: install a Parquet engine in your venv, e.g. `pip install pyarrow`."
    )

def save_wave_subset(prefix: str) -> Path:
    """
    Read only the requested columns for a wave, save Parquet + preview CSV.
    Return the Parquet path.
    """
    usecols = _unique(keep_maps[prefix])

    # Read only requested columns from SPSS
    df, _ = pyreadstat.read_sav(
        raw_paths[prefix],
        usecols=usecols,
        apply_value_formats=False
    )

    # Deduplicate any accidental duplicate columns
    df = df.loc[:, ~df.columns.duplicated()]

    print(f"{prefix.upper()} -> shape {df.shape} (requested={len(usecols)})")

    pq_path = OUT_DIR / f"ukhls_{prefix}_columns_of_interest.parquet"
    prev_path = OUT_DIR / f"ukhls_{prefix}_preview_head10.csv"

    if pq_path.exists() and not OVERWRITE:
        print(f"{prefix.upper()} exists -> {pq_path} (skip write; set OVERWRITE=True to regenerate)")
    else:
        _arrow_safe_parquet(df, pq_path)
        print(f"Saved Parquet -> {pq_path}")

        df.head(10).to_csv(prev_path, index=False)
        print(f"Saved preview (10 rows) -> {prev_path}")

    if not pq_path.exists():
        raise FileNotFoundError(f"Expected Parquet not found after save: {pq_path}")
    return pq_path

print("=== Reading & saving per-wave 'columns of interest' to Parquet (with previews) ===")
k_pq = save_wave_subset("k")
l_pq = save_wave_subset("l")
n_pq = save_wave_subset("n")

# ---- Reload from saved Parquet and display 10 rows for each ----
def _read_parquet_smart(path: Path) -> pd.DataFrame:
    last_err = None
    for engine in ("pyarrow", "fastparquet"):
        try:
            return pd.read_parquet(path, engine=engine)
        except Exception as e:
            last_err = e
    raise RuntimeError(
        f"Failed to read Parquet at {path}.\n"
        f"Last error: {type(last_err).__name__}: {last_err}\n"
        "Hint: install a Parquet engine in your venv, e.g. `pip install pyarrow`."
    )

def show_head_from_parquet(path: Path, title: str, n: int = 10):
    if path is None:
        raise TypeError(f"{title}: Parquet path is None")
    p = Path(path)
    if not p.exists():
        raise FileNotFoundError(f"{title}: Missing file -> {p}")
    df = _read_parquet_smart(p)
    print(f"\n{title} (loaded from Parquet) — head({n}) [{df.shape[0]} rows, {df.shape[1]} cols]")
    display(df.head(n))

show_head_from_parquet(k_pq, "K")
show_head_from_parquet(l_pq, "L")
show_head_from_parquet(n_pq, "N")

=== Reading & saving per-wave 'columns of interest' to Parquet (with previews) ===
K -> shape (32008, 21) (requested=21)
Saved Parquet -> C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\interim\ukhls_k_columns_of_interest.parquet
Saved preview (10 rows) -> C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\interim\ukhls_k_preview_head10.csv
K -> shape (32008, 21) (requested=21)
Saved Parquet -> C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\interim\ukhls_k_columns_of_interest.parquet
Saved preview (10 rows) -> C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\interim\ukhls_k_preview_head10.csv
L -> shape (29271, 21) (requested=21)
Saved Parquet -> C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\interim\ukhls_l_columns_of_interest.parquet
Saved preview (10 rows) -> C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\interim\ukhls_l_preview_head10.csv
L -> shape (29271, 21) (requested=21)
Saved Parquet -> C:\User

Unnamed: 0,pidp,k_mhealthtyp1,k_hhsize,k_jbhrs,k_sex_dv,k_age_dv,k_intdaty_dv,k_ethn_dv,k_fimnnet_dv,k_gor_dv,...,k_hhtype_dv,k_nchild_dv,k_hrpid,k_ppid,k_jbft_dv,k_jbnssec_dv,k_scghq1_dv,k_sf12pcs_dv,k_indinui_lw,k_ind5mus_xw
0,68006127.0,-8.0,2.0,-8.0,2.0,49.0,2019.0,1.0,0.0,1.0,...,6.0,0.0,68006127.0,68020564.0,-8.0,-8.0,16.0,26.41,1.703009,0.0
1,68020564.0,-8.0,2.0,-8.0,1.0,48.0,2019.0,1.0,1565.670044,1.0,...,6.0,0.0,68006127.0,68006127.0,-8.0,-8.0,12.0,34.68,0.0,0.0
2,68008847.0,-8.0,1.0,39.0,2.0,61.0,2019.0,1.0,2134.0,1.0,...,2.0,0.0,68008847.0,-8.0,1.0,14.0,9.0,44.2,0.794699,0.0
3,68009527.0,-8.0,4.0,39.0,1.0,41.0,2019.0,1.0,2043.0,1.0,...,11.0,2.0,68009527.0,68061288.0,1.0,15.0,16.0,60.48,0.962017,0.0
4,68061288.0,-8.0,4.0,-8.0,2.0,33.0,2019.0,1.0,149.25,1.0,...,11.0,2.0,68009527.0,68009527.0,-8.0,-8.0,11.0,54.23,0.0,0.0
5,68010887.0,-8.0,2.0,32.0,2.0,55.0,2019.0,1.0,1250.0,1.0,...,6.0,0.0,68068082.0,68068082.0,1.0,25.0,9.0,57.28,1.055802,0.0
6,68068082.0,-8.0,2.0,-8.0,1.0,58.0,2019.0,1.0,690.859985,1.0,...,6.0,0.0,68068082.0,68010887.0,1.0,19.0,9.0,56.15,0.0,0.0
7,68014287.0,-8.0,3.0,-8.0,2.0,49.0,2019.0,1.0,715.869995,1.0,...,18.0,1.0,68014287.0,-8.0,-8.0,-8.0,23.0,44.37,0.0,0.0
8,68020407.0,-8.0,2.0,-8.0,2.0,82.0,2019.0,1.0,1621.670044,1.0,...,17.0,0.0,68020407.0,-8.0,-8.0,-8.0,13.0,18.07,0.833442,0.0
9,68028575.0,-8.0,4.0,-8.0,2.0,28.0,2019.0,1.0,0.0,1.0,...,11.0,2.0,68157166.0,68157166.0,-8.0,-8.0,6.0,56.71,1.029255,0.0



L (loaded from Parquet) — head(10) [29271 rows, 21 cols]


Unnamed: 0,pidp,l_mhealthtyp1,l_hhsize,l_jbhrs,l_sex_dv,l_age_dv,l_intdaty_dv,l_ethn_dv,l_fimnnet_dv,l_gor_dv,...,l_hhtype_dv,l_nchild_dv,l_hrpid,l_ppid,l_jbft_dv,l_jbnssec_dv,l_scghq1_dv,l_sf12pcs_dv,l_indinui_lw,l_ind5mus_xw
0,68008847.0,-8.0,1.0,39.0,2.0,62.0,2020.0,1.0,2288.0,1.0,...,2.0,0.0,68008847.0,-8.0,1.0,14.0,12.0,37.58,0.739967,0.0
1,68009527.0,-8.0,4.0,36.5,1.0,43.0,2020.0,1.0,2060.0,1.0,...,11.0,2.0,68034180.0,68061288.0,1.0,15.0,11.0,56.37,0.950116,0.0
2,68061288.0,-8.0,4.0,10.0,2.0,34.0,2020.0,1.0,474.0,1.0,...,11.0,2.0,68034180.0,68009527.0,2.0,24.0,15.0,61.73,0.0,0.0
3,68010887.0,-8.0,2.0,32.0,2.0,56.0,2020.0,1.0,1200.0,1.0,...,6.0,0.0,68010887.0,68068082.0,1.0,25.0,11.0,51.64,0.998976,0.0
4,68068082.0,-8.0,2.0,-8.0,1.0,59.0,2020.0,1.0,3275.909912,1.0,...,6.0,0.0,68010887.0,68010887.0,1.0,17.0,10.0,56.15,0.0,0.0
5,68028575.0,-8.0,4.0,-8.0,2.0,28.0,2020.0,1.0,0.0,1.0,...,11.0,2.0,68095380.0,68157166.0,-8.0,-8.0,8.0,57.76,1.03181,0.0
6,68157166.0,-8.0,4.0,38.0,1.0,35.0,2020.0,1.0,2974.0,1.0,...,11.0,2.0,68095380.0,68028575.0,1.0,8.0,7.0,57.47,0.0,0.0
7,68029927.0,-8.0,5.0,-8.0,2.0,48.0,2020.0,1.0,89.699997,1.0,...,20.0,0.0,68029939.0,68029931.0,-8.0,-8.0,9.0,53.79,0.0,0.0
8,68029939.0,-8.0,5.0,-8.0,1.0,16.0,2020.0,1.0,0.0,1.0,...,20.0,0.0,68029939.0,-8.0,-8.0,-8.0,8.0,57.76,0.0,0.0
9,68149808.0,-8.0,5.0,30.0,2.0,23.0,2020.0,1.0,865.0,1.0,...,20.0,0.0,68029939.0,-8.0,1.0,7.0,8.0,56.15,0.0,0.0



N (loaded from Parquet) — head(10) [35471 rows, 21 cols]


Unnamed: 0,pidp,n_mhgad,n_hhsize,n_jbhrs,n_sex_dv,n_age_dv,n_intdaty_dv,n_ethn_dv,n_fimnnet_dv,n_gor_dv,...,n_hhtype_dv,n_nchild_dv,n_hrpid,n_ppid,n_jbft_dv,n_scghq1_dv,n_sf12pcs_dv,n_indinui_lw,n_ind5mus_xw,n_jbnssec_dv
0,22445.0,2.0,4.0,28.0,2.0,37.0,2022.0,1.0,1857.079956,8.0,...,11.0,2.0,276841780.0,277059298.0,1.0,24.0,62.83,0.0,0.0,2.0
1,29925.0,2.0,3.0,29.0,2.0,45.0,2022.0,1.0,2378.75,7.0,...,5.0,2.0,622866606.0,-8.0,2.0,23.0,65.47,0.0,0.0,14.0
2,76165.0,2.0,4.0,35.0,2.0,39.0,2022.0,1.0,3206.0,5.0,...,11.0,2.0,141045780.0,142378492.0,1.0,12.0,57.2,0.0,0.0,11.0
3,280165.0,2.0,4.0,-8.0,2.0,43.0,2022.0,1.0,94.470001,8.0,...,20.0,1.0,783876922.0,756200970.0,-8.0,16.0,58.55,0.0,0.0,-8.0
4,469205.0,2.0,3.0,16.0,2.0,32.0,2022.0,1.0,2056.080078,4.0,...,5.0,2.0,414412580.0,-8.0,2.0,16.0,49.93,0.0,0.0,25.0
5,599765.0,2.0,3.0,37.0,2.0,35.0,2022.0,1.0,2839.649902,5.0,...,10.0,1.0,209943344.0,210167702.0,1.0,6.0,56.15,0.0,0.0,2.0
6,732365.0,1.0,3.0,-8.0,1.0,37.0,2022.0,1.0,838.5,2.0,...,19.0,0.0,732365.0,-8.0,-8.0,35.0,45.49,0.0,0.0,-8.0
7,1587125.0,2.0,1.0,37.0,2.0,56.0,2022.0,1.0,2290.0,1.0,...,3.0,0.0,1587125.0,-8.0,1.0,15.0,37.37,0.0,0.0,7.0
8,2888645.0,2.0,1.0,38.0,2.0,33.0,2022.0,1.0,2300.0,11.0,...,3.0,0.0,2888645.0,-8.0,1.0,6.0,56.15,0.0,0.0,2.0
9,3424485.0,2.0,5.0,-8.0,2.0,86.0,2022.0,1.0,1092.75,6.0,...,20.0,0.0,103550562.0,-8.0,-8.0,10.0,33.23,0.0,0.0,-8.0


In [21]:
# === Cell 5 • Section 1: Standardize per wave (save standardized) ===
import pandas as pd
from pathlib import Path
from IPython.display import display

# ---- Missing-code map ----
MISS_LABELS = {
    -9: "missing",
    -8: "inapplicable",
    -7: "proxy/partial",
    -2: "refusal",
    -1: "don't know",
}
MISS_CODES = set(MISS_LABELS.keys())

# ---- Config ----
PROJECT_ROOT = Path(PROJECT_ROOT)  # reuse your existing var
IN_DIR  = PROJECT_ROOT / "data" / "interim"     # where 'ukhls_{k|l|n}_columns_of_interest.(parquet|csv)' live
OUT_DIR = PROJECT_ROOT / "data" / "processed"   # where standardized/model-ready/analysis-ready will be written
OUT_DIR.mkdir(parents=True, exist_ok=True)

SAVE_FMT = "parquet"  # "parquet" (preferred) or "csv"
WAVE_NUM = {"k": 11, "l": 12, "n": 14}

# 21 standardized variables
STD_COLS = [
    "pidp",
    "hrpid","ppid","ind5mus_xw","indinui_lw",
    "intdaty_dv","age_dv","sex_dv",
    "gor_dv","urban_dv","hhtype_dv","hhsize",
    "ethn_dv","anxiety_raw",
    "jbft_dv","jbnssec_dv","jbhrs",
    "fimnnet_dv","sf12pcs_dv","scghq1_dv","nchild_dv",
]

# ---------- I/O helpers ----------
def _find_input_file(prefix: str) -> Path:
    base = IN_DIR / f"ukhls_{prefix}_columns_of_interest"
    pq, cs = base.with_suffix(".parquet"), base.with_suffix(".csv")
    if pq.exists(): return pq
    if cs.exists(): return cs
    raise FileNotFoundError(f"No input for {prefix}: {pq.name} / {cs.name}")

def _load_df(path: Path) -> pd.DataFrame:
    if path.suffix.lower() == ".parquet":
        return pd.read_parquet(path)
    elif path.suffix.lower() == ".csv":
        return pd.read_csv(path)
    else:
        raise ValueError(f"Unsupported file type: {path}")

def _safe_to_parquet_or_csv(df: pd.DataFrame, path_base: Path, fmt: str = "parquet") -> Path:
    """
    Try Parquet, fall back to CSV. Normalizes object/categorical to string for Arrow friendliness.
    Returns final written path.
    """
    df2 = df.copy()
    for c in df2.columns:
        if isinstance(df2[c].dtype, pd.CategoricalDtype):
            df2[c] = df2[c].astype("string")
        elif pd.api.types.is_object_dtype(df2[c]):
            df2[c] = df2[c].astype("string")

    if fmt.lower() == "parquet":
        try:
            out = path_base.with_suffix(".parquet")
            df2.to_parquet(out, index=False)
            print(f"Saved Parquet -> {out}")
            return out
        except Exception as e:
            print(f"Parquet failed ({type(e).__name__}: {e}). Falling back to CSV.")
            out = path_base.with_suffix(".csv")
            df.to_csv(out, index=False)
            print(f"Saved CSV -> {out}")
            return out
    elif fmt.lower() == "csv":
        out = path_base.with_suffix(".csv")
        df.to_csv(out, index=False)
        print(f"Saved CSV -> {out}")
        return out
    else:
        raise ValueError("SAVE_FMT must be 'parquet' or 'csv'")

# ---------- Standardize ----------
def standardize_wave(df_in: pd.DataFrame, prefix: str) -> pd.DataFrame:
    """
    Strip wave prefix, build anxiety_raw (mhgad or mhealthtyp1), add wave + wave_num,
    ensure all STD_COLS exist, return only STD_COLS + ['wave','wave_num'].
    Also removes any duplicate columns created by collisions after renaming.
    """
    df = df_in.copy()
    before = df.shape

    # 1) strip prefix
    pref = f"{prefix}_"
    ren = {c: c[len(pref):] for c in df.columns if c.startswith(pref)}
    df = df.rename(columns=ren)

    # 2) DROP duplicate columns created by collisions (keep the last)
    if df.columns.duplicated().any():
        dup_names = df.columns[df.columns.duplicated()].tolist()
        print(f"{prefix.upper()} -> found duplicate cols after renaming; dropping older copies: {sorted(set(dup_names))}")
        df = df.loc[:, ~df.columns.duplicated(keep="last")]

    # 3) harmonize anxiety_raw only if missing
    if "anxiety_raw" not in df.columns:
        if "mhgad" in df.columns:
            df["anxiety_raw"] = df["mhgad"]
        elif "mhealthtyp1" in df.columns:
            df["anxiety_raw"] = df["mhealthtyp1"]

    # 4) add wave indicators
    df["wave"] = prefix.upper()
    df["wave_num"] = WAVE_NUM[prefix]

    # 5) ensure all STD_COLS exist
    for c in STD_COLS:
        if c not in df.columns:
            df[c] = pd.NA

    # 6) final select (unique, canonical ordering)
    final_cols = STD_COLS + ["wave", "wave_num"]
    final_cols = pd.Index(final_cols).drop_duplicates().tolist()
    df = df.loc[:, final_cols]

    after = df.shape
    print(f"{prefix.upper()} -> BEFORE: {before} | AFTER (standardized): {after}")
    return df

# ---------- Run (Section 1) ----------
print("=== Load saved columns-of-interest ===")
k_in = _load_df(_find_input_file("k"))
l_in = _load_df(_find_input_file("l"))
n_in = _load_df(_find_input_file("n"))

print("\n=== Standardize per wave ===")
k_std = standardize_wave(k_in, "k")
l_std = standardize_wave(l_in, "l")
n_std = standardize_wave(n_in, "n")

# Quick previews
print("\nK std (10 rows):"); display(k_std.head(10))
print("\nL std (10 rows):"); display(l_std.head(10))
print("\nN std (10 rows):"); display(n_std.head(10))

# Save standardized (raw) per-wave to processed
print("\n=== Save standardized frames ===")
k_std_path = _safe_to_parquet_or_csv(k_std, OUT_DIR / "ukhls_k_standardized", SAVE_FMT)
l_std_path = _safe_to_parquet_or_csv(l_std, OUT_DIR / "ukhls_l_standardized", SAVE_FMT)
n_std_path = _safe_to_parquet_or_csv(n_std, OUT_DIR / "ukhls_n_standardized", SAVE_FMT)
print("Standardized saved ->", k_std_path, l_std_path, n_std_path)


=== Load saved columns-of-interest ===

=== Standardize per wave ===
K -> BEFORE: (32008, 21) | AFTER (standardized): (32008, 23)
L -> BEFORE: (29271, 21) | AFTER (standardized): (29271, 23)
N -> BEFORE: (35471, 21) | AFTER (standardized): (35471, 23)

K std (10 rows):


Unnamed: 0,pidp,hrpid,ppid,ind5mus_xw,indinui_lw,intdaty_dv,age_dv,sex_dv,gor_dv,urban_dv,...,anxiety_raw,jbft_dv,jbnssec_dv,jbhrs,fimnnet_dv,sf12pcs_dv,scghq1_dv,nchild_dv,wave,wave_num
0,68006127.0,68006127.0,68020564.0,0.0,1.703009,2019.0,49.0,2.0,1.0,1.0,...,-8.0,-8.0,-8.0,-8.0,0.0,26.41,16.0,0.0,K,11
1,68020564.0,68006127.0,68006127.0,0.0,0.0,2019.0,48.0,1.0,1.0,1.0,...,-8.0,-8.0,-8.0,-8.0,1565.670044,34.68,12.0,0.0,K,11
2,68008847.0,68008847.0,-8.0,0.0,0.794699,2019.0,61.0,2.0,1.0,1.0,...,-8.0,1.0,14.0,39.0,2134.0,44.2,9.0,0.0,K,11
3,68009527.0,68009527.0,68061288.0,0.0,0.962017,2019.0,41.0,1.0,1.0,1.0,...,-8.0,1.0,15.0,39.0,2043.0,60.48,16.0,2.0,K,11
4,68061288.0,68009527.0,68009527.0,0.0,0.0,2019.0,33.0,2.0,1.0,1.0,...,-8.0,-8.0,-8.0,-8.0,149.25,54.23,11.0,2.0,K,11
5,68010887.0,68068082.0,68068082.0,0.0,1.055802,2019.0,55.0,2.0,1.0,1.0,...,-8.0,1.0,25.0,32.0,1250.0,57.28,9.0,0.0,K,11
6,68068082.0,68068082.0,68010887.0,0.0,0.0,2019.0,58.0,1.0,1.0,1.0,...,-8.0,1.0,19.0,-8.0,690.859985,56.15,9.0,0.0,K,11
7,68014287.0,68014287.0,-8.0,0.0,0.0,2019.0,49.0,2.0,1.0,1.0,...,-8.0,-8.0,-8.0,-8.0,715.869995,44.37,23.0,1.0,K,11
8,68020407.0,68020407.0,-8.0,0.0,0.833442,2019.0,82.0,2.0,1.0,1.0,...,-8.0,-8.0,-8.0,-8.0,1621.670044,18.07,13.0,0.0,K,11
9,68028575.0,68157166.0,68157166.0,0.0,1.029255,2019.0,28.0,2.0,1.0,1.0,...,-8.0,-8.0,-8.0,-8.0,0.0,56.71,6.0,2.0,K,11



L std (10 rows):


Unnamed: 0,pidp,hrpid,ppid,ind5mus_xw,indinui_lw,intdaty_dv,age_dv,sex_dv,gor_dv,urban_dv,...,anxiety_raw,jbft_dv,jbnssec_dv,jbhrs,fimnnet_dv,sf12pcs_dv,scghq1_dv,nchild_dv,wave,wave_num
0,68008847.0,68008847.0,-8.0,0.0,0.739967,2020.0,62.0,2.0,1.0,1.0,...,-8.0,1.0,14.0,39.0,2288.0,37.58,12.0,0.0,L,12
1,68009527.0,68034180.0,68061288.0,0.0,0.950116,2020.0,43.0,1.0,1.0,1.0,...,-8.0,1.0,15.0,36.5,2060.0,56.37,11.0,2.0,L,12
2,68061288.0,68034180.0,68009527.0,0.0,0.0,2020.0,34.0,2.0,1.0,1.0,...,-8.0,2.0,24.0,10.0,474.0,61.73,15.0,2.0,L,12
3,68010887.0,68010887.0,68068082.0,0.0,0.998976,2020.0,56.0,2.0,1.0,1.0,...,-8.0,1.0,25.0,32.0,1200.0,51.64,11.0,0.0,L,12
4,68068082.0,68010887.0,68010887.0,0.0,0.0,2020.0,59.0,1.0,1.0,1.0,...,-8.0,1.0,17.0,-8.0,3275.909912,56.15,10.0,0.0,L,12
5,68028575.0,68095380.0,68157166.0,0.0,1.03181,2020.0,28.0,2.0,1.0,1.0,...,-8.0,-8.0,-8.0,-8.0,0.0,57.76,8.0,2.0,L,12
6,68157166.0,68095380.0,68028575.0,0.0,0.0,2020.0,35.0,1.0,1.0,1.0,...,-8.0,1.0,8.0,38.0,2974.0,57.47,7.0,2.0,L,12
7,68029927.0,68029939.0,68029931.0,0.0,0.0,2020.0,48.0,2.0,1.0,1.0,...,-8.0,-8.0,-8.0,-8.0,89.699997,53.79,9.0,0.0,L,12
8,68029939.0,68029939.0,-8.0,0.0,0.0,2020.0,16.0,1.0,1.0,1.0,...,-8.0,-8.0,-8.0,-8.0,0.0,57.76,8.0,0.0,L,12
9,68149808.0,68029939.0,-8.0,0.0,0.0,2020.0,23.0,2.0,1.0,1.0,...,-8.0,1.0,7.0,30.0,865.0,56.15,8.0,0.0,L,12



N std (10 rows):


Unnamed: 0,pidp,hrpid,ppid,ind5mus_xw,indinui_lw,intdaty_dv,age_dv,sex_dv,gor_dv,urban_dv,...,anxiety_raw,jbft_dv,jbnssec_dv,jbhrs,fimnnet_dv,sf12pcs_dv,scghq1_dv,nchild_dv,wave,wave_num
0,22445.0,276841780.0,277059298.0,0.0,0.0,2022.0,37.0,2.0,8.0,1.0,...,2.0,1.0,2.0,28.0,1857.079956,62.83,24.0,2.0,N,14
1,29925.0,622866606.0,-8.0,0.0,0.0,2022.0,45.0,2.0,7.0,1.0,...,2.0,2.0,14.0,29.0,2378.75,65.47,23.0,2.0,N,14
2,76165.0,141045780.0,142378492.0,0.0,0.0,2022.0,39.0,2.0,5.0,1.0,...,2.0,1.0,11.0,35.0,3206.0,57.2,12.0,2.0,N,14
3,280165.0,783876922.0,756200970.0,0.0,0.0,2022.0,43.0,2.0,8.0,2.0,...,2.0,-8.0,-8.0,-8.0,94.470001,58.55,16.0,1.0,N,14
4,469205.0,414412580.0,-8.0,0.0,0.0,2022.0,32.0,2.0,4.0,1.0,...,2.0,2.0,25.0,16.0,2056.080078,49.93,16.0,2.0,N,14
5,599765.0,209943344.0,210167702.0,0.0,0.0,2022.0,35.0,2.0,5.0,1.0,...,2.0,1.0,2.0,37.0,2839.649902,56.15,6.0,1.0,N,14
6,732365.0,732365.0,-8.0,0.0,0.0,2022.0,37.0,1.0,2.0,1.0,...,1.0,-8.0,-8.0,-8.0,838.5,45.49,35.0,0.0,N,14
7,1587125.0,1587125.0,-8.0,0.0,0.0,2022.0,56.0,2.0,1.0,1.0,...,2.0,1.0,7.0,37.0,2290.0,37.37,15.0,0.0,N,14
8,2888645.0,2888645.0,-8.0,0.0,0.0,2022.0,33.0,2.0,11.0,1.0,...,2.0,1.0,2.0,38.0,2300.0,56.15,6.0,0.0,N,14
9,3424485.0,103550562.0,-8.0,0.0,0.0,2022.0,86.0,2.0,6.0,2.0,...,2.0,-8.0,-8.0,-8.0,1092.75,33.23,10.0,0.0,N,14



=== Save standardized frames ===
Saved Parquet -> C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\processed\ukhls_k_standardized.parquet
Saved Parquet -> C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\processed\ukhls_l_standardized.parquet
Saved Parquet -> C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\processed\ukhls_k_standardized.parquet
Saved Parquet -> C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\processed\ukhls_l_standardized.parquet
Saved Parquet -> C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\processed\ukhls_n_standardized.parquet
Standardized saved -> C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\processed\ukhls_k_standardized.parquet C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\processed\ukhls_l_standardized.parquet C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\processed\ukhls_n_standardized.parquet
Saved Parquet -> C:\Users\User\Documents\Github\Gam

In [23]:
# === Cell 5 • Section 2: Dynamic clean, promote cleaned to base, and save model‑ready ===
import pandas as pd
from pathlib import Path
from IPython.display import display

# Reuse PROJECT_ROOT / OUT_DIR / SAVE_FMT / MISS_LABELS / MISS_CODES / _safe_to_parquet_or_csv from Section 1
PROCESSED_DIR = Path(OUT_DIR)                 # points to data/processed from Section 1
MODEL_DIR = PROCESSED_DIR / "model-ready"     # requested output folder
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# Load standardized (saved in Section 1)
def _load_std(prefix: str) -> pd.DataFrame:
    base = PROCESSED_DIR / f"ukhls_{prefix}_standardized"
    if base.with_suffix(".parquet").exists():
        return pd.read_parquet(base.with_suffix(".parquet"))
    elif base.with_suffix(".csv").exists():
        return pd.read_csv(base.with_suffix(".csv"))
    else:
        raise FileNotFoundError(f"Standardized not found for {prefix}: {base.with_suffix('.parquet').name} or .csv")

k_std = _load_std("k")
l_std = _load_std("l")
n_std = _load_std("n")

# ---------- Dynamic cleaner (special negatives -> NA, plus separate label) ----------
def add_clean_and_labels(df: pd.DataFrame) -> tuple[pd.DataFrame, list[str]]:
    """
    For each numeric-like column that contains a special negative (MISS_CODES):
      - add <col>_clean where those codes are set to NA (nullable numeric)
      - add <col>_label:
          * MISS_LABELS[...] where a special negative is present
          * 'observed' where value is non-missing and not a special negative
          * NA where value is truly missing
    Returns (df_with_clean_and_label, cleaned_cols).
    """
    out = df.copy()
    cleaned = []

    for c in out.columns:
        vals = pd.to_numeric(out[c], errors="coerce")  # non-numeric -> NaN
        mask_special = vals.isin(MISS_CODES)
        if not mask_special.any():
            continue

        # _clean: specials -> NA; keep nullable numeric dtype where possible
        cleaned_vals = vals.mask(mask_special, other=pd.NA)
        try:
            non_null = pd.Series(cleaned_vals.dropna())
            if pd.api.types.is_integer_dtype(non_null.dtype):
                out[f"{c}_clean"] = cleaned_vals.astype("Int64")
            else:
                out[f"{c}_clean"] = cleaned_vals.astype("Float64")
        except Exception:
            out[f"{c}_clean"] = cleaned_vals

        # _label: specials -> MISS_LABELS; observed -> 'observed'; true NaN -> NA
        def _lab(x):
            if pd.isna(x):
                return pd.NA
            xi = int(x)
            return MISS_LABELS.get(xi, "observed")
        out[f"{c}_label"] = vals.map(_lab).astype("string")

        cleaned.append(c)

    return out, cleaned

print("\n=== Dynamic clean (add *_clean and *_label only where specials appear) ===")
k_mr, k_cleaned = add_clean_and_labels(k_std)
l_mr, l_cleaned = add_clean_and_labels(l_std)
n_mr, n_cleaned = add_clean_and_labels(n_std)

print(f"Shapes AFTER dynamic clean -> K: {k_mr.shape}  L: {l_mr.shape}  N: {n_mr.shape}")
print("K cleaned columns:", k_cleaned)
print("L cleaned columns:", l_cleaned)
print("N cleaned columns:", n_cleaned)

# Optional preview of affected columns
def _preview_clean(df, cleaned_cols, title, n=8):
    cols = ["pidp","wave","wave_num","intdaty_dv"]
    for c in cleaned_cols:
        cols += [c, f"{c}_clean", f"{c}_label"]
    cols = [c for c in cols if c in df.columns]
    print(f"\n{title} — first {n} rows (only affected columns)")
    display(df[cols].head(n))

_preview_clean(k_mr, k_cleaned, "K (model-ready)")
_preview_clean(l_mr, l_cleaned, "L (model-ready)")
_preview_clean(n_mr, n_cleaned, "N (model-ready)")

# === Promote *_clean -> base name, keep *_label (ppid_clean -> ppid, etc.) ===
def promote_clean_to_base(df: pd.DataFrame) -> pd.DataFrame:
    """
    Replace any base column with its <col>_clean values (creating base if absent),
    drop <col>_clean, and keep <col>_label alongside.
    """
    out = df.copy()
    changed = []
    for c in list(out.columns):
        if c.endswith("_clean"):
            base = c[:-6]  # remove "_clean"
            out[base] = out[c]
            out = out.drop(columns=[c])
            changed.append(base)
    if changed:
        print(f"Promoted cleaned columns into base: {sorted(changed)}")
    return out

k_model = promote_clean_to_base(k_mr)
l_model = promote_clean_to_base(l_mr)
n_model = promote_clean_to_base(n_mr)

# Save model‑ready per-wave and combined to data/processed/model‑ready
print("\n=== Save model‑ready frames ===")
k_model_path = _safe_to_parquet_or_csv(k_model, MODEL_DIR / "ukhls_k_model_ready", SAVE_FMT)
l_model_path = _safe_to_parquet_or_csv(l_model, MODEL_DIR / "ukhls_l_model_ready", SAVE_FMT)
n_model_path = _safe_to_parquet_or_csv(n_model, MODEL_DIR / "ukhls_n_model_ready", SAVE_FMT)

all_model = pd.concat([k_model, l_model, n_model], ignore_index=True)
all_model_path = _safe_to_parquet_or_csv(all_model, MODEL_DIR / "ukhls_kln_model_ready", SAVE_FMT)

print("Model‑ready saved ->")
print(k_model_path, "\n", l_model_path, "\n", n_model_path, "\n", all_model_path)

# Quick previews (post‑promotion)
print("\nK (model‑ready) — 8 rows:"); display(k_model.head(8))
print("\nL (model‑ready) — 8 rows:"); display(l_model.head(8))
print("\nN (model‑ready) — 8 rows:"); display(n_model.head(8))


=== Dynamic clean (add *_clean and *_label only where specials appear) ===
Shapes AFTER dynamic clean -> K: (32008, 51)  L: (29271, 47)  N: (35471, 47)
K cleaned columns: ['ppid', 'intdaty_dv', 'age_dv', 'sex_dv', 'gor_dv', 'urban_dv', 'ethn_dv', 'anxiety_raw', 'jbft_dv', 'jbnssec_dv', 'jbhrs', 'fimnnet_dv', 'sf12pcs_dv', 'scghq1_dv']
L cleaned columns: ['ppid', 'sex_dv', 'gor_dv', 'urban_dv', 'ethn_dv', 'anxiety_raw', 'jbft_dv', 'jbnssec_dv', 'jbhrs', 'fimnnet_dv', 'sf12pcs_dv', 'scghq1_dv']
N cleaned columns: ['ppid', 'age_dv', 'sex_dv', 'gor_dv', 'urban_dv', 'ethn_dv', 'anxiety_raw', 'jbft_dv', 'jbnssec_dv', 'jbhrs', 'sf12pcs_dv', 'scghq1_dv']

K (model-ready) — first 8 rows (only affected columns)
Shapes AFTER dynamic clean -> K: (32008, 51)  L: (29271, 47)  N: (35471, 47)
K cleaned columns: ['ppid', 'intdaty_dv', 'age_dv', 'sex_dv', 'gor_dv', 'urban_dv', 'ethn_dv', 'anxiety_raw', 'jbft_dv', 'jbnssec_dv', 'jbhrs', 'fimnnet_dv', 'sf12pcs_dv', 'scghq1_dv']
L cleaned columns: ['ppid'

Unnamed: 0,pidp,wave,wave_num,intdaty_dv,ppid,ppid_clean,ppid_label,intdaty_dv.1,intdaty_dv_clean,intdaty_dv_label,...,jbhrs_label,fimnnet_dv,fimnnet_dv_clean,fimnnet_dv_label,sf12pcs_dv,sf12pcs_dv_clean,sf12pcs_dv_label,scghq1_dv,scghq1_dv_clean,scghq1_dv_label
0,68006127.0,K,11,2019.0,68020564.0,68020564.0,observed,2019.0,2019.0,observed,...,inapplicable,0.0,0.0,observed,26.41,26.41,observed,16.0,16.0,observed
1,68020564.0,K,11,2019.0,68006127.0,68006127.0,observed,2019.0,2019.0,observed,...,inapplicable,1565.670044,1565.670044,observed,34.68,34.68,observed,12.0,12.0,observed
2,68008847.0,K,11,2019.0,-8.0,,inapplicable,2019.0,2019.0,observed,...,observed,2134.0,2134.0,observed,44.2,44.2,observed,9.0,9.0,observed
3,68009527.0,K,11,2019.0,68061288.0,68061288.0,observed,2019.0,2019.0,observed,...,observed,2043.0,2043.0,observed,60.48,60.48,observed,16.0,16.0,observed
4,68061288.0,K,11,2019.0,68009527.0,68009527.0,observed,2019.0,2019.0,observed,...,inapplicable,149.25,149.25,observed,54.23,54.23,observed,11.0,11.0,observed
5,68010887.0,K,11,2019.0,68068082.0,68068082.0,observed,2019.0,2019.0,observed,...,observed,1250.0,1250.0,observed,57.28,57.28,observed,9.0,9.0,observed
6,68068082.0,K,11,2019.0,68010887.0,68010887.0,observed,2019.0,2019.0,observed,...,inapplicable,690.859985,690.859985,observed,56.15,56.15,observed,9.0,9.0,observed
7,68014287.0,K,11,2019.0,-8.0,,inapplicable,2019.0,2019.0,observed,...,inapplicable,715.869995,715.869995,observed,44.37,44.37,observed,23.0,23.0,observed



L (model-ready) — first 8 rows (only affected columns)


Unnamed: 0,pidp,wave,wave_num,intdaty_dv,ppid,ppid_clean,ppid_label,sex_dv,sex_dv_clean,sex_dv_label,...,jbhrs_label,fimnnet_dv,fimnnet_dv_clean,fimnnet_dv_label,sf12pcs_dv,sf12pcs_dv_clean,sf12pcs_dv_label,scghq1_dv,scghq1_dv_clean,scghq1_dv_label
0,68008847.0,L,12,2020.0,-8.0,,inapplicable,2.0,2.0,observed,...,observed,2288.0,2288.0,observed,37.58,37.58,observed,12.0,12.0,observed
1,68009527.0,L,12,2020.0,68061288.0,68061288.0,observed,1.0,1.0,observed,...,observed,2060.0,2060.0,observed,56.37,56.37,observed,11.0,11.0,observed
2,68061288.0,L,12,2020.0,68009527.0,68009527.0,observed,2.0,2.0,observed,...,observed,474.0,474.0,observed,61.73,61.73,observed,15.0,15.0,observed
3,68010887.0,L,12,2020.0,68068082.0,68068082.0,observed,2.0,2.0,observed,...,observed,1200.0,1200.0,observed,51.64,51.64,observed,11.0,11.0,observed
4,68068082.0,L,12,2020.0,68010887.0,68010887.0,observed,1.0,1.0,observed,...,inapplicable,3275.909912,3275.909912,observed,56.15,56.15,observed,10.0,10.0,observed
5,68028575.0,L,12,2020.0,68157166.0,68157166.0,observed,2.0,2.0,observed,...,inapplicable,0.0,0.0,observed,57.76,57.76,observed,8.0,8.0,observed
6,68157166.0,L,12,2020.0,68028575.0,68028575.0,observed,1.0,1.0,observed,...,observed,2974.0,2974.0,observed,57.47,57.47,observed,7.0,7.0,observed
7,68029927.0,L,12,2020.0,68029931.0,68029931.0,observed,2.0,2.0,observed,...,inapplicable,89.699997,89.699997,observed,53.79,53.79,observed,9.0,9.0,observed



N (model-ready) — first 8 rows (only affected columns)


Unnamed: 0,pidp,wave,wave_num,intdaty_dv,ppid,ppid_clean,ppid_label,age_dv,age_dv_clean,age_dv_label,...,jbnssec_dv_label,jbhrs,jbhrs_clean,jbhrs_label,sf12pcs_dv,sf12pcs_dv_clean,sf12pcs_dv_label,scghq1_dv,scghq1_dv_clean,scghq1_dv_label
0,22445.0,N,14,2022.0,277059298.0,277059298.0,observed,37.0,37.0,observed,...,observed,28.0,28.0,observed,62.83,62.83,observed,24.0,24.0,observed
1,29925.0,N,14,2022.0,-8.0,,inapplicable,45.0,45.0,observed,...,observed,29.0,29.0,observed,65.47,65.47,observed,23.0,23.0,observed
2,76165.0,N,14,2022.0,142378492.0,142378492.0,observed,39.0,39.0,observed,...,observed,35.0,35.0,observed,57.2,57.2,observed,12.0,12.0,observed
3,280165.0,N,14,2022.0,756200970.0,756200970.0,observed,43.0,43.0,observed,...,inapplicable,-8.0,,inapplicable,58.55,58.55,observed,16.0,16.0,observed
4,469205.0,N,14,2022.0,-8.0,,inapplicable,32.0,32.0,observed,...,observed,16.0,16.0,observed,49.93,49.93,observed,16.0,16.0,observed
5,599765.0,N,14,2022.0,210167702.0,210167702.0,observed,35.0,35.0,observed,...,observed,37.0,37.0,observed,56.15,56.15,observed,6.0,6.0,observed
6,732365.0,N,14,2022.0,-8.0,,inapplicable,37.0,37.0,observed,...,inapplicable,-8.0,,inapplicable,45.49,45.49,observed,35.0,35.0,observed
7,1587125.0,N,14,2022.0,-8.0,,inapplicable,56.0,56.0,observed,...,observed,37.0,37.0,observed,37.37,37.37,observed,15.0,15.0,observed


Promoted cleaned columns into base: ['age_dv', 'anxiety_raw', 'ethn_dv', 'fimnnet_dv', 'gor_dv', 'intdaty_dv', 'jbft_dv', 'jbhrs', 'jbnssec_dv', 'ppid', 'scghq1_dv', 'sex_dv', 'sf12pcs_dv', 'urban_dv']
Promoted cleaned columns into base: ['anxiety_raw', 'ethn_dv', 'fimnnet_dv', 'gor_dv', 'jbft_dv', 'jbhrs', 'jbnssec_dv', 'ppid', 'scghq1_dv', 'sex_dv', 'sf12pcs_dv', 'urban_dv']
Promoted cleaned columns into base: ['age_dv', 'anxiety_raw', 'ethn_dv', 'gor_dv', 'jbft_dv', 'jbhrs', 'jbnssec_dv', 'ppid', 'scghq1_dv', 'sex_dv', 'sf12pcs_dv', 'urban_dv']

=== Save model‑ready frames ===
Saved Parquet -> C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\processed\model-ready\ukhls_k_model_ready.parquet
Saved Parquet -> C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\processed\model-ready\ukhls_k_model_ready.parquet
Saved Parquet -> C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\processed\model-ready\ukhls_l_model_ready.parquet
Saved Parquet -> C:\Users\

Unnamed: 0,pidp,hrpid,ppid,ind5mus_xw,indinui_lw,intdaty_dv,age_dv,sex_dv,gor_dv,urban_dv,...,gor_dv_label,urban_dv_label,ethn_dv_label,anxiety_raw_label,jbft_dv_label,jbnssec_dv_label,jbhrs_label,fimnnet_dv_label,sf12pcs_dv_label,scghq1_dv_label
0,68006127.0,68006127.0,68020564.0,0.0,1.703009,2019.0,49.0,2.0,1.0,1.0,...,observed,observed,observed,inapplicable,inapplicable,inapplicable,inapplicable,observed,observed,observed
1,68020564.0,68006127.0,68006127.0,0.0,0.0,2019.0,48.0,1.0,1.0,1.0,...,observed,observed,observed,inapplicable,inapplicable,inapplicable,inapplicable,observed,observed,observed
2,68008847.0,68008847.0,,0.0,0.794699,2019.0,61.0,2.0,1.0,1.0,...,observed,observed,observed,inapplicable,observed,observed,observed,observed,observed,observed
3,68009527.0,68009527.0,68061288.0,0.0,0.962017,2019.0,41.0,1.0,1.0,1.0,...,observed,observed,observed,inapplicable,observed,observed,observed,observed,observed,observed
4,68061288.0,68009527.0,68009527.0,0.0,0.0,2019.0,33.0,2.0,1.0,1.0,...,observed,observed,observed,inapplicable,inapplicable,inapplicable,inapplicable,observed,observed,observed
5,68010887.0,68068082.0,68068082.0,0.0,1.055802,2019.0,55.0,2.0,1.0,1.0,...,observed,observed,observed,inapplicable,observed,observed,observed,observed,observed,observed
6,68068082.0,68068082.0,68010887.0,0.0,0.0,2019.0,58.0,1.0,1.0,1.0,...,observed,observed,observed,inapplicable,observed,observed,inapplicable,observed,observed,observed
7,68014287.0,68014287.0,,0.0,0.0,2019.0,49.0,2.0,1.0,1.0,...,observed,observed,observed,inapplicable,inapplicable,inapplicable,inapplicable,observed,observed,observed



L (model‑ready) — 8 rows:


Unnamed: 0,pidp,hrpid,ppid,ind5mus_xw,indinui_lw,intdaty_dv,age_dv,sex_dv,gor_dv,urban_dv,...,gor_dv_label,urban_dv_label,ethn_dv_label,anxiety_raw_label,jbft_dv_label,jbnssec_dv_label,jbhrs_label,fimnnet_dv_label,sf12pcs_dv_label,scghq1_dv_label
0,68008847.0,68008847.0,,0.0,0.739967,2020.0,62.0,2.0,1.0,1.0,...,observed,observed,observed,inapplicable,observed,observed,observed,observed,observed,observed
1,68009527.0,68034180.0,68061288.0,0.0,0.950116,2020.0,43.0,1.0,1.0,1.0,...,observed,observed,observed,inapplicable,observed,observed,observed,observed,observed,observed
2,68061288.0,68034180.0,68009527.0,0.0,0.0,2020.0,34.0,2.0,1.0,1.0,...,observed,observed,observed,inapplicable,observed,observed,observed,observed,observed,observed
3,68010887.0,68010887.0,68068082.0,0.0,0.998976,2020.0,56.0,2.0,1.0,1.0,...,observed,observed,observed,inapplicable,observed,observed,observed,observed,observed,observed
4,68068082.0,68010887.0,68010887.0,0.0,0.0,2020.0,59.0,1.0,1.0,1.0,...,observed,observed,observed,inapplicable,observed,observed,inapplicable,observed,observed,observed
5,68028575.0,68095380.0,68157166.0,0.0,1.03181,2020.0,28.0,2.0,1.0,1.0,...,observed,observed,observed,inapplicable,inapplicable,inapplicable,inapplicable,observed,observed,observed
6,68157166.0,68095380.0,68028575.0,0.0,0.0,2020.0,35.0,1.0,1.0,1.0,...,observed,observed,observed,inapplicable,observed,observed,observed,observed,observed,observed
7,68029927.0,68029939.0,68029931.0,0.0,0.0,2020.0,48.0,2.0,1.0,1.0,...,observed,observed,observed,inapplicable,inapplicable,inapplicable,inapplicable,observed,observed,observed



N (model‑ready) — 8 rows:


Unnamed: 0,pidp,hrpid,ppid,ind5mus_xw,indinui_lw,intdaty_dv,age_dv,sex_dv,gor_dv,urban_dv,...,sex_dv_label,gor_dv_label,urban_dv_label,ethn_dv_label,anxiety_raw_label,jbft_dv_label,jbnssec_dv_label,jbhrs_label,sf12pcs_dv_label,scghq1_dv_label
0,22445.0,276841780.0,277059298.0,0.0,0.0,2022.0,37.0,2.0,8.0,1.0,...,observed,observed,observed,observed,observed,observed,observed,observed,observed,observed
1,29925.0,622866606.0,,0.0,0.0,2022.0,45.0,2.0,7.0,1.0,...,observed,observed,observed,observed,observed,observed,observed,observed,observed,observed
2,76165.0,141045780.0,142378492.0,0.0,0.0,2022.0,39.0,2.0,5.0,1.0,...,observed,observed,observed,observed,observed,observed,observed,observed,observed,observed
3,280165.0,783876922.0,756200970.0,0.0,0.0,2022.0,43.0,2.0,8.0,2.0,...,observed,observed,observed,observed,observed,inapplicable,inapplicable,inapplicable,observed,observed
4,469205.0,414412580.0,,0.0,0.0,2022.0,32.0,2.0,4.0,1.0,...,observed,observed,observed,observed,observed,observed,observed,observed,observed,observed
5,599765.0,209943344.0,210167702.0,0.0,0.0,2022.0,35.0,2.0,5.0,1.0,...,observed,observed,observed,observed,observed,observed,observed,observed,observed,observed
6,732365.0,732365.0,,0.0,0.0,2022.0,37.0,1.0,2.0,1.0,...,observed,observed,observed,observed,observed,inapplicable,inapplicable,inapplicable,observed,observed
7,1587125.0,1587125.0,,0.0,0.0,2022.0,56.0,2.0,1.0,1.0,...,observed,observed,observed,observed,observed,observed,observed,observed,observed,observed


In [None]:
# === Cell 5 • Section 2: Dynamic clean, promote cleaned to base, reorder, preview all, save to model‑ready ===
import pandas as pd
from pathlib import Path
from IPython.display import display

# Reuse PROJECT_ROOT / OUT_DIR / SAVE_FMT / MISS_LABELS / MISS_CODES / STD_COLS / _safe_to_parquet_or_csv from Section 1
PROCESSED_DIR = Path(OUT_DIR)                 # points to data/processed from Section 1
MODEL_DIR = PROCESSED_DIR / "model-ready"     # requested output folder
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# Load standardized (saved in Section 1)
def _load_std(prefix: str) -> pd.DataFrame:
    base = PROCESSED_DIR / f"ukhls_{prefix}_standardized"
    if base.with_suffix(".parquet").exists():
        return pd.read_parquet(base.with_suffix(".parquet"))
    elif base.with_suffix(".csv").exists():
        return pd.read_csv(base.with_suffix(".csv"))
    else:
        raise FileNotFoundError(f"Standardized not found for {prefix}: {base.with_suffix('.parquet').name} or .csv")

k_std = _load_std("k")
l_std = _load_std("l")
n_std = _load_std("n")

# ---------- Dynamic cleaner (special negatives -> NA, plus separate label) ----------
def add_clean_and_labels(df: pd.DataFrame) -> tuple[pd.DataFrame, list[str]]:
    """
    For each numeric-like column that contains a special negative (MISS_CODES):
      - add <col>_clean where those codes are set to NA (nullable numeric)
      - add <col>_label:
          * MISS_LABELS[...] where a special negative is present
          * 'observed' where value is non-missing and not a special negative
          * NA where value is truly missing
    Returns (df_with_clean_and_label, cleaned_cols).
    """
    out = df.copy()
    cleaned = []

    for c in out.columns:
        vals = pd.to_numeric(out[c], errors="coerce")  # non-numeric -> NaN
        mask_special = vals.isin(MISS_CODES)
        if not mask_special.any():
            continue

        # _clean: specials -> NA; keep nullable numeric dtype where possible
        cleaned_vals = vals.mask(mask_special, other=pd.NA)
        try:
            non_null = pd.Series(cleaned_vals.dropna())
            if pd.api.types.is_integer_dtype(non_null.dtype):
                out[f"{c}_clean"] = cleaned_vals.astype("Int64")
            else:
                out[f"{c}_clean"] = cleaned_vals.astype("Float64")
        except Exception:
            out[f"{c}_clean"] = cleaned_vals

        # _label: specials -> MISS_LABELS; observed -> 'observed'; true NaN -> NA
        def _lab(x):
            if pd.isna(x):
                return pd.NA
            xi = int(x)
            return MISS_LABELS.get(xi, "observed")
        out[f"{c}_label"] = vals.map(_lab).astype("string")

        cleaned.append(c)

    return out, cleaned

print("\n=== Dynamic clean (add *_clean and *_label where specials appear) ===")
k_mr, k_cleaned = add_clean_and_labels(k_std)
l_mr, l_cleaned = add_clean_and_labels(l_std)
n_mr, n_cleaned = add_clean_and_labels(n_std)

print(f"Shapes AFTER dynamic clean -> K: {k_mr.shape}  L: {l_mr.shape}  N: {n_mr.shape}")
print("K cleaned columns:", k_cleaned)
print("L cleaned columns:", l_cleaned)
print("N cleaned columns:", n_cleaned)

# === Promote *_clean -> base name, keep *_label (ppid_clean -> ppid, etc.) ===
def promote_clean_to_base(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    changed = []
    for c in list(out.columns):
        if c.endswith("_clean"):
            base = c[:-6]  # remove "_clean"
            out[base] = out[c]
            out = out.drop(columns=[c])
            changed.append(base)
    if changed:
        print(f"Promoted cleaned columns into base: {sorted(changed)}")
    return out

k_model = promote_clean_to_base(k_mr)
l_model = promote_clean_to_base(l_mr)
n_model = promote_clean_to_base(n_mr)

# === Reorder columns: ids -> each base in STD_COLS then its label -> leftovers ===
ID_COLS = ["pidp", "wave", "wave_num"]

def reorder_model_ready(df: pd.DataFrame) -> pd.DataFrame:
    ordered = []
    # identifiers first (in this exact order if present)
    for c in ID_COLS:
        if c in df.columns:
            ordered.append(c)
    # standardized base vars next, place their label immediately after if present
    for c in STD_COLS:
        if c in df.columns and c not in ordered:
            ordered.append(c)
        lab = f"{c}_label"
        if lab in df.columns and lab not in ordered:
            ordered.append(lab)
    # any remaining columns (e.g., labels for non-STD_COLS, extras)
    remaining = [c for c in df.columns if c not in ordered]
    # keep remaining in existing order for traceability
    return df.loc[:, ordered + remaining]

k_model = reorder_model_ready(k_model)
l_model = reorder_model_ready(l_model)
n_model = reorder_model_ready(n_model)

# === Save model‑ready per-wave and combined to data/processed/model‑ready ===
# === Save model‑ready per-wave and combined to data/processed/model‑ready ===
print("\n=== Save model‑ready frames ===")
MODEL_DIR.mkdir(parents=True, exist_ok=True)

if not globals().get("_MODEL_READY_SAVED"):
    k_model_path = _safe_to_parquet_or_csv(k_model, MODEL_DIR / "ukhls_k_model_ready", SAVE_FMT)
    l_model_path = _safe_to_parquet_or_csv(l_model, MODEL_DIR / "ukhls_l_model_ready", SAVE_FMT)
    n_model_path = _safe_to_parquet_or_csv(n_model, MODEL_DIR / "ukhls_n_model_ready", SAVE_FMT)

    all_model = pd.concat([k_model, l_model, n_model], ignore_index=True)
    all_model = reorder_model_ready(all_model)
    all_model_path = _safe_to_parquet_or_csv(all_model, MODEL_DIR / "ukhls_kln_model_ready", SAVE_FMT)

    _MODEL_READY_SAVED = True
else:
    print("Model‑ready already saved in this session; skipping write.")
    # Rebuild combined in-memory for preview, and reconstruct paths for display
    all_model = reorder_model_ready(pd.concat([k_model, l_model, n_model], ignore_index=True))
    suffix = ".parquet" if SAVE_FMT.lower() == "parquet" else ".csv"
    k_model_path = (MODEL_DIR / "ukhls_k_model_ready").with_suffix(suffix)
    l_model_path = (MODEL_DIR / "ukhls_l_model_ready").with_suffix(suffix)
    n_model_path = (MODEL_DIR / "ukhls_n_model_ready").with_suffix(suffix)
    all_model_path = (MODEL_DIR / "ukhls_kln_model_ready").with_suffix(suffix)

print("Model‑ready saved ->")
print(k_model_path, "\n", l_model_path, "\n", n_model_path, "\n", all_model_path)

# === Preview all columns (full frame, ordered) ===
def preview_all(df: pd.DataFrame, title: str, n: int = 8):
    print(f"\n{title} — shape {df.shape[0]} x {df.shape[1]} (columns)")
    with pd.option_context("display.max_columns", None, "display.width", 220):
        display(df.head(n))
    cols = list(df.columns)
    print(f"{title} columns ({len(cols)}):")
    print("HEAD:", cols[:min(20, len(cols))])
    if len(cols) > 20:
        print("TAIL:", cols[-20:])

preview_all(k_model, "K (model‑ready)")
preview_all(l_model, "L (model‑ready)")
preview_all(n_model, "N (model‑ready)")
preview_all(all_model, "Combined K+L+N (model‑ready)")


=== Dynamic clean (add *_clean and *_label where specials appear) ===
Shapes AFTER dynamic clean -> K: (32008, 51)  L: (29271, 47)  N: (35471, 47)
K cleaned columns: ['ppid', 'intdaty_dv', 'age_dv', 'sex_dv', 'gor_dv', 'urban_dv', 'ethn_dv', 'anxiety_raw', 'jbft_dv', 'jbnssec_dv', 'jbhrs', 'fimnnet_dv', 'sf12pcs_dv', 'scghq1_dv']
L cleaned columns: ['ppid', 'sex_dv', 'gor_dv', 'urban_dv', 'ethn_dv', 'anxiety_raw', 'jbft_dv', 'jbnssec_dv', 'jbhrs', 'fimnnet_dv', 'sf12pcs_dv', 'scghq1_dv']
N cleaned columns: ['ppid', 'age_dv', 'sex_dv', 'gor_dv', 'urban_dv', 'ethn_dv', 'anxiety_raw', 'jbft_dv', 'jbnssec_dv', 'jbhrs', 'sf12pcs_dv', 'scghq1_dv']
Promoted cleaned columns into base: ['age_dv', 'anxiety_raw', 'ethn_dv', 'fimnnet_dv', 'gor_dv', 'intdaty_dv', 'jbft_dv', 'jbhrs', 'jbnssec_dv', 'ppid', 'scghq1_dv', 'sex_dv', 'sf12pcs_dv', 'urban_dv']
Promoted cleaned columns into base: ['anxiety_raw', 'ethn_dv', 'fimnnet_dv', 'gor_dv', 'jbft_dv', 'jbhrs', 'jbnssec_dv', 'ppid', 'scghq1_dv', 'sex_

Unnamed: 0,pidp,wave,wave_num,hrpid,ppid,ppid_label,ind5mus_xw,indinui_lw,intdaty_dv,intdaty_dv_label,age_dv,age_dv_label,sex_dv,sex_dv_label,gor_dv,gor_dv_label,urban_dv,urban_dv_label,hhtype_dv,hhsize,ethn_dv,ethn_dv_label,anxiety_raw,anxiety_raw_label,jbft_dv,jbft_dv_label,jbnssec_dv,jbnssec_dv_label,jbhrs,jbhrs_label,fimnnet_dv,fimnnet_dv_label,sf12pcs_dv,sf12pcs_dv_label,scghq1_dv,scghq1_dv_label,nchild_dv
0,68006127.0,K,11,68006127.0,68020564.0,observed,0.0,1.703009,2019.0,observed,49.0,observed,2.0,observed,1.0,observed,1.0,observed,6.0,2.0,1.0,observed,,inapplicable,,inapplicable,,inapplicable,,inapplicable,0.0,observed,26.41,observed,16.0,observed,0.0
1,68020564.0,K,11,68006127.0,68006127.0,observed,0.0,0.0,2019.0,observed,48.0,observed,1.0,observed,1.0,observed,1.0,observed,6.0,2.0,1.0,observed,,inapplicable,,inapplicable,,inapplicable,,inapplicable,1565.670044,observed,34.68,observed,12.0,observed,0.0
2,68008847.0,K,11,68008847.0,,inapplicable,0.0,0.794699,2019.0,observed,61.0,observed,2.0,observed,1.0,observed,1.0,observed,2.0,1.0,1.0,observed,,inapplicable,1.0,observed,14.0,observed,39.0,observed,2134.0,observed,44.2,observed,9.0,observed,0.0
3,68009527.0,K,11,68009527.0,68061288.0,observed,0.0,0.962017,2019.0,observed,41.0,observed,1.0,observed,1.0,observed,1.0,observed,11.0,4.0,1.0,observed,,inapplicable,1.0,observed,15.0,observed,39.0,observed,2043.0,observed,60.48,observed,16.0,observed,2.0
4,68061288.0,K,11,68009527.0,68009527.0,observed,0.0,0.0,2019.0,observed,33.0,observed,2.0,observed,1.0,observed,1.0,observed,11.0,4.0,1.0,observed,,inapplicable,,inapplicable,,inapplicable,,inapplicable,149.25,observed,54.23,observed,11.0,observed,2.0
5,68010887.0,K,11,68068082.0,68068082.0,observed,0.0,1.055802,2019.0,observed,55.0,observed,2.0,observed,1.0,observed,1.0,observed,6.0,2.0,1.0,observed,,inapplicable,1.0,observed,25.0,observed,32.0,observed,1250.0,observed,57.28,observed,9.0,observed,0.0
6,68068082.0,K,11,68068082.0,68010887.0,observed,0.0,0.0,2019.0,observed,58.0,observed,1.0,observed,1.0,observed,1.0,observed,6.0,2.0,1.0,observed,,inapplicable,1.0,observed,19.0,observed,,inapplicable,690.859985,observed,56.15,observed,9.0,observed,0.0
7,68014287.0,K,11,68014287.0,,inapplicable,0.0,0.0,2019.0,observed,49.0,observed,2.0,observed,1.0,observed,1.0,observed,18.0,3.0,1.0,observed,,inapplicable,,inapplicable,,inapplicable,,inapplicable,715.869995,observed,44.37,observed,23.0,observed,1.0


K (model‑ready) columns (37):
HEAD: ['pidp', 'wave', 'wave_num', 'hrpid', 'ppid', 'ppid_label', 'ind5mus_xw', 'indinui_lw', 'intdaty_dv', 'intdaty_dv_label', 'age_dv', 'age_dv_label', 'sex_dv', 'sex_dv_label', 'gor_dv', 'gor_dv_label', 'urban_dv', 'urban_dv_label', 'hhtype_dv', 'hhsize']
TAIL: ['urban_dv_label', 'hhtype_dv', 'hhsize', 'ethn_dv', 'ethn_dv_label', 'anxiety_raw', 'anxiety_raw_label', 'jbft_dv', 'jbft_dv_label', 'jbnssec_dv', 'jbnssec_dv_label', 'jbhrs', 'jbhrs_label', 'fimnnet_dv', 'fimnnet_dv_label', 'sf12pcs_dv', 'sf12pcs_dv_label', 'scghq1_dv', 'scghq1_dv_label', 'nchild_dv']

L (model‑ready) — shape 29271 x 35 (columns)


Unnamed: 0,pidp,wave,wave_num,hrpid,ppid,ppid_label,ind5mus_xw,indinui_lw,intdaty_dv,age_dv,sex_dv,sex_dv_label,gor_dv,gor_dv_label,urban_dv,urban_dv_label,hhtype_dv,hhsize,ethn_dv,ethn_dv_label,anxiety_raw,anxiety_raw_label,jbft_dv,jbft_dv_label,jbnssec_dv,jbnssec_dv_label,jbhrs,jbhrs_label,fimnnet_dv,fimnnet_dv_label,sf12pcs_dv,sf12pcs_dv_label,scghq1_dv,scghq1_dv_label,nchild_dv
0,68008847.0,L,12,68008847.0,,inapplicable,0.0,0.739967,2020.0,62.0,2.0,observed,1.0,observed,1.0,observed,2.0,1.0,1.0,observed,,inapplicable,1.0,observed,14.0,observed,39.0,observed,2288.0,observed,37.58,observed,12.0,observed,0.0
1,68009527.0,L,12,68034180.0,68061288.0,observed,0.0,0.950116,2020.0,43.0,1.0,observed,1.0,observed,1.0,observed,11.0,4.0,1.0,observed,,inapplicable,1.0,observed,15.0,observed,36.5,observed,2060.0,observed,56.37,observed,11.0,observed,2.0
2,68061288.0,L,12,68034180.0,68009527.0,observed,0.0,0.0,2020.0,34.0,2.0,observed,1.0,observed,1.0,observed,11.0,4.0,1.0,observed,,inapplicable,2.0,observed,24.0,observed,10.0,observed,474.0,observed,61.73,observed,15.0,observed,2.0
3,68010887.0,L,12,68010887.0,68068082.0,observed,0.0,0.998976,2020.0,56.0,2.0,observed,1.0,observed,1.0,observed,6.0,2.0,1.0,observed,,inapplicable,1.0,observed,25.0,observed,32.0,observed,1200.0,observed,51.64,observed,11.0,observed,0.0
4,68068082.0,L,12,68010887.0,68010887.0,observed,0.0,0.0,2020.0,59.0,1.0,observed,1.0,observed,1.0,observed,6.0,2.0,1.0,observed,,inapplicable,1.0,observed,17.0,observed,,inapplicable,3275.909912,observed,56.15,observed,10.0,observed,0.0
5,68028575.0,L,12,68095380.0,68157166.0,observed,0.0,1.03181,2020.0,28.0,2.0,observed,1.0,observed,1.0,observed,11.0,4.0,1.0,observed,,inapplicable,,inapplicable,,inapplicable,,inapplicable,0.0,observed,57.76,observed,8.0,observed,2.0
6,68157166.0,L,12,68095380.0,68028575.0,observed,0.0,0.0,2020.0,35.0,1.0,observed,1.0,observed,1.0,observed,11.0,4.0,1.0,observed,,inapplicable,1.0,observed,8.0,observed,38.0,observed,2974.0,observed,57.47,observed,7.0,observed,2.0
7,68029927.0,L,12,68029939.0,68029931.0,observed,0.0,0.0,2020.0,48.0,2.0,observed,1.0,observed,1.0,observed,20.0,5.0,1.0,observed,,inapplicable,,inapplicable,,inapplicable,,inapplicable,89.699997,observed,53.79,observed,9.0,observed,0.0


L (model‑ready) columns (35):
HEAD: ['pidp', 'wave', 'wave_num', 'hrpid', 'ppid', 'ppid_label', 'ind5mus_xw', 'indinui_lw', 'intdaty_dv', 'age_dv', 'sex_dv', 'sex_dv_label', 'gor_dv', 'gor_dv_label', 'urban_dv', 'urban_dv_label', 'hhtype_dv', 'hhsize', 'ethn_dv', 'ethn_dv_label']
TAIL: ['urban_dv_label', 'hhtype_dv', 'hhsize', 'ethn_dv', 'ethn_dv_label', 'anxiety_raw', 'anxiety_raw_label', 'jbft_dv', 'jbft_dv_label', 'jbnssec_dv', 'jbnssec_dv_label', 'jbhrs', 'jbhrs_label', 'fimnnet_dv', 'fimnnet_dv_label', 'sf12pcs_dv', 'sf12pcs_dv_label', 'scghq1_dv', 'scghq1_dv_label', 'nchild_dv']

N (model‑ready) — shape 35471 x 35 (columns)


Unnamed: 0,pidp,wave,wave_num,hrpid,ppid,ppid_label,ind5mus_xw,indinui_lw,intdaty_dv,age_dv,age_dv_label,sex_dv,sex_dv_label,gor_dv,gor_dv_label,urban_dv,urban_dv_label,hhtype_dv,hhsize,ethn_dv,ethn_dv_label,anxiety_raw,anxiety_raw_label,jbft_dv,jbft_dv_label,jbnssec_dv,jbnssec_dv_label,jbhrs,jbhrs_label,fimnnet_dv,sf12pcs_dv,sf12pcs_dv_label,scghq1_dv,scghq1_dv_label,nchild_dv
0,22445.0,N,14,276841780.0,277059298.0,observed,0.0,0.0,2022.0,37.0,observed,2.0,observed,8.0,observed,1.0,observed,11.0,4.0,1.0,observed,2.0,observed,1.0,observed,2.0,observed,28.0,observed,1857.079956,62.83,observed,24.0,observed,2.0
1,29925.0,N,14,622866606.0,,inapplicable,0.0,0.0,2022.0,45.0,observed,2.0,observed,7.0,observed,1.0,observed,5.0,3.0,1.0,observed,2.0,observed,2.0,observed,14.0,observed,29.0,observed,2378.75,65.47,observed,23.0,observed,2.0
2,76165.0,N,14,141045780.0,142378492.0,observed,0.0,0.0,2022.0,39.0,observed,2.0,observed,5.0,observed,1.0,observed,11.0,4.0,1.0,observed,2.0,observed,1.0,observed,11.0,observed,35.0,observed,3206.0,57.2,observed,12.0,observed,2.0
3,280165.0,N,14,783876922.0,756200970.0,observed,0.0,0.0,2022.0,43.0,observed,2.0,observed,8.0,observed,2.0,observed,20.0,4.0,1.0,observed,2.0,observed,,inapplicable,,inapplicable,,inapplicable,94.470001,58.55,observed,16.0,observed,1.0
4,469205.0,N,14,414412580.0,,inapplicable,0.0,0.0,2022.0,32.0,observed,2.0,observed,4.0,observed,1.0,observed,5.0,3.0,1.0,observed,2.0,observed,2.0,observed,25.0,observed,16.0,observed,2056.080078,49.93,observed,16.0,observed,2.0
5,599765.0,N,14,209943344.0,210167702.0,observed,0.0,0.0,2022.0,35.0,observed,2.0,observed,5.0,observed,1.0,observed,10.0,3.0,1.0,observed,2.0,observed,1.0,observed,2.0,observed,37.0,observed,2839.649902,56.15,observed,6.0,observed,1.0
6,732365.0,N,14,732365.0,,inapplicable,0.0,0.0,2022.0,37.0,observed,1.0,observed,2.0,observed,1.0,observed,19.0,3.0,1.0,observed,1.0,observed,,inapplicable,,inapplicable,,inapplicable,838.5,45.49,observed,35.0,observed,0.0
7,1587125.0,N,14,1587125.0,,inapplicable,0.0,0.0,2022.0,56.0,observed,2.0,observed,1.0,observed,1.0,observed,3.0,1.0,1.0,observed,2.0,observed,1.0,observed,7.0,observed,37.0,observed,2290.0,37.37,observed,15.0,observed,0.0


N (model‑ready) columns (35):
HEAD: ['pidp', 'wave', 'wave_num', 'hrpid', 'ppid', 'ppid_label', 'ind5mus_xw', 'indinui_lw', 'intdaty_dv', 'age_dv', 'age_dv_label', 'sex_dv', 'sex_dv_label', 'gor_dv', 'gor_dv_label', 'urban_dv', 'urban_dv_label', 'hhtype_dv', 'hhsize', 'ethn_dv']
TAIL: ['urban_dv', 'urban_dv_label', 'hhtype_dv', 'hhsize', 'ethn_dv', 'ethn_dv_label', 'anxiety_raw', 'anxiety_raw_label', 'jbft_dv', 'jbft_dv_label', 'jbnssec_dv', 'jbnssec_dv_label', 'jbhrs', 'jbhrs_label', 'fimnnet_dv', 'sf12pcs_dv', 'sf12pcs_dv_label', 'scghq1_dv', 'scghq1_dv_label', 'nchild_dv']

Combined K+L+N (model‑ready) — shape 96750 x 37 (columns)


Unnamed: 0,pidp,wave,wave_num,hrpid,ppid,ppid_label,ind5mus_xw,indinui_lw,intdaty_dv,intdaty_dv_label,age_dv,age_dv_label,sex_dv,sex_dv_label,gor_dv,gor_dv_label,urban_dv,urban_dv_label,hhtype_dv,hhsize,ethn_dv,ethn_dv_label,anxiety_raw,anxiety_raw_label,jbft_dv,jbft_dv_label,jbnssec_dv,jbnssec_dv_label,jbhrs,jbhrs_label,fimnnet_dv,fimnnet_dv_label,sf12pcs_dv,sf12pcs_dv_label,scghq1_dv,scghq1_dv_label,nchild_dv
0,68006127.0,K,11,68006127.0,68020564.0,observed,0.0,1.703009,2019.0,observed,49.0,observed,2.0,observed,1.0,observed,1.0,observed,6.0,2.0,1.0,observed,,inapplicable,,inapplicable,,inapplicable,,inapplicable,0.0,observed,26.41,observed,16.0,observed,0.0
1,68020564.0,K,11,68006127.0,68006127.0,observed,0.0,0.0,2019.0,observed,48.0,observed,1.0,observed,1.0,observed,1.0,observed,6.0,2.0,1.0,observed,,inapplicable,,inapplicable,,inapplicable,,inapplicable,1565.670044,observed,34.68,observed,12.0,observed,0.0
2,68008847.0,K,11,68008847.0,,inapplicable,0.0,0.794699,2019.0,observed,61.0,observed,2.0,observed,1.0,observed,1.0,observed,2.0,1.0,1.0,observed,,inapplicable,1.0,observed,14.0,observed,39.0,observed,2134.0,observed,44.2,observed,9.0,observed,0.0
3,68009527.0,K,11,68009527.0,68061288.0,observed,0.0,0.962017,2019.0,observed,41.0,observed,1.0,observed,1.0,observed,1.0,observed,11.0,4.0,1.0,observed,,inapplicable,1.0,observed,15.0,observed,39.0,observed,2043.0,observed,60.48,observed,16.0,observed,2.0
4,68061288.0,K,11,68009527.0,68009527.0,observed,0.0,0.0,2019.0,observed,33.0,observed,2.0,observed,1.0,observed,1.0,observed,11.0,4.0,1.0,observed,,inapplicable,,inapplicable,,inapplicable,,inapplicable,149.25,observed,54.23,observed,11.0,observed,2.0
5,68010887.0,K,11,68068082.0,68068082.0,observed,0.0,1.055802,2019.0,observed,55.0,observed,2.0,observed,1.0,observed,1.0,observed,6.0,2.0,1.0,observed,,inapplicable,1.0,observed,25.0,observed,32.0,observed,1250.0,observed,57.28,observed,9.0,observed,0.0
6,68068082.0,K,11,68068082.0,68010887.0,observed,0.0,0.0,2019.0,observed,58.0,observed,1.0,observed,1.0,observed,1.0,observed,6.0,2.0,1.0,observed,,inapplicable,1.0,observed,19.0,observed,,inapplicable,690.859985,observed,56.15,observed,9.0,observed,0.0
7,68014287.0,K,11,68014287.0,,inapplicable,0.0,0.0,2019.0,observed,49.0,observed,2.0,observed,1.0,observed,1.0,observed,18.0,3.0,1.0,observed,,inapplicable,,inapplicable,,inapplicable,,inapplicable,715.869995,observed,44.37,observed,23.0,observed,1.0


Combined K+L+N (model‑ready) columns (37):
HEAD: ['pidp', 'wave', 'wave_num', 'hrpid', 'ppid', 'ppid_label', 'ind5mus_xw', 'indinui_lw', 'intdaty_dv', 'intdaty_dv_label', 'age_dv', 'age_dv_label', 'sex_dv', 'sex_dv_label', 'gor_dv', 'gor_dv_label', 'urban_dv', 'urban_dv_label', 'hhtype_dv', 'hhsize']
TAIL: ['urban_dv_label', 'hhtype_dv', 'hhsize', 'ethn_dv', 'ethn_dv_label', 'anxiety_raw', 'anxiety_raw_label', 'jbft_dv', 'jbft_dv_label', 'jbnssec_dv', 'jbnssec_dv_label', 'jbhrs', 'jbhrs_label', 'fimnnet_dv', 'fimnnet_dv_label', 'sf12pcs_dv', 'sf12pcs_dv_label', 'scghq1_dv', 'scghq1_dv_label', 'nchild_dv']


In [29]:
# === Cell 6: Keep only participants present in all waves (common pidp), save filtered ===
import pandas as pd
from pathlib import Path
from IPython.display import display

SAVE_FMT = "parquet"  # or "csv"

# Resolve project root and key folders (anchor to repo root's /data)
def _autodetect_project_root() -> Path:
    p = Path.cwd()
    for _ in range(8):
        if (p / "data").exists():
            return p
        p = p.parent
    return Path.cwd()

try:
    PROJECT_ROOT = Path(PROJECT_ROOT)
except NameError:
    PROJECT_ROOT = _autodetect_project_root()

DATA_DIR = PROJECT_ROOT / "data"
PROCESSED_DIR = DATA_DIR / "processed"
MODEL_DIR = PROCESSED_DIR / "model-ready"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

def _load_model_ready(prefix: str) -> pd.DataFrame:
    """
    Load model-ready frame for a wave from PROJECT_ROOT/data/processed/model-ready,
    falling back to PROJECT_ROOT/data/processed if needed. Logs the exact file used.
    """
    candidates = [
        MODEL_DIR / f"ukhls_{prefix}_model_ready.parquet",
        MODEL_DIR / f"ukhls_{prefix}_model_ready.csv",
        PROCESSED_DIR / f"ukhls_{prefix}_model_ready.parquet",
        PROCESSED_DIR / f"ukhls_{prefix}_model_ready.csv",
    ]
    for p in candidates:
        if p.exists():
            print(f"Loading {prefix.upper()} from: {p}")
            return pd.read_parquet(p) if p.suffix.lower() == ".parquet" else pd.read_csv(p)
    tried = "\n  - " + "\n  - ".join(str(p) for p in candidates)
    raise FileNotFoundError(f"Model-ready not found for {prefix}. Tried:{tried}")

def _save(df: pd.DataFrame, base: Path, fmt: str = "parquet") -> Path:
    """
    Save to Parquet or CSV (fallback if Parquet not available). Returns final path.
    """
    if fmt.lower() == "parquet":
        try:
            out = base.with_suffix(".parquet")
            df.to_parquet(out, index=False)
            print(f"Saved Parquet -> {out}")
            return out
        except Exception as e:
            print(f"Parquet failed ({type(e).__name__}: {e}). Falling back to CSV.")
            out = base.with_suffix(".csv")
            df.to_csv(out, index=False)
            print(f"Saved CSV -> {out}")
            return out
    else:
        out = base.with_suffix(".csv")
        df.to_csv(out, index=False)
        print(f"Saved CSV -> {out}")
        return out

def _normalize_pidp(s: pd.Series) -> pd.Series:
    """
    Normalize pidp for reliable set operations: use nullable Int64 when possible, else string.
    """
    vals = pd.to_numeric(s, errors="coerce")
    if vals.notna().all():
        return vals.astype("Int64")
    return s.astype("string")

# Load model-ready per-wave (root-anchored)
k_mr = _load_model_ready("k")
l_mr = _load_model_ready("l")
n_mr = _load_model_ready("n")

# Normalize pidp and compute intersection across K/L/N
k_pidp = _normalize_pidp(k_mr["pidp"])
l_pidp = _normalize_pidp(l_mr["pidp"])
n_pidp = _normalize_pidp(n_mr["pidp"])

common_pidp = set(k_pidp.dropna().unique()) & set(l_pidp.dropna().unique()) & set(n_pidp.dropna().unique())
print(f"Common pidp count across K/L/N: {len(common_pidp)}")

# Filter to only those pidp
k_common = k_mr[k_pidp.isin(common_pidp)].copy()
l_common = l_mr[l_pidp.isin(common_pidp)].copy()
n_common = n_mr[n_pidp.isin(common_pidp)].copy()

print("Shapes after filtering to common pidp ->",
      "K:", k_common.shape, "L:", l_common.shape, "N:", n_common.shape)

# Combined filtered model-ready
kln_common = pd.concat([k_common, l_common, n_common], ignore_index=True)

# Sanity: each pidp should have 3 waves in the combined set
by_pidp_waves = kln_common.groupby("pidp")["wave"].nunique()
exactly_three = (by_pidp_waves == 3).sum()
print(f"Participants with exactly 3 waves in combined: {exactly_three} (of {by_pidp_waves.size})")

# Preview all columns (full)
def _preview(df: pd.DataFrame, title: str, n: int = 8):
    print(f"\n{title} — shape {df.shape[0]} x {df.shape[1]}")
    with pd.option_context("display.max_columns", None, "display.width", 220):
        display(df.head(n))
    cols = list(df.columns)
    print(f"{title} columns ({len(cols)}):")
    print("HEAD:", cols[:min(20, len(cols))])
    if len(cols) > 20:
        print("TAIL:", cols[-20:])

_preview(k_common, "K (model-ready, common pidp)")
_preview(l_common, "L (model-ready, common pidp)")
_preview(n_common, "N (model-ready, common pidp)")
_preview(kln_common, "Combined K+L+N (model-ready, common pidp)")

# Save filtered model-ready outputs (root-anchored model-ready folder)
k_path = _save(k_common, MODEL_DIR / "ukhls_k_model_ready_commonpidp", SAVE_FMT)
l_path = _save(l_common, MODEL_DIR / "ukhls_l_model_ready_commonpidp", SAVE_FMT)
n_path = _save(n_common, MODEL_DIR / "ukhls_n_model_ready_commonpidp", SAVE_FMT)
all_path = _save(kln_common, MODEL_DIR / "ukhls_kln_model_ready_commonpidp", SAVE_FMT)

print("\nSaved common-pidp model-ready ->")
print(k_path, "\n", l_path, "\n", n_path, "\n", all_path)


Loading K from: C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\processed\model-ready\ukhls_k_model_ready.parquet
Loading L from: C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\processed\model-ready\ukhls_l_model_ready.parquet
Loading N from: C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\processed\model-ready\ukhls_n_model_ready.parquet
Common pidp count across K/L/N: 22088
Shapes after filtering to common pidp -> K: (22088, 37) L: (22088, 35) N: (22088, 35)
Participants with exactly 3 waves in combined: 22088 (of 22088)

K (model-ready, common pidp) — shape 22088 x 37
Shapes after filtering to common pidp -> K: (22088, 37) L: (22088, 35) N: (22088, 35)
Participants with exactly 3 waves in combined: 22088 (of 22088)

K (model-ready, common pidp) — shape 22088 x 37


Unnamed: 0,pidp,wave,wave_num,hrpid,ppid,ppid_label,ind5mus_xw,indinui_lw,intdaty_dv,intdaty_dv_label,age_dv,age_dv_label,sex_dv,sex_dv_label,gor_dv,gor_dv_label,urban_dv,urban_dv_label,hhtype_dv,hhsize,ethn_dv,ethn_dv_label,anxiety_raw,anxiety_raw_label,jbft_dv,jbft_dv_label,jbnssec_dv,jbnssec_dv_label,jbhrs,jbhrs_label,fimnnet_dv,fimnnet_dv_label,sf12pcs_dv,sf12pcs_dv_label,scghq1_dv,scghq1_dv_label,nchild_dv
2,68008847.0,K,11,68008847.0,,inapplicable,0.0,0.794699,2019.0,observed,61.0,observed,2.0,observed,1.0,observed,1.0,observed,2.0,1.0,1.0,observed,,inapplicable,1.0,observed,14.0,observed,39.0,observed,2134.0,observed,44.2,observed,9.0,observed,0.0
3,68009527.0,K,11,68009527.0,68061288.0,observed,0.0,0.962017,2019.0,observed,41.0,observed,1.0,observed,1.0,observed,1.0,observed,11.0,4.0,1.0,observed,,inapplicable,1.0,observed,15.0,observed,39.0,observed,2043.0,observed,60.48,observed,16.0,observed,2.0
4,68061288.0,K,11,68009527.0,68009527.0,observed,0.0,0.0,2019.0,observed,33.0,observed,2.0,observed,1.0,observed,1.0,observed,11.0,4.0,1.0,observed,,inapplicable,,inapplicable,,inapplicable,,inapplicable,149.25,observed,54.23,observed,11.0,observed,2.0
5,68010887.0,K,11,68068082.0,68068082.0,observed,0.0,1.055802,2019.0,observed,55.0,observed,2.0,observed,1.0,observed,1.0,observed,6.0,2.0,1.0,observed,,inapplicable,1.0,observed,25.0,observed,32.0,observed,1250.0,observed,57.28,observed,9.0,observed,0.0
6,68068082.0,K,11,68068082.0,68010887.0,observed,0.0,0.0,2019.0,observed,58.0,observed,1.0,observed,1.0,observed,1.0,observed,6.0,2.0,1.0,observed,,inapplicable,1.0,observed,19.0,observed,,inapplicable,690.859985,observed,56.15,observed,9.0,observed,0.0
9,68028575.0,K,11,68157166.0,68157166.0,observed,0.0,1.029255,2019.0,observed,28.0,observed,2.0,observed,1.0,observed,1.0,observed,11.0,4.0,1.0,observed,,inapplicable,,inapplicable,,inapplicable,,inapplicable,0.0,observed,56.71,observed,6.0,observed,2.0
10,68157166.0,K,11,68157166.0,68028575.0,observed,0.0,0.0,2019.0,observed,34.0,observed,1.0,observed,1.0,observed,1.0,observed,11.0,4.0,1.0,observed,,inapplicable,1.0,observed,8.0,observed,37.0,observed,3100.0,observed,56.64,observed,12.0,observed,2.0
11,68029927.0,K,11,68029931.0,68029931.0,observed,0.0,0.0,2019.0,observed,47.0,observed,2.0,observed,1.0,observed,1.0,observed,20.0,5.0,1.0,observed,,inapplicable,,inapplicable,,inapplicable,,inapplicable,89.699997,observed,56.95,observed,9.0,observed,1.0


K (model-ready, common pidp) columns (37):
HEAD: ['pidp', 'wave', 'wave_num', 'hrpid', 'ppid', 'ppid_label', 'ind5mus_xw', 'indinui_lw', 'intdaty_dv', 'intdaty_dv_label', 'age_dv', 'age_dv_label', 'sex_dv', 'sex_dv_label', 'gor_dv', 'gor_dv_label', 'urban_dv', 'urban_dv_label', 'hhtype_dv', 'hhsize']
TAIL: ['urban_dv_label', 'hhtype_dv', 'hhsize', 'ethn_dv', 'ethn_dv_label', 'anxiety_raw', 'anxiety_raw_label', 'jbft_dv', 'jbft_dv_label', 'jbnssec_dv', 'jbnssec_dv_label', 'jbhrs', 'jbhrs_label', 'fimnnet_dv', 'fimnnet_dv_label', 'sf12pcs_dv', 'sf12pcs_dv_label', 'scghq1_dv', 'scghq1_dv_label', 'nchild_dv']

L (model-ready, common pidp) — shape 22088 x 35


Unnamed: 0,pidp,wave,wave_num,hrpid,ppid,ppid_label,ind5mus_xw,indinui_lw,intdaty_dv,age_dv,sex_dv,sex_dv_label,gor_dv,gor_dv_label,urban_dv,urban_dv_label,hhtype_dv,hhsize,ethn_dv,ethn_dv_label,anxiety_raw,anxiety_raw_label,jbft_dv,jbft_dv_label,jbnssec_dv,jbnssec_dv_label,jbhrs,jbhrs_label,fimnnet_dv,fimnnet_dv_label,sf12pcs_dv,sf12pcs_dv_label,scghq1_dv,scghq1_dv_label,nchild_dv
0,68008847.0,L,12,68008847.0,,inapplicable,0.0,0.739967,2020.0,62.0,2.0,observed,1.0,observed,1.0,observed,2.0,1.0,1.0,observed,,inapplicable,1.0,observed,14.0,observed,39.0,observed,2288.0,observed,37.58,observed,12.0,observed,0.0
1,68009527.0,L,12,68034180.0,68061288.0,observed,0.0,0.950116,2020.0,43.0,1.0,observed,1.0,observed,1.0,observed,11.0,4.0,1.0,observed,,inapplicable,1.0,observed,15.0,observed,36.5,observed,2060.0,observed,56.37,observed,11.0,observed,2.0
2,68061288.0,L,12,68034180.0,68009527.0,observed,0.0,0.0,2020.0,34.0,2.0,observed,1.0,observed,1.0,observed,11.0,4.0,1.0,observed,,inapplicable,2.0,observed,24.0,observed,10.0,observed,474.0,observed,61.73,observed,15.0,observed,2.0
3,68010887.0,L,12,68010887.0,68068082.0,observed,0.0,0.998976,2020.0,56.0,2.0,observed,1.0,observed,1.0,observed,6.0,2.0,1.0,observed,,inapplicable,1.0,observed,25.0,observed,32.0,observed,1200.0,observed,51.64,observed,11.0,observed,0.0
4,68068082.0,L,12,68010887.0,68010887.0,observed,0.0,0.0,2020.0,59.0,1.0,observed,1.0,observed,1.0,observed,6.0,2.0,1.0,observed,,inapplicable,1.0,observed,17.0,observed,,inapplicable,3275.909912,observed,56.15,observed,10.0,observed,0.0
5,68028575.0,L,12,68095380.0,68157166.0,observed,0.0,1.03181,2020.0,28.0,2.0,observed,1.0,observed,1.0,observed,11.0,4.0,1.0,observed,,inapplicable,,inapplicable,,inapplicable,,inapplicable,0.0,observed,57.76,observed,8.0,observed,2.0
6,68157166.0,L,12,68095380.0,68028575.0,observed,0.0,0.0,2020.0,35.0,1.0,observed,1.0,observed,1.0,observed,11.0,4.0,1.0,observed,,inapplicable,1.0,observed,8.0,observed,38.0,observed,2974.0,observed,57.47,observed,7.0,observed,2.0
7,68029927.0,L,12,68029939.0,68029931.0,observed,0.0,0.0,2020.0,48.0,2.0,observed,1.0,observed,1.0,observed,20.0,5.0,1.0,observed,,inapplicable,,inapplicable,,inapplicable,,inapplicable,89.699997,observed,53.79,observed,9.0,observed,0.0


L (model-ready, common pidp) columns (35):
HEAD: ['pidp', 'wave', 'wave_num', 'hrpid', 'ppid', 'ppid_label', 'ind5mus_xw', 'indinui_lw', 'intdaty_dv', 'age_dv', 'sex_dv', 'sex_dv_label', 'gor_dv', 'gor_dv_label', 'urban_dv', 'urban_dv_label', 'hhtype_dv', 'hhsize', 'ethn_dv', 'ethn_dv_label']
TAIL: ['urban_dv_label', 'hhtype_dv', 'hhsize', 'ethn_dv', 'ethn_dv_label', 'anxiety_raw', 'anxiety_raw_label', 'jbft_dv', 'jbft_dv_label', 'jbnssec_dv', 'jbnssec_dv_label', 'jbhrs', 'jbhrs_label', 'fimnnet_dv', 'fimnnet_dv_label', 'sf12pcs_dv', 'sf12pcs_dv_label', 'scghq1_dv', 'scghq1_dv_label', 'nchild_dv']

N (model-ready, common pidp) — shape 22088 x 35


Unnamed: 0,pidp,wave,wave_num,hrpid,ppid,ppid_label,ind5mus_xw,indinui_lw,intdaty_dv,age_dv,age_dv_label,sex_dv,sex_dv_label,gor_dv,gor_dv_label,urban_dv,urban_dv_label,hhtype_dv,hhsize,ethn_dv,ethn_dv_label,anxiety_raw,anxiety_raw_label,jbft_dv,jbft_dv_label,jbnssec_dv,jbnssec_dv_label,jbhrs,jbhrs_label,fimnnet_dv,sf12pcs_dv,sf12pcs_dv_label,scghq1_dv,scghq1_dv_label,nchild_dv
0,22445.0,N,14,276841780.0,277059298.0,observed,0.0,0.0,2022.0,37.0,observed,2.0,observed,8.0,observed,1.0,observed,11.0,4.0,1.0,observed,2.0,observed,1.0,observed,2.0,observed,28.0,observed,1857.079956,62.83,observed,24.0,observed,2.0
1,29925.0,N,14,622866606.0,,inapplicable,0.0,0.0,2022.0,45.0,observed,2.0,observed,7.0,observed,1.0,observed,5.0,3.0,1.0,observed,2.0,observed,2.0,observed,14.0,observed,29.0,observed,2378.75,65.47,observed,23.0,observed,2.0
2,76165.0,N,14,141045780.0,142378492.0,observed,0.0,0.0,2022.0,39.0,observed,2.0,observed,5.0,observed,1.0,observed,11.0,4.0,1.0,observed,2.0,observed,1.0,observed,11.0,observed,35.0,observed,3206.0,57.2,observed,12.0,observed,2.0
3,280165.0,N,14,783876922.0,756200970.0,observed,0.0,0.0,2022.0,43.0,observed,2.0,observed,8.0,observed,2.0,observed,20.0,4.0,1.0,observed,2.0,observed,,inapplicable,,inapplicable,,inapplicable,94.470001,58.55,observed,16.0,observed,1.0
4,469205.0,N,14,414412580.0,,inapplicable,0.0,0.0,2022.0,32.0,observed,2.0,observed,4.0,observed,1.0,observed,5.0,3.0,1.0,observed,2.0,observed,2.0,observed,25.0,observed,16.0,observed,2056.080078,49.93,observed,16.0,observed,2.0
5,599765.0,N,14,209943344.0,210167702.0,observed,0.0,0.0,2022.0,35.0,observed,2.0,observed,5.0,observed,1.0,observed,10.0,3.0,1.0,observed,2.0,observed,1.0,observed,2.0,observed,37.0,observed,2839.649902,56.15,observed,6.0,observed,1.0
6,732365.0,N,14,732365.0,,inapplicable,0.0,0.0,2022.0,37.0,observed,1.0,observed,2.0,observed,1.0,observed,19.0,3.0,1.0,observed,1.0,observed,,inapplicable,,inapplicable,,inapplicable,838.5,45.49,observed,35.0,observed,0.0
7,1587125.0,N,14,1587125.0,,inapplicable,0.0,0.0,2022.0,56.0,observed,2.0,observed,1.0,observed,1.0,observed,3.0,1.0,1.0,observed,2.0,observed,1.0,observed,7.0,observed,37.0,observed,2290.0,37.37,observed,15.0,observed,0.0


N (model-ready, common pidp) columns (35):
HEAD: ['pidp', 'wave', 'wave_num', 'hrpid', 'ppid', 'ppid_label', 'ind5mus_xw', 'indinui_lw', 'intdaty_dv', 'age_dv', 'age_dv_label', 'sex_dv', 'sex_dv_label', 'gor_dv', 'gor_dv_label', 'urban_dv', 'urban_dv_label', 'hhtype_dv', 'hhsize', 'ethn_dv']
TAIL: ['urban_dv', 'urban_dv_label', 'hhtype_dv', 'hhsize', 'ethn_dv', 'ethn_dv_label', 'anxiety_raw', 'anxiety_raw_label', 'jbft_dv', 'jbft_dv_label', 'jbnssec_dv', 'jbnssec_dv_label', 'jbhrs', 'jbhrs_label', 'fimnnet_dv', 'sf12pcs_dv', 'sf12pcs_dv_label', 'scghq1_dv', 'scghq1_dv_label', 'nchild_dv']

Combined K+L+N (model-ready, common pidp) — shape 66264 x 37


Unnamed: 0,pidp,wave,wave_num,hrpid,ppid,ppid_label,ind5mus_xw,indinui_lw,intdaty_dv,intdaty_dv_label,age_dv,age_dv_label,sex_dv,sex_dv_label,gor_dv,gor_dv_label,urban_dv,urban_dv_label,hhtype_dv,hhsize,ethn_dv,ethn_dv_label,anxiety_raw,anxiety_raw_label,jbft_dv,jbft_dv_label,jbnssec_dv,jbnssec_dv_label,jbhrs,jbhrs_label,fimnnet_dv,fimnnet_dv_label,sf12pcs_dv,sf12pcs_dv_label,scghq1_dv,scghq1_dv_label,nchild_dv
0,68008847.0,K,11,68008847.0,,inapplicable,0.0,0.794699,2019.0,observed,61.0,observed,2.0,observed,1.0,observed,1.0,observed,2.0,1.0,1.0,observed,,inapplicable,1.0,observed,14.0,observed,39.0,observed,2134.0,observed,44.2,observed,9.0,observed,0.0
1,68009527.0,K,11,68009527.0,68061288.0,observed,0.0,0.962017,2019.0,observed,41.0,observed,1.0,observed,1.0,observed,1.0,observed,11.0,4.0,1.0,observed,,inapplicable,1.0,observed,15.0,observed,39.0,observed,2043.0,observed,60.48,observed,16.0,observed,2.0
2,68061288.0,K,11,68009527.0,68009527.0,observed,0.0,0.0,2019.0,observed,33.0,observed,2.0,observed,1.0,observed,1.0,observed,11.0,4.0,1.0,observed,,inapplicable,,inapplicable,,inapplicable,,inapplicable,149.25,observed,54.23,observed,11.0,observed,2.0
3,68010887.0,K,11,68068082.0,68068082.0,observed,0.0,1.055802,2019.0,observed,55.0,observed,2.0,observed,1.0,observed,1.0,observed,6.0,2.0,1.0,observed,,inapplicable,1.0,observed,25.0,observed,32.0,observed,1250.0,observed,57.28,observed,9.0,observed,0.0
4,68068082.0,K,11,68068082.0,68010887.0,observed,0.0,0.0,2019.0,observed,58.0,observed,1.0,observed,1.0,observed,1.0,observed,6.0,2.0,1.0,observed,,inapplicable,1.0,observed,19.0,observed,,inapplicable,690.859985,observed,56.15,observed,9.0,observed,0.0
5,68028575.0,K,11,68157166.0,68157166.0,observed,0.0,1.029255,2019.0,observed,28.0,observed,2.0,observed,1.0,observed,1.0,observed,11.0,4.0,1.0,observed,,inapplicable,,inapplicable,,inapplicable,,inapplicable,0.0,observed,56.71,observed,6.0,observed,2.0
6,68157166.0,K,11,68157166.0,68028575.0,observed,0.0,0.0,2019.0,observed,34.0,observed,1.0,observed,1.0,observed,1.0,observed,11.0,4.0,1.0,observed,,inapplicable,1.0,observed,8.0,observed,37.0,observed,3100.0,observed,56.64,observed,12.0,observed,2.0
7,68029927.0,K,11,68029931.0,68029931.0,observed,0.0,0.0,2019.0,observed,47.0,observed,2.0,observed,1.0,observed,1.0,observed,20.0,5.0,1.0,observed,,inapplicable,,inapplicable,,inapplicable,,inapplicable,89.699997,observed,56.95,observed,9.0,observed,1.0


Combined K+L+N (model-ready, common pidp) columns (37):
HEAD: ['pidp', 'wave', 'wave_num', 'hrpid', 'ppid', 'ppid_label', 'ind5mus_xw', 'indinui_lw', 'intdaty_dv', 'intdaty_dv_label', 'age_dv', 'age_dv_label', 'sex_dv', 'sex_dv_label', 'gor_dv', 'gor_dv_label', 'urban_dv', 'urban_dv_label', 'hhtype_dv', 'hhsize']
TAIL: ['urban_dv_label', 'hhtype_dv', 'hhsize', 'ethn_dv', 'ethn_dv_label', 'anxiety_raw', 'anxiety_raw_label', 'jbft_dv', 'jbft_dv_label', 'jbnssec_dv', 'jbnssec_dv_label', 'jbhrs', 'jbhrs_label', 'fimnnet_dv', 'fimnnet_dv_label', 'sf12pcs_dv', 'sf12pcs_dv_label', 'scghq1_dv', 'scghq1_dv_label', 'nchild_dv']
Saved Parquet -> C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\processed\model-ready\ukhls_k_model_ready_commonpidp.parquet
Saved Parquet -> C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\processed\model-ready\ukhls_l_model_ready_commonpidp.parquet
Saved Parquet -> C:\Users\User\Documents\Github\Gambling_MentalHealth_MSc\data\processed\model