# Load Packages and Merged Dataset

In [1]:
import pandas as pd
import re
import os
import json
from io import StringIO
from pathlib import Path

merged = pd.read_parquet(Path("PROCESSED/DATA/merged.parquet")) # From previous step (B_MERGE_DATA)
merged = merged.replace(['', ' ', '.'], pd.NA)  # Handle blanks

### Step 1: Transforming Numerical Missing to True Missing

Getting all associated numerical codes associated with missing value

In [2]:
INVALID_PATTERN = re.compile(
    r"(?:refused|don['']?t\s*know|missing|blank but applicable|"
    r"(?:can|could)\s*not\s*assess|unknown|not\s*ascertained)",
    re.IGNORECASE
)

def get_invalid_codes(codebook):
    result = {}
    codebook_df = pd.read_csv(codebook)
    if 'variable_name' not in codebook_df.columns:
        return
    for var_name in codebook_df['variable_name']:
        table = codebook_df[codebook_df['variable_name'] == var_name]['table'].item()
        if type(table) is float:
            continue

        df = pd.read_html(StringIO(table))[0]
        

        if "Code or Value" in df.columns and "Value Description" in df.columns:
            # rows with invalid desc
            # regex warning handled by using non-capturing groups above
            invalid = df["Value Description"].astype(str).str.contains(INVALID_PATTERN, na=False)
            if invalid.any():
                codes = df.loc[invalid, "Code or Value"].astype(str).str.strip().tolist()
                result[var_name] = codes
    return result

# Populate "invalid_map" (variable name to invalid codes) for questionnaire_data
questionnaire_codebooks_path = Path("RAW/CODEBOOKS/questionnaire_data")
codebook_files = list(questionnaire_codebooks_path.rglob("*.csv"))

invalid_map = {}
for codebook_file in codebook_files:
    invalid_codes = get_invalid_codes(codebook_file)
    if invalid_codes:
        invalid_map.update(invalid_codes)

Saving the mapper to .csv

In [3]:
# Save the invalid map to a CSV file
df_invalid_map = pd.DataFrame([
    {'variable': var, 'invalid_code': code}
    for var, codes in invalid_map.items()
    for code in codes
])

directory = "./PROCESSED/DATA"
if not os.path.exists(directory):
    os.makedirs(directory)
save_path = directory + "/invalid_map_questionnaire_2017_2020.csv"
df_invalid_map.to_csv(save_path, index = False)
print(f"Saved invalid_map_questionnaire_2017_2020.csv to {save_path}")

Saved invalid_map_questionnaire_2017_2020.csv to ./PROCESSED/DATA/invalid_map_questionnaire_2017_2020.csv


Apply transformation

In [4]:
# read table to set 7,77,9,99 etc to NaN
# invalid_map = pd.read_csv("./PROCESSED/DATA/invalid_map_questionnaire_2017_2020.csv")
invalid_map = pd.read_csv("./PROCESSED/DATA/invalid_map_questionnaire_2017_2020.csv", dtype=str)
invalid_dict = invalid_map.groupby("variable")["invalid_code"].apply(list).to_dict()
 
# loop and replace invalid (don't know, refused, etc.) with NaN
# for var, codes in invalid_dict.items():
#     match = [c for c in df.columns if c.startswith(var)]
#     if not match: continue
#     col = match[0]
#     df[col] = df[col].replace(codes, pd.NA)
 
for var, codes in invalid_dict.items():
    cols = [c for c in merged.columns if c == var or c.endswith(var)]
    # print(cols)
    if not cols: 
        continue
    # match "7"/"77"/"." and also 7/77 (numbers)
    codes_mixed = set(codes) | {int(c) for c in codes if c.isdigit()}
    merged[cols] = merged[cols].mask(merged[cols].isin(codes_mixed), pd.NA)


# create median blood pressure and pulse columns
bp_sys = ["P_BPXO__BPXOSY1", "P_BPXO__BPXOSY2", "P_BPXO__BPXOSY3"]
bp_dia = ["P_BPXO__BPXODI1", "P_BPXO__BPXODI2", "P_BPXO__BPXODI3"]
bp_pulse = ["P_BPXO__BPXOPLS1", "P_BPXO__BPXOPLS2", "P_BPXO__BPXOPLS3"]

merged["BP_sys_median"] = merged[bp_sys].median(axis=1)
merged["BP_dia_median"] = merged[bp_dia].median(axis=1)
merged["Pulse_median"] = merged[bp_pulse].median(axis=1)

merged = merged.drop(columns=bp_sys + bp_dia + bp_pulse, errors="ignore")
del bp_sys, bp_dia, bp_pulse


# set DBD895 and DBD900 column value of 5555 to 22, simplification since not many are above 21
merged[['P_DBQ__DBD895', 'P_DBQ__DBD900']] = merged[['P_DBQ__DBD895', 'P_DBQ__DBD900']].replace(5555, 22)

# set DBD905 and DBD910 column value of 6666 to 91, simplification since not many are above 90
merged[['P_DBQ__DBD905', 'P_DBQ__DBD910']] = merged[['P_DBQ__DBD905', 'P_DBQ__DBD910']].replace(6666, 91)

  merged["BP_sys_median"] = merged[bp_sys].median(axis=1)
  merged["BP_dia_median"] = merged[bp_dia].median(axis=1)
  merged["Pulse_median"] = merged[bp_pulse].median(axis=1)


### Step 2: Remove the columns with more than 30% missingness  

This is a rudamentary drop. Our intention is to make a more granular pass later.

In [5]:
# Step 2a - Remove columns with more than 30% missingness
num_rows = merged.shape[0]
threshold = 0.3 * num_rows

cols_to_drop = merged.columns[merged.isnull().sum() > threshold]
merged_dropped = merged.drop(columns=cols_to_drop)

# Step 2b - Remove rows with no target variable
target_col = 'LBXGH'
before_rows = merged_dropped.shape[0]
merged_dropped = merged_dropped.dropna(subset=[target_col])
after_rows = merged_dropped.shape[0]
rows_dropped = before_rows - after_rows

# Logging the cleaning steps
log_path = Path("LOG/log_cleaning.txt")
with open(log_path, "w") as log:
    log.write("==== Data Cleaning Log ====\n\n")

    # Log columns dropped
    log.write(f"Columns dropped (>30% missingness): {len(cols_to_drop)}\n")
    if len(cols_to_drop) > 0:
        log.write("\n".join(cols_to_drop))
        log.write("\n\n")
    else:
        log.write("None\n\n")

    # Log rows dropped
    log.write(f"Rows dropped with missing target variable ({target_col}): {rows_dropped}\n")

print(f"Dropped {len(cols_to_drop)} columns with >30% missingness.")
print(f"Dropped {rows_dropped} rows with missing target variable ({target_col}).")
print(f"Cleaning log saved to {log_path.resolve()}")

Dropped 1209 columns with >30% missingness.
Dropped 672 rows with missing target variable (LBXGH).
Cleaning log saved to C:\Users\victo\Documents\CSE 881\PROJECT\CSE881_PROJ\LOG\log_cleaning.txt


### Step 3: Selective Categorical Encoding and Dropping

Manual screening was done for more granular dropping and encoding measures
- `SHEETS/dict_data_type.xlsx`
- `TABLES/init_selection_tab.csv`

In [6]:
init_selection_tab = pd.read_excel('SHEETS/dict_data_type.xlsx', sheet_name='init_selection')
init_selection_tab.to_csv('TABLES/init_selection_tab.csv', index=False)
drop_vars = init_selection_tab.loc[init_selection_tab["IS_KEEP"] == False, "variable_name"]
cat_vars = init_selection_tab.loc[init_selection_tab["IS_CATEGORICAL"] == True, "variable_name"]

drop_vars = drop_vars.str.split('_').str[0] # Extract variable label
cat_vars = cat_vars.str.split('_').str[0] # Extract variable labelcat_vars
merged_base_names = merged_dropped.columns.to_series().str.split('__').str[-1] # Extract variable label

# Track variables before drop
before_drop = merged_dropped.shape[1]
print(f"Variables before drop: {before_drop}")

# Perform special selected drop
drop_vars_mask = merged_base_names.isin(drop_vars)
merged_dropped = merged_dropped.drop(
    columns = merged_dropped.columns[drop_vars_mask]
)

# Track variables after drop
after_drop = merged_dropped.shape[1]
print(f"Variables after drop: {after_drop}")
print(f"Total variables dropped: {before_drop - after_drop}")

# Perform special categorical encoding
merged_base_names = merged_dropped.columns.to_series().str.split('__').str[-1]
# merged_base_names = merged_base_names.reset_index(drop = True) # taken out, breaks its original index alignment with merged_dropped.columns
# for var in cat_vars:
    # if var in merged_base_names.values:
    #     idx = merged_base_names.index[merged_base_names.values == var]
    #     column_name = merged_dropped.columns[idx]
    #     merged_dropped[column_name] = merged_dropped[column_name].astype("category")

# cast one column at a time to avoid the “mixed block” issue
for col, base in zip(merged_dropped.columns, merged_base_names):
    if base in set(cat_vars):
        merged_dropped[col] = merged_dropped[col].astype('category')

dtypes = merged_dropped.dtypes

Variables before drop: 421
Variables after drop: 258
Total variables dropped: 163


## Rename for Clarity

In [7]:
# Folder with all codebooks
codebook_root = Path("RAW/CODEBOOKS")

# Helper to clean symbols
def clean_text(s):
    s = "" if pd.isna(s) else str(s)
    s = re.sub(r"[^A-Za-z0-9_]+", "_", s)
    return re.sub(r"_+", "_", s).strip("_")

# Collect all codebook files
codebook_list = []
for f in codebook_root.rglob("*_codebook.csv"):
    df = pd.read_csv(f)
    df.columns = [c.lower().replace(" ", "_") for c in df.columns]
    df = df.rename(columns={"variable_name": "var", "variable": "var",
                            "sas_label": "label", "label": "label"})
    if "var" not in df or "label" not in df:
        continue
    data_file = Path(f).stem.replace("_codebook", "")
    df["variable_name"] = df["var"].apply(clean_text)
    df["sas_label"] = df["label"].apply(clean_text)
    df["variable_label"] = df["variable_name"] + "_" + df["sas_label"]
    df["data_file"] = data_file
    codebook_list.append(df[["data_file", "variable_name", "sas_label", "variable_label"]])

# Combine and save
if codebook_list:
    out = pd.concat(codebook_list, ignore_index=True)
    output_path = Path("TABLES/dictionary.csv")
    output_path.parent.mkdir(parents=True, exist_ok=True)
    out.to_csv(output_path, index=False)
    print(f"Saved dictionary to: {output_path.resolve()}")
else:
    print("No valid codebooks found.")

dict_path = "TABLES/dictionary.csv"
df_dict = pd.read_csv(dict_path)

# Mapping: {old_name: new_name}
# The old name is data_file + "__" + variable_name (same as in merged data)
# df_dict["old_name"] = df_dict["data_file"] + "__" + df_dict["variable_name"]
# rename_map = dict(zip(df_dict["old_name"], df_dict["variable_label"]))

# Modified dictionary mapping that preserves file prefix
df_dict["old_name"] = df_dict["data_file"] + "__" + df_dict["variable_name"]
df_dict["new_name"] = df_dict["data_file"] + "__" + df_dict["variable_label"]
rename_map = dict(zip(df_dict["old_name"], df_dict["new_name"]))

# Apply renaming
merged_dropped = merged_dropped.rename(columns=rename_map)

Saved dictionary to: C:\Users\victo\Documents\CSE 881\PROJECT\CSE881_PROJ\TABLES\dictionary.csv


### Save to .parquet

In [8]:
directory = "./PROCESSED/DATA"
if not os.path.exists(directory):
    os.makedirs(directory)

save_path = directory + "/merged_and_dropped.parquet"

cat_cols = merged_dropped.select_dtypes(include=['category']).columns.tolist()
with open("./PROCESSED/DATA/merged_and_dropped.cat_cols.json", "w") as f:
    json.dump(cat_cols, f)

merged_dropped.to_parquet(save_path, index = False)
merged_dropped.to_csv(directory + "/merged_and_dropped.csv", index = False)
print(f"Saved merged_and_dropped.parquet to {save_path}")

Saved merged_and_dropped.parquet to ./PROCESSED/DATA/merged_and_dropped.parquet
