# Load Packages and Merged Dataset

In [131]:
import pandas as pd
import re
import os
from io import StringIO
from pathlib import Path

merged = pd.read_parquet(Path("PROCESSED/DATA/merged.parquet")) # From previous step (B_MERGE_DATA)

### Step 1: Transforming Numerical Missing to True Missing

Getting all associated numerical codes associated with missing value

In [126]:
INVALID_PATTERN = re.compile(
    r"(?:refused|don['']?t\s*know|missing|blank but applicable|"
    r"(?:can|could)\s*not\s*assess|unknown|not\s*ascertained)",
    re.IGNORECASE
)

def get_invalid_codes(codebook):
    result = {}
    codebook_df = pd.read_csv(codebook)
    if 'variable_name' not in codebook_df.columns:
        return
    for var_name in codebook_df['variable_name']:
        table = codebook_df[codebook_df['variable_name'] == var_name]['table'].item()
        if type(table) is float:
            continue

        df = pd.read_html(StringIO(table))[0]
        

        if "Code or Value" in df.columns and "Value Description" in df.columns:
            # rows with invalid desc
            # regex warning handled by using non-capturing groups above
            invalid = df["Value Description"].astype(str).str.contains(INVALID_PATTERN, na=False)
            if invalid.any():
                codes = df.loc[invalid, "Code or Value"].astype(str).str.strip().tolist()
                result[var_name] = codes
    return result

# Populate "invalid_map" (variable name to invalid codes) for questionnaire_data
questionnaire_codebooks_path = Path("RAW/CODEBOOKS/questionnaire_data")
codebook_files = list(questionnaire_codebooks_path.rglob("*.csv"))

invalid_map = {}
for codebook_file in codebook_files:
    invalid_codes = get_invalid_codes(codebook_file)
    if invalid_codes:
        invalid_map.update(invalid_codes)

Saving the mapper to .csv

In [127]:
# Save the invalid map to a CSV file
df_invalid_map = pd.DataFrame([
    {'variable': var, 'invalid_code': code}
    for var, codes in invalid_map.items()
    for code in codes
])

directory = "./PROCESSED/DATA"
if not os.path.exists(directory):
    os.makedirs(directory)
save_path = directory + "/invalid_map_questionnaire_2017_2020.csv"
df_invalid_map.to_csv(save_path, index = False)
print(f"Saved invalid_map_questionnaire_2017_2020.csv to {save_path}")

Saved invalid_map_questionnaire_2017_2020.csv to ./PROCESSED/DATA/invalid_map_questionnaire_2017_2020.csv


Apply transformation

In [128]:
# read table to set 7,77,9,99 etc to NaN
# invalid_map = pd.read_csv("./PROCESSED/DATA/invalid_map_questionnaire_2017_2020.csv")
invalid_map = pd.read_csv("./PROCESSED/DATA/invalid_map_questionnaire_2017_2020.csv", dtype=str)
invalid_dict = invalid_map.groupby("variable")["invalid_code"].apply(list).to_dict()
 
# loop and replace invalid (don't know, refused, etc.) with NaN
# for var, codes in invalid_dict.items():
#     match = [c for c in df.columns if c.startswith(var)]
#     if not match: continue
#     col = match[0]
#     df[col] = df[col].replace(codes, pd.NA)
 
for var, codes in invalid_dict.items():
    cols = [c for c in merged.columns if c == var or c.endswith(var)]
    # print(cols)
    if not cols: 
        continue
    # match "7"/"77"/"." and also 7/77 (numbers)
    codes_mixed = set(codes) | {int(c) for c in codes if c.isdigit()}
    merged[cols] = merged[cols].mask(merged[cols].isin(codes_mixed), pd.NA)

### Step 2: Selective Categorical Encoding and Dropping

Manual screening was done for more granular dropping and encoding measures
- `SHEETS/dict_data_type.xlsx`
- `TABLES/init_selection_tab.csv`

In [174]:
init_selection_tab = pd.read_excel('SHEETS/dict_data_type.xlsx', sheet_name='init_selection')
init_selection_tab.to_csv('TABLES/init_selection_tab.csv', index=False)
drop_vars = init_selection_tab.loc[init_selection_tab["IS_KEEP"] == False, "variable_name"]
cat_vars = init_selection_tab.loc[init_selection_tab["IS_CATEGORICAL"] == True, "variable_name"]

drop_vars = drop_vars.str.split('_').str[0] # Extract variable label
cat_vars = cat_vars.str.split('_').str[0] # Extract variable labelcat_vars
merged_base_names = merged.columns.to_series().str.split('__').str[-1] # Extract variable label

# Perform special selected drop
drop_vars_mask = merged_base_names.isin(drop_vars)
merged_dropped = merged.drop(
    columns = merged.columns[drop_vars_mask]
)

# Perform special categorical encoding
merged_base_names = merged_dropped.columns.to_series().str.split('__').str[-1]
merged_base_names = merged_base_names.reset_index(drop = True)
for var in cat_vars:
    if var in merged_base_names.values:
        idx = merged_base_names.index[merged_base_names.values == var]
        column_name = merged_dropped.columns[idx]
        merged_dropped[column_name] = merged_dropped[column_name].astype("category")

### Step 3: Remove the columns with more than 30% missingness  

This is a rudamentary drop. Our intention is to make a more granular pass later.

In [175]:
# Step 2a - Remove columns with more than 30% missingness
num_rows = merged.shape[0]
threshold = 0.3 * num_rows

cols_to_drop = merged.columns[merged.isnull().sum() > threshold]
merged_dropped = merged.drop(columns=cols_to_drop)

# Step 2b - Remove rows with no target variable
target_col = 'P_GHB__LBXGH'
before_rows = merged_dropped.shape[0]
merged_dropped = merged_dropped.dropna(subset=[target_col])
after_rows = merged_dropped.shape[0]
rows_dropped = before_rows - after_rows

# Logging the cleaning steps
log_path = Path("LOG/log_cleaning.txt")
with open(log_path, "w") as log:
    log.write("==== Data Cleaning Log ====\n\n")

    # Log columns dropped
    log.write(f"Columns dropped (>30% missingness): {len(cols_to_drop)}\n")
    if len(cols_to_drop) > 0:
        log.write("\n".join(cols_to_drop))
        log.write("\n\n")
    else:
        log.write("None\n\n")

    # Log rows dropped
    log.write(f"Rows dropped with missing target variable ({target_col}): {rows_dropped}\n")

print(f"Dropped {len(cols_to_drop)} columns with >30% missingness.")
print(f"Dropped {rows_dropped} rows with missing target variable ({target_col}).")
print(f"Cleaning log saved to {log_path.resolve()}")

Dropped 1107 columns with >30% missingness.
Dropped 672 rows with missing target variable (P_GHB__LBXGH).
Cleaning log saved to C:\HW\LIFEBOAT\POOL\CSE881\BLEED_FROM_THE_THROAT\CSE881_PROJ\LOG\log_cleaning.txt


### Save to .parquet

In [176]:
directory = "./PROCESSED/DATA"
if not os.path.exists(directory):
    os.makedirs(directory)

save_path = directory + "/merged_and_dropped.parquet"

merged.to_parquet(save_path, index = False)
print(f"Saved merged_and_dropped.parquet to {save_path}")

Saved merged_and_dropped.parquet to ./PROCESSED/DATA/merged_and_dropped.parquet
