# Load Packages

In [1]:
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import MultiLabelBinarizer
import os

# Perform Merge onto `P_GHB.xpt` (Target File)

Merging all files together to base `P_GHB.xpt`:
- P_GHB contains our target, so this will be the base of our merging operations
- ⚠️ Skipping 9 specific files with duplicate `SEQN`:
    - dietary_data: The dietary data large contains some diet information over the course of two tracked days, this information is not particularly insightful for our case, since diabetes is a longform developing disease.
        - `P_DR1IFF.xpt`
        - `P_DR2IFF.xpt`
        - `P_DS1IDS.xpt`
        - `P_DS2IDS.xpt`
        - `P_DSQIDS.xpt`
    - examination_data: These files deal with audio sensor data, which is not useful for our use case.
        - `P_AUXAR.xpt`
        - `P_AUXTYM.xpt`
        - `P_AUXWBR.xpt`
    - questionnaire_data: This data needs to undergo transformation before being usable. We first transform this dataset before merging.
        - `P_RXQ_RX.xpt`

- Skipping additional dietary data files:
    - `P_DR1TOT.xpt`,
    - `P_DR2TOT.xpt`,
    - `P_DS1TOT.xpt`,
    - `P_DS2TOT.xpt`

In [2]:
root = Path("RAW/DATA")
xpt_files = list(root.rglob("*.xpt"))

P_GHB_path = 'RAW/DATA/laboratory_data/P_GHB.xpt'
P_GHB = pd.read_sas(P_GHB_path, format='xport', encoding = 'utf-8')

base = P_GHB.copy()
base['SEQN'] = pd.to_numeric(base['SEQN']).astype('Int64')
base_idxed = base.set_index('SEQN')

used_cols = set(base_idxed.columns) # Tracking used column names for collisions

fileskip_list = [
    # Base
    'P_GHB.xpt',

    # Dietary Data
    'P_DR1IFF.xpt',
    'P_DR2IFF.xpt',
    'P_DR1TOT.xpt',
    'P_DR2TOT.xpt',
    'P_DRXFCD.xpt',
    'DSBI.xpt',
    'DSII.xpt',
    'DSPI.xpt',
    'P_DS1IDS.xpt',
    'P_DS2IDS.xpt',
    'P_DS1TOT.xpt',
    'P_DS2TOT.xpt',
    'P_DSQIDS.xpt',
    'P_DSQTOT.xpt',

    # Audiometry Sensor Data
    'P_AUXAR.xpt',
    'P_AUXTYM.xpt',
    'P_AUXWBR.xpt',

    # Medication Survey Data
    'P_RXQ_RX.xpt'
]

dfs = []  # all other dfs, already indexed by SEQN
log_path = Path("LOG/log_merge.txt")

with open(log_path, "w") as log:
    for p in xpt_files:
        filename = os.path.basename(p)
        stem = Path(filename).stem
        print(f"Loading: {filename}")

        if filename in fileskip_list:
            print(f"\tSkipping {filename} (manually excluded)")
            continue

        # Read with fallback encoding
        try:
            df = pd.read_sas(p, format="xport", encoding="utf-8")
        except UnicodeDecodeError:
            df = pd.read_sas(p, format="xport", encoding="cp1252")

        if 'SEQN' not in df.columns:
            print(f"\tSkipping {filename}, missing SEQN")
            continue

        
        df['SEQN'] = pd.to_numeric(df['SEQN']).astype('Int64')

        dup_counts = df['SEQN'].value_counts()
        dup_counts = dup_counts[dup_counts > 1]
        if not dup_counts.empty:
            # print(f"\tSkipping {filename}. Found {dup_counts.size} duplicated SEQN values")
            msg = f"Skipping {filename}. Found {dup_counts.size} duplicated SEQN values\n"
            print(f"\t{msg.strip()}")
            log.write(msg)
            continue
        df = df.set_index('SEQN')
        
        # # Renaming only colliding columns (prefixing with filename)
        # rename_mapper = {}
        # for col in df.columns:
        #     if col in used_cols:
        #         rename_mapper[col] = f"{filename}_{col}"
        # if rename_mapper:
        #     df = df.rename(columns=rename_mapper)

        df = df.rename(columns={col: f"{stem}__{col}" for col in df.columns})

        # Check for duplicate columns across files
        overlapping = [col for col in df.columns if col in used_cols]
        if overlapping:
            msg = f"{filename}: Found {len(overlapping)} duplicate columns: {overlapping}\n"
            print(f"\t{msg.strip()}")
            log.write(msg)

        used_cols.update(df.columns) # Update used set

        dfs.append(df)

# ---- One big combine ----
# Concatenate all other files horizontally (align on SEQN once)
others = pd.concat(dfs, axis=1, copy=False)

# Single join to the base
merged = base_idxed.join(others, how='left')
merged = merged.reset_index()

print(f"\nMerging complete. Duplicate column log saved to {log_path.resolve()}")

Loading: P_DEMO.xpt
Loading: DSBI.xpt
	Skipping DSBI.xpt (manually excluded)
Loading: DSII.xpt
	Skipping DSII.xpt (manually excluded)
Loading: DSPI.xpt
	Skipping DSPI.xpt (manually excluded)
Loading: P_DR1IFF.xpt
	Skipping P_DR1IFF.xpt (manually excluded)
Loading: P_DR1TOT.xpt
	Skipping P_DR1TOT.xpt (manually excluded)
Loading: P_DR2IFF.xpt
	Skipping P_DR2IFF.xpt (manually excluded)
Loading: P_DR2TOT.xpt
	Skipping P_DR2TOT.xpt (manually excluded)
Loading: P_DRXFCD.xpt
	Skipping P_DRXFCD.xpt (manually excluded)
Loading: P_DS1IDS.xpt
	Skipping P_DS1IDS.xpt (manually excluded)
Loading: P_DS1TOT.xpt
	Skipping P_DS1TOT.xpt (manually excluded)
Loading: P_DS2IDS.xpt
	Skipping P_DS2IDS.xpt (manually excluded)
Loading: P_DS2TOT.xpt
	Skipping P_DS2TOT.xpt (manually excluded)
Loading: P_DSQIDS.xpt
	Skipping P_DSQIDS.xpt (manually excluded)
Loading: P_DSQTOT.xpt
	Skipping P_DSQTOT.xpt (manually excluded)
Loading: P_AUX.xpt
Loading: P_AUXAR.xpt
	Skipping P_AUXAR.xpt (manually excluded)
Loading: P_A

# Merging Medicine Data from `P_RXQ_RX.xpt`

The idea for this part is to include what kind of diabetes medication that the patient might take, which also might alter the a1c. This data can be used in two ways:
* __Predicting Diabetes__  
  This variable together with fasting glucose, a1c, diabetes questionnaire are used to determine the target variable, then dropped to ensure no leakage.  
* __Predicting A1C (natural)__  
  If the target is to understand the natural factor that affect a1c, then might drop the row for any treated patients.
* __Predicting A1C (all factor)__  
  Can be used as either as factor for more details, or make it as simple as treated vs untreated.

**Algorithm**

Step 1 - Import drug class table (manual work in excel)  
Step 2 - Lookup and split the drug combination if any  
Step 3 - From the drug list, binarize it  
Step 4 - Summarize the binarized data to handle multiple records  
Step 5 - Merge with the full data

In [3]:
# convert XPT files to pandas dataframe
def xpt_to_df(file_path):
    df = pd.read_sas(file_path, format='xport', encoding='utf-8')
    return df

Step 1 - Import drug class table (manual work in excel)  

In [4]:
P_RXQ_RX = xpt_to_df('INPUTS/UNPROCESSED/P_RXQ_RX.XPT')
DIAMED_TAB = pd.read_excel("TABLES/DIAMED_TAB.xlsx", sheet_name="DIAMED_TAB")

Step 2 - Lookup and split the drug combination if any  

In [5]:
drug_dict = dict(zip(DIAMED_TAB["DRUG_NAME"].str.upper(), DIAMED_TAB["DRUG_CLASS"]))

def find_drug_class(drug_name):
    if not isinstance(drug_name, str):
        return []
    parts = [x.strip().upper() for x in drug_name.split(';')]
    return [drug_dict[p] for p in parts if p in drug_dict]

Step 3 - From the drug list, binarize it  

In [6]:
P_RXQ_RX['drug_classes'] = P_RXQ_RX['RXDDRUG'].apply(find_drug_class)

# One-hot encode the list of drug classes
mlb = MultiLabelBinarizer()
encoded = mlb.fit_transform(P_RXQ_RX['drug_classes'])
encoded_df = pd.DataFrame(encoded, columns=mlb.classes_, index=P_RXQ_RX.index)

# Combine with original DataFrame
P_RXQ_RX = pd.concat([P_RXQ_RX, encoded_df], axis=1)

Step 4 - Summarize the binarized data to handle multiple records  

In [7]:
cols = ['SEQN'] + list(mlb.classes_)
P_RXQ_RX_summary = P_RXQ_RX[cols].groupby("SEQN", as_index=False).max() # groupby to summarize the SEQN, then max to indicate presence of drug class
P_RXQ_RX_summary.to_csv("INPUTS/CSV/DIAMED.csv", index=False)

Step 5 - Merge with the full data

In [8]:
merged = merged.merge(P_RXQ_RX_summary, on="SEQN", how="left")

# Save merged to .parquet
- Using parquet to keep data types
- Requires `pyarrow` and `fastparquet`, listed now in requirements.txt. Make sure to restart kernel after installing to populate new pyarrow types.

In [9]:
directory = "./PROCESSED/DATA"
if not os.path.exists(directory):
    os.makedirs(directory)

save_path = directory + "/merged.parquet"

merged.to_parquet(save_path, index = False)
print(f"Saved merged.parquet to {save_path}")

Saved merged.parquet to ./PROCESSED/DATA/merged.parquet
