In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import load_dataset
from typing import List, Tuple, Any, Dict, Optional

In [3]:
dataset = load_dataset("rtweera/nhanes-dataset-selected-raw-attributes-v3", split="train")
df = dataset.to_pandas()
df.shape

README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

nhanes-selected-attributes-raw-v3.parque(â€¦):   0%|          | 0.00/11.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/101316 [00:00<?, ? examples/s]

(101316, 375)

In [4]:
def str_to_list(string: str) -> list:
    return [x.strip() for x in string.strip().split("\n")]
    

In [5]:
nominal_attr_groups = {
    "CHOL_LIFESTYLE_CHANGES": str_to_list(
        """
        BPD110A__questionnaire
        BPD110B__questionnaire
        BPD110C__questionnaire
        BPD120__questionnaire
        BPD130__questionnaire
        BPD140__questionnaire
        """
    ),
    "HTN_LIFESTYLE_CHANGES_TOLD": str_to_list(
        """
        BPQ040B__questionnaire
        BPQ040C__questionnaire
        BPQ040D__questionnaire
        BPQ040E__questionnaire
        BPQ040F__questionnaire
        BPQ043A__questionnaire
        BPQ043B__questionnaire
        BPQ043C__questionnaire
        BPQ043D__questionnaire
        """
    ),
    "HTN_LIFESTYLE_CHANGES_NOW": str_to_list(
        """
        BPQ050B__questionnaire
        BPQ050C__questionnaire
        BPQ050D__questionnaire
        BPQ050E__questionnaire
        """
    ), 
    "CHOL_LIFESTYLE_CHANGES_TOLD": str_to_list(
        """
        BPQ090A__questionnaire
        BPQ090B__questionnaire
        BPQ090C__questionnaire
        """
    ),
    "CHOL_LIFESTYLE_CHANGES_NOW": str_to_list(
        """
        BPQ100A__questionnaire
        BPQ100B__questionnaire
        BPQ100C__questionnaire
        """
    ),
    "LEG_PAIN": str_to_list(
        """
        DIQ140__questionnaire
        DIQ150__questionnaire
        """
    ),
    "PREDIAB_RISK": str_to_list(
        """
        DIQ160__questionnaire
        DIQ170__questionnaire
        DIQ172__questionnaire
        """
    ), 
    "PREDIAB_RISK_REASON": str_to_list(
        """
        DIQ175A__questionnaire
        DIQ175B__questionnaire
        DIQ175C__questionnaire
        DIQ175D__questionnaire
        DIQ175E__questionnaire
        DIQ175F__questionnaire
        DIQ175G__questionnaire
        DIQ175H__questionnaire
        DIQ175I__questionnaire
        DIQ175J__questionnaire
        DIQ175K__questionnaire
        DIQ175L__questionnaire
        DIQ175M__questionnaire
        DIQ175N__questionnaire
        DIQ175O__questionnaire
        DIQ175P__questionnaire
        DIQ175Q__questionnaire
        DIQ175R__questionnaire
        DIQ175S__questionnaire
        DIQ175T__questionnaire
        DIQ175U__questionnaire
        DIQ175V__questionnaire
        DIQ175W__questionnaire
        DIQ175X__questionnaire
        """
    ),
    "DIAB_FAMILY_HISTORY_INDEX": str_to_list(
        """
        HAC5A1__questionnaire
        HAC5A10__questionnaire
        HAC5A11__questionnaire
        HAC5A12__questionnaire
        HAC5A2__questionnaire
        HAC5A3__questionnaire
        HAC5A4__questionnaire
        HAC5A5__questionnaire
        HAC5A6__questionnaire
        HAC5A7__questionnaire
        HAC5A8__questionnaire
        HAC5A9__questionnaire
        """
    ),
    "HBP_LIFESTYLE_CHANGES_TOLD": str_to_list(
        """
        HAE4B__questionnaire
        HAE4C__questionnaire
        HAE4D1__questionnaire
        HAE4D2__questionnaire
        HAE4D3__questionnaire
        HAE4D4__questionnaire
        HAE4D5__questionnaire
        HAE4D6__questionnaire
        """
    ),
    "HBP_LIFESTYLE_CHANGES_NOW": str_to_list(
        """
        HAE5B__questionnaire
        HAE5C__questionnaire
        HAE5D1__questionnaire
        HAE5D2__questionnaire
        HAE5D3__questionnaire
        HAE5D4__questionnaire
        HAE5D5__questionnaire
        HAE5D6__questionnaire
        """
    ),
    "HBC_LIFESTYLE_CHANGES_TOLD": str_to_list(
        """
        HAE8A__questionnaire
        HAE8B__questionnaire
        HAE8C__questionnaire
        """
    ),
    "HBC_LIFESTYLE_CHANGES_NOW": str_to_list(
        """
        HAE9A__questionnaire
        HAE9B__questionnaire
        HAE9C__questionnaire
        """
    ),
    "DIET_CHANGE_REASON": str_to_list(
        """
        HAM15A__questionnaire
        HAM15B__questionnaire
        HAM15C__questionnaire
        HAM15D__questionnaire
        HAM15Y__questionnaire
        """
    ), 
    "CURRENT_TOBACCO_USE": str_to_list(
        """
        HAR16__questionnaire
        HAR24__questionnaire
        HAR27__questionnaire
        HAR3__questionnaire
        """
    ),
    
}


In [6]:
nominal_attr_groups

{'CHOL_LIFESTYLE_CHANGES': ['BPD110A__questionnaire',
  'BPD110B__questionnaire',
  'BPD110C__questionnaire',
  'BPD120__questionnaire',
  'BPD130__questionnaire',
  'BPD140__questionnaire'],
 'HTN_LIFESTYLE_CHANGES_TOLD': ['BPQ040B__questionnaire',
  'BPQ040C__questionnaire',
  'BPQ040D__questionnaire',
  'BPQ040E__questionnaire',
  'BPQ040F__questionnaire',
  'BPQ043A__questionnaire',
  'BPQ043B__questionnaire',
  'BPQ043C__questionnaire',
  'BPQ043D__questionnaire'],
 'HTN_LIFESTYLE_CHANGES_NOW': ['BPQ050B__questionnaire',
  'BPQ050C__questionnaire',
  'BPQ050D__questionnaire',
  'BPQ050E__questionnaire'],
 'CHOL_LIFESTYLE_CHANGES_TOLD': ['BPQ090A__questionnaire',
  'BPQ090B__questionnaire',
  'BPQ090C__questionnaire'],
 'CHOL_LIFESTYLE_CHANGES_NOW': ['BPQ100A__questionnaire',
  'BPQ100B__questionnaire',
  'BPQ100C__questionnaire'],
 'LEG_PAIN': ['DIQ140__questionnaire', 'DIQ150__questionnaire'],
 'PREDIAB_RISK': ['DIQ160__questionnaire',
  'DIQ170__questionnaire',
  'DIQ172__questi

In [7]:
def reduce_nominal_dimensions(
    df: pd.DataFrame,
    attr_groups: Dict[str, List[str]],
    count_value: int = 1,
    max_count: Optional[int] = None,
    drop_original: bool = True
) -> pd.DataFrame:
    """
    Reduce dimensionality by combining nominal attributes into count columns.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe
    attr_groups : Dict[str, List[str]]
        Dictionary mapping new column names to lists of original columns to combine
    count_value : int, default=1
        The value to count in the original columns (e.g., 1 means count occurrences of 1)
    max_count : Optional[int], default=None
        Maximum value to cap the count at. If None, no capping is applied
    drop_original : bool, default=True
        Whether to drop the original columns after creating new ones
    
    Returns:
    --------
    pd.DataFrame
        DataFrame with new combined columns
    
    Examples:
    ---------
    >>> # Count occurrences of 1, cap at 5, drop original columns
    >>> df_reduced = reduce_nominal_dimensions(
    ...     df, 
    ...     nominal_attr_groups, 
    ...     count_value=1, 
    ...     max_count=5, 
    ...     drop_original=True
    ... )
    
    >>> # Count occurrences of 2, no cap, keep original columns
    >>> df_reduced = reduce_nominal_dimensions(
    ...     df, 
    ...     nominal_attr_groups, 
    ...     count_value=2, 
    ...     max_count=None, 
    ...     drop_original=False
    ... )
    """
    df_result = df.copy()
    columns_to_drop = []
    
    for new_col_name, original_cols in attr_groups.items():
        # Filter to only existing columns in the dataframe
        existing_cols = [col for col in original_cols if col in df.columns]
        
        if len(existing_cols) != len(original_cols):
            raise ValueError("Error: column length mismatch in nominal feature reduction")
        
        # Count occurrences of the specified value across the columns
        # Using .eq() to compare with count_value, then sum across columns
        count_series = df[existing_cols].eq(count_value).sum(axis=1)
        
        # Apply max_count cap if specified
        if max_count is not None:
            count_series = count_series.clip(upper=max_count)
        
        # Add the new column
        df_result[new_col_name] = count_series
        
        # Track columns to drop
        if drop_original:
            columns_to_drop.extend(existing_cols)
    
    # Drop original columns if requested
    if drop_original:
        columns_to_drop = list(set(columns_to_drop))  # Remove duplicates
        df_result = df_result.drop(columns=columns_to_drop, errors='ignore')
    
    return df_result

# EXAMPLE USAGE
# Apply to your dataframe
# df_reduced = reduce_nominal_dimensions(
#     df=your_dataframe,
#     attr_groups=nominal_attr_groups,
#     count_value=1,        # Count occurrences of 1 (that is 'yes' values)
#     max_count=5,          # Cap at 5 (or None for no cap)
#     drop_original=True    # Drop original columns
# )

In [8]:
def reduce_dietary_features(
    df: pd.DataFrame,
    drop_original: bool = True
) -> pd.DataFrame:
    """
    Reduce specific dietary intake frequencies into clinically relevant 
    aggregated features for diabetes prediction.
    
    Groups foods by their metabolic impact and nutritional characteristics
    relevant to diabetes risk and management.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe with detailed food frequency columns
    drop_original : bool
        Whether to drop the original detailed columns
    
    Returns:
    --------
    pd.DataFrame
        DataFrame with aggregated dietary features
    """
    df_result = df.copy()
    
    # Define food groupings based on metabolic/diabetes relevance
    food_groups = {
        # High glycemic load carbohydrates
        'REFINED_CARBS_FREQ': [
            'HAN5GS__questionnaire',
            'HAN6AS__questionnaire',
            'HAN4DS__questionnaire',
            'HAN5KS__questionnaire',
            'HAN5JS__questionnaire',
            'HAN2BS__questionnaire'
        ],
        
        # Whole grains and complex carbs (better for diabetes)
        'WHOLE_GRAINS_FREQ': [
            'HAN5HS__questionnaire',
            'HAN5IS__questionnaire'
        ],
        
        # Added sugars and sweetened beverages (high diabetes risk)
        'ADDED_SUGARS_FREQ': [
            'HAN6ES__questionnaire',
            'HAN6CS__questionnaire',
            'HAN6BS__questionnaire',
            'HAN1AS__questionnaire',
            'HAN1FS__questionnaire'
        ],
        
        # Fruits and natural fruit sources (fiber + natural sugars)
        'FRUIT_INTAKE_FREQ': [
            'HAN3AS__questionnaire',
            'HAN3BS__questionnaire',
            'HAN3DS__questionnaire',
            'HAN3ES__questionnaire',
            'HAN3FS__questionnaire',
            'HAN4ES__questionnaire'  # High in fiber, better GI
        ],
        
        # Dairy products (protein, calcium, fat content varies)
        'DAIRY_FREQ': [
            'HAN1BS__questionnaire',
            'HAN1GS__questionnaire',
            'HAN1IS__questionnaire',
            'HAN1HS__questionnaire'  # Cheese-heavy
        ],
        
        # Alcohol consumption (affects blood sugar regulation)
        'ALCOHOL_FREQ': [
            'HAN6HS__questionnaire',
            'HAN6IS__questionnaire',
            'HAN6JS__questionnaire'
        ],
        
        # Saturated fats
        'SATURATED_FAT_FREQ': [
            'HAN7BS__questionnaire'
        ],
        
        # Diet/low-calorie beverages
        'DIET_BEVERAGES_FREQ': [
            'HAN6DS__questionnaire'
        ]
    }
    
    columns_to_drop = []
    
    # Create aggregated features
    for new_feature, food_items in food_groups.items():
        existing_cols = [col for col in food_items if col in df.columns]
        
        if existing_cols:
            # Sum frequencies across all items in the group
            # NaN values are ignored in sum (treated as 0)
            df_result[new_feature] = df[existing_cols].sum(axis=1, min_count=1)
            columns_to_drop.extend(existing_cols)
    
    # Create derived ratios and composite features
    # These provide more nuanced dietary pattern information
    
    # Ratio of refined to whole grains (higher = worse for diabetes)
    if 'REFINED_CARBS_FREQ' in df_result.columns and 'WHOLE_GRAINS_FREQ' in df_result.columns:
        df_result['REFINED_TO_WHOLE_GRAIN_RATIO'] = (
            df_result['REFINED_CARBS_FREQ'] / 
            (df_result['WHOLE_GRAINS_FREQ'] + 1)  # Add 1 to avoid division by zero
        )
    
    # Ratio of added sugars to fruit intake
    if 'ADDED_SUGARS_FREQ' in df_result.columns and 'FRUIT_INTAKE_FREQ' in df_result.columns:
        df_result['SUGAR_TO_FRUIT_RATIO'] = (
            df_result['ADDED_SUGARS_FREQ'] / 
            (df_result['FRUIT_INTAKE_FREQ'] + 1)
        )
    
    # Total unhealthy dietary pattern score (sum of risk factors)
    unhealthy_components = ['REFINED_CARBS_FREQ', 'ADDED_SUGARS_FREQ', 
                           'SATURATED_FAT_FREQ', 'ALCOHOL_FREQ']
    existing_unhealthy = [col for col in unhealthy_components if col in df_result.columns]
    if existing_unhealthy:
        df_result['UNHEALTHY_DIET_SCORE'] = df_result[existing_unhealthy].sum(axis=1, min_count=1)
    
    # Total healthy dietary pattern score
    healthy_components = ['WHOLE_GRAINS_FREQ', 'FRUIT_INTAKE_FREQ', 'VEGETABLE_DISHES_FREQ']
    existing_healthy = [col for col in healthy_components if col in df_result.columns]
    if existing_healthy:
        df_result['HEALTHY_DIET_SCORE'] = df_result[existing_healthy].sum(axis=1, min_count=1)
    
    # Overall diet quality ratio
    if 'HEALTHY_DIET_SCORE' in df_result.columns and 'UNHEALTHY_DIET_SCORE' in df_result.columns:
        df_result['DIET_QUALITY_RATIO'] = (
            df_result['HEALTHY_DIET_SCORE'] / 
            (df_result['UNHEALTHY_DIET_SCORE'] + 1)
        )
    
    # Drop original columns if requested
    if drop_original:
        columns_to_drop = list(set(columns_to_drop))
        df_result = df_result.drop(columns=columns_to_drop, errors='ignore')
    
    return df_result

# EXAMPLE USAGE
# # Reduce dietary features
# df_reduced = reduce_dietary_features(df_sample, drop_original=False)

In [19]:
def reduce_activity_features(
    df: pd.DataFrame,
    drop_original: bool = True,
    vigorous_weight: int = 2,
    moderate_weight: int = 1
) -> pd.DataFrame:
    """
    Combines vigorous and moderate physical activity features into a single,
    weighted total activity time feature.

    Vigorous activity is weighted more heavily, reflecting its higher
    MET (Metabolic Equivalent of Task) value.

    Parameters:
    -----------
    df : pd.DataFrame
        Input dataframe containing the physical activity columns.
    vigorous_col : str
        Name of the column representing vigorous activity time.
    moderate_col : str
        Name of the column representing moderate activity time.
    vigorous_weight : int
        Weight/multiplier for the vigorous activity column (default is 2).
    moderate_weight : int
        Weight/multiplier for the moderate activity column (default is 1).
    drop_original : bool
        Whether to drop the original detailed columns (PAQ650 and PAQ665).

    Returns:
    --------
    pd.DataFrame
        DataFrame with the new aggregated 'TOTAL_WEEKLY_ACTIVITY_TIME' feature.
    """
    df_result = df.copy()

    vigorous_col: str = 'PAQ650__questionnaire'
    moderate_col: str = 'PAQ665__questionnaire'
    
    
    activity_cols = [vigorous_col, moderate_col,
                     "PAD615__questionnaire",
                     "PAD630__questionnaire",
                     "PAD645__questionnaire",
                     "PAD660__questionnaire",
                     "PAD675__questionnaire",
                    ]
    
    # Check if required columns exist
    if not all(col in df.columns for col in activity_cols):
        missing_cols = [col for col in activity_cols if col not in df.columns]
        raise ValueError(f"Warning: Missing columns for activity aggregation: {missing_cols}")

    # 1. Multiply the vigorous activity column by its weight (e.g., 2)
    #    The `fillna(0)` ensures that NaN values don't propagate and are treated
    #    as 0 for the multiplication/sum, which is appropriate for time measures.
    df_result[f'{vigorous_col}_weighted'] = (
        df[vigorous_col].fillna(0) * vigorous_weight
    )

    # 2. Multiply the moderate activity column by its weight (e.g., 1)
    df_result[f'{moderate_col}_weighted'] = (
        df[moderate_col].fillna(0) * moderate_weight
    )

    # 3. Sum the weighted columns to get the total activity time
    df_result['TOTAL_WEEKLY_ACTIVITY_TIME'] = (
        df_result[f'{vigorous_col}_weighted'] + 
        df_result[f'{moderate_col}_weighted']
    )

    # 4. Drop temporary weighted columns
    df_result = df_result.drop(
        columns=[f'{vigorous_col}_weighted', f'{moderate_col}_weighted'],
        errors='raise'
    )

    # Define the activity columns and their corresponding weights
    activity_groups = {
        # Vigorous activities (Weight = 2)
        'VIGOROUS': [
            'PAD615__questionnaire',  # Vigorous at work
            'PAD660__questionnaire'   # Vigorous recreation
        ],
        # Moderate activities (Weight = 1)
        'MODERATE': [
            'PAD630__questionnaire',  # Moderate at work
            'PAD645__questionnaire',  # Walking/bicycling for travel
            'PAD675__questionnaire'   # Moderate recreation
        ]
    }
    
    # Initialize the new score column to zero
    df_result["TOTAL_DAILY_ACTIVITY_SCORE"] = 0.0

    columns_to_sum = []
    
    # Process Vigorous Activities
    for col in activity_groups['VIGOROUS']:
        if col in df.columns:
            # Multiply time by the vigorous weight (e.g., 2)
            # .fillna(0) ensures NaNs are treated as 0 for the calculation
            df_result['TOTAL_DAILY_ACTIVITY_SCORE'] += df[col].fillna(0) * vigorous_weight
            columns_to_sum.append(col)

    # Process Moderate Activities
    for col in activity_groups['MODERATE']:
        if col in df.columns:
            # Multiply time by the moderate weight (e.g., 1)
            df_result['TOTAL_DAILY_ACTIVITY_SCORE'] += df[col].fillna(0) * moderate_weight
            columns_to_sum.append(col)

    # Note on handling NaNs:
    # We used .fillna(0) before the calculation. This treats missing activity data
    # as 'zero minutes of activity' for the *score calculation*.

    # Now, check if all original columns were missing for a row.
    # If *all* original columns are NaN for a person, the aggregated score
    # should ideally be NaN to reflect a lack of data, not zero activity.
    
    # Identify rows where ALL five original columns were NaN
    all_original_cols = activity_groups['VIGOROUS'] + activity_groups['MODERATE']
    
    # Use .all(axis=1) on the boolean mask to find rows where all are True (i.e., all NaNs)
    all_nan_mask = df[all_original_cols].isna().all(axis=1)
    
    # Apply NaN to the final score for those rows
    df_result.loc[all_nan_mask, 'TOTAL_DAILY_ACTIVITY_SCORE'] = float('nan')

    activity_cols.extend(columns_to_sum)
    # 5. Drop original columns if requested
    if drop_original:
        df_result = df_result.drop(columns=activity_cols, errors='raise')

    return df_result

In [20]:
def preprocess_pipeline(df: pd.DataFrame)->pd.DataFrame:
    df = reduce_nominal_dimensions(
        df=df,
        attr_groups=nominal_attr_groups,
        count_value=1,        # Count occurrences of 1 (that is 'yes' values)
        max_count=5,          # Cap at 5 (or None for no cap)
        drop_original=True    # Drop original columns
    )
    df = reduce_dietary_features(
        df=df,
        drop_original=True
    )
    df = reduce_activity_features(
        df=df,
        drop_original=True
    )
    return df

In [21]:
reduced_df = preprocess_pipeline(df)
reduced_df.shape

(101316, 273)

In [22]:
reduced_df.to_parquet("nhanes-feature-engineered.parquet")

In [23]:
reduced_df.to_csv("nhanes-feature-engineered.csv")

In [30]:

missing_count = reduced_df["REFINED_CARBS_FREQ"].isna().sum()

# 2. Calculate the total number of rows
total_count = len(reduced_df["REFINED_CARBS_FREQ"])

(missing_count / total_count) * 100

90.17035808756762