In [1]:
import numpy as np
import pandas as pd

In [2]:
# convert XPT files to pandas dataframe
def xpt_to_df(file_path):
    df = pd.read_sas(file_path, format='xport', encoding='utf-8')
    return df

In [3]:
nhanes_variable_mapping = {
    # Demographics and Status
    'SEQN': 'respondent_sequence_number',
    'SDDSRVYR': 'data_release_cycle', 
    'RIDSTATR': 'interview_examination_status',
    'RIAGENDR': 'gender',
    'RIDAGEYR': 'age_years_screening',
    'RIDAGEMN': 'age_months_screening_0_to_24',
    'RIDRETH1': 'race_hispanic_origin',
    'RIDRETH3': 'race_hispanic_origin_with_asian',
    'RIDEXMON': 'six_month_exam_period',
    'RIDEXAGM': 'age_months_exam_0_to_19_years',
    'DMQMILIZ': 'served_active_duty_armed_forces',
    'DMDBORN4': 'country_of_birth',
    'DMDYRUSR': 'length_time_in_us',
    'DMDEDUC2': 'education_level_adults_20_plus',
    'DMDMARTZ': 'marital_status',
    'RIDEXPRG': 'pregnancy_status_at_exam',
    'DMDHHSIZ': 'total_people_in_household',
    'DMDHRGND': 'household_ref_person_gender',
    'DMDHRAGZ': 'household_ref_person_age',
    'DMDHREDZ': 'household_ref_person_education',
    'DMDHRMAZ': 'household_ref_person_marital_status',
    'DMDHSEDZ': 'household_ref_person_spouse_education',
    'WTINT2YR': 'full_sample_2year_interview_weight',
    'WTMEC2YR': 'full_sample_2year_mec_exam_weight',
    'SDMVSTRA': 'masked_variance_pseudo_stratum',
    'SDMVPSU': 'masked_variance_pseudo_psu',
    'INDFMPIR': 'ratio_family_income_to_poverty',
    
    # Body Measurements
    'BMDSTATS': 'body_measures_component_status',
    'BMXWT': 'weight_kg',
    'BMIWT': 'weight_comment',
    'BMXRECUM': 'recumbent_length_cm',
    'BMIRECUM': 'recumbent_length_comment',
    'BMXHEAD': 'head_circumference_cm',
    'BMIHEAD': 'head_circumference_comment',
    'BMXHT': 'standing_height_cm',
    'BMIHT': 'standing_height_comment',
    'BMXBMI': 'body_mass_index',
    'BMDBMIC': 'bmi_category_children_youth',
    'BMXLEG': 'upper_leg_length_cm',
    'BMILEG': 'upper_leg_length_comment',
    'BMXARML': 'upper_arm_length_cm',
    'BMIARML': 'upper_arm_length_comment',
    'BMXARMC': 'arm_circumference_cm',
    'BMIARMC': 'arm_circumference_comment',
    'BMXWAIST': 'waist_circumference_cm',
    'BMIWAIST': 'waist_circumference_comment',
    'BMXHIP': 'hip_circumference_cm',
    'BMIHIP': 'hip_circumference_comment'
}

In [5]:
# read data
BMX_df_raw = xpt_to_df('INPUTS/UNPROCESSED/P_BMX.XPT')

# BMX_df = BMX_df_raw.rename(columns=nhanes_variable_mapping)
# BMX_df.head()

DEMO_df = xpt_to_df('INPUTS/UNPROCESSED/P_DEMO.XPT')

P_RXQ_RX_df = xpt_to_df('INPUTS/UNPROCESSED/P_RXQ_RX.XPT')

import os
os.makedirs('INPUTS/CSV', exist_ok=True)
BMX_df_raw.to_csv('INPUTS/CSV/BMX_df.csv', index=False)
DEMO_df.to_csv('INPUTS/CSV/DEMO_df.csv', index=False)
P_RXQ_RX_df.to_csv('INPUTS/CSV/P_RXQ_RX_df.csv', index=False)

In [None]:
# DSII = xpt_to_df('INPUTS/UNPROCESSED/DSII.XPT')

In [None]:
# glm(....., weight=BMX_df['full_sample_2year_mec_exam_weight'], ...)