## Check and set working directory

I'm not really sure this is necessary, the colab environment is confusing. And I'm also not sure whether we need to mount the drive in every session.

In [None]:
# Check current working directory.
!pwd

In [None]:
# Move working directory to ds1_nhanes folder
%cd /content/drive/MyDrive/ds1_nhanes/

## Load Libraries

In [None]:
import pandas as pd
import numpy as np

## Load Data

In [None]:
# Load daily recall 1 individual foods data
df = pd.read_sas('1_raw/nhanes_2021_2023/DR1IFF_L.xpt')
df.info()

In [None]:
df.head()

## Create dictionary for DR1IFF

This will define each variable. We could rename them, but I found that it just makes it harder when we have to join across multiple waves.

In [None]:
# Get a printout of each column name as a template
# We will copy and paste this into next cell so we can write definitions
column_definitions = {col: "" for col in df.columns}
for col in column_definitions:
    print(f'"{col}": "",  # ')

In [None]:
column_definitions = {
  "SEQN": "respondent_id",
  "WTDRD1": "weight_day_1_dietary",
  "WTDR2D": "weight_day_2_dietary",
  "DR1ILINE": "food_component_number",
  "DR1DRSTZ": "dietary_recall_status",
  "DR1EXMER": "interviewer_id_code",
  "DRABF": "breast_fed_infant",
  "DRDINT": "number_of_days_intake",
  "DR1DBIH": "days_between_intake_and_interview",
  "DR1DAY": "intake_day_of_week",
  "DR1LANG": "langauge",
  "DR1CCMNM": "combination_food_number",
  "DR1CCMTX": "combination_food_type",
  "DR1_020": "time_of_meal",
  "DR1_030Z": "name_of_meal",
  "DR1FS": "food_source",
  "DR1_040Z": "ate_at_home",
  "DR1IFDCD": "usda_food_code",
  "DR1IGRMS": "",
  "DR1IKCAL": "",
  "DR1IPROT": "",
  "DR1ICARB": "",
  "DR1ISUGR": "",
  "DR1IFIBE": "",
  "DR1ITFAT": "",
  "DR1ISFAT": "",
  "DR1IMFAT": "",
  "DR1IPFAT": "",
  "DR1ICHOL": "",
  "DR1IATOC": "",
  "DR1IATOA": "",
  "DR1IRET": "",
  "DR1IVARA": "",
  "DR1IACAR": "",
  "DR1IBCAR": "",
  "DR1ICRYP": "",
  "DR1ILYCO": "",
  "DR1ILZ": "",
  "DR1IVB1": "",
  "DR1IVB2": "",
  "DR1INIAC": "",
  "DR1IVB6": "",
  "DR1IFOLA": "",
  "DR1IFA": "",
  "DR1IFF": "",
  "DR1IFDFE": "",
  "DR1ICHL": "",
  "DR1IVB12": "",
  "DR1IB12A": "",
  "DR1IVC": "",
  "DR1IVD": "",
  "DR1IVK": "",
  "DR1ICALC": "",
  "DR1IPHOS": "",
  "DR1IMAGN": "",
  "DR1IIRON": "",
  "DR1IZINC": "",
  "DR1ICOPP": "",
  "DR1ISODI": "",
  "DR1IPOTA": "",
  "DR1ISELE": "",
  "DR1ICAFF": "",
  "DR1ITHEO": "",
  "DR1IALCO": "",
  "DR1IMOIS": "",
  "DR1IS040": "",
  "DR1IS060": "",
  "DR1IS080": "",
  "DR1IS100": "",
  "DR1IS120": "",
  "DR1IS140": "",
  "DR1IS160": "",
  "DR1IS180": "",
  "DR1IM161": "",
  "DR1IM181": "",
  "DR1IM201": "",
  "DR1IM221": "",
  "DR1IP182": "",
  "DR1IP183": "",
  "DR1IP184": "",
  "DR1IP204": "",
  "DR1IP205": "",
  "DR1IP225": "",
  "DR1IP226": ""
}

In [None]:
df.columns

In [None]:
# Make a df with column names and definitions
long_names = [
    'respondent_sequence_number', # respondent id
    'weight_day_1_dietary',
    'weight_day_2_dietary',
    'food_component_number',
    'dietary_recall_status',
    'interviewer_id_code',
    'breast_fed_infant',
    'number_of_days_of_intake',
    'days_between_intake_and_interview',
    'intake_day_of_week',

    'language',
    'combination_food_number',
    'combination_food_type',
    'time_of_meal',
    'name_of_meal',
    'food_source',
    'ate_at_home',
    'usda_food_code', # Use these to link to FPED
    'grams',
    'energy_kcal',

    'protein_gm',
    'carbohydrate_gm',
    'sugar_gm',
    'fiber_gm',
    'fat_gm',
    'saturated_fat_gm',
    'monounsaturated_fat_gm',
    'polyunsaturated_fat_gm',
    'cholesterol_mg',
    'vitamin_e_mcg',

    'added_vitamin_e_mcg',
    'retinol_mcg',
    'vitamin_a_mcg',
    'alpha_carotene_mcg',
    'beta_carotene_mcg',
    'beta_cryptoxanthin_mcg',
    'lycopene_mcg',
    'lutein_zeaxanthin_mcg',
    'vitamin_b1_mcg',
    'vitamin_b2_mcg',

    'niacin_mg',
    'vitamin_b6_mcg',
    'folate_mcg',
    'folic_acid_mcg',
    'food_folate_mcg',
    'folate_dfe_mcg',
    'choline_mg',
    'vitamin_b12_mcg',
    'added_vitamin_b12_mcg',
    'vitamin_c_mg',

    'vitamin_d_mcg',
    'vitamin_k_mcg',
    'calcium_mg',
    'phosphorus_mg',
    'magnesium_mg',
    'iron_mg',
    'zinc_mg',
    'copper_mg',
    'sodium_mg',
    'potassium_mg',

    'selenium_mcg',
    'caffeine_mg',
    'theobromine_mg',
    'alcohol_gm',
    'moisture_gm',
    'sfa_40_gm',
    'sfa_60_gm',
    'sfa_80_gm',
    'sfa_100_gm',
    'sfa_120_gm',

    'sfa_140_gm',
    'sfa_160_gm',
    'sfa_180_gm',
    'mfa_161_gm',
    'mfa_181_gm',
    'mfa_201_gm',
    'mfa_221_gm',
    'pfa_182_gm',
    'pfa_183_gm',
    'pfa_184_gm',

    'pfa_204_gm',
    'pfa_205_gm',
    'pfa_225_gm',
    'pfa_226_gm'
]
print(long_names)

In [None]:
definitions = pd.DataFrame({'variable_name': df.columns, 'definition': long_names})
definitions.head()

In [84]:
# Save definitions to objects folder
definitions.to_csv('2_clean/dr1_definitions_2021.csv')

## Explore Dietary Recall Data

In [None]:
df.info()

In [None]:
# Compare rows to unique respondent IDs
# first get number of rows
print(df.shape)
print(df['SEQN'].nunique())

There are far more rows than unique respondents. This is because for each respondent, there is one row for each individual food they consumed.

In [None]:
# Get unique food codes
print(df['DR1IFDCD'].nunique())

## Join with FPED

To identify the foods from food codes, we join with the Food Patterns Equivalents Database. This also links it to the ~39 USDA food categories that correspond to national dietary recommendations. It also converts from sensible units like grams to American units like cups.