# 1. Processings of `FoodData_Central_survey`

Link towards the dataset: https://fdc.nal.usda.gov/download-datasets.html

In [None]:
# Needed Dependencies
# !pip install pandas openpyxl

In [None]:
# Imports
import pandas as pd
import os

In [58]:
save_path = "./data/"
read_path = f"{save_path}FoodData_Central_survey_food_csv_2024-10-31/"

csv_files = [f for f in os.listdir(read_path) if f.endswith('.csv')]

csv_files.sort()

dataframes = [pd.read_csv(os.path.join(read_path, key)) for key in csv_files if key.islower()]

# Step 0 - Cleaning individual datasets

In [59]:
food_nutrient_ds = pd.read_csv(os.path.join(read_path, 'food_nutrient.csv'))

# Drop the columns from the original dataset that are full of null values
food_nutrient_ds = food_nutrient_ds.drop(columns=['data_points', 'derivation_id', 'min', 'max', 'median', 'footnote', 'min_year_acquired'])

food_nutrient_ds.head()

Unnamed: 0,id,fdc_id,nutrient_id,amount
0,34136169,2705384,301,125.0
1,34136178,2705384,319,57.0
2,34136185,2705384,337,0.0
3,34136218,2705384,628,0.002
4,34136219,2705384,629,0.0


In [60]:
nutrient_ds = pd.read_csv(os.path.join(read_path, 'nutrient.csv'))

# Will be merged with food_nutrient_ds on nutrient_nbr => Nutrient number has to have the same type as nutrient_id from food_nutrient_ds
# Convert nutrient_nbr to int, filling NaN with -1
nutrient_ds['nutrient_nbr'] = nutrient_ds['nutrient_nbr'].fillna(-1).astype(int)

# Rank does not appear in the documentation of the database: https://fdc.nal.usda.gov/docs/Download_Field_Descriptions_Oct2020.pdf\
# So we will drop it for now to reduce computation and increase readability, speed and memory usage
nutrient_ds = nutrient_ds.drop(columns=['rank'])

# 'Id' won't be used for merging so we can drop it as well
nutrient_ds = nutrient_ds.drop(columns=['id'])

# Rename name to Nutrient_Name
# And: unit to Nutrient_Unit for better readability
nutrient_ds = nutrient_ds.rename(columns={'name': 'Nutrient_Name', 'unit_name': 'Nutrient_Unit'})

nutrient_ds.head()

Unnamed: 0,Nutrient_Name,Nutrient_Unit,nutrient_nbr
0,Energy (Atwater General Factors),KCAL,957
1,Energy (Atwater Specific Factors),KCAL,958
2,Solids,G,201
3,Nitrogen,G,202
4,Protein,G,203


In [61]:
food_ds = pd.read_csv(os.path.join(read_path, 'food.csv'))

# We will be using food to get food nutrients for each food item
# We only need the columns fdc_id (For the join) and description

food_ds = food_ds[['fdc_id', 'description']]

food_ds.head()

Unnamed: 0,fdc_id,description
0,2705383,"Milk, human"
1,2705384,"Milk, NFS"
2,2705385,"Milk, whole"
3,2705386,"Milk, reduced fat (2%)"
4,2705387,"Milk, low fat (1%)"


# Step 1 - Merge `food_nutrient_ds` with `nutrient_ds`


In [62]:
merged_foodNutrient_Nutrient = pd.merge(food_nutrient_ds, nutrient_ds, left_on='nutrient_id', right_on='nutrient_nbr', how='left')

# We verify that all the nutrients mentioned in food_nutrient_ds were found in nutrient_ds
assert len(merged_foodNutrient_Nutrient[merged_foodNutrient_Nutrient['nutrient_nbr'] == -1]) == 0, "There are unmatched nutrient_ids in food_nutrient_ds"

# The nutrient_nbr and nutrient_id columns are now redundant, we can drop one of them
merged_foodNutrient_Nutrient = merged_foodNutrient_Nutrient.drop(columns=['nutrient_nbr', 'nutrient_id'])
merged_foodNutrient_Nutrient.head()
# len(merged_foodNutrient_Nutrient)

Unnamed: 0,id,fdc_id,amount,Nutrient_Name,Nutrient_Unit
0,34136169,2705384,125.0,"Calcium, Ca",MG
1,34136178,2705384,57.0,Retinol,UG
2,34136185,2705384,0.0,Lycopene,UG
3,34136185,2705384,0.0,cis-Lycopene,UG
4,34136185,2705384,0.0,trans-Lycopene,UG


# Step 2 - Merge `merged_nutrient_ds` with `food_ds` to get food descriptions

In [63]:
merged_foodNutrient_Nutrient_food = pd.merge(merged_foodNutrient_Nutrient, food_ds, left_on='fdc_id', right_on='fdc_id', how='left')

# Verify that all the fdc_ids mentioned in food_nutrient_ds were found in food_ds
assert len(merged_foodNutrient_Nutrient_food[merged_foodNutrient_Nutrient_food['description'].isnull()]) == 0, "There are unmatched fdc_ids in food_nutrient_ds"

# Drop the id table that is now redundant
merged_foodNutrient_Nutrient_food = merged_foodNutrient_Nutrient_food.drop(columns=['id'])

# merged_foodNutrient_Nutrient_food.head()

#Export the final dataset to a CSV file
merged_foodNutrient_Nutrient_food.to_csv(os.path.join(save_path, 'PROCESSED_nutrients_per_aliment.csv'), index=False)

merged_foodNutrient_Nutrient_food.head()

Unnamed: 0,fdc_id,amount,Nutrient_Name,Nutrient_Unit,description
0,2705384,125.0,"Calcium, Ca",MG,"Milk, NFS"
1,2705384,57.0,Retinol,UG,"Milk, NFS"
2,2705384,0.0,Lycopene,UG,"Milk, NFS"
3,2705384,0.0,cis-Lycopene,UG,"Milk, NFS"
4,2705384,0.0,trans-Lycopene,UG,"Milk, NFS"


In [64]:
parse_str = ''

nutrient_name_uniques = merged_foodNutrient_Nutrient_food['Nutrient_Name'].unique()

parse_str += f"Number of Unique Nutrient Names: {len(nutrient_name_uniques)}\n\n"
# print(f"Number of Unique Nutrient Names: {len(nutrient_name_uniques)}")

parse_str += "Unique Nutrient Names:\n"
for name in nutrient_name_uniques:
    parse_str += f"- {name}\n"

print(parse_str)

with open(os.path.join(save_path, 'PROCESSED_nutrient_name_uniques.txt'), 'w') as f:
    f.write(parse_str)
    

Number of Unique Nutrient Names: 75

Unique Nutrient Names:
- Calcium, Ca
- Retinol
- Lycopene
- cis-Lycopene
- trans-Lycopene
- MUFA 20:1
- PUFA 20:5 n-3 (EPA)
- SFA 14:0
- Sugars, Total
- Total Sugars
- Carotene, beta
- cis-beta-Carotene
- trans-beta-Carotene
- Total lipid (fat)
- Thiamin
- Vitamin B-12, added
- PUFA 18:3
- PUFA 18:2
- Phosphorus, P
- Energy
- Folic acid
- Vitamin E (alpha-tocopherol)
- Tocopherols and tocotrienols
- Vitamin B-6
- Niacin
- Folate, DFE
- Cryptoxanthin, beta
- Fatty acids, total saturated
- SFA 8:0
- Zinc, Zn
- Magnesium, Mg
- Folate, food
- SFA 12:0
- SFA 16:0
- Alcohol, ethyl
- Fatty acids, total monounsaturated
- Copper, Cu
- PUFA 20:4
- Choline, total
- Carbohydrate, by difference
- Carbohydrate, by summation
- Riboflavin
- Carotene, alpha
- Vitamin K (phylloquinone)
- Cholesterol
- Vitamin B-12
- Caffeine
- SFA 6:0
- PUFA 22:6 n-3 (DHA)
- Protein
- PUFA 18:4
- Fiber, total dietary
- MUFA 16:1
- Fatty acids, total polyunsaturated
- PUFA 22:5 n-3 (D

# Step 3 - Filter based on nutrients of interest

In [65]:
nutrients_of_interest = [
    "Retinol",
    "Lycopene",
    "cis-Lycopene",
    "trans-Lycopene",
    "Carotene, beta",
    "cis-beta-Carotene",
    "trans-beta-Carotene",
    "Vitamin E (alpha-tocopherol)",
    "Tocopherols and tocotrienols",
    "Cryptoxanthin, beta",
    "Choline, total",
    "Carotene, alpha",
    "Vitamin K (phylloquinone)",
    "Zeaxanthin",
    "Lutein",
    "Lutein + zeaxanthin",
    "cis-Lutein/Zeaxanthin",
    "Vitamin D (D2 + D3)",
    "Vitamin A, RAE"
]

for nutrient in nutrients_of_interest:
    assert nutrient in nutrient_name_uniques, f"Nutrient '{nutrient}' not found in the dataset."

print("All nutrients of interest are present in the dataset.")

All nutrients of interest are present in the dataset.


Move to multiple-hot encoding format

Example:

| fdc_i | amount | Nutrient_Name | Nutrient_Unit | description |
|-------|--------|---------------|---------------|-------------|
| 1     | 57.0   | Retinol       | UG            | Milk, NFS |
| 1     | 0.0    | Lycopene      | UG            | Milk, NFS |
| 1     | 0.0    | cis-Lycopene  | UG            | Milk, NFS |

Becomes:

| fdc_id | description | Retinol (UG) | Lycopene (UG) | cis-Lycopene (UG) |
|--------|-------------|--------------|---------------|--------------------|
| 1      | Milk, NFS   | 57.0         | 0.0           | 0.0                 |


In [83]:
filtered_nutrients_db = merged_foodNutrient_Nutrient_food[merged_foodNutrient_Nutrient_food['Nutrient_Name'].isin(nutrients_of_interest)]

filtered_nutrients_multihot_db = pd.DataFrame()

for fdc_id in filtered_nutrients_db['fdc_id'].unique():
    crt_food_df = filtered_nutrients_db[filtered_nutrients_db['fdc_id'] == fdc_id]
    crt_food_dict = {'fdc_id': fdc_id, 'description': crt_food_df['description'].values[0]}
    
    for nutrient in nutrients_of_interest:
        crt_nutrient = nutrient.replace(" ", "_").replace(",", "").replace("(", "").replace(")", "").replace("+", "plus").replace("-", "_")
        # print(f"Filtered dataset for nutrient: {nutrient} (variable name: {crt_nutrient})")
        col_name = f"{crt_nutrient} ({crt_food_df[crt_food_df['Nutrient_Name'] == nutrient]['Nutrient_Unit'].values[0]})"

        nutrient_amount = crt_food_df[crt_food_df['Nutrient_Name'] == nutrient]['amount']
        if not nutrient_amount.empty:
            crt_food_dict[col_name] = nutrient_amount.values[0]
        else:
            crt_food_dict[col_name] = 0.0  # If the nutrient is not present, set amount to 0.0
    
    filtered_nutrients_multihot_db = pd.concat([filtered_nutrients_multihot_db, pd.DataFrame([crt_food_dict])], ignore_index=True)


filtered_nutrients_multihot_db.to_csv(os.path.join(save_path, 'PROCESSED_filtered_nutrients_multihot.csv'), index=False)

filtered_nutrients_multihot_db.head()

Unnamed: 0,fdc_id,description,Retinol (UG),Lycopene (UG),cis_Lycopene (UG),trans_Lycopene (UG),Carotene_beta (UG),cis_beta_Carotene (UG),trans_beta_Carotene (UG),Vitamin_E_alpha_tocopherol (MG),...,Cryptoxanthin_beta (UG),Choline_total (MG),Carotene_alpha (UG),Vitamin_K_phylloquinone (UG),Zeaxanthin (UG),Lutein (UG),Lutein_plus_zeaxanthin (UG),cis_Lutein/Zeaxanthin (UG),Vitamin_D_D2_plus_D3 (UG),Vitamin_A_RAE (UG)
0,2705384,"Milk, NFS",57.0,0.0,0.0,0.0,4.0,4.0,4.0,0.03,...,0.0,17.9,0.0,0.2,0.0,0.0,0.0,0.0,1.1,58.0
1,2705385,"Milk, whole",31.0,0.0,0.0,0.0,7.0,7.0,7.0,0.05,...,0.0,17.8,0.0,0.3,0.0,0.0,0.0,0.0,1.1,32.0
2,2705386,"Milk, reduced fat (2%)",83.0,0.0,0.0,0.0,3.0,3.0,3.0,0.03,...,0.0,18.2,0.0,0.2,0.0,0.0,0.0,0.0,1.1,83.0
3,2705387,"Milk, low fat (1%)",58.0,0.0,0.0,0.0,1.0,1.0,1.0,0.02,...,0.0,17.4,0.0,0.1,0.0,0.0,0.0,0.0,1.1,58.0
4,2705388,"Milk, fat free (skim)",64.0,0.0,0.0,0.0,2.0,2.0,2.0,0.0,...,0.0,18.2,0.0,0.0,0.0,0.0,0.0,0.0,1.1,64.0


In [84]:
filtered_nutrients_multihot_db

Unnamed: 0,fdc_id,description,Retinol (UG),Lycopene (UG),cis_Lycopene (UG),trans_Lycopene (UG),Carotene_beta (UG),cis_beta_Carotene (UG),trans_beta_Carotene (UG),Vitamin_E_alpha_tocopherol (MG),...,Cryptoxanthin_beta (UG),Choline_total (MG),Carotene_alpha (UG),Vitamin_K_phylloquinone (UG),Zeaxanthin (UG),Lutein (UG),Lutein_plus_zeaxanthin (UG),cis_Lutein/Zeaxanthin (UG),Vitamin_D_D2_plus_D3 (UG),Vitamin_A_RAE (UG)
0,2705384,"Milk, NFS",57.0,0.0,0.0,0.0,4.0,4.0,4.0,0.03,...,0.0,17.9,0.0,0.2,0.0,0.0,0.0,0.0,1.1,58.0
1,2705385,"Milk, whole",31.0,0.0,0.0,0.0,7.0,7.0,7.0,0.05,...,0.0,17.8,0.0,0.3,0.0,0.0,0.0,0.0,1.1,32.0
2,2705386,"Milk, reduced fat (2%)",83.0,0.0,0.0,0.0,3.0,3.0,3.0,0.03,...,0.0,18.2,0.0,0.2,0.0,0.0,0.0,0.0,1.1,83.0
3,2705387,"Milk, low fat (1%)",58.0,0.0,0.0,0.0,1.0,1.0,1.0,0.02,...,0.0,17.4,0.0,0.1,0.0,0.0,0.0,0.0,1.1,58.0
4,2705388,"Milk, fat free (skim)",64.0,0.0,0.0,0.0,2.0,2.0,2.0,0.00,...,0.0,18.2,0.0,0.0,0.0,0.0,0.0,0.0,1.1,64.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5426,2710810,Vegetables as ingredient in curry,0.0,458.0,458.0,458.0,994.0,994.0,994.0,0.24,...,0.0,14.6,368.0,9.0,225.0,225.0,225.0,225.0,0.0,98.0
5427,2710811,Vegetables as ingredient in soups,0.0,764.0,764.0,764.0,433.0,433.0,433.0,0.35,...,13.0,12.8,37.0,39.5,825.0,825.0,825.0,825.0,0.0,38.0
5428,2710812,Vegetables as ingredient in stews,0.0,1088.0,1088.0,1088.0,2588.0,2588.0,2588.0,0.38,...,6.0,12.5,1042.0,10.9,314.0,314.0,314.0,314.0,0.0,259.0
5429,2710813,Sauce as ingredient in hamburgers,4.0,7237.0,7237.0,7237.0,194.0,194.0,194.0,1.90,...,4.0,20.0,0.0,50.8,112.0,112.0,112.0,112.0,0.1,21.0


# Step 4 - Code assertions to double-check scripts integrity

Watermelon was labeled as [2709241, 2709270]:

With

| Retinol (UG) | Lycopene (UG) | cis_Lycopene (UG) | trans_Lycopene (UG) | Carotene_beta (UG) | cis_beta_Carotene (UG) | trans_beta_Carotene (UG) | Vitamin_E_alpha_tocopherol (MG) | Tocopherols_and_tocotrienols (MG) | Cryptoxanthin_beta (UG) | Choline_total (MG) |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| 0 | 2266  | 2266 | 2266 | 166.5| 166.5 | 166.5 | 0.035 | 0.035 | 39 | 5.85 | 

<br>    <br/>

| Carotene_alpha (UG) | Vitamin_K_phylloquinone (UG) | Zeaxanthin (UG) | Lutein (UG) | Lutein_plus_zeaxanthin (UG) | cis_Lutein/Zeaxanthin (UG) | Vitamin_D_D2_plus_D3 (UG) | Vitamin_A_RAE (UG)  |
| --- | --- | --- | --- | --- | --- | --- | --- |
| 0 | 1.5 | 17.5 | 17.5 | 17.5 | 17.5 | 0 | 15.5 |


In [100]:
print("WATERMELON VERIFICATION")

codes_watermelon = [2709241, 2709270]

lines_watermelon = [filtered_nutrients_multihot_db[filtered_nutrients_multihot_db['fdc_id'] == code] for code in codes_watermelon]


average_watermelon = (lines_watermelon[0].iloc[0, 2:] + lines_watermelon[1].iloc[0, 2:]) / len(codes_watermelon)
pd.DataFrame(average_watermelon).T

WATERMELON VERIFICATION


Unnamed: 0,Retinol (UG),Lycopene (UG),cis_Lycopene (UG),trans_Lycopene (UG),Carotene_beta (UG),cis_beta_Carotene (UG),trans_beta_Carotene (UG),Vitamin_E_alpha_tocopherol (MG),Tocopherols_and_tocotrienols (MG),Cryptoxanthin_beta (UG),Choline_total (MG),Carotene_alpha (UG),Vitamin_K_phylloquinone (UG),Zeaxanthin (UG),Lutein (UG),Lutein_plus_zeaxanthin (UG),cis_Lutein/Zeaxanthin (UG),Vitamin_D_D2_plus_D3 (UG),Vitamin_A_RAE (UG)
0,0.0,2266.0,2266.0,2266.0,166.5,166.5,166.5,0.035,0.035,39.0,5.85,0.0,1.5,17.5,17.5,17.5,17.5,0.0,15.5


Taramasalata was labeled as [2706332, 2710192, 2710204, 2705414]:

With

| Retinol (UG) | Lycopene (UG) | cis_Lycopene (UG) | trans_Lycopene (UG) | Carotene_beta (UG) | cis_beta_Carotene (UG) | trans_beta_Carotene (UG) | Vitamin_E_alpha_tocopherol (MG) | Tocopherols_and_tocotrienols (MG) | Cryptoxanthin_beta (UG) | Choline_total (MG) |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| 77.75 | 0 | 0 | 0 | 1.75 | 1.75 | 1.75 | 18.42 | 18.42 |	1 | 135.025 |


<br>    <br/>

| Carotene_alpha (UG) | Vitamin_K_phylloquinone (UG) | Zeaxanthin (UG) | Lutein (UG) | Lutein_plus_zeaxanthin (UG) | cis_Lutein/Zeaxanthin (UG) | Vitamin_D_D2_plus_D3 (UG) | Vitamin_A_RAE (UG)  |
| --- | --- | --- | --- | --- | --- | --- | --- |
| 0 | 42.65 | 165 | 165 | 165 | 165 | 0.925 | 78 |


In [None]:
print("TARAMASALATA VERIFICATION")

codes_taramasalata = [2706332, 2710192, 2710204, 2705414]

lines_taramasalata = [filtered_nutrients_multihot_db[filtered_nutrients_multihot_db['fdc_id'] == code] for code in codes_taramasalata]


average_taramasalata = (lines_taramasalata[0].iloc[0, 2:] + lines_taramasalata[1].iloc[0, 2:] + lines_taramasalata[2].iloc[0, 2:] + lines_taramasalata[3].iloc[0, 2:]) / len(codes_taramasalata)
pd.DataFrame(average_taramasalata).T

Unnamed: 0,Retinol (UG),Lycopene (UG),cis_Lycopene (UG),trans_Lycopene (UG),Carotene_beta (UG),cis_beta_Carotene (UG),trans_beta_Carotene (UG),Vitamin_E_alpha_tocopherol (MG),Tocopherols_and_tocotrienols (MG),Cryptoxanthin_beta (UG),Choline_total (MG),Carotene_alpha (UG),Vitamin_K_phylloquinone (UG),Zeaxanthin (UG),Lutein (UG),Lutein_plus_zeaxanthin (UG),cis_Lutein/Zeaxanthin (UG),Vitamin_D_D2_plus_D3 (UG),Vitamin_A_RAE (UG)
0,77.75,0.0,0.0,0.0,1.75,1.75,1.75,18.42,18.42,1.0,135.025,0.0,42.65,165.0,165.0,165.0,165.0,0.925,78.0
