# 2. Identifying unique aliments in IASI-DATABASE

In [2]:
# Needed Dependencies
# !pip install pandas openpyxl

In [3]:
# Imports
import pandas as pd

For the following FOOD_CODEs ['12012', '17046', '11130', '17045', 'a1362', 'a7021', '12008', '17038', '12042'], we were getting missing values in "./data/FOOD_CODE.csv".

As a result, `FOOD_CODE.csv` has been updated based on [Food Code Lookup](https://www.mrc-epid.cam.ac.uk/wp-content/uploads/2023/03/food_code_lookup.pdf) to include these missing FOOD_CODEs


In [None]:

food_codes_db = pd.read_csv("./data/FOOD_CODES.csv")


food_codes_db['FOOD_CODE'] = food_codes_db['FOOD_CODE'].astype(str).str.strip()
# food_codes_db.iloc[289].FOOD_CODE # DEBUG code

# Get data type for FOOD_CODE
food_codes_db['FOOD_CODE'].dtype

dtype('O')

In [5]:
# Verify if the missing FOOD_CODEs are correctly parsed
assert all(code in food_codes_db['FOOD_CODE'].values for code in ['12012', '17046', '11130', '17045', 'a1362', 'a7021', '12008', '17038', '12042'])

In [6]:
food_codes_db["FOOD_CODE"].nunique()

290

In [7]:
meals_db = pd.read_csv("./data/MEAL_ID.csv")

meals_db.head()

Unnamed: 0,LINE NUMBER,FFQNAME,SGDESC
0,1,BEEF,"Meat - beef - roast, steak, mince, stew or cas..."
1,2,BURGER,Meat - beefburgers
2,3,PORK,"Meat - pork - roast, chops, stew or slices"
3,4,LAMB,"Meat - lamb - roast, chops or stew"
4,5,CHICKEN,Meat - poultry


In [8]:
meals_db["LINE NUMBER"].describe()

count    130.000000
mean      65.500000
std       37.671829
min        1.000000
25%       33.250000
50%       65.500000
75%       97.750000
max      130.000000
Name: LINE NUMBER, dtype: float64

In [9]:
iasi_db = pd.read_excel('./data/popa2023andreea2013.xlsx')

# Read FOOD_CODE as string to preserve leading 'x' or 'a'
iasi_db['FOOD_CODE'] = iasi_db['FOOD_CODE'].astype(str).str.strip()

# iasi_db['FOOD_CODE'].nunique()
iasi_db

Unnamed: 0,ID,MEAL_ID,FOOD_CODE,FOOD_PORTION
0,1,0,12012,293.000
1,1,1,18005,0.560
2,1,1,18049,6.930
3,1,2,19029,14.000
4,1,3,18205,4.945
...,...,...,...,...
50340,651,123,13384,24.510
50341,651,126,15077,15.800
50342,651,126,15078,15.800
50343,651,126,15079,15.800


In [10]:
# Group by ID and add FOOD_PORTIONs
iasi_db_grouped = iasi_db.groupby('ID')['FOOD_PORTION'].sum()

iasi_db_grouped[iasi_db_grouped == 320.720000]

# iasi_db_grouped.describe()

ID
336    320.72
351    320.72
Name: FOOD_PORTION, dtype: float64

In [11]:
iasi_db[iasi_db['FOOD_CODE'] == 'x0535']

Unnamed: 0,ID,MEAL_ID,FOOD_CODE,FOOD_PORTION
236,4,10,x0535,1.05
319,5,10,x0535,1.05
531,8,10,x0535,1.05
615,9,10,x0535,1.05
1247,17,10,x0535,2.1
1533,20,10,x0535,1.05
2066,26,10,x0535,1.05
2351,29,10,x0535,1.05
3067,37,10,x0535,1.05
3538,43,10,x0535,1.05


In [12]:
unique_vals_b3 = pd.Series(iasi_db['FOOD_CODE'].unique(), name='FOOD_CODE').to_frame()
unique_vals_b3

needed_table = pd.merge(unique_vals_b3, food_codes_db, left_on='FOOD_CODE', right_on='FOOD_CODE', how='left')
# needed_table

# Remove columns that are not needed
# needed_table = needed_table.drop(columns=['AMOUNT', 'MODIFIER'])

# needed_table = pd.merge(needed_table, meals_db, left_on='MEAL', right_on='FFQNAME', how='left')

# Convert to list
# missing_vals = needed_table[needed_table['DESC'].isna()]['FOOD_CODE'].tolist()
# print(missing_vals)

# # Remove columns that are not needed
# needed_table = needed_table.drop(columns=['LINE NUMBER'])

# # Rename FOOD_CODE to FETA_ID
# needed_table = needed_table.rename(columns={'FOOD_CODE': 'FETA_ID'})

# Complete SGDESC by filling NaN with 'Unknown'
# needed_table['SGDESC'] = needed_table['SGDESC'].fillna('Astea apar in baza de date, dar nu ai Index Feta pentru ele in CSVurile tale.')

# needed_table

# Export to CSV
needed_table.to_csv('./data/PROCESSED_needs_fdc_id.csv', index=False)

needed_table.head()

Unnamed: 0,FOOD_CODE,DESC
0,12012,"Whole milk, average"
1,18005,"Beef, average, fat, cooked"
2,18049,"Beef, rump steak, grilled, lean"
3,19029,"Beefburgers, chilled/frozen, fried"
4,18205,"Pork, fat, cooked"


In [13]:
# mini_ds = iasi_db[:10]

# # for fc in mini_ds["FOOD_CODE"]:
# #     print(f"Searching for FOOD_CODE: {fc}")

# #     # Search for FOOD_CODE in food_codes_db
# #     for index, row in food_codes_db.iterrows():
# #         if row["FOOD_CODE"] == fc:
# #             print(f"Found matching FOOD_CODE in food_codes_db: {row['FOOD_CODE']}")
# #             break

# for fc in food_codes_db["FOOD_CODE"]:
#     if fc == 12012:
#         print("Found FOOD_CODE 12012 in food_codes_db")
#         break

# pd.merge(mini_ds, food_codes_db, on="FOOD_CODE", how="left")

In [14]:
# Join iasi_db with food_codes_db on FOOD_CODE
portions_merged_iasi_db = pd.merge(iasi_db, food_codes_db, on='FOOD_CODE', how="left")

portions_merged_iasi_db.head()

Unnamed: 0,ID,MEAL_ID,FOOD_CODE,FOOD_PORTION,DESC
0,1,0,12012,293.0,"Whole milk, average"
1,1,1,18005,0.56,"Beef, average, fat, cooked"
2,1,1,18049,6.93,"Beef, rump steak, grilled, lean"
3,1,2,19029,14.0,"Beefburgers, chilled/frozen, fried"
4,1,3,18205,4.945,"Pork, fat, cooked"


# ADDING NUTRIENTS FROM FDC TO IASI-FOODDATA

* A human expert made the manual labelling of the correspodences between FDCs and Feta-FOOD_CODEs in `manually_labelled_fdc_to_feta.xlsx`

In [15]:
# Read xlsx
correspondence_table = pd.read_excel('./data/manually_labelled_fdc_to_feta.xlsx')

correspondence_table['FOOD_CODE'] = correspondence_table['FOOD_CODE'].astype(str).str.strip()

# Combine all fdc_id columns into a single column
correspondence_table['fdc_id'] = correspondence_table[['fdc_id', 'fdc_id.1', 'fdc_id.2', 'fdc_id.3']].apply(lambda x: ','.join(x.dropna().astype(str)), axis=1) 

# Keep all fdc_ids into a single column as list
correspondence_table['fdc_ids'] = correspondence_table.apply(lambda row: [int(float(x)) for x in str(row['fdc_id']).split(',')], axis=1)

# Drop the now redundant columns
correspondence_table = correspondence_table.drop(columns=['fdc_id', 'fdc_id.1', 'fdc_id.2', 'fdc_id.3', 'DESC'])

print(correspondence_table.head(), end='\n\n\n')

print("Statistics of number of fdc_ids per FOOD_CODE:", end='\n')
print(correspondence_table['fdc_ids'].apply(lambda x: len(x)).describe())

  FOOD_CODE    fdc_ids
0     12012  [2705385]
1     18005  [2710780]
2     18049  [2705834]
3     19029  [2705855]
4     18205  [2705867]


Statistics of number of fdc_ids per FOOD_CODE:
count    201.000000
mean       1.019900
std        0.222715
min        1.000000
25%        1.000000
50%        1.000000
75%        1.000000
max        4.000000
Name: fdc_ids, dtype: float64


In [17]:
partial_merged_df = pd.merge(portions_merged_iasi_db, correspondence_table, on='FOOD_CODE', how='left')
partial_merged_df.head()

Unnamed: 0,ID,MEAL_ID,FOOD_CODE,FOOD_PORTION,DESC,fdc_ids
0,1,0,12012,293.0,"Whole milk, average",[2705385]
1,1,1,18005,0.56,"Beef, average, fat, cooked",[2710780]
2,1,1,18049,6.93,"Beef, rump steak, grilled, lean",[2705834]
3,1,2,19029,14.0,"Beefburgers, chilled/frozen, fried",[2705855]
4,1,3,18205,4.945,"Pork, fat, cooked",[2705867]


In [18]:
nutrients_of_interest = pd.read_csv('./data/PROCESSED_filtered_nutrients_multihot.csv')

nutrients_of_interest.rename(columns={'description': 'double_check_desc'}, inplace=True)

nutrients_of_interest.head()

Unnamed: 0,fdc_id,double_check_desc,Retinol (UG),Lycopene (UG),cis_Lycopene (UG),trans_Lycopene (UG),Carotene_beta (UG),cis_beta_Carotene (UG),trans_beta_Carotene (UG),Vitamin_E_alpha_tocopherol (MG),...,Cryptoxanthin_beta (UG),Choline_total (MG),Carotene_alpha (UG),Vitamin_K_phylloquinone (UG),Zeaxanthin (UG),Lutein (UG),Lutein_plus_zeaxanthin (UG),cis_Lutein/Zeaxanthin (UG),Vitamin_D_D2_plus_D3 (UG),Vitamin_A_RAE (UG)
0,2705384,"Milk, NFS",57.0,0.0,0.0,0.0,4.0,4.0,4.0,0.03,...,0.0,17.9,0.0,0.2,0.0,0.0,0.0,0.0,1.1,58.0
1,2705385,"Milk, whole",31.0,0.0,0.0,0.0,7.0,7.0,7.0,0.05,...,0.0,17.8,0.0,0.3,0.0,0.0,0.0,0.0,1.1,32.0
2,2705386,"Milk, reduced fat (2%)",83.0,0.0,0.0,0.0,3.0,3.0,3.0,0.03,...,0.0,18.2,0.0,0.2,0.0,0.0,0.0,0.0,1.1,83.0
3,2705387,"Milk, low fat (1%)",58.0,0.0,0.0,0.0,1.0,1.0,1.0,0.02,...,0.0,17.4,0.0,0.1,0.0,0.0,0.0,0.0,1.1,58.0
4,2705388,"Milk, fat free (skim)",64.0,0.0,0.0,0.0,2.0,2.0,2.0,0.0,...,0.0,18.2,0.0,0.0,0.0,0.0,0.0,0.0,1.1,64.0


In [None]:
nutrient_columns = nutrients_of_interest.columns.tolist()
nutrient_columns.remove('fdc_id')

raw_fully_merged_df = partial_merged_df.copy()

for index, row in partial_merged_df.iterrows():
    fdc_ids = row['fdc_ids']
    if isinstance(fdc_ids, list):
        nutrient_values = {}

        # If only one fdc_id, directly get the nutrient values
        if len(fdc_ids) == 1:
            fdc_id = fdc_ids[0]
            nutrient_row = nutrients_of_interest[nutrients_of_interest['fdc_id'] == fdc_id]
            if not nutrient_row.empty:
                for col in nutrient_columns:
                    raw_fully_merged_df.at[index, col] = nutrient_row.iloc[0][col]
        else:
            # Multiple fdc_ids, compute average for each nutrient
            # For the description, append all descriptions together

            # Initialize sums and counts with the first fdc_id
            nutrient_sums = nutrients_of_interest[nutrients_of_interest['fdc_id'] == fdc_ids[0]][nutrient_columns].copy()
            counts = 1

            # print(nutrient_sums)

            # Add values from the remaining fdc_ids
            for fdc_id in fdc_ids[1:]:
                nutrient_row = nutrients_of_interest[nutrients_of_interest['fdc_id'] == fdc_id]
                if not nutrient_row.empty:
                    counts += 1
                    for col in nutrient_columns:
                        value = nutrient_row.iloc[0][col]
                        if pd.notna(value):
                            # Append description or sum nutrients
                            if col == 'double_check_desc':
                                if pd.isna(raw_fully_merged_df.at[index, col]):
                                    raw_fully_merged_df.at[index, col] = value
                                else:
                                    raw_fully_merged_df.at[index, col] += f"; {value}"
                            else:
                                nutrient_sums[col] += float(value)
            # Compute averages for nutrients
            for col in nutrient_columns:
                if col != 'double_check_desc':
                    if counts > 0:
                        # Choosed to use iloc[0] for future compatibilitysince nutrient_sums[col] returns a "Deprecated" warning
                        raw_fully_merged_df.at[index, col] = nutrient_sums.iloc[0][col] / counts

                    else:
                        raw_fully_merged_df.at[index, col] = pd.NA

raw_fully_merged_df.head()


Unnamed: 0,ID,MEAL_ID,FOOD_CODE,FOOD_PORTION,DESC,fdc_ids,double_check_desc,Retinol (UG),Lycopene (UG),cis_Lycopene (UG),...,Cryptoxanthin_beta (UG),Choline_total (MG),Carotene_alpha (UG),Vitamin_K_phylloquinone (UG),Zeaxanthin (UG),Lutein (UG),Lutein_plus_zeaxanthin (UG),cis_Lutein/Zeaxanthin (UG),Vitamin_D_D2_plus_D3 (UG),Vitamin_A_RAE (UG)
0,1,0,12012,293.0,"Whole milk, average",[2705385],"Milk, whole",31.0,0.0,0.0,...,0.0,17.8,0.0,0.3,0.0,0.0,0.0,0.0,1.1,32.0
1,1,1,18005,0.56,"Beef, average, fat, cooked",[2710780],Beef as ingredient in recipes,3.0,0.0,0.0,...,0.0,86.1,0.0,1.6,0.0,0.0,0.0,0.0,0.1,3.0
2,1,1,18049,6.93,"Beef, rump steak, grilled, lean",[2705834],"Beef, steak, sirloin, lean only eaten",1.0,0.0,0.0,...,0.0,77.2,0.0,1.6,0.0,0.0,0.0,0.0,0.2,1.0
3,1,2,19029,14.0,"Beefburgers, chilled/frozen, fried",[2705855],"Beef, ground, patty",3.0,0.0,0.0,...,0.0,79.4,0.0,1.9,0.0,0.0,0.0,0.0,0.0,3.0
4,1,3,18205,4.945,"Pork, fat, cooked",[2705867],"Pork, chop, lean and fat eaten",1.0,0.0,0.0,...,0.0,75.7,0.0,0.0,0.0,0.0,0.0,0.0,0.6,1.0


In [20]:
raw_fully_merged_df

Unnamed: 0,ID,MEAL_ID,FOOD_CODE,FOOD_PORTION,DESC,fdc_ids,double_check_desc,Retinol (UG),Lycopene (UG),cis_Lycopene (UG),...,Cryptoxanthin_beta (UG),Choline_total (MG),Carotene_alpha (UG),Vitamin_K_phylloquinone (UG),Zeaxanthin (UG),Lutein (UG),Lutein_plus_zeaxanthin (UG),cis_Lutein/Zeaxanthin (UG),Vitamin_D_D2_plus_D3 (UG),Vitamin_A_RAE (UG)
0,1,0,12012,293.000,"Whole milk, average",[2705385],"Milk, whole",31.0,0.0,0.0,...,0.0,17.8,0.0,0.3,0.0,0.0,0.0,0.0,1.1,32.0
1,1,1,18005,0.560,"Beef, average, fat, cooked",[2710780],Beef as ingredient in recipes,3.0,0.0,0.0,...,0.0,86.1,0.0,1.6,0.0,0.0,0.0,0.0,0.1,3.0
2,1,1,18049,6.930,"Beef, rump steak, grilled, lean",[2705834],"Beef, steak, sirloin, lean only eaten",1.0,0.0,0.0,...,0.0,77.2,0.0,1.6,0.0,0.0,0.0,0.0,0.2,1.0
3,1,2,19029,14.000,"Beefburgers, chilled/frozen, fried",[2705855],"Beef, ground, patty",3.0,0.0,0.0,...,0.0,79.4,0.0,1.9,0.0,0.0,0.0,0.0,0.0,3.0
4,1,3,18205,4.945,"Pork, fat, cooked",[2705867],"Pork, chop, lean and fat eaten",1.0,0.0,0.0,...,0.0,75.7,0.0,0.0,0.0,0.0,0.0,0.0,0.6,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50340,651,123,13384,24.510,"Tomatoes, raw",[2709719],"Tomatoes, raw",0.0,2813.0,2813.0,...,6.0,7.0,61.0,7.5,103.0,103.0,103.0,103.0,0.0,32.0
50341,651,126,15077,15.800,"Coleslaw, with mayonnaise, retail",[2709815],Coleslaw,4.0,0.0,0.0,...,0.0,11.2,323.0,54.3,53.0,53.0,53.0,53.0,0.0,79.0
50342,651,126,15078,15.800,"Coleslaw, with reduced calorie dressing, retail",[2709816],"Cabbage salad, NFS",0.0,0.0,0.0,...,0.0,8.0,321.0,44.8,41.0,41.0,41.0,41.0,0.0,75.0
50343,651,126,15079,15.800,"Coleslaw, with vinaigrette, retail",[2709816],"Cabbage salad, NFS",0.0,0.0,0.0,...,0.0,8.0,321.0,44.8,41.0,41.0,41.0,41.0,0.0,75.0


In [21]:
raw_fully_merged_df.to_csv('./data/PROCESSED_iasi_with_nutrients_raw.csv', index=False)