In [71]:
import pandas as pd
import os

BASE_DIR = os.getcwd()

# Importing all datasets
df_bio_food = pd.read_csv(os.path.join(BASE_DIR, "BioFoodComp_cleaned_v2.csv"))
df_fdc = pd.read_csv(os.path.join(BASE_DIR, "./fdc/fdc_ids_proximal_dataset.csv"))
df_fwe = pd.read_csv(os.path.join(BASE_DIR, "./fwe/preprocessed.csv"))


In [72]:
PROXIMAL_MAP_BIO = {
    'Food name in English': 'Food Product',
    # Core Proximal Components
    'WATER(g)': 'Moisture',
    'PROTCNT(g)': 'Protein',
    'FATCE(g)': 'Fat',
    'CHOAVLDF(g)': 'Carbohydrates',
    'FIBTG(g)': 'Dietary Fiber',
    'ASH(g)': 'Ash',

    # Additional but common fields
    'SUGAR(g)': 'Sugars',
    'FIBND(g)': 'Insoluble Fiber',
    'FIBAD(g)': 'Soluble Fiber',

    # Less common / advanced or legacy fiber metrics
    'FIBC(g)': 'Crude Fiber',
    'CELLU(g)': 'Cellulose',
    'LIGN(g)': 'Lignin',
    'HEMCEL(g)': 'Hemicellulose'
}

In [None]:
print(df_bio_food.shape)
df_bio_food.rename(columns=PROXIMAL_MAP_BIO, inplace=True)
df_bio_food.drop(columns=['Soluble Fiber','Insoluble Fiber','Lignin','Hemicellulose','Cellulose'], inplace=True)
df_bio_food.isna().sum()/df_bio_food.shape[0]*100

(848, 14)


Food Product      0.000000
Moisture         11.556604
Protein           3.537736
Fat              17.806604
Carbohydrates    36.084906
Sugars           79.716981
Dietary Fiber    79.245283
Crude Fiber      44.693396
Ash               3.537736
dtype: float64

In [74]:
df_bio_food.head()

Unnamed: 0,Food Product,Moisture,Protein,Fat,Carbohydrates,Sugars,Dietary Fiber,Crude Fiber,Ash
0,"Sorghum, whole grain, white, raw",9.4,9.33,,58.7,,17.214,,1.44
1,"Sorghum, whole grain, red, raw",10.1,6.65,,64.18,,14.0244,,1.52
2,"Quinoa, Blanca de juli, raw",11.39,12.369956,,66.590415,,12.157292,1.7722,2.995018
3,"Quinoa, Kcancolla, raw",10.78,13.534674,,64.657734,,12.588942,2.739054,3.140544
4,"Quinoa, La Molina 89, raw",12.03,13.608959,,60.558548,,14.066403,2.973386,4.803162


In [75]:
PROXIMAL_MAP_FDC = {
    'Water': 'Moisture',
    'Nitrogen': 'Nitrogen',
    'Ash': 'Ash',
    'Carbohydrate, by summation': 'Total Carbohydrates (sum)',
    'Carbohydrate, by difference': 'Total Carbohydrates',
    'Protein': 'Protein',
    'Total lipid (fat)': 'Fat',
    'Fiber, total dietary': 'Dietary Fiber',
    'Energy': 'Energy (kcal)',

    # Contextual fields
    'Food Product': 'Food Product',
    'Side Stream': 'Side Stream'
}

In [76]:
print(df_fdc.shape)
df_fdc.rename(columns=PROXIMAL_MAP_FDC, inplace=True)
df_fdc.drop(columns=['Nitrogen','Total Carbohydrates (sum)','Energy (kcal)'], inplace=True)
df_fdc.isna().sum()/df_fdc.shape[0]*100


(184, 12)


Moisture                9.239130
Ash                     9.782609
Total Carbohydrates    16.847826
Protein                 1.630435
Fat                     7.608696
Dietary Fiber          35.326087
Food Product            0.000000
Side Stream             0.000000
Sugars                 53.260870
dtype: float64

In [77]:
PROXIMAL_MAP_FWE = {
    'Food Product': 'Food Product',
    'Side Stream': 'Side Stream',
    # Core Proximal Components
    'Ash': 'Ash',
    'Dry Matter': 'Dry Matter',
    # Fiber-related components
    'Fibre, crude': 'Crude Fiber',
    'Acid Detergent Fibre (ADF)': 'ADF',
    'Cellulose': 'Cellulose',
    'Hemicellulose': 'Hemicellulose',
    'Pectin': 'Pectin',
    # Carbohydrates
    'Sugar, total': 'Sugars'
}

In [78]:
print(df_fwe.shape)
df_fwe.rename(columns=PROXIMAL_MAP_FWE, inplace=True)
df_fwe.isna().sum()/df_fwe.shape[0]*100

(241, 10)


Food Product      0.000000
Side Stream       0.000000
ADF              51.037344
Ash              14.522822
Cellulose        83.817427
Dry Matter       72.614108
Crude Fiber      46.058091
Hemicellulose    83.817427
Pectin           95.435685
Sugars           97.095436
dtype: float64

In [79]:
df_merged = pd.concat([df_bio_food, df_fdc], axis=0, ignore_index=True)
df_merged.to_csv(os.path.join(BASE_DIR, "Combined.csv"), index=False)