In [113]:
### Scraping and data processing for FDA data

import requests
from bs4 import BeautifulSoup
import pandas as pd
import re

In [114]:
# URL of the FDA Food Additive Status List
url = "https://www.fda.gov/food/food-additives-petitions/food-additive-status-list"

# Send a GET request to the webpage
response = requests.get(url)
if response.status_code == 200:
    print("Page successfully retrieved")
else:
    print(f"Failed to retrieve page with status code {response.status_code}")

# Parse the content with BeautifulSoup
soup = BeautifulSoup(response.content, "html.parser")

print(soup)

Page successfully retrieved
<!DOCTYPE html>

<html dir="ltr" lang="en" prefix="og: https://ogp.me/ns#">
<head>
<meta charset="utf-8"/>
<script async="" src="https://www.googletagmanager.com/gtag/js?id=G-273DTKB5QW"></script>
<meta content="This Food Additives Status List organizes additives found in many parts of 21 CFR into one alphabetized list." name="description"/>
<meta content="Food Additive Status List" name="dcterms.title"/>
<meta content="Center for Food Safety and Applied Nutrition" name="dcterms.creator"/>
<meta content="This Food Additives Status List organizes additives found in many parts of 21 CFR into one alphabetized list." name="dcterms.description"/>
<meta content="FDA" name="dcterms.publisher"/>
<meta content="Article" name="dcterms.type"/>
<meta content="FDA" name="dcterms.source"/>
<meta content="Industry" name="dcterms.audience"/>
<meta content="U.S. Food and Drug Administration" property="og:site_name"/>
<meta content="Article" property="og:type"/>
<meta content

In [115]:
# Find the main content section
content = soup.find_all('li')

#print(content)

# Filter out the list items that match the specific pattern
additive_list = []

for li in content:
    strong_tag = li.find('strong')
    if strong_tag and "-" in li.get_text():
        additive_list.append(li.get_text(strip=True))

for item in additive_list:
    print(item)





Acacia(gum arabic)- EMUL/STAB, REG, Used as thickener, emulsifier, or stabilizer at=20% of alcoholic beverages-172.780, GRAS/FS, See Reg Part 135, Frozen Desserts; Part 169, Food Dressings and Flavorings; Part 169.179, Vanilla Pwd-184.1330
Acesulfame potassium- NNS, REG, See Regulation -172.800
Acetic acid- B&N/FEED, GRAS/FS, Part 133, Cheese; Part 582.1005, In animal feed practices; 184.1005, 172.814
Acetic anhydride- MISC, REG, In modifying food starch -172.892
Acetone- SOLV, REG, 30 ppm - As residual solvent in spice oleoresins 173.210
Acetone peroxides- BL, REG/FS, GMP, Part 137, Cereal Flours -172.802
Acetyl-(p-nitrophenyl)-sulfanilamide- FEED, REG, See: Sulfanitran
N-Acetyl-L-methionine(free, hydrated, or anhydrous, or sodium or potassium salts) - NUTR, REG, In foods, except infant foods and foods containing added nitrites/nitrates -172.372
Acetylated monoglycerides- EMUL, REG, GMP, Used in food, food processing, food pkg or food stg equipment -172.828
Acidified sodium chlorite s

In [116]:
# create pandas dataframe

fda_additives_df = pd.DataFrame(pd.Series(additive_list), columns=["rawData"])

display(fda_additives_df["rawData"])

# use regex to split after name


TITLE_REGEX = r"(.+?)-\s(.*)"

def split_additive_description(item):
    match = re.match(TITLE_REGEX, item)
    if match:
        return pd.Series([match.group(1).strip(), match.group(2).strip()])
    else:
        return pd.Series([item, ""])  # If no match, return the item as is

# Apply the function to the DataFrame
fda_additives_df[['additive', 'description']] = fda_additives_df["rawData"].apply(split_additive_description)

display(fda_additives_df)


0       Acacia(gum arabic)- EMUL/STAB, REG, Used as th...
1       Acesulfame potassium- NNS, REG, See Regulation...
2       Acetic acid- B&N/FEED, GRAS/FS, Part 133, Chee...
3       Acetic anhydride- MISC, REG, In modifying food...
4       Acetone- SOLV, REG, 30 ppm - As residual solve...
                              ...                        
1433    Zinc methionine sulfate- NUTR, REG, Tablets - ...
1434    Zinc oxide- NUTR/DS, GRAS, GMP - 182.5991, 182...
1435    Zinc stearate- NUTR/DS, GRAS, GMP, Free from c...
1436    Zinc sulfate- NUTR/DS, GRAS, GMP - 182.5997, 1...
1437    Zoalene- FEED, REG, 2 ppm - Residues in uncook...
Name: rawData, Length: 1438, dtype: object

Unnamed: 0,rawData,additive,description
0,"Acacia(gum arabic)- EMUL/STAB, REG, Used as th...",Acacia(gum arabic),"EMUL/STAB, REG, Used as thickener, emulsifier,..."
1,"Acesulfame potassium- NNS, REG, See Regulation...",Acesulfame potassium,"NNS, REG, See Regulation -172.800"
2,"Acetic acid- B&N/FEED, GRAS/FS, Part 133, Chee...",Acetic acid,"B&N/FEED, GRAS/FS, Part 133, Cheese; Part 582...."
3,"Acetic anhydride- MISC, REG, In modifying food...",Acetic anhydride,"MISC, REG, In modifying food starch -172.892"
4,"Acetone- SOLV, REG, 30 ppm - As residual solve...",Acetone,"SOLV, REG, 30 ppm - As residual solvent in spi..."
...,...,...,...
1433,"Zinc methionine sulfate- NUTR, REG, Tablets - ...",Zinc methionine sulfate,"NUTR, REG, Tablets - See 172.399 for specs"
1434,"Zinc oxide- NUTR/DS, GRAS, GMP - 182.5991, 182...",Zinc oxide,"NUTR/DS, GRAS, GMP - 182.5991, 182.8991"
1435,"Zinc stearate- NUTR/DS, GRAS, GMP, Free from c...",Zinc stearate,"NUTR/DS, GRAS, GMP, Free from chick edema fact..."
1436,"Zinc sulfate- NUTR/DS, GRAS, GMP - 182.5997, 1...",Zinc sulfate,"NUTR/DS, GRAS, GMP - 182.5997, 182.8997; GRAS,..."


In [117]:
# look up against reference data dictionary tables

fda_misc_abbreviations_df = pd.read_csv('fda_miscellaneous_abbreviations.csv')
fda_regulatory_status_df = pd.read_csv('fda_regulatory_status.csv')
fda_technical_effects_df = pd.read_csv('fda_technical_effects.csv')

for df in [fda_misc_abbreviations_df, fda_regulatory_status_df, fda_technical_effects_df]:
    
    print(df.head())

  Type Kind Effect or Use of Additive  Unnamed: 2  Unnamed: 3  Unnamed: 4  \
0    &                            and         NaN         NaN         NaN   
1  amt                         amount         NaN         NaN         NaN   
2  art                   artificially         NaN         NaN         NaN   
3  avg                        average         NaN         NaN         NaN   
4   ca           about, approximately         NaN         NaN         NaN   

   Unnamed: 5  Unnamed: 6  
0         NaN         NaN  
1         NaN         NaN  
2         NaN         NaN  
3         NaN         NaN  
4         NaN         NaN  
      Type                   Kind, Effect, or Use of Additive  Unnamed: 2  \
0      BAN  Substances banned prior to the Food Additives ...         NaN   
1       FS  Substances permitted as optional ingredient in...         NaN   
2     GRAS  Generally recognized as safe. Substances in th...         NaN   
3  GRAS/FS  Substances generally recognized as safe in foo...

In [118]:
# enrich, clean and combine data dictionary tables

fda_misc_abbreviations_df = fda_misc_abbreviations_df.iloc[:,:2]
fda_regulatory_status_df = fda_regulatory_status_df.iloc[:,:2]
fda_technical_effects_df = fda_technical_effects_df.iloc[:,:2]

fda_misc_abbreviations_df.columns = ['abbreviation', 'fda_status']
fda_regulatory_status_df.columns = ['abbreviation', 'fda_status']
fda_technical_effects_df.columns = ['abbreviation', 'fda_status']



In [119]:
fda_misc_abbreviations_df["source"] = 'FDA Miscellaneous Abbreviations'
fda_regulatory_status_df["source"] = "FDA Regulatory Status"
fda_technical_effects_df["source"] = "FDA Technical Effects"

fda_data_dictionary = pd.concat([fda_misc_abbreviations_df, fda_regulatory_status_df, fda_technical_effects_df])

display(fda_data_dictionary)

Unnamed: 0,abbreviation,fda_status,source
0,&,and,FDA Miscellaneous Abbreviations
1,amt,amount,FDA Miscellaneous Abbreviations
2,art,artificially,FDA Miscellaneous Abbreviations
3,avg,average,FDA Miscellaneous Abbreviations
4,ca,"about, approximately",FDA Miscellaneous Abbreviations
...,...,...,...
30,SP,"Spices, other natural seasonings & flavorings",FDA Technical Effects
31,SP/ADJ,Spray adjuvant,FDA Technical Effects
32,STAB,Stabilizer,FDA Technical Effects
33,SY/FL,Synthetic flavor,FDA Technical Effects


In [120]:
# Save the extracted data to a file
fda_data_dictionary.to_csv('fda_data_dictionary.csv')

print("Data successfully written to 'fda_data_dictionary.csv'")

fda_additives_df.to_csv('fda_additives_reference.csv')

print("Data successfully written to 'fda_additives_reference.csv'")


Data successfully written to 'fda_data_dictionary.csv'
Data successfully written to 'fda_additives_reference.csv'


In [121]:
# enrich additives df with regulatory info

FDA_REGULATION_CODE_REGEX = r'(?<!\w)(GRAS/FS|REG/FS|GRAS|REG|BAN|ILL|FS|PD|PS)(?!\w)'


fda_additives_df["abbreviation"] = fda_additives_df['description'].str.extract(FDA_REGULATION_CODE_REGEX)

display(fda_additives_df)

Unnamed: 0,rawData,additive,description,abbreviation
0,"Acacia(gum arabic)- EMUL/STAB, REG, Used as th...",Acacia(gum arabic),"EMUL/STAB, REG, Used as thickener, emulsifier,...",REG
1,"Acesulfame potassium- NNS, REG, See Regulation...",Acesulfame potassium,"NNS, REG, See Regulation -172.800",REG
2,"Acetic acid- B&N/FEED, GRAS/FS, Part 133, Chee...",Acetic acid,"B&N/FEED, GRAS/FS, Part 133, Cheese; Part 582....",GRAS/FS
3,"Acetic anhydride- MISC, REG, In modifying food...",Acetic anhydride,"MISC, REG, In modifying food starch -172.892",REG
4,"Acetone- SOLV, REG, 30 ppm - As residual solve...",Acetone,"SOLV, REG, 30 ppm - As residual solvent in spi...",REG
...,...,...,...,...
1433,"Zinc methionine sulfate- NUTR, REG, Tablets - ...",Zinc methionine sulfate,"NUTR, REG, Tablets - See 172.399 for specs",REG
1434,"Zinc oxide- NUTR/DS, GRAS, GMP - 182.5991, 182...",Zinc oxide,"NUTR/DS, GRAS, GMP - 182.5991, 182.8991",GRAS
1435,"Zinc stearate- NUTR/DS, GRAS, GMP, Free from c...",Zinc stearate,"NUTR/DS, GRAS, GMP, Free from chick edema fact...",GRAS
1436,"Zinc sulfate- NUTR/DS, GRAS, GMP - 182.5997, 1...",Zinc sulfate,"NUTR/DS, GRAS, GMP - 182.5997, 182.8997; GRAS,...",GRAS


In [122]:
fda_additives_df = fda_additives_df.merge(fda_data_dictionary[["abbreviation", "fda_status"]], how='left', on='abbreviation')

# create basic safety lookup based on reg status

reg_to_risk_score_data = {
    'abbreviation': ['GRAS/FS','REG/FS','GRAS','REG','BAN','ILL''FS','PD','PS'],
    'Column2': [10, 20, 30, 40]
}

safety_score_df = pd.DataFrame()

display(fda_additives_df)





Unnamed: 0,rawData,additive,description,abbreviation,fda_status
0,"Acacia(gum arabic)- EMUL/STAB, REG, Used as th...",Acacia(gum arabic),"EMUL/STAB, REG, Used as thickener, emulsifier,...",REG,Food additives for which a petition has been f...
1,"Acesulfame potassium- NNS, REG, See Regulation...",Acesulfame potassium,"NNS, REG, See Regulation -172.800",REG,Food additives for which a petition has been f...
2,"Acetic acid- B&N/FEED, GRAS/FS, Part 133, Chee...",Acetic acid,"B&N/FEED, GRAS/FS, Part 133, Cheese; Part 582....",GRAS/FS,Substances generally recognized as safe in foo...
3,"Acetic anhydride- MISC, REG, In modifying food...",Acetic anhydride,"MISC, REG, In modifying food starch -172.892",REG,Food additives for which a petition has been f...
4,"Acetone- SOLV, REG, 30 ppm - As residual solve...",Acetone,"SOLV, REG, 30 ppm - As residual solvent in spi...",REG,Food additives for which a petition has been f...
...,...,...,...,...,...
1433,"Zinc methionine sulfate- NUTR, REG, Tablets - ...",Zinc methionine sulfate,"NUTR, REG, Tablets - See 172.399 for specs",REG,Food additives for which a petition has been f...
1434,"Zinc oxide- NUTR/DS, GRAS, GMP - 182.5991, 182...",Zinc oxide,"NUTR/DS, GRAS, GMP - 182.5991, 182.8991",GRAS,Generally recognized as safe. Substances in th...
1435,"Zinc stearate- NUTR/DS, GRAS, GMP, Free from c...",Zinc stearate,"NUTR/DS, GRAS, GMP, Free from chick edema fact...",GRAS,Generally recognized as safe. Substances in th...
1436,"Zinc sulfate- NUTR/DS, GRAS, GMP - 182.5997, 1...",Zinc sulfate,"NUTR/DS, GRAS, GMP - 182.5997, 182.8997; GRAS,...",GRAS,Generally recognized as safe. Substances in th...


In [123]:
## finding potential danger

# look for keywords in desc?
# lookup against effects and regulatory - in an array? 