In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import requests
import time

# Selenium setup with headless Chrome






In [38]:
def get_nutrition_value(driver,element):
    try:
        outer_text = driver.execute_script(
            "return arguments[0].firstChild.textContent;", element
        )
        return float(outer_text.strip())
    except:
        return 0.0

In [None]:
error_ingredients = set()

In [50]:

def get_ingredient_details(ingredient):
   
    try:
        options = Options()
        options.headless = True
        service = Service('/opt/homebrew/bin/chromedriver')  # Update with the path to your ChromeDriver
        driver = webdriver.Chrome(service=service, options=options)
        url = "https://www.nutritionix.com/food/"
        details = {}
        # Open the URL
        url+=ingredient.replace(' ','%20')+'/'
        print(url)
        
        selenium_cookies = driver.get_cookies()
        cookies = {}
        for cookie in selenium_cookies:
            cookies[cookie['name']] = cookie['value']
        driver.get(url)

    
        driver.implicitly_wait(10)
        # Wait for the button to be clickable
        WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, "button.btn.btn-default.btn-xs[ng-click*='gdprCompliance.respond(true)']"))
        ).click()

        # Wait for a specific element of the main content to ensure it's loaded
        calorie_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".nf-calories .nf-pr"))
        )
        details['calories'] = float(calorie_element.text.strip())

        
        for category,prop in {'carbs':'carbohydrateContent','fats':'fatContent','protein':'proteinContent','sugar':'sugarContent'}.items():
            element = WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, f"span[itemprop={prop}]"))
            )
            details[category] = get_nutrition_value(driver,element)

        
        category_element = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//div[@class='rounded-box ng-scope'][@ng-if='vm.category']/div[@class='box-content ng-binding']"))
        )

        
        details['category'] = category_element.text.strip().split(' > ')[-1].split()[0]
        
        try:
            image_element =  WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.XPATH, "//img[@itemprop='image']"))
            )

            details['image'] = image_element.get_attribute('src')
        except:
            details['image'] = 'None'

        amazon_element =  WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//a[@class='amazon']"))
        )

        details['store_link'] = amazon_element.get_attribute('href')

        unit_element =  WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//div[@class='nf-serving-unit-name ']"))
        )
        unit = outer_text = driver.execute_script(
            "return arguments[0].firstChild.textContent;", unit_element
        )
        details['unit'] = unit.strip().split()[0].split(',')[0]



    
        
        # Get the page source and create a BeautifulSoup object


        soup = BeautifulSoup(driver.page_source, 'html.parser')

        driver.close()

    except Exception as e:
        print(e)
        error_ingredients.add(ingredient)

   
    

    return details





In [34]:
import pandas as pd

df = pd.read_csv('recipe_ingredients.csv')
df['name']

0                             ghee
1                      cumin seeds
2                   cinnamon stick
3              ginger garlic paste
4                            water
                   ...            
9204       dried Italian seasoning
9205                    sour cream
9206                Ranch dressing
9207    shredded mozzarella cheese
9208       shredded Cheddar cheese
Name: name, Length: 9209, dtype: object

In [None]:
details = {}

In [51]:


i = 0
for ingredient in df['name']:
    if ingredient not in details and ingredient not in error_ingredients: 
        details[ingredient] = get_ingredient_details(ingredient)
    if i%500 == 0:
        print(i , ' ingredients done')
    
    i+=1

0  ingredients done
https://www.nutritionix.com/food/whole%20milk%20yogurt/
https://www.nutritionix.com/food/finely%20grated%20ginger%20root/
500  ingredients done
https://www.nutritionix.com/food/saffron/
Message: 
Stacktrace:
0   chromedriver                        0x0000000102ac6004 chromedriver + 4169732
1   chromedriver                        0x0000000102abdff8 chromedriver + 4136952
2   chromedriver                        0x0000000102713500 chromedriver + 292096
3   chromedriver                        0x00000001027587a0 chromedriver + 575392
4   chromedriver                        0x0000000102793818 chromedriver + 817176
5   chromedriver                        0x000000010274c5e8 chromedriver + 525800
6   chromedriver                        0x000000010274d4b8 chromedriver + 529592
7   chromedriver                        0x0000000102a8c334 chromedriver + 3932980
8   chromedriver                        0x0000000102a90970 chromedriver + 3950960
9   chromedriver                       

In [52]:
len(details)

1941

In [54]:
len(error_ingredients)

332

In [59]:
details['ghee']

{'calories': 112.0,
 'carbs': 0.0,
 'fats': 13.0,
 'protein': 0.0,
 'sugar': 0.0,
 'category': 'Butters',
 'image': 'https://nix-tag-images.s3.amazonaws.com/2640_thumb.jpg',
 'store_link': 'https://www.amazon.com/gp/search?ie=UTF8&camp=1789&creative=9325&index=grocery&keywords=ghee&linkCode=ur2&tag=nutritionix07-20',
 'unit': 'tbsp'}

In [63]:
import csv

def ingredients_to_csv(ingredients: dict , ingredients_filename:str):
    with open(ingredients_filename,'w') as f:
        writer = csv.writer(f)
        writer.writerow(('ingredient','calories','carbs','fats','protein','sugar','category','image','store_link','unit'))
        for ing in ingredients:
             writer.writerow([ing]+list(ingredients[ing].values()))
               


ingredients_to_csv(details,'ingredients_list.csv')

In [66]:
len(details)

1941

In [8]:
details[ingredient]

{'calories': 112,
 'carbs': 0.0,
 'fats': 13.0,
 'protein': 0.0,
 'sugar': 0.0,
 'category': 'Butters',
 'image': 'https://nix-tag-images.s3.amazonaws.com/2640_thumb.jpg',
 'store_link': 'https://www.amazon.com/gp/search?ie=UTF8&camp=1789&creative=9325&index=grocery&keywords=ghee&linkCode=ur2&tag=nutritionix07-20'}

In [2]:
import pandas as pd
df = pd.read_csv('ingredients list.csv')
df.head()

Unnamed: 0,ingredient,calories,carbs,fats,protein,sugar,category,image,store_link,unit
0,ghee,112.0,0.0,13.0,0.0,0.0,Butters,https://nix-tag-images.s3.amazonaws.com/2640_t...,https://www.amazon.com/gp/search?ie=UTF8&camp=...,tbsp
1,cumin seeds,7.9,0.9,0.5,0.4,0.1,Seeds,https://nix-tag-images.s3.amazonaws.com/2180_t...,https://www.amazon.com/gp/search?ie=UTF8&camp=...,tsp
2,cinnamon stick,3.2,1.0,0.0,0.1,0.0,Spices,https://nix-tag-images.s3.amazonaws.com/4778_t...,https://www.amazon.com/gp/search?ie=UTF8&camp=...,stick
3,ginger garlic paste,61.0,3.2,5.4,0.5,0.2,Seasonings,https://d2eawub7utcl6.cloudfront.net/images/ni...,https://www.amazon.com/gp/search?ie=UTF8&camp=...,tbsp
4,water,0.0,0.0,0.0,0.0,0.0,Water,https://nix-tag-images.s3.amazonaws.com/4483_t...,https://www.amazon.com/gp/search?ie=UTF8&camp=...,cup


In [7]:
df['category'] = df['category'].fillna('Unknown')

In [16]:
df.isna().sum()

ingredient    0
calories      0
carbs         0
fats          0
protein       0
sugar         0
category      0
image         0
store_link    0
unit          0
dtype: int64

In [13]:
df['calories'] = df['calories'].fillna(df['calories'].median())
df['carbs'] = df['carbs'].fillna(df['carbs'].median())
df['fats'] = df['fats'].fillna(df['fats'].median())
df['protein'] = df['protein'].fillna(df['protein'].median())
df['sugar'] = df['sugar'].fillna(0)

In [15]:
df['image'] = df['image'].fillna('')
df['store_link'] = df['store_link'].fillna('')
df['unit'] = df['unit'].fillna('')

In [17]:
df.to_csv('ingredients list.csv')