# finding all 62 links on main page

In [284]:
import requests
import json
import re

# URL of the 62-cookie recipe page
url = "https://food52.com/story/26826-best-cookie-recipes"

response = requests.get(url)
html_content = response.text

external_urls= re.findall(r'"externalUrl":"(https://food52\.com/(?:recipes|blog)/\d{5}-[^"]+)"', html_content)

external_urls = list(set(external_urls))

keep_exceptions = {"25372","26755","14625", "25558"}

def first5digits(url):
  return url.split('/')[-1][:5]

filtered_urls = [
    url for url in external_urls
    if first5digits(url) in keep_exceptions or not (
        first5digits(url).startswith('1') or
        first5digits(url).startswith('2') or
        first5digits(url) == '34397'
    )
]


print("Found", len(filtered_urls), "external URLs:")
for link in filtered_urls:
    print(link)


Found 62 external URLs:
https://food52.com/recipes/82256-chipotle-chocolate-crinkle-cookies
https://food52.com/recipes/82219-chai-spice-snickerdoodle-recipe
https://food52.com/recipes/83254-mm-cookies-recipe
https://food52.com/recipes/85409-blueberry-pecan-oat-thumbprint-cookies
https://food52.com/recipes/39582-spritz-butter-christmas-cookies
https://food52.com/recipes/31624-parisian-macarons
https://food52.com/recipes/82227-caramelized-white-chocolate-cookies-recipe
https://food52.com/recipes/66073-dorie-greenspan-s-do-almost-anything-vanilla-cookie-dough
https://food52.com/recipes/76919-salted-chocolate-buckwheat-cookie
https://food52.com/recipes/80770-buttermilk-chocolate-chip-cookies
https://food52.com/recipes/40302-homemade-thin-mint-inspired-cookies
https://food52.com/recipes/86477-best-apple-pie-cookies-recipe
https://food52.com/recipes/39132-ovenly-s-secretly-vegan-salted-chocolate-chip-cookies
https://food52.com/recipes/82212-almond-biscotti-recipe
https://food52.com/recipes/8



# choosing one URL to inspect ingredients

In [287]:
import requests
from bs4 import BeautifulSoup

url = filtered_urls[0]
print(url)
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

li_tags = soup.find_all("li")
for li in li_tags:
    print(li.get_text(strip=True))

#there should only be 14 ingredients

https://food52.com/recipes/82256-chipotle-chocolate-crinkle-cookies
Schoolhouse
Dansk
ShopNewBest SellersExclusively at Food52Featured MakersKitchenKitchen Utensils & ToolsKitchen Storage & OrganizationCountertop OrganizationFood StorageKitchen LinensCutting BoardsCoffee & Tea AccessoriesKnivesCookwareFrying Pans & SkilletsBaking & Roasting PansDutch Ovens & Stock PotsSauce & Saute PansCookware SetsTabletopDinnerwareGlasswareTable LinensFlatware & Serving UtensilsServewareOutdoor EntertainingHomeOrganization & StorageCleaning & LaundryBath Towels & AccessoriesVases & Decorative ObjectsClocks & LightingCandles & Candle HoldersPlantersShop All
New
Best Sellers
Exclusively at Food52
Featured Makers
KitchenKitchen Utensils & ToolsKitchen Storage & OrganizationCountertop OrganizationFood StorageKitchen LinensCutting BoardsCoffee & Tea AccessoriesKnives
Kitchen Utensils & Tools
Kitchen Storage & Organization
Countertop Organization
Food Storage
Kitchen Linens
Cutting Boards
Coffee & Tea Acce

# getting all ingredients from one URL

In [288]:
import re

first_link= "https://food52.com/recipes/82256-chipotle-chocolate-crinkle-cookies"
response= requests.get(first_link)
html_content= response.text
pattern= re.compile(r'"recipeIngredient":\s*\[([^\]]+)\]', re.DOTALL)
match= pattern.search(html_content)
matches= pattern.findall(html_content)

ingredients_list = []
if match:
  raw_ingredients= match.group(1)
  ingredients_list = re.findall(r'"([^"]+)"', raw_ingredients)

print(json.dumps(ingredients_list, indent=2))


[
  "1/4 cup (31g) all-purpose flour",
  "1/4 teaspoon baking powder",
  "1/4 teaspoon salt",
  "2  large eggs",
  "2/3 cup (133g) granulated sugar",
  "1 1/4 teaspoon pure Mexican vanilla extract",
  "2 tablespoon unsalted butter",
  "5 ounce (142g) extra-bittersweet chocolate, chopped",
  "2 ounce (57g) unsweetened chocolate, chopped",
  "4 ounce (113g) mini chocolate chips or bittersweet chocolate, finely chopped",
  "1 1/2 tablespoon chipotle chile paste (recipe below)",
  "6  medium chipotle chiles (for chipotle paste\u2014see note) (1 ounce, or 28g)",
  "Granulated sugar",
  "Confectioner's sugar"
]


# format ingredients from one URL into json format

### ---- inspection line by line ----

In [289]:
line = ingredients_list[0]
print(line)

## we want to extract the amount unit and ingredient from this
## the ingredient is all-purpose flour
## the amount is 1/4
## unit is cup

pattern = re.compile(
    r'^(?P<amount>(?:\d+\s)?\d+/\d+|\d+)\s+'
    r'(?P<unit>\w+)\s*'
    r'(?:\([^\)]+\)\s*)?'
    r'(?P<ingredient>.+)$'
)


match = pattern.match(line)

if match:
    amount= match.group("amount")
    unit= match.group("unit")
    ingredient = match.group("ingredient")

    print({
        "amount": amount,
        "unit": unit,
        "ingredient": ingredient
    })

1/4 cup (31g) all-purpose flour
{'amount': '1/4', 'unit': 'cup', 'ingredient': 'all-purpose flour'}


### -- loop through all lines --

In [290]:
ingredients_parsed = []
for line in ingredients_list:
    match= pattern.match(line)
    if match:
        amount = match.group("amount").strip()
        unit = match.group("unit").strip()
        ingredient = match.group("ingredient").strip()
        ingredients_parsed.append({
            "amount": amount,
            "unit": unit,
            "ingredient": ingredient
        })
    else:
        ingredients_parsed.append({
            "amount": "",
            "unit": "",
            "ingredient": line.strip()
        })

recipe_json = {
    "recipes": [
        {
            "name": "chipotle chocolate crinkle cookies",
            "ingredients": ingredients_parsed
        }
    ]
}

print(json.dumps(recipe_json, indent=2))


{
  "recipes": [
    {
      "name": "chipotle chocolate crinkle cookies",
      "ingredients": [
        {
          "amount": "1/4",
          "unit": "cup",
          "ingredient": "all-purpose flour"
        },
        {
          "amount": "1/4",
          "unit": "teaspoon",
          "ingredient": "baking powder"
        },
        {
          "amount": "1/4",
          "unit": "teaspoon",
          "ingredient": "salt"
        },
        {
          "amount": "2",
          "unit": "large",
          "ingredient": "eggs"
        },
        {
          "amount": "2/3",
          "unit": "cup",
          "ingredient": "granulated sugar"
        },
        {
          "amount": "1 1/4",
          "unit": "teaspoon",
          "ingredient": "pure Mexican vanilla extract"
        },
        {
          "amount": "2",
          "unit": "tablespoon",
          "ingredient": "unsalted butter"
        },
        {
          "amount": "5",
          "unit": "ounce",
          "ingredient

## -- get unique ingredients from each site --

In [291]:
unique_ingredients = []
seen_ingredients= set()

for ing in recipe_json["recipes"][0]["ingredients"]:
    key = ing["ingredient"].lower().strip()
    if key not in seen_ingredients:
        unique_ingredients.append(ing)
        seen_ingredients.add(key)

recipe_json["recipes"][0]["ingredients"] = unique_ingredients

import json
print(json.dumps(recipe_json, indent=2))

{
  "recipes": [
    {
      "name": "chipotle chocolate crinkle cookies",
      "ingredients": [
        {
          "amount": "1/4",
          "unit": "cup",
          "ingredient": "all-purpose flour"
        },
        {
          "amount": "1/4",
          "unit": "teaspoon",
          "ingredient": "baking powder"
        },
        {
          "amount": "1/4",
          "unit": "teaspoon",
          "ingredient": "salt"
        },
        {
          "amount": "2",
          "unit": "large",
          "ingredient": "eggs"
        },
        {
          "amount": "2/3",
          "unit": "cup",
          "ingredient": "granulated sugar"
        },
        {
          "amount": "1 1/4",
          "unit": "teaspoon",
          "ingredient": "pure Mexican vanilla extract"
        },
        {
          "amount": "2",
          "unit": "tablespoon",
          "ingredient": "unsalted butter"
        },
        {
          "amount": "5",
          "unit": "ounce",
          "ingredient

## -- will do this for all 62 recipes I have --

## create a function for each part

In [292]:

url2 = filtered_urls[1]
print(url2)

import re

def ingredients_from_url(url):
  response = requests.get(url)
  html_content = response.text
  pattern = re.compile(r'"recipeIngredient":\s*\[([^\]]+)\]', re.DOTALL)
  match=pattern.search(html_content)
  matches= pattern.findall(html_content)
  ingredients_list = []
  if match:
    raw_ingredients = match.group(1)
    ingredients_list = re.findall(r'"([^"]+)"', raw_ingredients)
  print(json.dumps(ingredients_list, indent=2))
  return ingredients_list


def format_ingredients_all_lines(ingredients_list):
  ingredients_parsed = []
  for line in ingredients_list:
      match = pattern.match(line)
      if match:
          amount = match.group("amount").strip()
          unit = match.group("unit").strip()
          ingredient = match.group("ingredient").strip()
          ingredients_parsed.append({
              "amount": amount,
              "unit": unit,
              "ingredient": ingredient
          })
      else:
          ingredients_parsed.append({
              "amount": "",
              "unit": "",
              "ingredient": line.strip()
          })

  recipe_json = {
      "recipes": [
          {
              "name": "chipotle chocolate crinkle cookies",
              "ingredients": ingredients_parsed
          }
      ]
  }

  print(json.dumps(recipe_json, indent=2))
  return recipe_json



def unique_ingredients(recipe_json):
  unique_ingredients = []
  seen_ingredients= set()

  for ing in recipe_json["recipes"][0]["ingredients"]:
      key = ing["ingredient"].lower().strip()
      if key not in seen_ingredients:
          unique_ingredients.append(ing)
          seen_ingredients.add(key)

  recipe_json["recipes"][0]["ingredients"] = unique_ingredients
  print(json.dumps(recipe_json, indent=2))
  return recipe_json


url2 = filtered_urls[1]
print("--Here begins the running of function ingredients from url--")
ingredients_list_new = ingredients_from_url(url2)
print("--All the ingredients of one url are captured--")
print("\n")
print("\n")

print("--Here begins the running of function formatting ingredients all lines--")
recipe_json = format_ingredients_all_lines(ingredients_list_new)
print("--all the ingredients of one url are captured into json format--")
print("\n")
print("\n")

print("--Here begins the running of function unique ingredients--")
recipe_json_unique = unique_ingredients(recipe_json)
recipe_json_unique
print("--Here ends the running of function unique ingredients--")
print("\n")
print("\n")



https://food52.com/recipes/82219-chai-spice-snickerdoodle-recipe
--Here begins the running of function ingredients from url--
[
  "1/4 cup (50g) granulated sugar",
  "1/2 teaspoon ginger powder, ground",
  "1 1/2 teaspoon cardamom, freshly ground",
  "1/2 teaspoon cinnamon, freshly ground",
  "1 pinch cloves, freshly ground",
  "1/2 cup (118ml) canola oil",
  "1 cup (200g) granulated sugar",
  "1/4 cup (59ml) pure maple syrup",
  "3 tablespoon whole milk",
  "2 teaspoon pure vanilla extract",
  "2 cup (250g) flour",
  "1 teaspoon baking soda",
  "1/4 teaspoon salt",
  "1/2 teaspoon cinnamon, ground"
]
--All the ingredients of one url are captured--




--Here begins the running of function formatting ingredients all lines--
{
  "recipes": [
    {
      "name": "chipotle chocolate crinkle cookies",
      "ingredients": [
        {
          "amount": "1/4",
          "unit": "cup",
          "ingredient": "granulated sugar"
        },
        {
          "amount": "1/2",
          "unit

In [294]:
sixty_two_recipes_json = {"recipes": []}

for idx, url in enumerate(filtered_urls):
    print(f"Processing {idx+1}/{len(filtered_urls)}: {url}")

    ingredients_list_new = ingredients_from_url(url)

    recipe_json = format_ingredients_all_lines(ingredients_list_new)

    recipe_json_unique = unique_ingredients(recipe_json)

    recipe_name = url.split("/")[-1].split("-", 1)[-1].replace("-", " ")
    recipe_json_unique["recipes"][0]["name"] = recipe_name

    sixty_two_recipes_json["recipes"].append(recipe_json_unique["recipes"][0])
    print(f"Finished: {recipe_name}\n")

import json
with open("62_cookies_recipes.json", "w") as f:
    json.dump(sixty_two_recipes_json, f, indent=2)

print(f"Total recipes saved: {len(sixty_two_recipes_json['recipes'])}")



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        {
          "amount": "1/2",
          "unit": "cup",
          "ingredient": "whipping cream"
        },
        {
          "amount": "1/4",
          "unit": "cup",
          "ingredient": "unsalted butter, room temperature"
        },
        {
          "amount": "1",
          "unit": "teaspoon",
          "ingredient": "vanilla extract"
        },
        {
          "amount": "1/4",
          "unit": "teaspoon",
          "ingredient": "salt"
        },
        {
          "amount": "250",
          "unit": "gram",
          "ingredient": "finely chopped dark chocolate"
        }
      ]
    }
  ]
}
{
  "recipes": [
    {
      "name": "chipotle chocolate crinkle cookies",
      "ingredients": [
        {
          "amount": "1/2",
          "unit": "cup",
          "ingredient": "unsalted butter, room temperature"
        },
        {
          "amount": "1/4",
          "unit": "cup",
          "ingredie