In [16]:
import requests
import re
from urllib.parse import urlparse


url = "https://www.olivemagazine.com/recipes/collection/best-cookies-recipes/"
response = requests.get(url)
html = response.text

# Find all /recipes/ links
all_links = re.findall(r'<a\s+href="(https://www\.olivemagazine\.com/recipes/[^"]+|/recipes/[^"]+)"', html)
all_links = ["https://www.olivemagazine.com" + link if link.startswith("/") else link for link in all_links]

all_links = list(set(all_links))


exclude_keywords = [
    'kitchen-therapy-projects',
    'best-nutella-recipes',
    'best-peanut-butter-recipes',
    'best-ever-chocolate-recipes','meringues',
    'best-ever-recipes-using-white-chocolate',
    'mindful-therapy-projects','mindful-kitchen-therapy-projects','10-baking-recipes-that-will-transform-your-favourite-chocolate-bars-biscuits-and-spreads'
]


filtered_urls = [link for link in all_links if not any(f'/{kw}/' in link for kw in exclude_keywords)]

print(f"Found {len(filtered_urls)} recipe links:")
for link in filtered_urls:
    print(link)

#triple chocolate cookies --> mindful therapy projects


Found 37 recipe links:
https://www.olivemagazine.com/recipes/baking-and-desserts/biscoff-cookies/
https://www.olivemagazine.com/recipes/baking-and-desserts/apricot-and-white-chocolate-cookies/
https://www.olivemagazine.com/recipes/baking-and-desserts/caramelised-white-chocolate-florentines/
https://www.olivemagazine.com/recipes/baking-and-desserts/tahini-cookies/
https://www.olivemagazine.com/recipes/family/chunky-chocolate-chip-cookies/
https://www.olivemagazine.com/recipes/baking-and-desserts/easter-biscuits/
https://www.olivemagazine.com/recipes/baking-and-desserts/double-chocolate-toblerone-cookies/
https://www.olivemagazine.com/recipes/baking-and-desserts/miso-chocolate-cookies/
https://www.olivemagazine.com/recipes/family/maple-pecan-cookies/
https://www.olivemagazine.com/recipes/baking-and-desserts/oat-and-fig-cookies/
https://www.olivemagazine.com/recipes/baking-and-desserts/cereal-cookies/
https://www.olivemagazine.com/recipes/baking-and-desserts/sugar-cookies/
https://www.oli

In [17]:
import re
import json

first= "https://www.olivemagazine.com/recipes/family/maple-pecan-cookies/"
response= requests.get(first)
html_content= response.text
pattern= re.compile(r'"recipeIngredient":\s*\[([^\]]+)\]', re.DOTALL)
match= pattern.search(html_content)
matches= pattern.findall(html_content)

ingredients_list = []
if match:
  raw_ingredients= match.group(1)
  ingredients_list = re.findall(r'"([^"]+)"', raw_ingredients)

print(json.dumps(ingredients_list, indent=2, ensure_ascii=False))

[
  "150g salted butter softened",
  "125g soft light brown sugar",
  "90ml maple syrup",
  "1 egg lightly beaten",
  "a pinch mixed spice",
  "280g plain flour",
  "½ tsp baking powder",
  "100g pecan halves 12 left whole, the rest chopped"
]


In [18]:
line = ingredients_list[0]
print(line)

## we want to extract the amount unit and ingredient from this
## the ingredient is all-purpose flour
## the amount is 1/4
## unit is cup

pattern = re.compile(r'^(?P<amount>\d+(?:\.\d+)?)\s*(?P<unit>[a-zA-Z]+)?\s*(?P<ingredient>.+)$')


match = pattern.match(line)

if match:
    amount= match.group("amount")
    unit= match.group("unit")
    ingredient = match.group("ingredient")

    print({
        "amount": amount,
        "unit": unit,
        "ingredient": ingredient
    })

150g salted butter softened
{'amount': '150', 'unit': 'g', 'ingredient': 'salted butter softened'}


In [19]:

ingredients_parsed = []
for line in ingredients_list:
    match= pattern.match(line)
    if match:
        amount = match.group("amount").strip()
        unit = match.group("unit").strip()
        ingredient = match.group("ingredient").strip()
        ingredients_parsed.append({
            "amount": amount,
            "unit": unit,
            "ingredient": ingredient
        })
    else:
        ingredients_parsed.append({
            "amount": "",
            "unit": "",
            "ingredient": line.strip()
        })

recipe_json = {
    "recipes": [
        {
            "name": "Maple pecan cookies",
            "ingredients": ingredients_parsed
        }
    ]
}

print(json.dumps(recipe_json, indent=2))


{
  "recipes": [
    {
      "name": "Maple pecan cookies",
      "ingredients": [
        {
          "amount": "150",
          "unit": "g",
          "ingredient": "salted butter softened"
        },
        {
          "amount": "125",
          "unit": "g",
          "ingredient": "soft light brown sugar"
        },
        {
          "amount": "90",
          "unit": "ml",
          "ingredient": "maple syrup"
        },
        {
          "amount": "1",
          "unit": "egg",
          "ingredient": "lightly beaten"
        },
        {
          "amount": "",
          "unit": "",
          "ingredient": "a pinch mixed spice"
        },
        {
          "amount": "280",
          "unit": "g",
          "ingredient": "plain flour"
        },
        {
          "amount": "",
          "unit": "",
          "ingredient": "\u00bd tsp baking powder"
        },
        {
          "amount": "100",
          "unit": "g",
          "ingredient": "pecan halves 12 left whole, the

In [20]:
unique_ingredients = []
seen_ingredients= set()

for ing in recipe_json["recipes"][0]["ingredients"]:
    key = ing["ingredient"].lower().strip()
    if key not in seen_ingredients:
        unique_ingredients.append(ing)
        seen_ingredients.add(key)

recipe_json["recipes"][0]["ingredients"] = unique_ingredients

import json
print(json.dumps(recipe_json, indent=2))

{
  "recipes": [
    {
      "name": "Maple pecan cookies",
      "ingredients": [
        {
          "amount": "150",
          "unit": "g",
          "ingredient": "salted butter softened"
        },
        {
          "amount": "125",
          "unit": "g",
          "ingredient": "soft light brown sugar"
        },
        {
          "amount": "90",
          "unit": "ml",
          "ingredient": "maple syrup"
        },
        {
          "amount": "1",
          "unit": "egg",
          "ingredient": "lightly beaten"
        },
        {
          "amount": "",
          "unit": "",
          "ingredient": "a pinch mixed spice"
        },
        {
          "amount": "280",
          "unit": "g",
          "ingredient": "plain flour"
        },
        {
          "amount": "",
          "unit": "",
          "ingredient": "\u00bd tsp baking powder"
        },
        {
          "amount": "100",
          "unit": "g",
          "ingredient": "pecan halves 12 left whole, the

In [21]:

# url2 = filtered_urls[1]
# print(url2)

import re

def get_recipe_name(url):
    response = requests.get(url)
    html_content = response.text
    name_pattern = re.compile(r'"name":\s*"([^"]+)"')
    match = name_pattern.search(html_content)
    if match:
        return match.group(1)
    return "Unknown Recipe Title"

def ingredients_from_url(url):
  response = requests.get(url)
  html_content = response.text
  pattern = re.compile(r'"recipeIngredient":\s*\[([^\]]+)\]', re.DOTALL)
  match=pattern.search(html_content)
  matches= pattern.findall(html_content)
  ingredients_list = []
  if match:
    raw_ingredients = match.group(1)
    ingredients_list = re.findall(r'"([^"]+)"', raw_ingredients)
  print(json.dumps(ingredients_list, indent=2))
  return ingredients_list


def format_ingredients_all_lines(ingredients_list,url):
  recipe_name = get_recipe_name(url)
  ingredients_parsed = []
  for line in ingredients_list:
      match = pattern.match(line)
      if match:
          amount = match.group("amount").strip() if match.group("amount") else ""
          unit = match.group("unit").strip() if match.group("unit") else ""
          ingredient = match.group("ingredient").strip()
          ingredients_parsed.append({
              "amount": amount,
              "unit": unit,
              "ingredient": ingredient
          })
      else:
          ingredients_parsed.append({
              "amount": "",
              "unit": "",
              "ingredient": line.strip()
          })

  recipe_json = {
      "recipes": [
          {
              "name": recipe_name,
              "ingredients": ingredients_parsed
          }
      ]
  }

  print(json.dumps(recipe_json, indent=2))
  return recipe_json



def unique_ingredients(recipe_json):
  unique_ingredients = []
  seen_ingredients= set()

  for ing in recipe_json["recipes"][0]["ingredients"]:
      key = ing["ingredient"].lower().strip()
      if key not in seen_ingredients:
          unique_ingredients.append(ing)
          seen_ingredients.add(key)

  recipe_json["recipes"][0]["ingredients"] = unique_ingredients
  print(json.dumps(recipe_json, indent=2))
  return recipe_json


url2 = filtered_urls[1]
print("--Here begins the running of function ingredients from url--")
ingredients_list_new = ingredients_from_url(url2)
print("--All the ingredients of one url are captured--")
print("\n")
print("\n")

print("--Here begins the running of function formatting ingredients all lines--")
recipe_json = format_ingredients_all_lines(ingredients_list_new, url2)
print("--all the ingredients of one url are captured into json format--")
print("\n")
print("\n")

print("--Here begins the running of function unique ingredients--")
recipe_json_unique = unique_ingredients(recipe_json)
recipe_json_unique
print("--Here ends the running of function unique ingredients--")
print("\n")
print("\n")



--Here begins the running of function ingredients from url--
[
  "150g butter at room temperature",
  "100g golden caster sugar",
  "50g light muscovado sugar",
  "1 egg",
  "1 tsp vanilla extract",
  "1 tbsp milk",
  "220g self-raising flour",
  "150g white chocolate roughly chopped",
  "100g dried apricots snipped into pieces"
]
--All the ingredients of one url are captured--




--Here begins the running of function formatting ingredients all lines--
{
  "recipes": [
    {
      "name": "Apricot and white chocolate cookies",
      "ingredients": [
        {
          "amount": "150",
          "unit": "g",
          "ingredient": "butter at room temperature"
        },
        {
          "amount": "100",
          "unit": "g",
          "ingredient": "golden caster sugar"
        },
        {
          "amount": "50",
          "unit": "g",
          "ingredient": "light muscovado sugar"
        },
        {
          "amount": "1",
          "unit": "eg",
          "ingredient": "

In [22]:
thirty_seven_recipes_json = {"recipes": []}

for idx, url in enumerate(filtered_urls):
    print(f"Processing {idx+1}/{len(filtered_urls)}: {url}")

    ingredients_list_new = ingredients_from_url(url)

    recipe_json = format_ingredients_all_lines(ingredients_list_new,url)

    recipe_json_unique = unique_ingredients(recipe_json)

    thirty_seven_recipes_json["recipes"].append(recipe_json_unique["recipes"][0])
    print(f"Finished: {recipe_name}\n")

import json
with open("37_cookies_recipes.json", "w") as f:
    json.dump(thirty_seven_recipes_json, f, indent=2)

print(f"Total recipes saved: {len(thirty_seven_recipes_json['recipes'])}")



Processing 1/37: https://www.olivemagazine.com/recipes/baking-and-desserts/biscoff-cookies/
[
  "12 tbsp Biscoff spread",
  "125g salted butter softened",
  "100g caster sugar",
  "100g soft light brown sugar",
  "1 tsp vanilla bean paste",
  "1 medium egg",
  "225g plain flour",
  "1 tbsp cornflour",
  "\u00bd tsp baking powder",
  "100g Lotus biscuits finely chopped"
]
{
  "recipes": [
    {
      "name": "Biscoff cookies",
      "ingredients": [
        {
          "amount": "12",
          "unit": "tbsp",
          "ingredient": "Biscoff spread"
        },
        {
          "amount": "125",
          "unit": "g",
          "ingredient": "salted butter softened"
        },
        {
          "amount": "100",
          "unit": "g",
          "ingredient": "caster sugar"
        },
        {
          "amount": "100",
          "unit": "g",
          "ingredient": "soft light brown sugar"
        },
        {
          "amount": "1",
          "unit": "tsp",
          "ingredient":