In [None]:
import os
import re
import time
import random
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from langchain_groq import ChatGroq

In [None]:
def extract_ingredients_from_html(html_content, recipe_name=None):
    """Extract recipe ingredients from HTML using the LLM"""
    # Preprocess HTML to focus on likely ingredient sections
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Remove scripts, styles, and comments
    for element in soup(['script', 'style']):
        element.decompose()
    
    # Try to find ingredient sections
    ingredient_sections = []
    
    # Look for common ingredient section identifiers
    ingredient_indicators = ['ingredient', 'ingredients']
    for indicator in ingredient_indicators:
        # Check for elements with ingredient-related classes or IDs
        for element in soup.find_all(class_=lambda c: c and indicator in c.lower()):
            ingredient_sections.append(element.get_text())
        
        for element in soup.find_all(id=lambda i: i and indicator in i.lower()):
            ingredient_sections.append(element.get_text())
        
        # Look for lists near headers with ingredient text
        for header in soup.find_all(['h1', 'h2', 'h3', 'h4']):
            if indicator in header.get_text().lower():
                # Get the next few elements
                ingredients_list = []
                next_element = header.find_next()
                
                # Collect up to 20 elements after the header
                count = 0
                while next_element and count < 20:
                    if next_element.name == 'ul' or next_element.name == 'ol':
                        ingredients_list.append(next_element.get_text())
                        break
                    elif next_element.name == 'li':
                        ingredients_list.append(next_element.get_text())
                    count += 1
                    next_element = next_element.find_next()
                
                if ingredients_list:
                    ingredient_sections.append(" ".join(ingredients_list))
    
    # Compile the content to send to the LLM
    if ingredient_sections:
        content_for_llm = "\n".join(ingredient_sections)
    else:
        # Fallback: use page title and first part of the content
        title = soup.title.string if soup.title else "Recipe"
        content_for_llm = f"{title}\n\n{soup.get_text()[:4000]}"
    
    # Create prompt for the LLM
    recipe_context = f" for {recipe_name}" if recipe_name else ""
    prompt = f"""
    Extract only the ingredients list{recipe_context} from the following text.
    Format each ingredient on a new line with quantities. Be precise and include only actual ingredients.
    Do not include cooking instructions, equipment, or other non-ingredient information.
    
    Text:
    {content_for_llm}
    """
    
    # Get ingredients from LLM
    try:
        response = llm.invoke(prompt)
        return response.content
    except Exception as e:
        return f"Error extracting ingredients: {str(e)}"


In [None]:
def get_links_from_page(html_content, current_url):
      """Extract all links from a page"""
      soup = BeautifulSoup(html_content, 'html.parser')
      links = []
      
      for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            # Convert relative URLs to absolute
            absolute_url = urljoin(current_url, href)
            # Filter out non-HTTP URLs (like javascript:, mailto:, etc)
            if absolute_url.startswith(('http://', 'https://')):
                  links.append(absolute_url)
            
      return links

In [None]:
def crawl(website_url,recipe_query=None):
        """Crawl the website to find recipe pages"""
        queue = [website_url]
        pages_visited = 0
        
        print(f"Starting crawl of {website_url}")
        max_pages = 20
        recipe_urls = []
        visited_urls = set()
        while queue and pages_visited < max_pages and len(recipe_urls) < 10:
            current_url = queue.pop(0)
            
            if current_url in visited_urls:
                continue
                
            visited_urls.add(current_url)
            pages_visited += 1
            
            print(f"Visiting page {pages_visited}/{max_pages}: {current_url}")
            delay = 1
            try:
                # Add a delay to be respectful to the server
                time.sleep(delay)
                
                # Fetch the page
                headers = {
                    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
                }
                response = requests.get(current_url, headers=headers, timeout=10)
                response.raise_for_status()
                
                # Get links from the page
                links = get_links_from_page(response.text, current_url)
                
                # Check if current page is a recipe
                if is_likely_recipe_url(current_url):
                    # If we have a specific recipe query, check if it matches
                    if recipe_query:
                        # Use LLM to determine if this page matches the query
                        if page_matches_recipe_query(response.text, recipe_query):
                            recipe_urls.append(current_url)
                            print(f"Found matching recipe: {current_url}")
                    else:
                        # Without a specific query, add all likely recipe pages
                        recipe_urls.append(current_url)
                        print(f"Found recipe: {current_url}")
                
                # Add new links to the queue
                for link in links:
                    if is_valid_url(link) and link not in queue:
                        if is_likely_recipe_url(link):
                            # Prioritize links that look like recipes
                            queue.insert(0, link)
                        else:
                            queue.append(link)
                
            except Exception as e:
                print(f"Error processing {current_url}: {str(e)}")
                
        print(f"Crawl completed. Visited {len(visited_urls)} pages, found {len(recipe_urls)} recipes.")
        return recipe_urls

In [None]:
def find_and_extract_recipe(website_url, recipe_query=None, max_pages=15):
    """Find recipes on a website and extract ingredients"""
    # Initialize crawler
    # crawler = RecipeCrawler(website_url, max_pages=max_pages)
    
    # Crawl the website to find recipe pages
    recipe_urls = crawl(website_url, recipe_query)
    
    if not recipe_urls:
        return "No recipe pages found on this website."
    
    # Extract ingredients from each recipe page
    results = []
    
    for url in recipe_urls[:3]:  # Limit to first 3 recipes for testing
        try:
            print(f"\nExtracting ingredients from: {url}")
            # Fetch the page
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            }
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            
            # Get page title for recipe name
            soup = BeautifulSoup(response.text, 'html.parser')
            title = soup.title.string if soup.title else "Unknown Recipe"
            
            # Extract ingredients
            ingredients = extract_ingredients_from_html(response.text, recipe_query)
            
            results.append({
                'url': url,
                'title': title,
                'ingredients': ingredients
            })
            
        except Exception as e:
            print(f"Error processing recipe at {url}: {str(e)}")
    
    return results

In [None]:
def page_matches_recipe_query(self, page_content, recipe_query):
        """Use LLM to determine if a page matches the recipe query"""
        # Extract page title and a snippet
        soup = BeautifulSoup(page_content, 'html.parser')
        title = soup.title.string if soup.title else "No title"
        
        # Get page text snippets
        text_content = soup.get_text(separator=' ', strip=True)
        snippet = text_content[:1000] + "..." if len(text_content) > 1000 else text_content
        
        prompt = f"""
        I'm looking for a recipe about "{recipe_query}".
        
        Page title: {title}
        
        Page content snippet: 
        {snippet}
        
        Based only on this information, answer with exactly "YES" if this page likely contains a recipe for {recipe_query}, 
        or exactly "NO" if it's unlikely or unrelated. Only respond with YES or NO.
        """
        
        try:
            response = llm.invoke(prompt)
            result = response.content.strip().upper()
            return "YES" in result
        except Exception as e:
            print(f"Error using LLM to check page: {str(e)}")
            return False

In [None]:
def is_valid_url(self, url):
      """Check if URL belongs to the same domain and is not already visited"""
      parsed = urlparse(url)
      return parsed.netloc == self.domain and url not in self.visited_urls

def is_likely_recipe_url(self, url):
      """Heuristic to determine if a URL likely points to a recipe"""
      recipe_indicators = ['recipe', 'recipes', 'dish', 'meal', 'cook', 
                        'bake', 'food', 'ingredient', 'cuisine']
      
      url_lower = url.lower()
      # Check if URL contains recipe indicators
      return any(indicator in url_lower for indicator in recipe_indicators)

In [None]:
test_website = "https://www.allrecipes.com/"
recipe_query = "chocolate cake"

print(f"Looking for {recipe_query} recipes on {test_website}")
results = find_and_extract_recipe(test_website, recipe_query, max_pages=10)

In [None]:
for i, recipe in enumerate(results):
    print(f"\nRecipe {i+1}: {recipe['title']}")
    print(f"URL: {recipe['url']}")
    print("Ingredients:")
    print(recipe['ingredients'])
    print("-" * 50)