In [37]:
%pip install scrapy selenium requests beautifulsoup4 tqdm

Note: you may need to restart the kernel to use updated packages.


In [38]:
import requests
from bs4 import BeautifulSoup
import json
import re
import urllib3
from urllib.parse import urljoin

# Disable insecure request warnings (use with caution)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# --- Constants ---
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept-Language": "fa-IR,fa;q=0.9,en-US;q=0.8,en;q=0.7"
}

PERSIAN_DIGITS = str.maketrans('۰۱۲۳۴۵۶۷۸۹', '0123456789')

PERSIAN_NUMBERS = {
    ' صفر ': 0, 'یک': 1, ' دو ': 2, ' سه ': 3, ' چهار ': 4,
    ' پنج ': 5, ' شش ': 6, ' هفت ': 7, ' هشت ': 8, ' نه' : 9,
    ' ده ': 10, ' یازده ': 11, ' دوازده ': 12, ' سیزده ': 13,
    ' چهارده ': 14, ' پانزده ': 15, ' شانزده ': 16, ' هفده ': 17,
    ' هجده ': 18, ' نوزده ': 19, ' بیست ': 20, ' سی ': 30,
    ' چهل ': 40, ' پنجاه ': 50, ' شصت ': 60, ' هفتاد ': 70,
    ' هشتاد ': 80, ' نود ': 90, ' صد ': 100,
    ' یک چهارم ': 0.25, ' نصف ': 0.5, ' سه چهارم ': 0.75
}

UNITS = [
    'قاشق غذاخوری', 'قاشق چایخوری', 'کیلوگرم', 'گرم',
    'پیمانه', 'عدد', 'لیتر', 'فنجان', 'حبه', 'قاشق',
    'تکه', 'به مقدار لازم', 'مقدار لازم'
]

# --- Utility Functions ---


def convert_persian_numbers(text):
    """Converts Persian digits and number words in a string to standard format."""
    if not isinstance(text, str):
        return text  # Return non-strings as is
    # First convert digits
    text = text.translate(PERSIAN_DIGITS)
    for word, value in PERSIAN_NUMBERS.items():
        text = text.replace(word, str(value))
    return text


def clean_title(title):
    """Removes leading numbers/hyphens and converts Persian numbers in a title."""
    title = title.strip()
    # Remove patterns like "1-", "2.", "۳-" etc. from the beginning
    cleaned = re.sub(r"^[\d۰۱۲۳۴۵۶۷۸۹]+[-–.ـ\s]*", "", title).strip()
    return convert_persian_numbers(cleaned)

Isfahan Province, Iran

In [39]:


URL = "https://jainjas.com/Blog/414/%D8%BA%D8%B0%D8%A7%D9%87%D8%A7%DB%8C-%D9%85%D8%AD%D9%84%DB%8C-%D8%A7%D8%B5%D9%81%D9%87%D8%A7%D9%86"

# Category configuration
CATEGORY_MAPPING = {
    'گوشتی': {
        'meal_type': ['غذای اصلی'],
        'occasion': ['ناهار', 'شام']
    },
    'حلیم': {
        'meal_type': ['پیش غذا'],
        'occasion': ['صبحانه', 'عصرانه']
    },
    'آش': {
        'meal_type': ['پیش غذا'],
        'occasion': ['صبحانه', 'عصرانه']
    },
    'پلو خورش': {
        'meal_type': ['دسر'],
        'occasion': ['ناهار', 'شام']
    },
    'شیرین': {
        'meal_type': ['دسر'],
        'occasion': ['ناهار', 'شام']
    }
}

def detect_category(text):
    """Detect category from heading text"""
    text = text.lower()
    for keyword in CATEGORY_MAPPING:
        if keyword in text:
            return keyword
    return None


def parse_amount_unit(amount_text):
    amount_text = convert_persian_numbers(amount_text.strip())

    if any(phrase in amount_text for phrase in ['کافی', 'لازم', 'مقدار']):
        return ("مقدار لازم", "مقدار لازم")

    for word, value in PERSIAN_NUMBERS.items():
        if word in amount_text:
            unit = amount_text.replace(word, '').strip()
            return (value, unit)

    numeric_chars = []
    for c in amount_text:
        if c.isdigit() or c in ',./':
            numeric_chars.append(c)
        elif numeric_chars:
            break

    numeric_part = ''.join(numeric_chars)
    unit_part = amount_text.replace(numeric_part, '').strip()

    if '/' in numeric_part:
        parts = numeric_part.split('/')
        if len(parts) == 2:
            try:
                value = float(parts[0]) / float(parts[1])
                return (value, unit_part)
            except:
                pass

    numeric_part = numeric_part.replace(',', '.')

    try:
        return (float(numeric_part), unit_part)
    except:
        return ("مقدار لازم", amount_text)


def parse_ingredients(table):
    ingredients = []
    for row in table.find_all('tr'):
        cols = row.find_all('td')
        if len(cols) != 2:
            continue

        name = cols[0].get_text(strip=True)\
                      .replace('\xa0', ' ')\
                      .replace('‌', ' ')\
                      .strip()

        amount_text = cols[1].get_text(strip=True)
        amount, unit = parse_amount_unit(amount_text)

        ingredients.append({
            "name": name,
            "amount": amount,
            "unit": unit
        })

    return ingredients


def scrape_foods():
    try:
        response = requests.get(URL, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        foods = []
        current_config = None

        for element in soup.find_all(['h2', 'h3']):
            # Process category headings
            if element.name == 'h2':
                detected = detect_category(element.get_text(strip=True))
                if detected:
                    current_config = CATEGORY_MAPPING[detected]
                continue

            # Process food items
            if element.name == 'h3' and element.get('id', '').startswith('sec-'):
                if not current_config:
                    continue

                title = element.get_text(strip=True)\
                    .replace('\xa0', ' ')\
                    .replace('‌', ' ')\
                    .strip()

                if not title or title == "انواع آش در اصفهان":
                    continue

                # Get related elements
                ingredients_table = element.find_next('table')
                instructions = []
                images = []

                # Process instructions
                next_element = ingredients_table.find_next_sibling() if ingredients_table else None
                while next_element and next_element.name not in ['h2', 'h3']:
                    if next_element.name == 'p':
                        instruction = next_element.get_text(strip=True)
                        if instruction:
                            # Extract steps using regex (supports "1.", "2-", "3)", etc.)
                            steps = re.findall(
                                r'\d+[\.\-)]\s*(.*?)(?=\s*\d+[\.\-)]|$)', 
                                instruction, 
                                flags=re.DOTALL
                            )
                            for step in steps:
                                cleaned_step = step.strip()
                                if cleaned_step:
                                    instructions.append(cleaned_step)
                    next_element = next_element.find_next_sibling()

                # Process images
                img = element.find_next('img')
                while img and img.find_previous('h3') == element:
                    if 'src' in img.attrs:
                        images.append(img['src'])
                    img = img.find_next('img')

                food = {
                    "title": title,
                    "location": {
                        "province": "اصفهان",
                        "city": "اصفهان",
                        "coordinates": {
                            "latitude": 32.6539,
                            "longitude": 51.6660
                        }
                    },
                    "ingredients": parse_ingredients(ingredients_table) if ingredients_table else [],
                    "instructions": instructions,
                    "meal_type": current_config['meal_type'],
                    "occasion": current_config['occasion'],
                    "images": {
                        "تصویر نهایی": images[0] if images else "",
                        **{f"{i} مرحله": url for i, url in enumerate(images[1:], start=1)}
                    }
                }

                if food["ingredients"]:
                    foods.append(food)

        return foods

    except Exception as e:
        print(f"Scraping failed: {str(e)}")
        return []


# Execute and save
if __name__ == "__main__":
    foods_data = scrape_foods()

    with open("isfahan_foods.json", "w", encoding="utf-8") as f:
        json.dump(foods_data, f, ensure_ascii=False, indent=2)

    print(f"Successfully saved {len(foods_data)} food entries")

Successfully saved 28 food entries


Shiraz Province, Iran

In [40]:

def scrape_recipe_urls():
    """Scrape recipe URLs containing 'طرز تهیه' from the blog page"""
    try:
        url = "https://blog.okcs.com/shiraz-foods-desserts/"
        response = requests.get(url, headers=headers, verify=False)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, 'html.parser')
        base_url = "https://blog.okcs.com"
        recipe_urls = []

        # Find all links containing 'طرز تهیه' in their text
        for a in soup.find_all('a', text=lambda text: text and 'طرز تهیه' in text):
            href = a.get('href')
            if href:
                # Convert relative URLs to absolute
                if not href.startswith(('http://', 'https://')):
                    href = f"{base_url}{href}" if href.startswith(
                        '/') else f"{base_url}/{href}"
                recipe_urls.append(href)

        # Remove duplicates while preserving order
        seen = set()
        return [x for x in recipe_urls if not (x in seen or seen.add(x))]

    except Exception as e:
        print(f"Error scraping recipe URLs: {str(e)}")
        return []




def parse_amount_unit(amount_text):
    amount_text = convert_persian_numbers(amount_text.strip())

    if any(phrase in amount_text for phrase in ['کافی', 'لازم', 'مقدار']):
        return ("مقدار لازم", "مقدار لازم")

    for word, value in PERSIAN_NUMBERS.items():
        if word in amount_text:
            unit = amount_text.replace(word, '').strip()
            return (value, unit)

    numeric_chars = []
    for c in amount_text:
        if c.isdigit() or c in ',./':
            numeric_chars.append(c)
        elif numeric_chars:
            break
    numeric_part = ''.join(numeric_chars)
    unit_part = amount_text.replace(numeric_part, '').strip()

    if '/' in numeric_part:
        parts = numeric_part.split('/')
        if len(parts) == 2:
            try:
                value = float(parts[0]) / float(parts[1])
                return (value, unit_part)
            except Exception:
                pass

    numeric_part = numeric_part.replace(',', '.')
    try:
        return (float(numeric_part), unit_part)
    except Exception:
        return ("مقدار لازم", amount_text)


def parse_ingredients_from_table(table):
    ingredients = []
    rows = table.find_all('tr')
    if rows and "مواد اولیه" in rows[0].get_text():
        rows = rows[1:]
    for row in rows:
        cols = row.find_all('td')
        if len(cols) != 2:
            continue
        name = cols[0].get_text(strip=True).replace(
            '\xa0', ' ').replace('‌', ' ').strip()
        amount_text = cols[1].get_text(strip=True)
        amount, unit = parse_amount_unit(amount_text)
        ingredients.append({
            "name": name,
            "amount": amount,
            "unit": unit
        })
    return ingredients


def parse_ingredients_from_ul(ul):
    ingredients = []
    for li in ul.find_all("li"):
        text = li.get_text(strip=True)
        if ':' in text:
            parts = text.split(":", 1)
            name = parts[0].strip()
            amount_text = parts[1].strip()
            amount, unit = parse_amount_unit(amount_text)
            ingredients.append({
                "name": name,
                "amount": amount,
                "unit": unit
            })
    return ingredients


def scrape_recipe_page(url):
    try:
        response = requests.get(url, headers=headers, verify=False)
        soup = BeautifulSoup(response.content, 'html.parser')

        title_tag = soup.find("strong")
        food_title = title_tag.get_text(strip=True) if title_tag else "نامشخص"

        # Find image at top of post
        image_url = ""
        featured_img = soup.find("img", class_="attachment-post-thumbnail")
        if featured_img and featured_img.has_attr("src"):
            image_url = featured_img["src"]

        ingredients = []
        table_found = None
        for t in soup.find_all("table"):
            if "مواد اولیه" in t.get_text():
                table_found = t
                break
        if table_found:
            ingredients = parse_ingredients_from_table(table_found)
        else:
            for ul in soup.find_all("ul"):
                if any(':' in li.get_text() for li in ul.find_all("li")):
                    ingredients = parse_ingredients_from_ul(ul)
                    if ingredients:
                        break

        instructions = []
        for h3 in soup.find_all("h3"):
            span = h3.find("span")
            if span:
                text = span.get_text(strip=True)
                if text.startswith("مرحله"):
                    instructions.append(text)

        return {
            "title": food_title,
            "ingredients": ingredients,
            "instructions": instructions,
            "image": image_url
        }
    except Exception as e:
        print(f"Error scraping recipe page {url}: {str(e)}")
        return {"title": "نامشخص", "ingredients": [], "instructions": [], "image": ""}


def scrape_shiraz_foods(recipe_urls):
    foods = []
    for url in recipe_urls:
        recipe_data = scrape_recipe_page(url)
        food_item = {
            "title": recipe_data.get("title", "نامشخص"),
            "location": {
                "province": "فارس",
                "city": "شیراز",
                "coordinates": {
                    "latitude": 29.5926,
                    "longitude": 52.5836
                }
            },
            "ingredients": recipe_data.get("ingredients", []),
            "instructions": recipe_data.get("instructions", []),
            "meal_type": ["اصلی", "دسر", "پیش غذا"],
            "occasion": ["شام", "ناهار", "صبحانه"],
            "images": {
                "تصویر نهایی": recipe_data.get("image", "")
            }
        }
        foods.append(food_item)
    return foods


if __name__ == "__main__":
    recipe_urls = scrape_recipe_urls()
    if not recipe_urls:
        print("No recipe URLs found. Exiting.")
        exit()

    print(f"Found {len(recipe_urls)} recipe URLs to scrape")

    foods_data = scrape_shiraz_foods(recipe_urls)
    with open("shiraz_foods.json", "w", encoding="utf-8") as f:
        json.dump(foods_data, f, ensure_ascii=False, indent=2)
    print(f"Successfully saved {len(foods_data)} food entries.")

  for a in soup.find_all('a', text=lambda text: text and 'طرز تهیه' in text):


Found 32 recipe URLs to scrape
Successfully saved 32 food entries.


Hormozgan Province, Iran

In [41]:

MAIN_PAGE_URL = "https://blog.okala.com/hormozgan-cuisine/"

def parse_amount_unit(amount_text):
    amount_text = convert_persian_numbers(amount_text.strip())

    if any(word in amount_text for word in ["کافی", "لازم", "مقدار"]):
        return ("مقدار لازم", "مقدار لازم")

    for word, value in PERSIAN_NUMBERS.items():
        if word in amount_text:
            unit = amount_text.replace(word, '').strip()
            return (value, unit)

    match = re.match(r'([\d./]+)\s*(.*)', amount_text)
    if match:
        number, unit = match.groups()
        if '/' in number:
            parts = number.split('/')
            if len(parts) == 2:
                try:
                    value = float(parts[0]) / float(parts[1])
                    return (value, unit)
                except Exception:
                    pass
        number = number.replace(',', '.')
        try:
            return (float(number), unit)
        except Exception:
            pass

    return ("مقدار لازم", amount_text)


def parse_ingredients_from_table(table):
    ingredients = []
    rows = table.find_all('tr')
    for row in rows:
        cols = row.find_all('td')
        if len(cols) != 2:
            continue
        name = cols[0].get_text(strip=True)
        amount_text = cols[1].get_text(strip=True)
        amount, unit = parse_amount_unit(amount_text)
        ingredients.append({
            "name": name,
            "amount": amount,
            "unit": unit
        })
    return ingredients


def scrape_recipe_page(url):
    try:
        response = requests.get(url, headers=headers, verify=False)
        soup = BeautifulSoup(response.content, 'html.parser')

        ingredients = []
        for table in soup.find_all("table"):
            if "مواد" in table.get_text() and "لازم" in table.get_text():
                ingredients = parse_ingredients_from_table(table)
                break

        instructions = []
        for h3 in soup.find_all("h3"):
            text = h3.get_text(strip=True)
            if text.startswith("مرحله"):
                instructions.append(text)

        return {
            "ingredients": ingredients,
            "instructions": instructions
        }
    except Exception as e:
        print(f"Error while scraping recipe page {url}: {e}")
        return {
            "ingredients": [],
            "instructions": []
        }


def scrape_hormozgan_foods():
    foods = []
    image_urls = [
        "https://blog.okala.com/wp-content/uploads/2023/12/pudini-kooseh-1-1.jpg",
        "https://blog.okala.com/wp-content/uploads/2023/12/muflek-hormozgan-1-1.jpg",
        "https://blog.okala.com/wp-content/uploads/2023/12/mahyave-ba-mahi-khoshk-1-1.jpg",
        "https://blog.okala.com/wp-content/uploads/2023/12/koofte-mahi-moomgh-3-1.jpg",
        "https://blog.okala.com/wp-content/uploads/2023/12/katoogh-mahi-1-1.jpg",
        "https://blog.okala.com/wp-content/uploads/2023/12/mahi-gariz-kababi-1.jpg",
        "https://blog.okala.com/wp-content/uploads/2024/02/dopiyaze-meygoo.jpg"
    ]

    try:
        response = requests.get(MAIN_PAGE_URL, headers=headers, verify=False)
        soup = BeautifulSoup(response.content, "html.parser")

        h2_tags = soup.find_all("h2")
        food_index = 0

        for h2 in h2_tags:
            span = h2.find("span")
            if not span:
                continue

            title_raw = h2.get_text(strip=True)
            title_clean = clean_title(title_raw)

            image_url = image_urls[food_index]

            a_tag = h2.find_next("a", href=True)
            if not a_tag:
                continue

            relative_url = a_tag["href"]
            recipe_url = urljoin(MAIN_PAGE_URL, relative_url)

            if not recipe_url.startswith("https://blog.okala.com"):
                continue

            recipe_data = scrape_recipe_page(recipe_url)

            food_item = {
                "title": title_clean,
                "location": {
                    "province": "هرمزگان",
                    "city": "بندرعباس",
                    "coordinates": {
                        "latitude": 27.1963,
                        "longitude": 56.2884
                    }
                },
                "ingredients": recipe_data.get("ingredients", []),
                "instructions": recipe_data.get("instructions", []),
                "meal_type": ["اصلی", "دسر", "پیش غذا"],
                "occasion": ["شام", "ناهار", "صبحانه"],
                "images": {
                    "تصویر نهایی": image_url
                }
            }
            foods.append(food_item)
            food_index += 1

    except Exception as e:
        print(f"Error while scraping main page: {e}")
    return foods


if __name__ == "__main__":
    foods_data = scrape_hormozgan_foods()
    with open("hormozgan_foods.json", "w", encoding="utf-8") as f:
        json.dump(foods_data, f, ensure_ascii=False, indent=2)
    print(f"{len(foods_data)} Hormozgan local foods successfully saved.")

6 Hormozgan local foods successfully saved.


Chaharmahal and Bakhtiari Province, Iran


In [42]:


MAIN_PAGE_URL = "https://www.bartarinha.ir/بخش-آشپزی-15/1152168-محبوب-ترین-غذا-های-محلی-ایران"

def parse_ingredient(li_text):
    li_text = convert_persian_numbers(li_text.strip())
    if ': ' in li_text:
        name, amount = li_text.split(': ', 1)
    else:
        name, amount = li_text, "مقدار لازم"

    # Convert numeric values
    amount_match = re.match(r"([\d.]+)(.*)", amount.strip())
    if amount_match:
        numeric = float(amount_match.group(1)) if '.' in amount_match.group(
            1) else int(amount_match.group(1))
        unit = amount_match.group(2).strip()
    else:
        numeric = amount.strip()
        unit = ""

    return {
        "name": name.strip(),
        "amount": numeric if isinstance(numeric, (int, float)) else amount,
        "unit": unit
    }


def scrape_foods():
    foods = []
    current_food = None
    in_recipe = False

    try:
        response = requests.get(MAIN_PAGE_URL, headers=headers, verify=False)
        soup = BeautifulSoup(response.content, "html.parser")

        for element in soup.find_all(True):
            # Start new food entry
            if element.name == 'strong' and not current_food:
                current_food = {
                    "title": clean_title(element.get_text()),
                    "location": {
                        "province": "چهارمحال و بختیاری",
                        "city": "شهرکرد",
                        "coordinates": {
                            "latitude": 32.3274,
                            "longitude": 50.8650
                        }
                    },
                    "ingredients": [],
                    "instructions": [],
                    "meal_type": ["ناهار", "شام"],
                    "occasion": ["روزمره"],
                    "images": {"تصویر نهایی": ""}
                }
                continue

            if current_food:
                # Get image
                if element.name == 'img' and not current_food['images']['تصویر نهایی']:
                    current_food['images']['تصویر نهایی'] = element.get(
                        'src', '')

                # Get ingredients
                elif element.name == 'strong' and 'مواد' in element.get_text():
                    ul = element.find_next('ul')
                    if ul:
                        current_food['ingredients'] = [parse_ingredient(
                            li.get_text()) for li in ul.find_all('li')]

                # Start recipe collection
                elif element.name == 'strong' and ('طرز' in element.get_text() or 'نکات' in element.get_text()):
                    in_recipe = True
                    # SPECIAL CASE: If it's "نکات مهم", stop immediately
                    if 'نکات مهم' in element.get_text():
                        in_recipe = False
                        continue

                # Collect recipe steps
                elif in_recipe and element.name == 'p':
                    current_food['instructions'].append(
                        element.get_text(strip=True))

                # End recipe collection on new food title
                elif element.name == 'strong' and not ('مواد' in element.get_text() or 'طرز' in element.get_text()):
                    foods.append(current_food)
                    current_food = {
                        "title": clean_title(element.get_text()),
                        "location": {
                            "province": "چهارمحال و بختیاری",
                            "city": "شهرکرد",
                            "coordinates": {
                                "latitude": 32.3265,
                                "longitude": 50.8644
                            }
                        },
                        "ingredients": [],
                        "instructions": [],
                        "meal_type": ["اصلی", "دسر", "پیش غذا"],
                        "occasion": ["شام", "ناهار", "صبحانه"],
                        "images": {"تصویر نهایی": ""}
                    }
                    in_recipe = False

        # Add the last food item
        if current_food:
            foods.append(current_food)

    except Exception as e:
        print(f"Error: {e}")

    return foods


if __name__ == "__main__":
    foods_data = scrape_foods()
    with open("chaharmahal_foods.json", "w", encoding="utf-8") as f:
        json.dump(foods_data, f, ensure_ascii=False, indent=2)
    print(f"Successfully saved {len(foods_data)} food items.")

Successfully saved 3 food items.


Khusestan Province, Iran

In [43]:

def parse_amount_unit(text):
    original_text = text
    text = convert_persian_numbers(text.strip())

    # Handle "مقدار لازم" cases
    if any(phrase in text for phrase in ['به مقدار لازم', 'مقدار لازم']):
        name = re.sub(r'(به? مقدار لازم)', '', text,
                      flags=re.IGNORECASE).strip()
        unit = 'به مقدار لازم' if 'به مقدار لازم' in text else 'مقدار لازم'
        return ("مقدار لازم", unit, name)

    # Check for Persian number words with word boundaries
    for word, value in sorted(PERSIAN_NUMBERS.items(), key=lambda x: -len(x[0])):
        if re.search(rf'\b{word}\b', text):
            parts = re.split(rf'\b{word}\b', text, 1)
            remaining = parts[-1].strip()
            for unit in sorted(UNITS, key=lambda x: -len(x)):
                if remaining.startswith(unit):
                    return (value, unit, remaining[len(unit):].strip())
            return (value, remaining, '')

    # Extract numeric values with proper decimal handling
    num_match = re.match(r'^(\d+/\d+|\d+[\.,]?\d*)\s*', text)
    if num_match:
        num_str = num_match.group(1).replace(',', '.')
        remaining = text[len(num_str):].strip()

        try:
            if '/' in num_str:
                numerator, denominator = map(float, num_str.split('/'))
                amount = numerator / denominator
            else:
                amount = float(num_str)
        except:
            return ("مقدار لازم", "مقدار لازم", original_text)

        # Find longest matching unit
        for unit in sorted(UNITS, key=lambda x: -len(x)):
            if remaining.startswith(unit):
                return (amount, unit, remaining[len(unit):].strip())

        # If no unit found, take first word as unit
        if remaining:
            unit_match = re.match(r'^(\S+)', remaining)
            if unit_match:
                unit = unit_match.group(1)
                return (amount, unit, remaining[len(unit):].strip())

        return (amount, '', remaining)

    return ("مقدار لازم", "مقدار لازم", original_text)


def scrape_khuzestan_foods():
    BASE_URL = "https://www.kojaro.com/food/117017-khouzestan-traditional-food--part-1/"
    try:
        response = requests.get(BASE_URL, headers=headers, verify=False)
        soup = BeautifulSoup(response.content, 'html.parser')

        foods = []
        current_food = None
        capture_instructions = False
        skip_first = True

        elements = soup.find_all(['h2', 'h3', 'h4', 'img', 'ul', 'p'])

        for element in elements:
            if element.name == 'h2' and 'طعم غذاهای محلی خوزستان' in element.get_text():
                break

            if element.name == 'h3':
                if skip_first:
                    skip_first = False
                    continue

                if current_food:
                    foods.append(current_food)

                current_food = {
                    "title": element.get_text(strip=True),
                    "location": {
                        "province": "خورستان",
                        "city": "اهواز",
                        "coordinates": {
                            "latitude": 31.318327,
                            "longitude": 48.670620
                        }
                    },
                    "ingredients": [],
                    "instructions": [],
                    "meal_type": ["اصلی", "دسر", "پیش غذا"],
                    "occasion": ["شام", "ناهار", "صبحانه"],
                    "images": {"تصویر نهایی": ""}
                }

            elif current_food:
                if element.name == 'img' and not current_food['images']["تصویر نهایی"]:
                    src = element.get('src', '')
                    current_food['images']["تصویر نهایی"] = urljoin(BASE_URL, src)
                    
                    

                if element.name == 'h4' and 'مواد لازم' in element.get_text():
                    ul = element.find_next('ul')
                    if ul:
                        for li in ul.find_all('li'):
                            text = li.get_text(strip=True)
                            amount, unit, name = parse_amount_unit(text)

                            # Fallback for unparsed names
                            if not name and unit not in ['مقدار لازم', 'به مقدار لازم']:
                                name = text

                            current_food['ingredients'].append({
                                "name": name.strip(),
                                "amount": amount,
                                "unit": unit.strip()
                            })

                if element.name == 'h4' and 'طرز تهیه' in element.get_text():
                    capture_instructions = True
                    current_food['instructions'] = []
                elif capture_instructions:
                    if element.name == 'p':
                        paragraph = element.get_text(strip=True)
                        # Split sentences while preserving numbering
                        sentences = []
                        buffer = []
                        for part in re.split(r'(?<=[\.\d])\s+', paragraph):
                            if re.match(r'^\d+\.', part):
                                if buffer:
                                    sentences.append(' '.join(buffer))
                                buffer = [part]
                            else:
                                buffer.append(part)
                        if buffer:
                            sentences.append(' '.join(buffer))
                        current_food['instructions'].extend(sentences)
                    elif element.name in ['h3', 'h4']:
                        capture_instructions = False

        if current_food:
            foods.append(current_food)

        # Post-process ingredients
        for food in foods:
            for ing in food['ingredients']:
                # Cleanup for "مقدار لازم" cases
                if ing['unit'] in ['مقدار لازم', 'به مقدار لازم']:
                    ing['name'] = ing['name'].replace(
                        'به مقدار لازم', '').strip()
                    ing['name'] = ing['name'].replace('مقدار لازم', '').strip()

                # Remove any remaining numbers from name
                ing['name'] = re.sub(r'^\d+\s*', '', ing['name']).strip()

        return foods

    except Exception as e:
        print(f"Error: {str(e)}")
        return []


if __name__ == "__main__":
    khuzestan_foods = scrape_khuzestan_foods()

    with open("khuzestan_foods.json", "w", encoding="utf-8") as f:
        json.dump(khuzestan_foods, f, ensure_ascii=False, indent=2)

    print(f"Saved {len(khuzestan_foods)} Khuzestan food entries.")

Saved 79 Khuzestan food entries.


Bushehr Province, Iran

In [44]:


COOKPAD_BUSHEHR_URL = "https://cookpad.com/ir/جستجو/بوشهر"

def parse_ingredient(li):
    amount_tag = li.find('bdi', class_='font-semibold')
    name_tag = li.find('span')

    amount = amount_tag.get_text(strip=True) if amount_tag else "مقدار لازم"
    name = name_tag.get_text(strip=True) if name_tag else ""

    # Convert amounts
    amount = convert_persian_numbers(amount)
    amount_match = re.match(r"([\d.]+)(.*)", amount)
    if amount_match:
        numeric = float(amount_match.group(1)) if '.' in amount_match.group(
            1) else int(amount_match.group(1))
        unit = amount_match.group(2).strip()
    else:
        numeric = amount
        unit = ""

    return {
        "name": name,
        "amount": numeric if isinstance(numeric, (int, float)) else amount,
        "unit": unit
    }


def scrape_bushehr_cookpad():
    foods = []
    seen_titles = set()  # Track titles to avoid duplicates

    try:
        # First get main page with food links
        main_response = requests.get(
            COOKPAD_BUSHEHR_URL, headers=headers, verify=False)
        main_soup = BeautifulSoup(main_response.content, "html.parser")

        # Get all food links
        food_links = main_soup.select('a.block-link__main[href^="/ir/"]')

        for link in food_links[:30]:  # Limit to 30 links
            # Extract and clean title from the link text
            title = clean_title(link.get_text(strip=True))

            # Skip duplicate titles
            if title in seen_titles:
                print(f"Skipping duplicate title: {title}")
                continue
            seen_titles.add(title)

            food_url = urljoin(COOKPAD_BUSHEHR_URL, link['href'])

            # Scrape individual food page
            food_response = requests.get(
                food_url, headers=headers, verify=False)
            food_soup = BeautifulSoup(food_response.content, "html.parser")

            food_data = {
                "title": title,  # Use pre-cleaned title
                "location": {
                    "province": "بوشهر",
                    "city": "بوشهر",
                    "coordinates": {
                        "latitude": 28.9145,
                        "longitude": 50.8279
                    }
                },
                "ingredients": [],
                "instructions": [],
                "meal_type": ["اصلی", "دسر","پیش غذا"],
                "occasion": ["شام","ناهار","صبحانه"],
                "images": {"تصویر نهایی": ""}
            }

            # Get main image
            main_img = food_soup.find('img', {'fetchpriority': 'high'})
            if main_img:
                food_data["images"]["تصویر نهایی"] = main_img.get('src', '')

            # Get ingredients
            ingredient_list = food_soup.find('div', class_='ingredient-list')
            if ingredient_list:
                food_data["ingredients"] = [parse_ingredient(li)
                                            for li in ingredient_list.find_all('li')]

            # Get recipe steps
            steps_section = food_soup.find('ol', class_='list-none')
            if steps_section:
                for step in steps_section.find_all('li', class_='step'):
                    text_div = step.find('div', dir='auto')
                    if text_div:
                        food_data["instructions"].append(
                            text_div.get_text(strip=True))

            foods.append(food_data)

    except Exception as e:
        print(f"Error: {e}")

    return foods


if __name__ == "__main__":
    bushehr_foods = scrape_bushehr_cookpad()
    with open("bushehr_foods.json", "w", encoding="utf-8") as f:
        json.dump(bushehr_foods, f, ensure_ascii=False, indent=2)
    print(f"Successfully saved {len(bushehr_foods)} Bushehr foods.")

Skipping duplicate title: قلیه میگو بوشهری
Skipping duplicate title: خورش ماهی بوشهری
Successfully saved 28 Bushehr foods.


 Kohgiluyeh and Boyer-Ahmad Province, Iran


In [45]:



def scrape_blogfa_foods():
    URL = "https://aadabkb79.blogfa.com/post/5"
    try:
        response = requests.get(URL, headers=headers, verify=False)
        soup = BeautifulSoup(response.content, 'html.parser')

        foods = []
        h1_tags = soup.find_all('h1')

        counter = 0
        for h1 in h1_tags:
            if(counter == 0):
                counter += 1
                continue
            food_item = {
                "title": h1.get_text(strip=True),
                "location": {
                    "province": "کهگیلویه و بویراحمد",
                    "city": "یاسوج",
                    "coordinates": {
                        "latitude": 30.6638,
                        "longitude": 51.5949
                    }
                },
                "ingredients": [],
                "instructions": [],
                "meal_type": ["اصلی", "دسر", "پیش غذا"],
                "occasion": ["شام", "ناهار", "صبحانه"],
                "images": {"تصویر نهایی": ""}
            }

            # Find the recipe div following the h1
            recipe_div = h1.find_next('div')

            if recipe_div:
                # Extract text from all paragraphs
                paragraphs = recipe_div.find_all('p')
                instructions = []
                for p in paragraphs:
                    if p.find('img'):
                        continue  # Skip paragraphs containing images
                    text = p.get_text(strip=True)
                    if text:
                        sentences = [s.strip()
                                     for s in text.split('.') if s.strip()]
                        instructions.extend(sentences)

                food_item['instructions'] = instructions

                # # Find image if exists
                # img = recipe_div.find('img')
                # if img and img.has_attr('src'):
                #     food_item['images']['تصویر نهایی'] = img['src']

            foods.append(food_item)

        return foods

    except Exception as e:
        print(f"Error: {str(e)}")
        return []


if __name__ == "__main__":
    food_data = scrape_blogfa_foods()

    with open("kohgiluyeh_foods.json", "w", encoding="utf-8") as f:
        json.dump(food_data, f, ensure_ascii=False, indent=2)

    print(f"Successfully saved {len(food_data)} food entries.")

Successfully saved 9 food entries.


In [46]:
import os

input_json_files = [
    "isfahan_foods.json",
    "shiraz_foods.json",
    "hormozgan_foods.json",
    "chaharmahal_foods.json",
    "khuzestan_foods.json",
    "bushehr_foods.json",  
    "kohgiluyeh_foods.json"
]

output_json_file = "Local_Foods.json"

combined_food_data = []
total_entries = 0

print("Starting concatenation process...")

# Loop through each input file
for file_path in input_json_files:
    if os.path.exists(file_path):
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                print(f"Reading data from {file_path}...")
                data = json.load(f)

                if isinstance(data, list):
                    combined_food_data.extend(data)
                    print(f"-> Added {len(data)} entries from {file_path}")
                    total_entries += len(data)
                else:
                    print(
                        f"Warning: Content of {file_path} is not a list. Skipping.")

        except json.JSONDecodeError:
            print(
                f"Error: Could not decode JSON from {file_path}. File might be corrupted or empty. Skipping.")
        except Exception as e:
            print(
                f"An error occurred while reading {file_path}: {e}. Skipping.")
    else:
        print(f"Warning: File not found - {file_path}. Skipping.")

if combined_food_data:
    try:
        with open(output_json_file, 'w', encoding='utf-8') as f:
            json.dump(combined_food_data, f, ensure_ascii=False, indent=2)
        print(
            f"\nSuccessfully combined a total of {total_entries} food entries into {output_json_file}.")
    except Exception as e:
        print(
            f"\nError writing the final combined file {output_json_file}: {e}")
else:
    print("\nNo data was collected from the input files. Output file not created.")

Starting concatenation process...
Reading data from isfahan_foods.json...
-> Added 28 entries from isfahan_foods.json
Reading data from shiraz_foods.json...
-> Added 32 entries from shiraz_foods.json
Reading data from hormozgan_foods.json...
-> Added 6 entries from hormozgan_foods.json
Reading data from chaharmahal_foods.json...
-> Added 3 entries from chaharmahal_foods.json
Reading data from khuzestan_foods.json...
-> Added 79 entries from khuzestan_foods.json
Reading data from bushehr_foods.json...
-> Added 28 entries from bushehr_foods.json
Reading data from kohgiluyeh_foods.json...
-> Added 9 entries from kohgiluyeh_foods.json

Successfully combined a total of 185 food entries into Local_Foods.json.


Cleaning the LabelStudio Output

In [None]:
import json
import re  
import copy  

input_json_file = 'LabelStudio_Output.json'  
output_json_file = 'Filered_Data.json'     

KEYS_TO_REMOVE = {
    "id",
    "quality",       # Will be removed after being used for filtering
    "annotator",
    "annotation_id",
    "created_at",
    "updated_at",
    "lead_time"
}



def count_words(text):
    """Counts words in a given string, handling non-strings and splitting robustly."""
    if not isinstance(text, str):
        return 0
    words = re.split(r'\s+', text.strip())
    return len([word for word in words if word])


original_data = []
processed_data = []  

try:
    print(f"Loading data from '{input_json_file}'...")
    with open(input_json_file, 'r', encoding='utf-8') as f:
        original_data = json.load(f)

    if isinstance(original_data, dict):
        original_data = [original_data]
    elif not isinstance(original_data, list):
        print(
            f"Error: Expected JSON root to be an object or a list, but got {type(original_data)}")
        exit()

    print(f"Loaded {len(original_data)} entries.")

    print("Filtering out 'بد' quality entries and removing metadata...")
    original_items_count = len(original_data)
    filtered_out_count = 0

    for index, item in enumerate(original_data):
        if not isinstance(item, dict):
            print(
                f"Warning: Skipping item at original index {index} because it's not a dictionary (type: {type(item)}). Value: {item}")
            filtered_out_count += 1  
            continue

        if item.get("quality") == "بد":
            filtered_out_count += 1
            continue
        else:

            item_for_output = item.copy()
            for key_to_remove in KEYS_TO_REMOVE:
                item_for_output.pop(key_to_remove, None)

            processed_data.append(item_for_output)

    print(
        f"Filtering complete. Kept {len(processed_data)} entries, removed {filtered_out_count} entries.")

    print(f"Saving filtered and cleaned data to '{output_json_file}'...")
    with open(output_json_file, 'w', encoding='utf-8') as f:
        json.dump(processed_data, f, ensure_ascii=False, indent=2)
    print("Save complete.")

    print("\n--- Analyzing Final Data ---")
    final_total_word_count = 0
    final_total_instruction_words = 0
    final_instruction_set_count = 0

    if not processed_data:
        print("No data remaining after filtering to analyze.")
    else:
        for item in processed_data:
            if not isinstance(item, dict):
                continue

            item_word_count = 0
            item_word_count += count_words(item.get("title"))

            location = item.get("location")
            if isinstance(location, dict):
                item_word_count += count_words(location.get("province"))
                item_word_count += count_words(location.get("city"))

            ingredients = item.get("ingredients")
            if isinstance(ingredients, list):
                for ingredient in ingredients:
                    if isinstance(ingredient, dict):
                        item_word_count += count_words(ingredient.get("name"))
                        item_word_count += count_words(ingredient.get("unit"))

            instructions = item.get("instructions")
            current_instruction_words = 0
            has_valid_instructions = False
            if isinstance(instructions, list):
                for instruction_step in instructions:
                    step_words = count_words(instruction_step)
                    item_word_count += step_words  
                    current_instruction_words += step_words
                if current_instruction_words > 0:
                    has_valid_instructions = True

            item_word_count += count_words(item.get("meal_type"))

            occasion = item.get("occasion")
            if isinstance(occasion, dict):
                choices = occasion.get("choices")
                if isinstance(choices, list):
                    for choice in choices:
                        item_word_count += count_words(choice)

            final_total_word_count += item_word_count

            if has_valid_instructions:
                final_total_instruction_words += current_instruction_words
                final_instruction_set_count += 1

        mean_instruction_length = (final_total_instruction_words / final_instruction_set_count) \
            if final_instruction_set_count > 0 else 0

        print(
            f"Total word count in '{output_json_file}': {final_total_word_count}")
        print(
            f"Average instruction length in '{output_json_file}' (words per recipe with instructions): {mean_instruction_length:.2f}")
        print(
            f"Number of recipes with instructions considered for average: {final_instruction_set_count}")


# --- Error Handling ---
except FileNotFoundError:
    print(f"Error: Input file '{input_json_file}' not found.")
except json.JSONDecodeError as e:
    print(
        f"Error: Could not decode JSON from '{input_json_file}'. Invalid JSON format. Details: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {type(e).__name__} - {e}")
    import traceback
    traceback.print_exc()  # Print detailed traceback for unexpected errors

Loading data from 'LabelStudio_Output.json'...
Loaded 185 entries.
Filtering out 'بد' quality entries and removing metadata...
Filtering complete. Kept 170 entries, removed 15 entries.
Saving filtered and cleaned data to 'Filered_Data.json'...
Save complete.

--- Analyzing Final Data ---
Total word count in 'Filered_Data.json': 26529
Average instruction length in 'Filered_Data.json' (words per recipe with instructions): 119.46
Number of recipes with instructions considered for average: 170
