In [None]:
from bs4 import BeautifulSoup
import requests
import time
from tqdm import tqdm, trange
from openpyxl import Workbook

In [None]:
url = "https://www.lamaistas.lt/visi-receptai/"
urls_file = "urls.txt"
output_data = "data.xlsx"

In [None]:
n = round(25419/44)
def ScrapeLinks(url, output_file):
    headers = {"User-Agent": "Mozilla/5.0"}

    for i in tqdm(range(1, n+1), desc="Scraping pages"):
        new_url = f"{url}{i}"
        try:
            response = requests.get(new_url, headers=headers)
            soup = BeautifulSoup(response.text, "html.parser")

            for a in soup.find_all("a", href=True):
                if "/receptas/" in a["href"]:
                    full_link = a["href"]
                    with open(output_file, "a", encoding="utf-8") as f:
                        f.write(full_link + "\n")
            time.sleep(0.3)
        except:
            print(f"Error in page {i}")
            continue

    #Delete duplicates
    with open(output_file, "r", encoding="utf-8") as f:
        lines = f.readlines()
        unique_lines = list(set(lines))

    with open(output_file, "w", encoding="utf-8") as f:
        f.writelines(unique_lines)

In [None]:
ScrapeLinks(url, urls_file)

Scraping pages: 100%|██████████| 578/578 [07:26<00:00,  1.29it/s]


In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed

def clean(text):
    if not text:
        return ""
    return text.replace("\n", " ").replace("\r", "").strip()

def scrape_single_url(url, headers):
    try:
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, "html.parser")

        # Title
        title_tag = soup.find("div", class_="recipeTitleSegment")
        title = title_tag.text.strip() if title_tag else ""

        # Author description
        authors_tag = soup.find("span", class_="authorsDescription full")
        authors_desc = authors_tag.text.strip() if authors_tag else ""

        # Portions
        portions_tag = soup.find(class_="portionContainer")
        portions = portions_tag.text.strip().replace("\n", "") if portions_tag else ""

        # Preparation time
        prep_tag = soup.find("span", class_="info")
        prep_time = prep_tag.text.strip() if prep_tag else ""

        # Ingredients
        amount_list = soup.find_all(class_="amount")
        amount_text = [a.text.strip() for a in amount_list]

        ingredient_list = soup.find_all("span", class_="ingredient")
        ingredient = [i.text.strip() for i in ingredient_list]
        ingredient_list = [i for i in ingredient if i]

        ingredients = []
        for j in range(min(len(amount_text), len(ingredient_list))):
            ingredients.append(amount_text[j] + " " + ingredient_list[j])
        ingredients = "; ".join(ingredients)

        # Steps
        steps_list = soup.find_all("div", class_="description")
        steps_list = [s.find("div", class_="text") for s in steps_list]
        steps = [s.text.strip() for s in steps_list if s]
        steps = "; ".join(steps)

        # Clean
        title = clean(title)
        authors_desc = clean(authors_desc)
        portions = clean(portions)
        prep_time = clean(prep_time)
        ingredients = clean(ingredients)
        steps = clean(steps)

        return [title, authors_desc, portions, prep_time, ingredients, steps]

    except Exception as e:
        print(f"Error in url {url.strip()}: {e}")
        return None


def ScrapeData(urls_file, output_xlsx):
    with open(urls_file, "r", encoding="utf-8") as f:
        urls = [u.strip() for u in f.readlines() if u.strip()]

    headers = {"User-Agent": "Mozilla/5.0"}

    wb = Workbook()
    ws = wb.active
    ws.append(["title", "authors_desc", "portions", "prep_time", "ingredients", "steps"])

    with ThreadPoolExecutor(max_workers=20) as executor:
        future_to_url = {executor.submit(scrape_single_url, url, headers): url for url in urls}
        for future in tqdm(as_completed(future_to_url), total=len(urls), desc="Scraping data"):
            result = future.result()
            if result:
                ws.append(result)

    wb.save(output_xlsx)

In [None]:
ScrapeData(urls_file, output_data)

Scraping data:  19%|█▉        | 4888/25419 [09:12<52:12,  6.55it/s]

Error in url https://www.lamaistas.lt/receptas/mesos-pyragas-su-idaru-30495: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))


Scraping data: 100%|██████████| 25419/25419 [51:50<00:00,  8.17it/s]


In [None]:
from google.colab import files
files.download(output_data)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>