# Pipeline 1: IDs WITH PARALLEL SCRAPERS

## Prerequisites

In [None]:
import sys
from pathlib import Path
import os
import random

# Find the project root
project_root = Path().cwd().parent
print(f"Project root: {project_root}")

# Add project root to Python path (not just sources)
sys.path.insert(0, str(project_root))
print(f"Added to Python path: {project_root}")

# Set environment variables

os.environ["QE_ENV"] = "dev"
os.environ["QE_CONF_FOLDER"] = "sources/resources"
print(f"Added environment variables: QE_ENV={os.environ['QE_ENV']}, QE_CONF_FOLDER={os.environ['QE_CONF_FOLDER']}")

In [None]:
from sources.datamodel.listing_id import ListingId
from sources.logging import logging_utils
from sources.storage.abstract_storage import Storage
from sources.scrapers.immobiliare.scraper_ids import ImmobiliareIdScraper
from sources.config.config_manager import ConfigManager

## Configuration

In [None]:
ORDER = "&criterio=data&ordine=desc"
NO_AUCTION = "&noAste=1"

URL = "https://www.immobiliare.it/vendita-case/milano/"

URLS = [
    "https://www.immobiliare.it/vendita-case/milano/?idMZona[]=10071&idMZona[]=10316&idMZona[]=10295",
    "https://www.immobiliare.it/vendita-case/milano/?idMZona[]=10317&idMZona[]=10294&idMZona[]=10319",
    "https://www.immobiliare.it/vendita-case/milano/?idMZona[]=10320&idMZona[]=10293&idMZona[]=10068",
    "https://www.immobiliare.it/vendita-case/milano/?idMZona[]=10067&idMZona[]=10066&idMZona[]=10321",
    "https://www.immobiliare.it/vendita-case/milano/?idMZona[]=10292&idMZona[]=10065",
    "https://www.immobiliare.it/vendita-case/milano/?idMZona[]=10064&idMZona[]=10055",
    "https://www.immobiliare.it/vendita-case/milano/?idMZona[]=10072&idMZona[]=10296",
    "https://www.immobiliare.it/vendita-case/milano/?idMZona[]=10318&idMZona[]=10070",
    "https://www.immobiliare.it/vendita-case/milano/?idMZona[]=10069&idMZona[]=10059&idMZona[]=10057",
    "https://www.immobiliare.it/vendita-case/milano/?idMZona[]=10056&idMZona[]=10047&idMZona[]=10054",
    "https://www.immobiliare.it/vendita-case/milano/?idMZona[]=10053&idMZona[]=10061&idMZona[]=10060",
    "https://www.immobiliare.it/vendita-case/milano/?idMZona[]=10046&idMZona[]=10050&idMZona[]=10049",
]

# URLS = [f"{url}{ORDER}{NO_AUCTION}" for url in URLS]
# check that auctions are skipped by the scraper
URLS = [f"{url}{ORDER}" for url in URLS]

In [None]:
logging_utils.setup_logging(config_path='sources/resources/logging.yaml')
logger = logging_utils.get_logger(__name__)

settings = ConfigManager().get_storage_config()

storage: Storage = Storage.create_storage(data_type=ListingId, config=settings)

# scrapers = [ImmobiliareIdScraper(storage, scrape_url=(url + "&pag=5"), headless=False) for url in URLS]
scrapers = [
    ImmobiliareIdScraper(storage, scrape_url=(url), headless=False)
    for url in URLS
]
# Shuffle the scrapers list in-place
random.shuffle(scrapers) 

## Start the pipeline!

In [None]:
import concurrent.futures as cf
import time

# Stagger the start times to be respectful to the server
max_workers = 3  # Conservative limit for immobiliare.it
stagger_delay = 10  # Seconds between starting each scraper
listing_limit = 600

def run_scraper_with_delay(scraper, delay=1):
    """Run scraper with staggered start to avoid overwhelming the server"""
    time.sleep(random.uniform(0, delay))
    return scraper.scrape(limit=listing_limit)


with cf.ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = []
    for i, scraper in enumerate(scrapers):
        future = executor.submit(run_scraper_with_delay, scraper, stagger_delay)
        futures.append(future)

    for future in cf.as_completed(futures):
        try:
            result = future.result()
            logger.info("Scraper completed")
        except Exception as e:
            logger.error(f"Scraper failed: {e}")