# **INAIL Publications Scraper con Docling + VLM Integration**

In [1]:
# SETUP
%pip install -q google-colab-selenium docling "docling[vlm]" transformers torch pillow accelerate

import google_colab_selenium as gs
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from urllib.parse import urljoin, quote_plus
from pathlib import Path
from datetime import datetime
from typing import Dict, Any, Optional, List
import unicodedata
import re
import time
import json
import random

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.4/251.4 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.5/164.5 kB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.1/15.1 MB[0m [31m113.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.7/42.7 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m6.7 MB/s[0m eta 

**SELENIUM DRIVER SETUP WITH AUTO-RECOVERY**

This section initializes the Selenium Chrome WebDriver with robust error
handling.The recreate_driver() function enables automatic driver recovery if the session crashes or times out during long scraping operations, ensuring continuity without manual intervention.

In [22]:
# Funzione per ricreare driver se muore
def recreate_driver():
    global driver
    try:
        driver.quit()
    except:
        pass

    options = Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')

    driver = gs.Chrome(options=options)
    driver.set_page_load_timeout(180)
    driver.implicitly_wait(20)

    print("[INFO] Driver ricreato")
    return driver

# Crea driver iniziale
driver = recreate_driver()

<IPython.core.display.Javascript object>

[INFO] ♻️ Driver ricreato


**SELENIUM DRIVER INITIALIZATION**

Initialize Chrome WebDriver with Colab-optimized options (headless mode, security bypasses, timeouts). Base URL set for INAIL publications catalog.

In [23]:
print("[SETUP] Inizializzazione driver Selenium...")

options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
options.add_argument('--disable-gpu')
options.add_argument('--disable-software-rasterizer')
options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')

driver = gs.Chrome(options=options)
driver.set_page_load_timeout(180)  # 3 minuti per page load
driver.implicitly_wait(20)  # Wait implicito 20s

print("[SETUP] Driver inizializzato\n")

BASE_CATALOG_URL = "https://www.inail.it/portale/it/inail-comunica/pubblicazioni/catalogo-generale.html"

[SETUP] Inizializzazione driver Selenium...


<IPython.core.display.Javascript object>

[SETUP] ✅ Driver inizializzato



**GOOGLE DRIVE INTEGRATION**

Functions for mounting Google Drive, backing up scraped data, and restoring
from previous sessions. Enables persistent storage across Colab sessions.

In [5]:
def mount_google_drive():
    try:
        from google.colab import drive
        drive.mount('/content/drive', force_remount=False)
        print("[SUCCESS] Drive montato")
        return True
    except:
        print("[WARNING] Drive non disponibile")
        return False

def backup_to_drive():
    import shutil
    source = INAILConfig.OUTPUT_DIR
    dest = INAILConfig.DRIVE_BACKUP_DIR

    if not source.exists():
        return False

    try:
        if not Path('/content/drive').exists():
            mount_google_drive()
        shutil.copytree(str(source), str(dest), dirs_exist_ok=True)
        print(f"[SUCCESS] Backup su Drive completato")
        return True
    except Exception as e:
        print(f"[ERROR] Backup fallito: {e}")
        return False

def restore_from_drive():
    import shutil
    source = INAILConfig.DRIVE_BACKUP_DIR
    dest = INAILConfig.OUTPUT_DIR

    try:
        if not Path('/content/drive').exists():
            mount_google_drive()

        if not source.exists():
            print("[INFO] Nessun backup trovato")
            return False

        shutil.copytree(str(source), str(dest), dirs_exist_ok=True)
        print(f"[SUCCESS] Dati ripristinati da Drive")
        return True
    except Exception as e:
        print(f"[ERROR] Ripristino fallito: {e}")
        return False

**DOCLING INIZIAILZATION**

In [6]:
def initialize_docling_converter():
    from docling.datamodel.pipeline_options import PdfPipelineOptions
    from docling.datamodel.base_models import InputFormat
    from docling.document_converter import DocumentConverter, PdfFormatOption

    pipeline_options = PdfPipelineOptions()
    pipeline_options.generate_picture_images = True
    pipeline_options.images_scale = 2.0
    pipeline_options.do_ocr = True
    pipeline_options.do_picture_description = False

    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    print(f"[INFO] Docling OK | VLM: {'ON' if INAILConfig.VLM_ENABLED else 'OFF'}")
    return converter

**DOCUMENT PROCESSING HELPER FUNCTIONS**

Core utilities for PDF download, image/table extraction from Docling output,
document structure analysis (headings/tables/figures with metadata), and
optional VLM-based image description generation using SmolVLM transformer.

In [7]:
def download_pdf_inail(pdf_url: str, output_path: Path) -> bool:
    import os
    try:
        os.system(f'wget -q -O {output_path} "{pdf_url}"')
        return output_path.exists() and output_path.stat().st_size > 0
    except:
        return False

def export_images_from_document(doc, doc_id: str) -> List[Dict[str, Any]]:
    images_dir = INAILConfig.IMAGES_DIR / doc_id
    images_dir.mkdir(parents=True, exist_ok=True)
    exported_images = []

    try:
        from docling_core.types.doc import PictureItem
        if hasattr(doc, 'iterate_items'):
            for element, _level in doc.iterate_items():
                if isinstance(element, PictureItem):
                    picture_counter = len(exported_images) + 1
                    img_filename = f"figure_{picture_counter}.png"
                    img_path = images_dir / img_filename

                    try:
                        pil_image = element.get_image(doc)
                        pil_image.save(str(img_path), "PNG")
                        exported_images.append({
                            'path': str(img_path),
                            'filename': img_filename,
                            'caption': getattr(element, 'caption', 'N/A'),
                            'position': len(exported_images),
                            'vlm_description': None
                        })
                        print(f"  [+] {img_filename}")
                    except Exception as e:
                        print(f"  [!] Errore figura: {e}")

        print(f"[OK] {len(exported_images)} immagini esportate")
    except Exception as e:
        print(f"[ERROR] Export immagini: {e}")

    return exported_images

def export_tables_to_files(tables: List[Dict], doc_id: str) -> List[str]:
    if not tables:
        return []

    tables_dir = INAILConfig.TABLES_DIR / doc_id
    tables_dir.mkdir(parents=True, exist_ok=True)
    exported_tables = []

    for table in tables:
        table_id = table['table_id']
        txt_path = tables_dir / f"{table_id}.txt"

        try:
            with open(txt_path, 'w', encoding='utf-8') as f:
                f.write("="*80 + "\n")
                f.write(f"TABELLA: {table_id}\n")
                f.write("="*80 + "\n\n")
                f.write(f"CAPTION: {table.get('caption', 'N/A')}\n\n")

                if table.get('potential_columns'):
                    f.write("COLONNE IDENTIFICATE:\n")
                    for idx, col in enumerate(table['potential_columns'], 1):
                        f.write(f"  {idx}. {col}\n")
                    f.write("\n")

                f.write("METADATI STRUTTURALI:\n")
                f.write(f"  - Righe: {table.get('num_rows', 'N/A')}\n")
                f.write(f"  - Posizione: {table.get('position', 'N/A')}\n")
                f.write(f"  - Colonne: {len(table.get('potential_columns', []))}\n\n")

                f.write("CONTENUTO:\n" + "-"*80 + "\n")
                f.write(table['text_content'])
                f.write("\n" + "-"*80 + "\n")

            exported_tables.append(str(txt_path))
            print(f"  [+] {txt_path.name}")
        except Exception as e:
            print(f"  [!] Errore tabella: {e}")

    return exported_tables

def analyze_document_structure(doc) -> Dict[str, Any]:
    structure = {
        'num_tables': 0,
        'num_figures': 0,
        'num_headings': 0,
        'headings': [],
        'tables': [],
        'figures': []
    }

    try:
        markdown = doc.export_to_markdown()
        lines = markdown.split('\n')

        # Headings
        for i, line in enumerate(lines):
            if line.strip().startswith('#'):
                level = len(line) - len(line.lstrip('#'))
                text = line.lstrip('#').strip()
                structure['num_headings'] += 1
                structure['headings'].append({
                    'type': f'heading_level_{level}',
                    'text': text[:200],
                    'position': i,
                    'level': level
                })

        # Tables
        in_table = False
        current_table_lines = []
        table_start_idx = -1

        for i, line in enumerate(lines):
            if '|' in line and ('-' in line or '─' in line):
                if not in_table:
                    in_table = True
                    table_start_idx = i
                    current_table_lines = []

            if in_table:
                if '|' in line:
                    current_table_lines.append(line)
                else:
                    if current_table_lines:
                        caption = 'N/A'
                        for j in range(max(0, table_start_idx-5), table_start_idx):
                            candidate = lines[j].strip()
                            if candidate and 'Tabella' in candidate:
                                caption = candidate
                                break

                        potential_columns = []
                        if current_table_lines:
                            first_row = current_table_lines[0]
                            cols = [c.strip() for c in first_row.split('|') if c.strip()]
                            potential_columns = [col for col in cols if col and col not in ['-', '─']]

                        structure['num_tables'] += 1
                        structure['tables'].append({
                            'table_id': f'table_{structure["num_tables"]}',
                            'caption': caption,
                            'text_content': '\n'.join(current_table_lines),
                            'num_rows': len(current_table_lines),
                            'num_columns': len(potential_columns),
                            'potential_columns': potential_columns,
                            'position': table_start_idx
                        })

                    in_table = False
                    current_table_lines = []

        # Figures
        for i, line in enumerate(lines):
            if '<!-- image -->' in line.lower() or (line.strip().startswith('![') and '](' in line):
                structure['num_figures'] += 1
                caption = 'N/A'
                for j in range(i+1, min(len(lines), i+5)):
                    candidate = lines[j].strip()
                    if candidate and not candidate.startswith('#'):
                        caption = candidate
                        break

                structure['figures'].append({
                    'figure_id': f'figure_{structure["num_figures"]}',
                    'caption': caption,
                    'position': i,
                    'vlm_description': None
                })

    except Exception as e:
        print(f"[!] Errore analisi: {e}")

    return structure

def add_vlm_descriptions_to_images(exported_images: List[Dict], doc_id: str) -> List[Dict]:
    if not exported_images or not INAILConfig.VLM_ENABLED:
        return exported_images

    try:
        from transformers import AutoProcessor, AutoModelForVision2Seq
        from PIL import Image
        import torch

        print(f"\n[VLM] Caricamento {INAILConfig.VLM_MODEL}...")

        processor = AutoProcessor.from_pretrained(INAILConfig.VLM_MODEL, trust_remote_code=True)
        model = AutoModelForVision2Seq.from_pretrained(
            INAILConfig.VLM_MODEL,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            device_map="auto",
            trust_remote_code=True
        )

        for idx, img_data in enumerate(exported_images, 1):
            try:
                print(f"  [{idx}/{len(exported_images)}] {img_data['filename']}")
                image = Image.open(img_data['path']).convert('RGB')

                messages = [{
                    "role": "user",
                    "content": [
                        {"type": "image"},
                        {"type": "text", "text": INAILConfig.VLM_PROMPT}
                    ]
                }]

                prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
                inputs = processor(text=prompt, images=[image], return_tensors="pt").to(model.device)

                with torch.no_grad():
                    outputs = model.generate(**inputs, max_new_tokens=300, do_sample=False)

                full_output = processor.decode(outputs[0], skip_special_tokens=True)
                description = full_output.split("Assistant:")[-1].strip() if "Assistant:" in full_output else full_output.strip()

                img_data['vlm_description'] = description
                print(f"    ✓ {description[:80]}...")

            except Exception as e:
                print(f"    ✗ {e}")
                img_data['vlm_description'] = None

        del model, processor
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        success = sum(1 for img in exported_images if img.get('vlm_description'))
        print(f"[VLM] Completato: {success}/{len(exported_images)}\n")

    except Exception as e:
        print(f"[ERROR] VLM fallito: {e}")

    return exported_images

**CORE SCRAPING FUNCTION**

Main scraper for individual INAIL publications. Extracts web metadata
(title, date, abstract), downloads PDF, processes with Docling (text,
tables, figures), optionally applies VLM for image descriptions, and saves
structured JSON output with all content and metadata.

In [8]:
def scrape_inail_publication(driver, publication_url: str, converter,
                             save_pdf: bool = True, enable_vlm: Optional[bool] = None) -> Dict[str, Any]:
    use_vlm = enable_vlm if enable_vlm is not None else INAILConfig.VLM_ENABLED

    print(f"\n{'='*80}")
    print(f"SCRAPING PUBBLICAZIONE")
    print(f"{'='*80}\n")

    scraping_metadata = {
        'url': publication_url,
        'timestamp': datetime.now().isoformat(),
        'scraper_version': '4.0_TIMEOUT_FIXED',
        'docling_used': True,
        'vlm_enabled': use_vlm
    }

    try:
        driver.get(publication_url)
        time.sleep(3)

        page = BeautifulSoup(driver.page_source, "lxml")

        # Metadati
        title_elem = page.find("h2", class_="h1")
        title = title_elem.get_text(strip=True) if title_elem else "N/A"

        descr_blocks = page.find_all("p", class_="text-20")
        abstract = descr_blocks[1].get_text(strip=True) if len(descr_blocks) > 1 else ""

        data_elem = page.find("strong", class_="js-date-value")
        data_pub = data_elem.get_text(strip=True).split(", ")[0] if data_elem else "N/A"

        link_pdf = page.find("ul", class_="list-download")
        pdf_url = None
        if link_pdf:
            a_tag = link_pdf.find("a", href=True)
            if a_tag:
                pdf_url = urljoin("https://www.inail.it", a_tag["href"])

        print(f"[+] {title[:60]}...")
        print(f"[+] Data: {data_pub}")

        # PDF Processing
        pdf_data = None
        pdf_local_path = None

        if pdf_url:
            safe_filename = re.sub(r'[^\w\-_.]', '_', title[:50]) + '.pdf'
            pdf_local_path = INAILConfig.PDF_DIR / safe_filename
            doc_id = safe_filename.replace('.pdf', '')

            if download_pdf_inail(pdf_url, pdf_local_path):
                print(f"[INFO] Processing PDF...")

                result = converter.convert(str(pdf_local_path))
                doc = result.document

                markdown_text = doc.export_to_markdown()
                plain_text = doc.export_to_text()
                structure_info = analyze_document_structure(doc)
                exported_images = export_images_from_document(doc, doc_id)

                if use_vlm and exported_images:
                    exported_images = add_vlm_descriptions_to_images(exported_images, doc_id)
                    for idx, fig_info in enumerate(structure_info.get('figures', [])):
                        if idx < len(exported_images):
                            vlm_desc = exported_images[idx].get('vlm_description')
                            if vlm_desc:
                                fig_info['vlm_description'] = vlm_desc

                exported_tables = export_tables_to_files(structure_info['tables'], doc_id)

                pdf_data = {
                    'markdown_content': markdown_text,
                    'plain_text': plain_text,
                    'num_pages': len(doc.pages) if hasattr(doc, 'pages') else 'N/A',
                    'num_tables': structure_info['num_tables'],
                    'num_figures': structure_info['num_figures'],
                    'num_headings': structure_info['num_headings'],
                    'headings': structure_info['headings'],
                    'tables': structure_info['tables'],
                    'figures': structure_info['figures'],
                    'exported_images': exported_images,
                    'exported_tables': exported_tables
                }

                print(f"[OK] {pdf_data['num_pages']} pag | {pdf_data['num_tables']} tab | {pdf_data['num_figures']} fig")

                if not save_pdf:
                    pdf_local_path.unlink()
                    pdf_local_path = None

        return {
            'scraping_metadata': scraping_metadata,
            'web_metadata': {
                'title': title,
                'abstract': abstract,
                'data_pubblicazione': data_pub,
                'pdf_url': pdf_url,
                'pdf_local_path': str(pdf_local_path) if pdf_local_path else None
            },
            'document_content': pdf_data,
            'status': 'success',
            'has_pdf': pdf_url is not None,
            'pdf_processed': pdf_data is not None
        }

    except Exception as e:
        print(f"[ERROR] {e}")
        return {
            'scraping_metadata': scraping_metadata,
            'status': 'error',
            'error_message': str(e)
        }

def save_to_json(data: Dict[str, Any], output_name: Optional[str] = None) -> Path:
    if output_name is None:
        title = data.get('web_metadata', {}).get('title', 'unknown')
        safe_title = re.sub(r'[^\w\-_.]', '_', title[:50])
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        output_name = f"{safe_title}_{timestamp}.json"

    output_path = INAILConfig.JSON_DIR / output_name

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

    return output_path

**SEARCH & LINK EXTRACTION WITH ROBUST ERROR HANDLING**

URL building, query normalization, and publication link extraction from INAIL
catalog with automatic retry logic (4 attempts, 20s timeout per page),
intelligent failure detection (stops after 2 consecutive empty pages), and
automatic driver recovery on connection errors. Includes rate limiting and
duplicate filtering.

In [24]:
# Search & link extraction con auto-retry
def normalize_query(q: str) -> str:
    q = unicodedata.normalize("NFKC", q).strip().lower()
    return " ".join(q.split()).replace("'", "'")

def validate_date(date_str: str) -> bool:
    return bool(re.match(r"^\d{2}/\d{2}/\d{4}$", date_str))

def build_inail_search_url(query: str = None, page: int = 1,
                          start_date: str = None, end_date: str = None) -> str:
    if page < 1:
        raise ValueError("Pagina >= 1")

    params = []
    if query and query.strip():
        params.append(f"text={quote_plus(normalize_query(query))}")
    if start_date and validate_date(start_date):
        params.append(f"startDate={quote_plus(start_date)}")
    if end_date and validate_date(end_date):
        params.append(f"endDate={quote_plus(end_date)}")
    params.append(f"page={page}")

    return f"{BASE_CATALOG_URL}?{'&'.join(params)}"


# FUNZIONE HELPER: Ricrea driver se muore
def recreate_driver():
    # Ricrea il driver Selenium se si disconnette
    global driver

    try:
        driver.quit()
    except:
        pass

    print("\n[INFO] Ricreo driver Selenium...")

    options = Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--disable-gpu')
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')

    driver = gs.Chrome(options=options)
    driver.set_page_load_timeout(180)  # 3 minuti
    driver.implicitly_wait(20)

    time.sleep(5)  # Pausa per stabilizzazione
    print("[INFO] Driver ricreato\n")

    return driver


def extract_cards_with_wait(driver, timeout: int = 20, max_retries: int = 4) -> List[Dict[str, str]]:
    # Timeout 20s, 4 retry, gestione robusta
    for attempt in range(max_retries):
        try:
            print(f"    [Tentativo {attempt + 1}/{max_retries}]", end=" ")

            WebDriverWait(driver, timeout).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "h3.card-title a[href], body"))
            )

            time.sleep(3)  # Pausa rendering

            results = []
            cards_elements = driver.find_elements(By.CSS_SELECTOR, "h3.card-title a[href]")

            for a in cards_elements:
                try:
                    href = a.get_attribute("href")
                    text = a.text.strip()
                    if href and text:
                        results.append({
                            "titolo": text,
                            "url": urljoin("https://www.inail.it", href)
                        })
                except:
                    continue

            if results:
                print(f"✓ {len(results)} card")
                return results
            else:
                print("✗ Nessuna card")

        except TimeoutException:
            print(f"✗ Timeout")
            if attempt < max_retries - 1:
                wait_time = random.uniform(10, 15)
                print(f"    [Pausa {wait_time:.1f}s]")
                time.sleep(wait_time)
                try:
                    driver.refresh()
                    time.sleep(5)
                except:
                    pass

        except Exception as e:
            print(f"✗ Errore: {str(e)[:50]}")
            if attempt < max_retries - 1:
                time.sleep(8)

    print("    [FALLITO]")
    return []


def get_inail_publication_links(driver, query: str = None, max_pages: int = 1,
                               start_date: str = None, end_date: str = None) -> List[Dict[str, str]]:
    # Con stop intelligente e auto-retry driver
    all_links = []
    seen = set()
    consecutive_failures = 0
    MAX_FAILURES = 2

    print(f"\n{'='*80}")
    print(f"ESTRAZIONE LINK (VERSIONE FIXED)")
    print(f"{'='*80}")
    print(f"Query: {query if query else '(tutte)'} | Max pagine: {max_pages}\n")

    for page_num in range(1, max_pages + 1):
        url = build_inail_search_url(query=query, page=page_num,
                                     start_date=start_date, end_date=end_date)

        print(f"[Pagina {page_num}/{max_pages}]")

        try:
            driver.get(url)

            if page_num > 1:
                delay = random.uniform(6, 10)
                print(f"  [Pausa {delay:.1f}s]")
                time.sleep(delay)

            cards = extract_cards_with_wait(driver, timeout=20)

            if not cards:
                consecutive_failures += 1
                print(f"  Pagina vuota ({consecutive_failures}/{MAX_FAILURES})")

                if consecutive_failures >= MAX_FAILURES:
                    print(f"\n  STOP: Troppe pagine vuote")
                    break

                time.sleep(random.uniform(10, 15))
                continue

            consecutive_failures = 0
            new_count = 0

            for card in cards:
                if card["url"] not in seen:
                    seen.add(card["url"])
                    all_links.append(card)
                    new_count += 1

            print(f"  +{new_count} nuove (totale: {len(all_links)})")
            time.sleep(random.uniform(2, 4))

        except KeyboardInterrupt:
            print(f"\n INTERRUZIONE MANUALE")
            break

        except Exception as e:
            consecutive_failures += 1
            error_msg = str(e)
            print(f"  Errore: {error_msg[:100]}")

            # Ricrea driver se timeout o connection error
            if any(keyword in error_msg.lower() for keyword in ['timeout', 'connection', 'disconnected', 'chrome']):
                print(f"  Problema con driver, ricreo...")
                try:
                    driver = recreate_driver()
                    consecutive_failures = 0  # Reset dopo ricreazione
                    print(f"  Ritento pagina {page_num}...")
                    continue  # Riprova questa pagina
                except Exception as recreate_error:
                    print(f"  Impossibile ricreare driver: {recreate_error}")
                    break

            # Se non è un problema di driver, continua normalmente
            if consecutive_failures >= MAX_FAILURES:
                print(f"\n  STOP: Troppi errori consecutivi")
                break

            time.sleep(random.uniform(12, 18))

    print(f"\n{'='*80}")
    print(f"Totale: {len(all_links)} pubblicazioni")
    print(f"{'='*80}\n")

    return all_links

**BATCH SCRAPING**

Mass scraping of INAIL publications. Retrieves publication links,
processes each document (PDF download, Docling parsing, optional VLM), saves
individual JSON files and aggregated batch file, handles interruptions gracefully,
includes rate limiting and comprehensive statistics (success/failure counts,
total tables/figures extracted).

In [25]:
def batch_scrape_inail_publications(driver, converter, query: str = None, max_pages: int = 5,
                                   max_documents: Optional[int] = None, enable_vlm: bool = False,
                                   save_pdfs: bool = True, delay_between_docs: int = 4) -> List[Dict[str, Any]]:

    publication_links = get_inail_publication_links(driver, query=query, max_pages=max_pages)

    if not publication_links:
        print("[WARNING] Nessuna pubblicazione trovata")
        return []

    if max_documents:
        publication_links = publication_links[:max_documents]
        print(f"[INFO] Limitato a {max_documents} documenti\n")

    results = []
    failed = []

    print(f"{'='*80}")
    print(f"SCRAPING {len(publication_links)} DOCUMENTI")
    print(f"{'='*80}\n")

    for idx, pub in enumerate(publication_links, 1):
        print(f"\n[{idx}/{len(publication_links)}] {pub['titolo'][:50]}...")

        try:
            result = scrape_inail_publication(
                driver=driver,
                publication_url=pub['url'],
                converter=converter,
                save_pdf=save_pdfs,
                enable_vlm=enable_vlm
            )

            if result['status'] == 'success':
                results.append(result)
                save_to_json(result)
                print(f"  OK")
            else:
                failed.append({'url': pub['url'], 'title': pub['titolo'],
                              'error': result.get('error_message', 'Unknown')})
                print(f"  FAIL")

        except KeyboardInterrupt:
            print(f"\nINTERRUZIONE - Salvati: {len(results)}")
            break

        except Exception as e:
            print(f"  Eccezione: {str(e)[:80]}")
            failed.append({'url': pub['url'], 'title': pub['titolo'], 'error': str(e)})

        if idx < len(publication_links):
            delay = random.uniform(delay_between_docs, delay_between_docs + 2)
            time.sleep(delay)

    print(f"\n{'='*80}")
    print(f"COMPLETATO: {len(results)} | {len(failed)}")
    print(f"{'='*80}\n")

    # Salva batch aggregato
    if results:
        safe_query = re.sub(r'[^\w\-_.]', '_', query[:30]) if query else "tutte"
        batch_file = f"batch_inail_{safe_query}.json"
        batch_path = INAILConfig.JSON_DIR / batch_file

        with open(batch_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)

        print(f"Batch salvato: {batch_file}")

        # Stats
        total_tables = sum(r['document_content']['num_tables']
                          for r in results if r.get('document_content'))
        total_figures = sum(r['document_content']['num_figures']
                           for r in results if r.get('document_content'))

        print(f"\nStatistiche:")
        print(f"  - Tabelle: {total_tables}")
        print(f"  - Figure: {total_figures}")

    return results

**INTERACTIVE BATCH SCRAPER INTERFACE**

User-friendlyfor batch scraping with search query, date
range, pagination limits, document cap, and VLM toggle. Displays configuration
summary and requires confirmation before execution.

In [26]:
def interactive_batch_scraping_inail(driver, converter):
    print("\n" + "="*80)
    print("INAIL BATCH SCRAPER")
    print("="*80 + "\n")

    query = input("Query (Enter = tutte): ").strip() or None

    start_date = input("Data inizio (gg/mm/aaaa) o Enter: ").strip()
    if start_date and not validate_date(start_date):
        start_date = None

    end_date = input("Data fine (gg/mm/aaaa) o Enter: ").strip()
    if end_date and not validate_date(end_date):
        end_date = None

    try:
        max_pages = int(input("Max pagine (default 5): ").strip() or "5")
    except:
        max_pages = 5

    max_docs_input = input("Limite docs (Enter = tutti): ").strip()
    max_docs = int(max_docs_input) if max_docs_input else None

    vlm_input = input("VLM? (y/n, default: n): ").strip().lower()
    enable_vlm = (vlm_input == 'y')

    print(f"\n{'='*80}")
    print("RIEPILOGO:")
    print(f"{'='*80}")
    print(f"Query: {query if query else '(tutte)'}")
    print(f"Date: {start_date or 'any'} → {end_date or 'any'}")
    print(f"Pagine: {max_pages}")
    print(f"Docs: {max_docs if max_docs else '(tutti)'}")
    print(f"VLM: {'ON' if enable_vlm else 'OFF'}")
    print(f"{'='*80}\n")

    confirm = input("Procedere? (y/n): ").strip().lower()
    if confirm != 'y':
        print("[CANCELLED]")
        return None

    results = batch_scrape_inail_publications(
        driver=driver,
        converter=converter,
        query=query,
        max_pages=max_pages,
        max_documents=max_docs,
        enable_vlm=enable_vlm,
        save_pdfs=True,
        delay_between_docs=4
    )

    return results

**INCREMENTAL SCRAPING ENGINE**

Resumes scraping from existing batch JSON by loading previously scraped URLs,
extracting new publication links from specified page range, filtering out
duplicates, processing only new documents with Docling/VLM, and appending
results to existing batch file. Enables efficient long-running scraping
sessions across multiple Colab runs without re-downloading processed content.

In [27]:
def load_existing_scraped_urls(json_path: Path) -> set:
    if not json_path.exists():
        return set()

    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        scraped_urls = set()
        if isinstance(data, list):
            for item in data:
                url = item.get('scraping_metadata', {}).get('url')
                if url:
                    scraped_urls.add(url)

        return scraped_urls
    except:
        return set()

def append_to_json(new_data: Dict[str, Any], json_path: Path):
    if json_path.exists():
        try:
            with open(json_path, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)

            if isinstance(existing_data, list):
                existing_data.append(new_data)
            else:
                existing_data = [existing_data, new_data]
        except:
            existing_data = [new_data]
    else:
        existing_data = [new_data]

    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(existing_data, f, ensure_ascii=False, indent=2)

def incremental_batch_scraping_inail(driver, converter, existing_json_filename: str,
                                    query: str = None, start_page: int = 1, max_pages: int = 5,
                                    max_documents: Optional[int] = None, enable_vlm: bool = False,
                                    delay_between_docs: int = 4) -> List[Dict[str, Any]]:

    json_path = INAILConfig.JSON_DIR / existing_json_filename
    already_scraped = load_existing_scraped_urls(json_path)

    print(f"\n{'='*80}")
    print(f"SCRAPING INCREMENTALE")
    print(f"{'='*80}")
    print(f"JSON: {existing_json_filename}")
    print(f"Già processati: {len(already_scraped)}")
    print(f"Pagine: {start_page} → {start_page + max_pages - 1}")
    print(f"{'='*80}\n")

    # Estrai link con range custom
    all_links = []
    seen = set()
    consecutive_failures = 0
    MAX_FAILURES = 2

    end_page = start_page + max_pages - 1

    for page_num in range(start_page, end_page + 1):
        url = build_inail_search_url(query=query, page=page_num)

        print(f"[Pagina {page_num}/{end_page}]")

        try:
            driver.get(url)

            if page_num > start_page:
                delay = random.uniform(6, 10)
                time.sleep(delay)

            cards = extract_cards_with_wait(driver, timeout=20)

            if not cards:
                consecutive_failures += 1
                if consecutive_failures >= MAX_FAILURES:
                    print(f"  STOP")
                    break
                continue

            consecutive_failures = 0
            new_count = 0
            skipped_count = 0

            for card in cards:
                if card["url"] not in seen:
                    seen.add(card["url"])

                    if card["url"] in already_scraped:
                        skipped_count += 1
                    else:
                        all_links.append(card)
                        new_count += 1

            print(f"  Nuovi: {new_count} |  Skip: {skipped_count}")
            time.sleep(random.uniform(2, 4))

        except Exception as e:
            consecutive_failures += 1
            print(f"  Errore: {str(e)[:80]}")
            if consecutive_failures >= MAX_FAILURES:
                break

    print(f"\n[RESULT] Nuove pubblicazioni: {len(all_links)}\n")

    if not all_links:
        print("[INFO] Tutti già processati!")
        return []

    if max_documents and len(all_links) > max_documents:
        all_links = all_links[:max_documents]

    # Scraping nuovi documenti
    new_results = []

    print(f"{'='*80}")
    print(f"SCRAPING {len(all_links)} NUOVI DOCUMENTI")
    print(f"{'='*80}\n")

    for idx, pub in enumerate(all_links, 1):
        print(f"\n[{idx}/{len(all_links)}] {pub['titolo'][:50]}...")

        try:
            result = scrape_inail_publication(
                driver=driver,
                publication_url=pub['url'],
                converter=converter,
                save_pdf=True,
                enable_vlm=enable_vlm
            )

            if result['status'] == 'success':
                new_results.append(result)
                save_to_json(result)
                append_to_json(result, json_path)
                print(f"  Aggiunto a {json_path.name}")

        except Exception as e:
            print(f"  {str(e)[:80]}")

        if idx < len(all_links):
            time.sleep(random.uniform(delay_between_docs, delay_between_docs + 2))

    print(f"\n{'='*80}")
    print(f"INCREMENTALE COMPLETATO")
    print(f"{'='*80}")
    print(f"Nuovi: {len(new_results)}")
    print(f"Totale in JSON: {len(already_scraped) + len(new_results)}")
    print(f"{'='*80}\n")

    return new_results

**MAIN ENTRY POINT**

Scraper initialization, Drive backup/restore, and execution mode
selection (interactive batch, quick test, backup only, restore only). Sets up
directories, initializes Docling converter, presents mode menu, executes chosen
workflow, and offers auto-backup on completion. Entry point for all scraping
operations.

In [28]:
def main():
    print("\n" + "="*80)
    print("INAIL SCRAPER - SETUP")
    print("="*80 + "\n")

    INAILConfig.setup_directories()

    print("\n[STEP 1] Controllo backup Drive...")
    restore_from_drive()

    print("\n[STEP 2] Inizializzazione Docling...")
    converter = initialize_docling_converter()

    print("\n" + "="*80)
    print("MODALITÀ:")
    print("="*80)
    print("1. Scraping interattivo (nuovo batch)")
    print("2. Scraping rapido (2 pag, 5 docs, no VLM)")
    print("3. Solo backup su Drive")
    print("4. Solo ripristino da Drive")
    print("="*80 + "\n")

    choice = input("Scegli (1-4, default 2): ").strip() or "2"

    if choice == "3":
        backup_to_drive()
        return None
    elif choice == "4":
        restore_from_drive()
        return None
    elif choice == "1":
        results = interactive_batch_scraping_inail(driver, converter)
    else:
        # Scraping rapido (default)
        print("\n[TEST RAPIDO] 2 pagine, 5 docs, NO VLM\n")
        results = batch_scrape_inail_publications(
            driver=driver,
            converter=converter,
            query=None,
            max_pages=2,
            max_documents=5,
            enable_vlm=False,
            delay_between_docs=3
        )

    # Auto-backup se ci sono risultati
    if results and len(results) > 0:
        backup_choice = input("\nBackup su Drive? (y/n): ").strip().lower()
        if backup_choice == 'y':
            backup_to_drive()

    return results

**UTILITY FUNCTIONS**

Convenience helpers for quick Drive backup/restore operations and local file
inspection. Provides one-line commands for common tasks (quick_backup,
quick_restore) and detailed listing of local JSON files with batch/individual
file separation and size reporting.

In [29]:
def quick_backup():
    # Backup rapido
    mount_google_drive()
    backup_to_drive()

def quick_restore():
    # Ripristino rapido
    mount_google_drive()
    restore_from_drive()

def list_local_files():
    # Lista file JSON locali
    json_dir = INAILConfig.JSON_DIR
    if json_dir.exists():
        files = list(json_dir.glob("*.json"))
        batch_files = [f for f in files if f.name.startswith('batch_')]

        print(f"\n{'='*80}")
        print(f"FILE JSON LOCALI ({len(files)} totali)")
        print(f"{'='*80}")

        if batch_files:
            print("\n File aggregati:")
            for f in batch_files:
                size = f.stat().st_size / 1024
                print(f"  • {f.name} ({size:.1f} KB)")

        single_files = [f for f in files if not f.name.startswith('batch_')]
        if single_files:
            print(f"\n File singoli: {len(single_files)}")
    else:
        print("[INFO] Nessun file trovato")

**SCRIPT EXECUTION**

In [13]:
if __name__ == "__main__":
    results = main()

    print("\n" + "="*80)
    print("SCRAPING TERMINATO")
    print("="*80)

    if results:
        print(f"✅ Documenti: {len(results)}")
        print(f"📁 Output: {INAILConfig.OUTPUT_DIR}")
        print(f"☁️  Drive: {INAILConfig.DRIVE_BACKUP_DIR}")

    print("\nComandi utili:")
    print("  • quick_backup()     - Backup su Drive")
    print("  • quick_restore()    - Ripristina da Drive")
    print("  • list_local_files() - Mostra file locali")
    print("="*80 + "\n")


INAIL SCRAPER - SETUP

[INFO] Directory create: inail_scraped_data

[STEP 1] Controllo backup Drive...
Mounted at /content/drive
[SUCCESS] ✅ Drive montato
[SUCCESS] ✅ Dati ripristinati da Drive

[STEP 2] Inizializzazione Docling...
[INFO] Docling OK | VLM: ON

MODALITÀ:
1. Scraping interattivo (nuovo batch)
2. Scraping rapido (2 pag, 5 docs, no VLM)
3. Solo backup su Drive
4. Solo ripristino da Drive

Scegli (1-4, default 2): 1

INAIL BATCH SCRAPER

Query (Enter = tutte): 
Data inizio (gg/mm/aaaa) o Enter: 
Data fine (gg/mm/aaaa) o Enter: 
Max pagine (default 5): 4
Limite docs (Enter = tutti): 
VLM? (y/n, default: n): y

RIEPILOGO:
Query: (tutte)
Date: any → any
Pagine: 4
Docs: (tutti)
VLM: ON

Procedere? (y/n): y

ESTRAZIONE LINK (VERSIONE FIXED)
Query: (tutte) | Max pagine: 4

[Pagina 1/4]
    [Tentativo 1/4] ✓ 10 card
  ✅ +10 nuove (totale: 10)
[Pagina 2/4]
  [Pausa 6.2s]
    [Tentativo 1/4] ✓ 10 card
  ✅ +10 nuove (totale: 20)
[Pagina 3/4]
  [Pausa 9.3s]
    [Tentativo 1/4] ✓ 10 c

[32m[INFO] 2025-10-26 08:58:35,858 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-10-26 08:58:35,863 [RapidOCR] download_file.py:68: Initiating download: https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/det/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-10-26 08:58:37,598 [RapidOCR] download_file.py:82: Download size: 13.83MB[0m
[32m[INFO] 2025-10-26 08:58:45,278 [RapidOCR] download_file.py:95: Successfully saved to: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-10-26 08:58:45,280 [RapidOCR] torch.py:54: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-10-26 08:58:45,508 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-10-26 08:58:45,509 [RapidOCR] download_file.py:68: Initiating download: https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/cls/ch_ptocr_mobile_v2.0_cls

  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
  [+] figure_8.png
  [+] figure_9.png
  [+] figure_10.png
  [+] figure_11.png
  [+] figure_12.png
  [+] figure_13.png
  [+] figure_14.png
  [+] figure_15.png
  [+] figure_16.png
  [+] figure_17.png
  [+] figure_18.png
  [+] figure_19.png
  [+] figure_20.png
  [+] figure_21.png
  [+] figure_22.png
  [+] figure_23.png
  [+] figure_24.png
  [+] figure_25.png
  [+] figure_26.png
  [+] figure_27.png
  [+] figure_28.png
  [+] figure_29.png
  [+] figure_30.png
  [+] figure_31.png
  [+] figure_32.png
  [+] figure_33.png
  [+] figure_34.png
  [+] figure_35.png
  [+] figure_36.png
  [+] figure_37.png
  [+] figure_38.png
  [+] figure_39.png
  [+] figure_40.png
  [+] figure_41.png
  [+] figure_42.png
  [+] figure_43.png
  [+] figure_44.png
  [+] figure_45.png
  [+] figure_46.png
  [+] figure_47.png
  [+] figure_48.png
  [+] figure_49.png
  [+] figure_50.png
  [+] fig

`torch_dtype` is deprecated! Use `dtype` instead!


  [1/52] figure_1.png
    ✓ This technical document image is a diagram, chart, or schematic of a machine too...
  [2/52] figure_2.png
    ✓ This technical document image is a diagram, chart, or schematic of a process or ...
  [3/52] figure_3.png
    ✓ This technical document image is divided into four sections, each showcasing dif...
  [4/52] figure_4.png
    ✓ This technical document image is a diagram, chart, and photo that shows the rela...
  [5/52] figure_5.png
    ✓ This image is a technical document that is divided into several sections. The ma...
  [6/52] figure_6.png
    ✓ The technical document image is a collage of four different scenes, each represe...
  [7/52] figure_7.png
    ✓ This image is a satellite image of a rural area. The central focus is a large, d...
  [8/52] figure_8.png
    ✓ This image is a detailed map of a specific area, likely a factory or industrial ...
  [9/52] figure_9.png
    ✓ The image is a collage of five smaller images, each depicting a different sc



  [+] figure_1.png
[OK] 1 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/1] figure_1.png
    ✓ The image is a technical document titled "INCILI 2025" in a blue background. The...
[VLM] Completato: 1/1

[OK] 4 pag | 0 tab | 1 fig
  ✅ OK

[3/40] Il burnout: un fenomeno occupazionale...

SCRAPING PUBBLICAZIONE

[+] Il burnout: un fenomeno occupazionale...
[+] Data: 17/10/2025
[INFO] Processing PDF...




  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
[OK] 3 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/3] figure_1.png
    ✓ This technical document image is for academic research. The main content type is...
  [2/3] figure_2.png
    ✓ The image depicts a cracked glass window, likely from a window or door, with the...
  [3/3] figure_3.png
    ✓ The image is a technical document image that is divided into several sections. T...
[VLM] Completato: 3/3

  [+] table_1.txt
[OK] 4 pag | 1 tab | 3 fig
  ✅ OK

[4/40] La scheda di dati di sicurezza nel quadro normativ...

SCRAPING PUBBLICAZIONE

[+] La scheda di dati di sicurezza nel quadro normativo del rego...
[+] Data: 16/10/2025
[INFO] Processing PDF...




  [+] figure_1.png
[OK] 1 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/1] figure_1.png
    ✓ This technical document image is for academic research. The main content type is...
[VLM] Completato: 1/1

  [+] table_1.txt
  [+] table_2.txt
[OK] 4 pag | 2 tab | 1 fig
  ✅ OK

[5/40] Procedura per l’esecuzione dell’esame visivo di su...

SCRAPING PUBBLICAZIONE

[+] Procedura per l’esecuzione dell’esame visivo di superfici es...
[+] Data: 13/10/2025
[INFO] Processing PDF...




  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
  [+] figure_8.png
[OK] 8 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/8] figure_1.png
    ✓ INCILL...
  [2/8] figure_2.png
    ✓ This image is a technical document image for a company called "Collana Salute E ...
  [3/8] figure_3.png
    ✓ This technical document image is a diagram, chart, or schematic of a product or ...
  [4/8] figure_4.png
    ✓ The image depicts a construction site with a focus on a large, industrial machin...
  [5/8] figure_5.png
    ✓ The technical document image in the provided description is a close-up view of a...
  [6/8] figure_6.png
    ✓ The image is a technical document titled "VISIONE DEI DIEI" (Italian for "Vision...
  [7/8] figure_7.png
    ✓ This technical document image is a detailed diagram, chart, or schematic of a sp...
  [8/8] figure_8.png
    ✓ This technical document image is a detailed diagram, chart, or schematic that re...
[VLM] Completato: 8/8

  [+] table_1.txt
  [+] table_2.txt
  [+] table_3.txt
[OK] 36 pag | 3 tab | 8 fig
  ✅ OK

[6/40] La Direttiva europea 2023/2668: contenuti e novità...

SCRAPING PU



  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
  [+] figure_8.png
  [+] figure_9.png
  [+] figure_10.png
  [+] figure_11.png
  [+] figure_12.png
  [+] figure_13.png
  [+] figure_14.png
  [+] figure_15.png
  [+] figure_16.png
  [+] figure_17.png
  [+] figure_18.png
  [+] figure_19.png
  [+] figure_20.png
[OK] 20 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/20] figure_1.png
    ✓ This technical document image is a diagram, chart, or schematic of a process or ...
  [2/20] figure_2.png
    ✓ The image is a technical document titled "2025 Colla Salute e Sicurezza" which t...
  [3/20] figure_3.png
    ✓ This technical document image is a detailed diagram, chart, or schematic of a ma...
  [4/20] figure_4.png
    ✓ This technical document image is a diagram, chart, or photo of a pile of rocks. ...
  [5/20] figure_5.png
    ✓ This technical document image is a diagram, chart, or photo representing a diamo...
  [6/20] figure_6.png
    ✓ The image depicts an industrial structure undergoing significant renovation. The...
  [7/20] figure_7.png
    ✓ This image is a bar chart titled "1920-1945." The chart is divided into four dis...
  [8/20] figure_8.png
    ✓ The image is a map of Europe that uses a color-coded scale to represent differen...
  [9/20] figure_9.png
    ✓ The image is a detailed map of the United States, showing the average estima



  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
  [+] figure_8.png
  [+] figure_9.png
  [+] figure_10.png
  [+] figure_11.png
  [+] figure_12.png
  [+] figure_13.png
  [+] figure_14.png
  [+] figure_15.png
  [+] figure_16.png
  [+] figure_17.png
  [+] figure_18.png
  [+] figure_19.png
  [+] figure_20.png
  [+] figure_21.png
  [+] figure_22.png
  [+] figure_23.png
  [+] figure_24.png
  [+] figure_25.png
  [+] figure_26.png
  [+] figure_27.png
  [+] figure_28.png
[OK] 28 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/28] figure_1.png
    ✓ The word "INCIL" is written in large, white letters with a serif font. The lette...
  [2/28] figure_2.png
    ✓ This technical document image is a diagram, chart, and schematic of a process or...
  [3/28] figure_3.png
    ✓ This technical document image is a diagram, chart, or schematic of a process or ...
  [4/28] figure_4.png
    ✓ This technical document image is a diagram, chart, or schematic depicting a proc...
  [5/28] figure_5.png
    ✓ This is a technical document image. The main content type is a diagram, chart, o...
  [6/28] figure_6.png
    ✓ The given image is a technical document titled "Figure 1.2: Number totali di imp...
  [7/28] figure_7.png
    ✓ The image is a map of Italy, specifically the region of Ateco. The map is color-...
  [8/28] figure_8.png
    ✓ The image is a bar chart titled “Figure 1.4”. The chart is divided into four sec...
  [9/28] figure_9.png
    ✓ This image is a bar chart titled "50-249" with a legend at the bottom. The c



  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
  [+] figure_8.png
  [+] figure_9.png
  [+] figure_10.png
  [+] figure_11.png
  [+] figure_12.png
  [+] figure_13.png
  [+] figure_14.png
  [+] figure_15.png
  [+] figure_16.png
  [+] figure_17.png
  [+] figure_18.png
  [+] figure_19.png
  [+] figure_20.png
  [+] figure_21.png
  [+] figure_22.png
  [+] figure_23.png
  [+] figure_24.png
  [+] figure_25.png
  [+] figure_26.png
  [+] figure_27.png
  [+] figure_28.png
  [+] figure_29.png
  [+] figure_30.png
  [+] figure_31.png
  [+] figure_32.png
  [+] figure_33.png
  [+] figure_34.png
  [+] figure_35.png
  [+] figure_36.png
  [+] figure_37.png
  [+] figure_38.png
  [+] figure_39.png
  [+] figure_40.png
  [+] figure_41.png
  [+] figure_42.png
[OK] 42 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/42] figure_1.png
    ✓ This technical document image is a detailed diagram, chart, or photo of various ...
  [2/42] figure_2.png
    ✓ The image is a technical document titled "Incla" in the format of a diagram or c...
  [3/42] figure_3.png
    ✓ The image is a bar chart titled "Menomagnito Permanente Estito Mortal" showing t...
  [4/42] figure_4.png
    ✓ This image is a bar chart titled "Menomancionale permanente" and titled "Estiso ...
  [5/42] figure_5.png
    ✓ The image is a technical document image that contains a table with three main se...
  [6/42] figure_6.png
    ✓ The image is a technical document that depicts a pie chart. The chart is titled ...
  [7/42] figure_7.png
    ✓ The image is a circular chart that visually represents the percentage of employe...
  [8/42] figure_8.png
    ✓ The image is a bar chart that visually represents the distribution of data acros...
  [9/42] figure_9.png
    ✓ The image is a pie chart that visually represents the distribution of data a



  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
[OK] 7 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/7] figure_1.png
    ✓ The image is a technical document titled "INCIL" in white, uppercase letters on ...
  [2/7] figure_2.png
    ✓ This technical document image is a diagram, chart, or photo depicting a schemati...
  [3/7] figure_3.png
    ✓ Figure 2 of Figure 1....
  [4/7] figure_4.png
    ✓ This is a detailed technical document image....
  [5/7] figure_5.png
    ✓ The given image is a technical document titled "Figure 3: Esperimento 2" by a re...
  [6/7] figure_6.png
    ✓ The image is a technical document titled "Imagine 2" and is divided into three m...
  [7/7] figure_7.png
    ✓ The image is a bar chart titled "Imagine 3." The chart is divided into four dist...
[VLM] Completato: 7/7

[OK] 4 pag | 0 tab | 7 fig
  ✅ OK

[10/40] Bio-ritmo ospedali - Metodologia per la valutazion...

SCRAPING PUBBLICAZIONE

[+] Bio-ritmo ospedali - Metodologia per la valutazione del risc...
[+] Data: 30/09/2025
[INFO] Processing PDF...




  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
  [+] figure_8.png
  [+] figure_9.png
  [+] figure_10.png
  [+] figure_11.png
  [+] figure_12.png
  [+] figure_13.png
  [+] figure_14.png
  [+] figure_15.png
  [+] figure_16.png
  [+] figure_17.png
  [+] figure_18.png
  [+] figure_19.png
  [+] figure_20.png
  [+] figure_21.png
  [+] figure_22.png
  [+] figure_23.png
  [+] figure_24.png
  [+] figure_25.png
  [+] figure_26.png
  [+] figure_27.png
  [+] figure_28.png
  [+] figure_29.png
  [+] figure_30.png
  [+] figure_31.png
  [+] figure_32.png
  [+] figure_33.png
  [+] figure_34.png
  [+] figure_35.png
  [+] figure_36.png
  [+] figure_37.png
  [+] figure_38.png
  [+] figure_39.png
  [+] figure_40.png
  [+] figure_41.png
  [+] figure_42.png
  [+] figure_43.png
  [+] figure_44.png
[OK] 44 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/44] figure_1.png
    ✓ This technical document image is a diagram, chart, or schematic of a laboratory ...
  [2/44] figure_2.png
    ✓ This technical document image is a diagram, chart, or schematic of a machine too...
  [3/44] figure_3.png
    ✓ This image is a bar chart titled "Inforinti mortali non mortali." The x-axis rep...
  [4/44] figure_4.png
    ✓ The image is a technical document titled "Probabilità alta" (Probability at alta...
  [5/44] figure_5.png
    ✓ The image depicts a mathematical equation involving a matrix and a matrix-vector...
  [6/44] figure_6.png
    ✓ This document is a table that lists the SCHEDA RACCOLTA DATI in two columns. The...
  [7/44] figure_7.png
    ✓ The document is a table that categorizes various types of workplace hazards. It ...
  [8/44] figure_8.png
    ✓ This technical document image is a bar chart. The chart is divided into two main...
  [9/44] figure_9.png
    ✓ This technical document image is a detailed diagram, chart, or schematic of 



  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
  [+] figure_8.png
  [+] figure_9.png
  [+] figure_10.png
  [+] figure_11.png
  [+] figure_12.png
  [+] figure_13.png
  [+] figure_14.png
  [+] figure_15.png
  [+] figure_16.png
  [+] figure_17.png
  [+] figure_18.png
  [+] figure_19.png
  [+] figure_20.png
  [+] figure_21.png
  [+] figure_22.png
  [+] figure_23.png
  [+] figure_24.png
  [+] figure_25.png
  [+] figure_26.png
  [+] figure_27.png
  [+] figure_28.png
  [+] figure_29.png
  [+] figure_30.png
  [+] figure_31.png
  [+] figure_32.png
  [+] figure_33.png
  [+] figure_34.png
  [+] figure_35.png
  [+] figure_36.png
  [+] figure_37.png
  [+] figure_38.png
  [+] figure_39.png
  [+] figure_40.png
  [+] figure_41.png
  [+] figure_42.png
  [+] figure_43.png
  [+] figure_44.png
  [+] figure_45.png
  [+] figure_46.png
  [+] figure_47.png
  [+] figure_48.png
  [+] figure_49.png
  [+] figure_50.png
  [+] fig



  [1/70] figure_1.png
    ✓ INCail is a brand name that is used to identify a company or organization. It is...
  [2/70] figure_2.png
    ✓ The image is a technical document that appears to be a diagram or chart related ...
  [3/70] figure_3.png
    ✓ This technical document image is a diagram, chart, or schematic of a process or ...
  [4/70] figure_4.png
    ✓ FOCUS...
  [5/70] figure_5.png
    ✓ Figure 10....
  [6/70] figure_6.png
    ✓ Figure 1b....
  [7/70] figure_7.png
    ✓ The image is a technical document titled "Distribuzione % delle segnalazioni per...
  [8/70] figure_8.png
    ✓ The image is a technical document titled "Distribuzione % delle segnalazioni per...
  [9/70] figure_9.png
    ✓ The image is a bar chart that visually represents the distribution of different ...
  [10/70] figure_10.png
    ✓ The image is a technical document titled "Distribuzione del segnalazione rilevat...
  [11/70] figure_11.png
    ✓ ### Image Description

The image is a pie chart titled "Distrib



  [+] figure_1.png
  [+] figure_2.png
[OK] 2 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/2] figure_1.png
    ✓ This technical document image is a blue and white document titled "INCLI 2025" i...
  [2/2] figure_2.png
    ✓ This technical document image is a detailed, step-by-step diagram of a chemical ...
[VLM] Completato: 2/2

  [+] table_1.txt
  [+] table_2.txt
[OK] 4 pag | 2 tab | 2 fig
  ✅ OK

[13/40] Reinserimento e integrazione lavorativa delle pers...

SCRAPING PUBBLICAZIONE

[+] Reinserimento e integrazione lavorativa delle persone con di...
[+] Data: 24/09/2025
[INFO] Processing PDF...




  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
  [+] figure_8.png
  [+] figure_9.png
  [+] figure_10.png
  [+] figure_11.png
  [+] figure_12.png
  [+] figure_13.png
  [+] figure_14.png
[OK] 14 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/14] figure_1.png
    ✓ This technical document image is a logo, specifically a "INCIL" logo. The logo i...
  [2/14] figure_2.png
    ✓ The image is a technical document titled "Con Inail, Riconicio, Dal Mio, Lavoro"...
  [3/14] figure_3.png
    ✓ The image is a technical document titled "Direzione centrali pianificazioni e co...
  [4/14] figure_4.png
    ✓ The image is a technical document titled "Con Inail, Ricominco Dal Mio" which tr...
  [5/14] figure_5.png
    ✓ The image depicts a blue tractor in a lush green field. The tractor is positione...
  [6/14] figure_6.png
    ✓ The image depicts a woman standing in a barn, smiling, and posing for a photo. S...
  [7/14] figure_7.png
    ✓ The image depicts a woman interacting with two cows in what appears to be a farm...
  [8/14] figure_8.png
    ✓ The image depicts a young woman seated in a wheelchair, smiling at the camera. S...
  [9/14] figure_9.png
    ✓ The image depicts a man and a woman seated in a wheelchair. The man is weari



  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
  [+] figure_8.png
  [+] figure_9.png
  [+] figure_10.png
  [+] figure_11.png
  [+] figure_12.png
  [+] figure_13.png
  [+] figure_14.png
  [+] figure_15.png
  [+] figure_16.png
  [+] figure_17.png
  [+] figure_18.png
  [+] figure_19.png
  [+] figure_20.png
  [+] figure_21.png
  [+] figure_22.png
  [+] figure_23.png
  [+] figure_24.png
  [+] figure_25.png
  [+] figure_26.png
  [+] figure_27.png
  [+] figure_28.png
  [+] figure_29.png
  [+] figure_30.png
  [+] figure_31.png
  [+] figure_32.png
  [+] figure_33.png
  [+] figure_34.png
  [+] figure_35.png
  [+] figure_36.png
  [+] figure_37.png
  [+] figure_38.png
  [+] figure_39.png
  [+] figure_40.png
[OK] 40 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/40] figure_1.png
    ✓ The image is a technical document titled "2025" with a green header and a blue h...
  [2/40] figure_2.png
    ✓ This technical document image is of a blue rectangular brochure for INCIL. The t...
  [3/40] figure_3.png
    ✓ The technical document image is a diagram, chart, or schematic depicting the com...
  [4/40] figure_4.png
    ✓ The image is a technical document, likely from a document related to workplace s...
  [5/40] figure_5.png
    ✓ The technical document image is a diagram, chart, or photo depicting a specific ...
  [6/40] figure_6.png
    ✓ The image is a bar chart titled "Figura 1: Esperienza / Azioni dell’operativa." ...
  [7/40] figure_7.png
    ✓ This image is a technical document image. The main content type is a diagram or ...
  [8/40] figure_8.png
    ✓ The image is a bar chart titled "Valutazione del grâdimento - valori %." The x-a...
  [9/40] figure_9.png
    ✓ Figure 3 shows the percentage of employees who reported having been injured 



  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
[OK] 6 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/6] figure_1.png
    ✓ INCILL...
  [2/6] figure_2.png
    ✓ MalProf...
  [3/6] figure_3.png
    ✓ The image is a technical document titled "Conferenza dei Presidenti delle Region...
  [4/6] figure_4.png
    ✓ Percentual di cial cia in sanità delle malattie p Sicilio su the totale delle ma...
  [5/6] figure_5.png
    ✓ This technical document image is a detailed representation of a figure from the ...
  [6/6] figure_6.png
    ✓ The image is a pie chart titled “Agenti dell’area “fattori psicosociali” nel ser...
[VLM] Completato: 6/6

  [+] table_1.txt
  [+] table_2.txt
  [+] table_3.txt
  [+] table_4.txt
[OK] 6 pag | 4 tab | 6 fig
  ✅ OK

[16/40] Infor.MO, Rapporto Inail - Regioni sulle cause deg...

SCRAPING PUBBLICAZIONE

[+] Infor.MO, Rapporto Inail - Regioni sulle cause degli infortu...
[+] Data: 4/09/2025
[INFO] Processing PDF...




  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
  [+] figure_8.png
  [+] figure_9.png
  [+] figure_10.png
  [+] figure_11.png
  [+] figure_12.png
  [+] figure_13.png
  [+] figure_14.png
  [+] figure_15.png
  [+] figure_16.png
  [+] figure_17.png
  [+] figure_18.png
  [+] figure_19.png
  [+] figure_20.png
  [+] figure_21.png
  [+] figure_22.png
  [+] figure_23.png
  [+] figure_24.png
  [+] figure_25.png
  [+] figure_26.png
  [+] figure_27.png
  [+] figure_28.png
  [+] figure_29.png
  [+] figure_30.png
  [+] figure_31.png
  [+] figure_32.png
  [+] figure_33.png
  [+] figure_34.png
  [+] figure_35.png
  [+] figure_36.png
  [+] figure_37.png
  [+] figure_38.png
  [+] figure_39.png
  [+] figure_40.png
  [+] figure_41.png
  [+] figure_42.png
  [+] figure_43.png
  [+] figure_44.png
  [+] figure_45.png
  [+] figure_46.png
  [+] figure_47.png
  [+] figure_48.png
  [+] figure_49.png
  [+] figure_50.png
  [+] fig



  [1/68] figure_1.png
    ✓ This technical document image is a detailed, high-resolution image that appears ...
  [2/68] figure_2.png
    ✓ The image is a technical document titled "Incil - Rapporto Inail - Regioni sulle...
  [3/68] figure_3.png
    ✓ The image is a technical document image that is divided into several distinct se...
  [4/68] figure_4.png
    ✓ The image is a bar chart titled “Principali incident per gli infortuni morti, an...
  [5/68] figure_5.png
    ✓ The image is a line graph depicting the percentage of deaths due to preventable ...
  [6/68] figure_6.png
    ✓ Figure 4 shows the percentage of employees who reported experiencing at least on...
  [7/68] figure_7.png
    ✓ The image is a bar chart that compares the percentage of people who have experie...
  [8/68] figure_8.png
    ✓ The chart shows the percentage of Americans who say they have been affected by t...
  [9/68] figure_9.png
    ✓ The image is a bar chart that compares the distribution of a particular type



  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
[OK] 3 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/3] figure_1.png
    ✓ This technical document image is for academic research. The main content type is...
  [2/3] figure_2.png
    ✓ This technical document image is a diagram, chart, or photo depicting a key visu...
  [3/3] figure_3.png
    ✓ This technical document image is a detailed description of a document titled "Wo...
[VLM] Completato: 3/3

[OK] 4 pag | 0 tab | 3 fig
  ✅ OK

[18/40] Punture da imenotteri: conosciamo la portata del f...

SCRAPING PUBBLICAZIONE

[+] Punture da imenotteri: conosciamo la portata del fenomeno?...
[+] Data: 2/09/2025
[INFO] Processing PDF...




  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
[OK] 6 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/6] figure_1.png
    ✓ The image depicts a honey bee on a white daisy. The bee is in the process of col...
  [2/6] figure_2.png
    ✓ The image depicts a honey bee on a white background. The bee is primarily yellow...
  [3/6] figure_3.png
    ✓ The image is a technical document titled "INCLAL" in white capital letters on a ...
  [4/6] figure_4.png
    ✓ The chart depicts the distribution of the percentage of employees who are employ...
  [5/6] figure_5.png
    ✓ The chart depicts the number of infarcts (15) and the number of patients (21) wh...
  [6/6] figure_6.png
    ✓ This image shows a technical document that is divided into several sections. The...
[VLM] Completato: 6/6

  [+] table_1.txt
  [+] table_2.txt
  [+] table_3.txt
[OK] 6 pag | 3 tab | 6 fig
  ✅ OK

[19/40] Protesi & Ausili per lo sport...

SCRAPING PUBBLICAZIONE

[+] Protesi & Ausili per lo sport...
[+] Data: 29/08/2025
[INFO] Processing PDF...




  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
  [+] figure_8.png
  [+] figure_9.png
  [+] figure_10.png
  [+] figure_11.png
  [+] figure_12.png
  [+] figure_13.png
  [+] figure_14.png
[OK] 14 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/14] figure_1.png
    ✓ The image depicts a tennis player in a wheelchair on a tennis court. The player ...
  [2/14] figure_2.png
    ✓ The technical document image is a detailed diagram or chart that provides a comp...
  [3/14] figure_3.png
    ✓ The technical document image in detail is a diagram, chart, or photo depicting a...
  [4/14] figure_4.png
    ✓ The image depicts a skier in mid-action, likely in the middle of a jump or a dow...
  [5/14] figure_5.png
    ✓ The technical document image is a diagram, chart, or photo depicting a man with ...
  [6/14] figure_6.png
    ✓ The technical document image is a diagram, chart, or photo depicting a hockey ri...
  [7/14] figure_7.png
    ✓ The image depicts a man in a blue tank top with white and green accents, which a...
  [8/14] figure_8.png
    ✓ The image depicts a group of women playing a volleyball game in a gymnasium. The...
  [9/14] figure_9.png
    ✓ The image depicts a technical document image of a diver in an underwater env



  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
[OK] 4 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/4] figure_1.png
    ✓ This technical document image is for academic research. The main content type is...
  [2/4] figure_2.png
    ✓ This image is a schematic of a workplace, likely for a chemical or industrial se...
  [3/4] figure_3.png
    ✓ This image is a technical document image. The main content type is a diagram, ch...
  [4/4] figure_4.png
    ✓ Figure 3: Schematic of Processes of Processed Lava Fires from a Process Scheme o...
[VLM] Completato: 4/4

  [+] table_1.txt
[OK] 4 pag | 1 tab | 4 fig
  ✅ OK

[21/40] Integrazione di fonti e modelli nell’analisi degli...

SCRAPING PUBBLICAZIONE

[+] Integrazione di fonti e modelli nell’analisi degli infortuni...
[+] Data: 7/08/2025
[INFO] Processing PDF...




  [+] figure_1.png
  [+] figure_2.png
[OK] 2 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/2] figure_1.png
    ✓ This technical document image is for academic research. The main content type is...
  [2/2] figure_2.png
    ✓ Figure 1 shows a detailed diagram or chart that provides information about a pro...
[VLM] Completato: 2/2

  [+] table_1.txt
[OK] 4 pag | 1 tab | 2 fig
  ✅ OK

[22/40] L' accertamento tecnico per la sicurezza delle mac...

SCRAPING PUBBLICAZIONE

[+] L' accertamento tecnico per la sicurezza delle macchine uten...
[+] Data: 4/08/2025
[INFO] Processing PDF...




  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
  [+] figure_8.png
  [+] figure_9.png
  [+] figure_10.png
  [+] figure_11.png
  [+] figure_12.png
  [+] figure_13.png
  [+] figure_14.png
  [+] figure_15.png
  [+] figure_16.png
  [+] figure_17.png
  [+] figure_18.png
  [+] figure_19.png
  [+] figure_20.png
  [+] figure_21.png
  [+] figure_22.png
  [+] figure_23.png
  [+] figure_24.png
  [+] figure_25.png
  [+] figure_26.png
  [+] figure_27.png
  [+] figure_28.png
  [+] figure_29.png
  [+] figure_30.png
  [+] figure_31.png
  [+] figure_32.png
  [+] figure_33.png
  [+] figure_34.png
  [+] figure_35.png
  [+] figure_36.png
  [+] figure_37.png
  [+] figure_38.png
  [+] figure_39.png
  [+] figure_40.png
  [+] figure_41.png
[OK] 41 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/41] figure_1.png
    ✓ This technical document image in the image is a diagram, chart, or photo of a ma...
  [2/41] figure_2.png
    ✓ This technical document image is a detailed diagram, chart, or schematic of a ma...
  [3/41] figure_3.png
    ✓ The technical document image is a figure from a scientific or industrial report....
  [4/41] figure_4.png
    ✓ This image shows a technical document image of a machine, likely a boiler or sim...
  [5/41] figure_5.png
    ✓ This technical document image is a black and white photograph of a piece of mach...
  [6/41] figure_6.png
    ✓ This technical document image is a detailed, black and white, 3D-rendered image ...
  [7/41] figure_7.png
    ✓ The image is a black and white technical document image. The main content is a d...
  [8/41] figure_8.png
    ✓ This technical document image is a black and white diagram, chart, photo, schema...
  [9/41] figure_9.png
  [10/41] figure_10.png
    ✓ This technical document image is a black and white, 



  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
[OK] 4 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/4] figure_1.png
    ✓ This technical document image is for academic research. The main content type is...
  [2/4] figure_2.png
    ✓ This image is a technical document image, which is a diagram, chart, or schemati...
  [3/4] figure_3.png
    ✓ This technical document image is a diagram, chart, or schematic depicting a proc...
  [4/4] figure_4.png
    ✓ The image is a technical document page from a document titled "VLEP/VLB." The pa...
[VLM] Completato: 4/4

[OK] 4 pag | 0 tab | 4 fig
  ✅ OK

[24/40] Industria della produzione del cemento e Radioprot...

SCRAPING PUBBLICAZIONE

[+] Industria della produzione del cemento e Radioprotezione...
[+] Data: 28/07/2025
[INFO] Processing PDF...




  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
[OK] 4 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/4] figure_1.png
    ✓ This technical document image is for academic research. The main content type is...
  [2/4] figure_2.png
    ✓ This technical document image in detail for academic research. The image is a di...
  [3/4] figure_3.png
    ✓ This image depicts a schematic diagram of a production process, specifically a t...
  [4/4] figure_4.png
    ✓ This image is a technical document image from a book on workplace safety or indu...
[VLM] Completato: 4/4

  [+] table_1.txt
  [+] table_2.txt
[OK] 4 pag | 2 tab | 4 fig
  ✅ OK

[25/40] Salute e sicurezza negli ambienti di vita e di lav...

SCRAPING PUBBLICAZIONE

[+] Salute e sicurezza negli ambienti di vita e di lavoro...
[+] Data: 14/07/2025
[INFO] Processing PDF...




  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
  [+] figure_8.png
  [+] figure_9.png
  [+] figure_10.png
  [+] figure_11.png
  [+] figure_12.png
  [+] figure_13.png
  [+] figure_14.png
  [+] figure_15.png
  [+] figure_16.png
  [+] figure_17.png
  [+] figure_18.png
  [+] figure_19.png
  [+] figure_20.png
  [+] figure_21.png
  [+] figure_22.png
  [+] figure_23.png
  [+] figure_24.png
  [+] figure_25.png
  [+] figure_26.png
  [+] figure_27.png
  [+] figure_28.png
  [+] figure_29.png
  [+] figure_30.png
  [+] figure_31.png
  [+] figure_32.png
  [+] figure_33.png
  [+] figure_34.png
  [+] figure_35.png
  [+] figure_36.png
  [+] figure_37.png
  [+] figure_38.png
  [+] figure_39.png
  [+] figure_40.png
  [+] figure_41.png
  [+] figure_42.png
  [+] figure_43.png
  [+] figure_44.png
  [+] figure_45.png
  [+] figure_46.png
  [+] figure_47.png
  [+] figure_48.png
  [+] figure_49.png
  [+] figure_50.png
  [+] fig



  [1/56] figure_1.png
    ✓ This technical document image is a detailed diagram, chart, or schematic of a ma...
  [2/56] figure_2.png
    ✓ The symbol \(c\) is a circle with a dashed outline....
  [3/56] figure_3.png
    ✓ The image is a technical document titled "Figura 1: Processo di selezione dagli ...
  [4/56] figure_4.png
    ✓ This technical document image in figure 2 of a larger document is a diagram, cha...
  [5/56] figure_5.png
    ✓ This is a detailed, technical document image. The image is a bar chart that visu...
  [6/56] figure_6.png
    ✓ The graph is a bar chart that visually represents the number of fatalities and i...
  [7/56] figure_7.png
    ✓ The image is a bar chart titled "Infuentoni SEVESO per classificazione ATECO 200...
  [8/56] figure_8.png
    ✓ The data are from a report by the Institute for Industrial Safety (IIS) and the ...
  [9/56] figure_9.png
    ✓ The image is a bar chart that shows the results of a survey about the safety of ...
  [10/56] figure_10.p



  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
[OK] 3 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/3] figure_1.png
    ✓ 2025 INCIL 2025 technical document image....
  [2/3] figure_2.png
    ✓ The image is a technical document titled "Figure 1: La catena della sopravivenza...
  [3/3] figure_3.png
    ✓ The image is a technical document from a company named Parla Registra. It is a d...
[VLM] Completato: 3/3

[OK] 2 pag | 0 tab | 3 fig
  ✅ OK

[27/40] Il ruolo della differenza di sesso e di genere nel...

SCRAPING PUBBLICAZIONE

[+] Il ruolo della differenza di sesso e di genere nell’esposizi...
[+] Data: 30/05/2025
[INFO] Processing PDF...




  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
[OK] 3 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/3] figure_1.png
    ✓ This technical document image is for academic research. The main content type is...
  [2/3] figure_2.png
    ✓ This image is a technical document titled "Figure 1 Etchetattura di un sensibili...
  [3/3] figure_3.png
    ✓ This document shows a diagram and a schematic of a scenario involving a circular...
[VLM] Completato: 3/3

[OK] 2 pag | 0 tab | 3 fig
  ✅ OK

[28/40] Stima dei potenziali lavoratori esposti ad acrilon...

SCRAPING PUBBLICAZIONE

[+] Stima dei potenziali lavoratori esposti ad acrilonitrile sul...
[+] Data: 27/05/2025
[INFO] Processing PDF...




  [+] figure_1.png
  [+] figure_2.png
[OK] 2 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/2] figure_1.png
    ✓ This technical document image is for academic research. The main content type is...
  [2/2] figure_2.png
    ✓ This image is a data visualization image from the Center for Industrial Safety a...
[VLM] Completato: 2/2

  [+] table_1.txt
[OK] 4 pag | 1 tab | 2 fig
  ✅ OK

[29/40] Utilizzo di fibre sostitutive dell’amianto di nuov...

SCRAPING PUBBLICAZIONE

[+] Utilizzo di fibre sostitutive dell’amianto di nuova generazi...
[+] Data: 27/05/2025
[INFO] Processing PDF...




  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
[OK] 4 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/4] figure_1.png
    ✓ This technical document image is for academic research. The main content type is...
  [2/4] figure_2.png
    ✓ ### Detailed Description

The given image is a technical diagram for classifying...
  [3/4] figure_3.png
    ✓ This figure shows a medical chart titled "Figure 2. Nota R e nota Q dell'Allegat...
  [4/4] figure_4.png
    ✓ This technical document image is a two-page collage consisting of two identical ...
[VLM] Completato: 4/4

[OK] 4 pag | 0 tab | 4 fig
  ✅ OK

[30/40] Conference Proceedings...

SCRAPING PUBBLICAZIONE

[+] Conference Proceedings...
[+] Data: 23/05/2025
[INFO] Processing PDF...




  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
  [+] figure_8.png
  [+] figure_9.png
  [+] figure_10.png
  [+] figure_11.png
  [+] figure_12.png
  [+] figure_13.png
  [+] figure_14.png
  [+] figure_15.png
  [+] figure_16.png
  [+] figure_17.png
  [+] figure_18.png
  [+] figure_19.png
  [+] figure_20.png
  [+] figure_21.png
  [+] figure_22.png
  [+] figure_23.png
  [+] figure_24.png
  [+] figure_25.png
  [+] figure_26.png
  [+] figure_27.png
  [+] figure_28.png
  [+] figure_29.png
  [+] figure_30.png
  [+] figure_31.png
  [+] figure_32.png
  [+] figure_33.png
  [+] figure_34.png
  [+] figure_35.png
  [+] figure_36.png
  [+] figure_37.png
  [+] figure_38.png
  [+] figure_39.png
  [+] figure_40.png
  [+] figure_41.png
  [+] figure_42.png
  [+] figure_43.png
  [+] figure_44.png
  [+] figure_45.png
  [+] figure_46.png
  [+] figure_47.png
  [+] figure_48.png
  [+] figure_49.png
  [+] figure_50.png
  [+] fig



  [1/303] figure_1.png
    ✓ This image is a technical document image. The main content type is a brown color...
  [2/303] figure_2.png
    ✓ This technical document image is a detailed diagram, chart, or schematic depicti...
  [3/303] figure_3.png
    ✓ This image is a technical document image that appears to be a diagram or schemat...
  [4/303] figure_4.png
    ✓ This technical document image is a diagram, chart, or schematic of a company’s i...
  [5/303] figure_5.png
    ✓ This technical document image is a composite of several panels, each depicting a...
  [6/303] figure_6.png
    ✓ This image is a technical document image that depicts the data from a graph or c...
  [7/303] figure_7.png
    ✓ Human controller Control action Feedback Automated controller Control action Fee...
  [8/303] figure_8.png
    ✓ CONTROLLER is connected with CONTROL PROCESS which is then connected with FEEDBA...
  [9/303] figure_9.png
    ✓ The image is a technical document that is divided into several sect



  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
  [+] figure_8.png
  [+] figure_9.png
  [+] figure_10.png
  [+] figure_11.png
  [+] figure_12.png
  [+] figure_13.png
  [+] figure_14.png
  [+] figure_15.png
  [+] figure_16.png
  [+] figure_17.png
  [+] figure_18.png
  [+] figure_19.png
  [+] figure_20.png
  [+] figure_21.png
  [+] figure_22.png
  [+] figure_23.png
  [+] figure_24.png
  [+] figure_25.png
  [+] figure_26.png
  [+] figure_27.png
  [+] figure_28.png
  [+] figure_29.png
  [+] figure_30.png
  [+] figure_31.png
  [+] figure_32.png
  [+] figure_33.png
  [+] figure_34.png
  [+] figure_35.png
  [+] figure_36.png
  [+] figure_37.png
  [+] figure_38.png
  [+] figure_39.png
  [+] figure_40.png
  [+] figure_41.png
  [+] figure_42.png
  [+] figure_43.png
  [+] figure_44.png
  [+] figure_45.png
  [+] figure_46.png
  [+] figure_47.png
  [+] figure_48.png
  [+] figure_49.png
  [+] figure_50.png
  [+] fig



  [1/203] figure_1.png
    ✓ This image is a technical document image. The main content type is a brown color...
  [2/203] figure_2.png
    ✓ The image is a technical document titled "Consiglio Nazionale degli Incegneri" (...
  [3/203] figure_3.png
    ✓ The image is a technical document image that contains various visual elements an...
  [4/203] figure_4.png
    ✓ This technical document image is a diagram, chart, or schematic of a company’s i...
  [5/203] figure_5.png
    ✓ INCILL...
  [6/203] figure_6.png
    ✓ The image depicts a circular emblem with a red background and white text. The em...
  [7/203] figure_7.png
    ✓ The image depicts a technical document titled "CONSIGLO Nazionale DECLIGI INEGEN...
  [8/203] figure_8.png
    ✓ This technical document image consists of a blue circle with the word "mic" writ...
  [9/203] figure_9.png
    ✓ The image is a technical document image that includes a series of photographs, c...
  [10/203] figure_10.png
    ✓ The group is a group of pe




⚠️ INTERRUZIONE - Salvati: 31

COMPLETATO: ✅ 31 | ❌ 0

📦 Batch salvato: batch_inail_tutte.json

Statistiche:
  - Tabelle: 653
  - Figure: 1058

Backup su Drive? (y/n): y
[SUCCESS] ✅ Backup su Drive completato

SCRAPING TERMINATO
✅ Documenti: 31
📁 Output: inail_scraped_data
☁️  Drive: /content/drive/MyDrive/INAIL_Thesis_Data

Comandi utili:
  • quick_backup()     - Backup su Drive
  • quick_restore()    - Ripristina da Drive
  • list_local_files() - Mostra file locali



**ENHANCED VLM WITH TEXT FILE OUTPUT**

Modified VLM function that saves image descriptions as standalone .txt files
alongside PNG images (e.g., figure_1.png → figure_1.txt). Each file contains
image metadata, VLM model info, timestamp, and full description for easy
vector database indexing without parsing JSON. Facilitates direct file-based
retrieval during embedding generation.

In [30]:
# MODIFICHE PER SALVARE DESCRIZIONI VLM COME FILE TESTUALI

def add_vlm_descriptions_to_images(exported_images: List[Dict], doc_id: str) -> List[Dict]:

    # Salva descrizioni VLM come file .txt accanto alle immagini per facilitare l'indicizzazione vettoriale

    if not exported_images or not INAILConfig.VLM_ENABLED:
        return exported_images

    try:
        from transformers import AutoProcessor, AutoModelForVision2Seq
        from PIL import Image
        import torch

        print(f"\n[VLM] Caricamento {INAILConfig.VLM_MODEL}...")

        processor = AutoProcessor.from_pretrained(INAILConfig.VLM_MODEL, trust_remote_code=True)
        model = AutoModelForVision2Seq.from_pretrained(
            INAILConfig.VLM_MODEL,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            device_map="auto",
            trust_remote_code=True
        )

        for idx, img_data in enumerate(exported_images, 1):
            try:
                print(f"  [{idx}/{len(exported_images)}] {img_data['filename']}")
                image = Image.open(img_data['path']).convert('RGB')

                messages = [{
                    "role": "user",
                    "content": [
                        {"type": "image"},
                        {"type": "text", "text": INAILConfig.VLM_PROMPT}
                    ]
                }]

                prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
                inputs = processor(text=prompt, images=[image], return_tensors="pt").to(model.device)

                with torch.no_grad():
                    outputs = model.generate(**inputs, max_new_tokens=300, do_sample=False)

                full_output = processor.decode(outputs[0], skip_special_tokens=True)
                description = full_output.split("Assistant:")[-1].strip() if "Assistant:" in full_output else full_output.strip()

                # Salva nel dizionario (come prima)
                img_data['vlm_description'] = description

                # Salva come file .txt accanto all'immagine
                img_path = Path(img_data['path'])
                txt_path = img_path.with_suffix('.txt')  # figure_1.png -> figure_1.txt

                with open(txt_path, 'w', encoding='utf-8') as f:
                    f.write("="*80 + "\n")
                    f.write(f"VLM DESCRIPTION - {img_data['filename']}\n")
                    f.write("="*80 + "\n\n")

                    # Metadati immagine
                    f.write("IMAGE METADATA:\n")
                    f.write(f"  - Filename: {img_data['filename']}\n")
                    f.write(f"  - Caption: {img_data.get('caption', 'N/A')}\n")
                    f.write(f"  - Position in document: {img_data.get('position', 'N/A')}\n")
                    f.write(f"  - Model: {INAILConfig.VLM_MODEL}\n")
                    f.write(f"  - Timestamp: {datetime.now().isoformat()}\n\n")

                    # Descrizione VLM
                    f.write("VLM ANALYSIS:\n")
                    f.write("-"*80 + "\n")
                    f.write(description)
                    f.write("\n" + "-"*80 + "\n")

                img_data['vlm_description_file'] = str(txt_path)
                print(f"    ✓ Salvato: {txt_path.name}")
                print(f"    ✓ {description[:80]}...")

            except Exception as e:
                print(f"    ✗ {e}")
                img_data['vlm_description'] = None
                img_data['vlm_description_file'] = None

        del model, processor
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        success = sum(1 for img in exported_images if img.get('vlm_description'))
        print(f"[VLM] Completato: {success}/{len(exported_images)}\n")

    except Exception as e:
        print(f"[ERROR] VLM fallito: {e}")

    return exported_images


# Crea manifest per vector database

def create_vector_db_manifest(json_dir: Path = None) -> Path:

    # Crea un manifest JSON che mappa tutti i contenuti per l'indicizzazione vettoriale.
    # Include: testo markdown, tabelle estratte, immagini con descrizioni VLM.

    if json_dir is None:
        json_dir = INAILConfig.JSON_DIR

    manifest = {
        'created_at': datetime.now().isoformat(),
        'purpose': 'Vector database indexing manifest',
        'total_documents': 0,
        'documents': []
    }

    json_files = list(json_dir.glob("*.json"))
    json_files = [f for f in json_files if not f.name.startswith('vector_db_manifest')]

    print(f"\n{'='*80}")
    print(f"CREAZIONE MANIFEST PER VECTOR DB")
    print(f"{'='*80}\n")

    for json_file in json_files:
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)

            # Gestisci sia file singoli che batch
            if isinstance(data, list):
                docs_to_process = data
            else:
                docs_to_process = [data]

            for doc in docs_to_process:
                if doc.get('status') != 'success':
                    continue

                doc_content = doc.get('document_content')
                if not doc_content:
                    continue

                web_meta = doc.get('web_metadata', {})
                doc_id = re.sub(r'[^\w\-_.]', '_', web_meta.get('title', 'unknown')[:50])

                # Prepara entry per vector DB
                vector_entry = {
                    'document_id': doc_id,
                    'title': web_meta.get('title', 'N/A'),
                    'publication_date': web_meta.get('data_pubblicazione', 'N/A'),
                    'source_url': doc.get('scraping_metadata', {}).get('url'),
                    'content_types': [],
                    'indexable_content': []
                }

                # 1) Testo principale (Markdown)
                if doc_content.get('markdown_content'):
                    vector_entry['content_types'].append('markdown_text')
                    vector_entry['indexable_content'].append({
                        'type': 'markdown_text',
                        'content': doc_content['markdown_content'],
                        'metadata': {
                            'num_pages': doc_content.get('num_pages', 'N/A'),
                            'num_headings': doc_content.get('num_headings', 0)
                        }
                    })

                # 2) Tabelle con metadati
                for table_info in doc_content.get('tables', []):
                    vector_entry['content_types'].append('table')

                    # Costruisci testo arricchito per la tabella
                    table_text = f"TABELLA: {table_info.get('caption', 'Senza titolo')}\n\n"

                    if table_info.get('potential_columns'):
                        table_text += "COLONNE: " + ", ".join(table_info['potential_columns']) + "\n\n"

                    table_text += "CONTENUTO:\n" + table_info.get('text_content', '')

                    vector_entry['indexable_content'].append({
                        'type': 'table',
                        'content': table_text,
                        'metadata': {
                            'table_id': table_info.get('table_id'),
                            'caption': table_info.get('caption'),
                            'num_rows': table_info.get('num_rows'),
                            'num_columns': table_info.get('num_columns'),
                            'columns': table_info.get('potential_columns', []),
                            'position': table_info.get('position')
                        }
                    })

                # 3) Immagini con descrizioni VLM
                for img_info in doc_content.get('exported_images', []):
                    if img_info.get('vlm_description'):
                        vector_entry['content_types'].append('image_vlm')

                        # Testo combinato: caption + descrizione VLM
                        img_text = f"IMMAGINE: {img_info.get('filename')}\n"
                        img_text += f"CAPTION: {img_info.get('caption', 'N/A')}\n\n"
                        img_text += f"DESCRIZIONE VISIVA:\n{img_info['vlm_description']}"

                        vector_entry['indexable_content'].append({
                            'type': 'image_vlm',
                            'content': img_text,
                            'metadata': {
                                'filename': img_info.get('filename'),
                                'caption': img_info.get('caption'),
                                'vlm_description_file': img_info.get('vlm_description_file'),
                                'image_path': img_info.get('path'),
                                'position': img_info.get('position')
                            }
                        })

                # Aggiungi al manifest se ha contenuto indicizzabile
                if vector_entry['indexable_content']:
                    manifest['documents'].append(vector_entry)
                    manifest['total_documents'] += 1
                    print(f"  ✓ {doc_id}: {len(vector_entry['indexable_content'])} chunks")

        except Exception as e:
            print(f"  ✗ Errore processing {json_file.name}: {e}")

    # Salva manifest
    manifest_path = json_dir / f"vector_db_manifest_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

    with open(manifest_path, 'w', encoding='utf-8') as f:
        json.dump(manifest, f, ensure_ascii=False, indent=2)

    print(f"\n{'='*80}")
    print(f"MANIFEST CREATO")
    print(f"{'='*80}")
    print(f"File: {manifest_path.name}")
    print(f"Documenti: {manifest['total_documents']}")

    # Statistiche
    total_chunks = sum(len(doc['indexable_content']) for doc in manifest['documents'])
    total_tables = sum(1 for doc in manifest['documents']
                      for chunk in doc['indexable_content']
                      if chunk['type'] == 'table')
    total_images = sum(1 for doc in manifest['documents']
                      for chunk in doc['indexable_content']
                      if chunk['type'] == 'image_vlm')
    total_markdown = sum(1 for doc in manifest['documents']
                        for chunk in doc['indexable_content']
                        if chunk['type'] == 'markdown_text')

    print(f"Chunks totali: {total_chunks}")
    print(f"   - Testi markdown: {total_markdown}")
    print(f"   - Tabelle: {total_tables}")
    print(f"   - Immagini VLM: {total_images}")
    print(f"{'='*80}\n")

    return manifest_path

# FUNZIONE UTILITY: Verifica organizzazione file per vector DB

def verify_vector_db_readiness():
    """
    Verifica che tutti i file siano pronti per l'indicizzazione vettoriale
    """
    print(f"\n{'='*80}")
    print(f"VERIFICA PREPARAZIONE VECTOR DB")
    print(f"{'='*80}\n")

    issues = []

    # Verifica immagini con descrizioni
    images_dir = INAILConfig.IMAGES_DIR
    if images_dir.exists():
        for doc_dir in images_dir.iterdir():
            if doc_dir.is_dir():
                png_files = list(doc_dir.glob("*.png"))
                txt_files = list(doc_dir.glob("*.txt"))

                print(f"{doc_dir.name}:")
                print(f"   - Immagini: {len(png_files)}")
                print(f"   - Descrizioni VLM: {len(txt_files)}")

                if len(png_files) != len(txt_files):
                    issues.append(f"{doc_dir.name}: {len(png_files)} immagini ma {len(txt_files)} descrizioni")
                    print(f"   MISMATCH!")
                else:
                    print(f"   OK")

    # Verifica tabelle
    tables_dir = INAILConfig.TABLES_DIR
    if tables_dir.exists():
        total_tables = sum(1 for _ in tables_dir.rglob("*.txt"))
        print(f"\n Tabelle estratte: {total_tables}")

    # Verifica JSON
    json_files = list(INAILConfig.JSON_DIR.glob("*.json"))
    print(f"\n📄 File JSON: {len(json_files)}")

    if issues:
        print(f"\n PROBLEMI RILEVATI:")
        for issue in issues:
            print(f"   - {issue}")
    else:
        print(f"\n Tutto pronto per indicizzazione vettoriale!")

    print(f"{'='*80}\n")

    return len(issues) == 0


# Comando per generare manifest

def main():
    print("\n" + "="*80)
    print("INAIL SCRAPER - SETUP")
    print("="*80 + "\n")

    INAILConfig.setup_directories()

    print("\n[STEP 1] Controllo backup Drive...")
    restore_from_drive()

    print("\n[STEP 2] Inizializzazione Docling...")
    converter = initialize_docling_converter()

    print("\n" + "="*80)
    print("MODALITÀ:")
    print("="*80)
    print("1. Scraping interattivo (nuovo batch)")
    print("2. Scraping rapido (2 pag, 5 docs, no VLM)")
    print("3. Solo backup su Drive")
    print("4. Solo ripristino da Drive")
    print("5. Crea manifest per Vector DB")  # NUOVO
    print("6. Verifica preparazione Vector DB")  # NUOVO
    print("="*80 + "\n")

    choice = input("Scegli (1-6, default 2): ").strip() or "2"

    if choice == "3":
        backup_to_drive()
        return None
    elif choice == "4":
        restore_from_drive()
        return None
    elif choice == "5":
        manifest_path = create_vector_db_manifest()
        print(f"\n Manifest salvato: {manifest_path}")
        return None
    elif choice == "6":
        verify_vector_db_readiness()
        return None
    elif choice == "1":
        results = interactive_batch_scraping_inail(driver, converter)
    else:
        print("\n[TEST RAPIDO] 2 pagine, 5 docs, NO VLM\n")
        results = batch_scrape_inail_publications(
            driver=driver,
            converter=converter,
            query=None,
            max_pages=2,
            max_documents=5,
            enable_vlm=False,
            delay_between_docs=3
        )

    if results and len(results) > 0:
        backup_choice = input("\nBackup su Drive? (y/n): ").strip().lower()
        if backup_choice == 'y':
            backup_to_drive()

    return results

In [16]:
# ============================================================================
# SCRIPT DI RECUPERO COMPLETO - CON BACKUP DRIVE AUTOMATICO
# ============================================================================

import json
from pathlib import Path
from datetime import datetime
from typing import Dict, List, Any
import re
import shutil

# Directory configurazione (stesse del tuo scraper)
OUTPUT_DIR = Path('./inail_scraped_data')
JSON_DIR = OUTPUT_DIR / 'json'
IMAGES_DIR = OUTPUT_DIR / 'images'
TABLES_DIR = OUTPUT_DIR / 'tables'
DRIVE_BACKUP_DIR = Path('/content/drive/MyDrive/INAIL_Thesis_Data')

# ============================================================================
# GOOGLE DRIVE FUNCTIONS
# ============================================================================

def mount_google_drive():
    """Monta Google Drive"""
    try:
        from google.colab import drive
        if not Path('/content/drive').exists():
            drive.mount('/content/drive', force_remount=False)
            print("[SUCCESS] ✅ Drive montato")
        else:
            print("[INFO] Drive già montato")
        return True
    except:
        print("[WARNING] ⚠️ Drive non disponibile")
        return False


def backup_to_drive():
    """Backup completo su Drive"""
    source = OUTPUT_DIR
    dest = DRIVE_BACKUP_DIR

    if not source.exists():
        print("[ERROR] Nessun dato da backuppare")
        return False

    try:
        if not Path('/content/drive').exists():
            mount_google_drive()

        print(f"\n{'='*80}")
        print(f"BACKUP SU GOOGLE DRIVE")
        print(f"{'='*80}")
        print(f"Origine: {source}")
        print(f"Destinazione: {dest}\n")

        # Copia ricorsiva
        shutil.copytree(str(source), str(dest), dirs_exist_ok=True)

        # Conta file backuppati
        total_files = sum(1 for _ in dest.rglob('*') if _.is_file())

        print(f"✅ Backup completato!")
        print(f"📦 File totali: {total_files}")
        print(f"📁 Path Drive: {dest}")
        print(f"{'='*80}\n")

        return True
    except Exception as e:
        print(f"[ERROR] ❌ Backup fallito: {e}")
        return False


def restore_from_drive():
    """Ripristino da Drive"""
    source = DRIVE_BACKUP_DIR
    dest = OUTPUT_DIR

    try:
        if not Path('/content/drive').exists():
            mount_google_drive()

        if not source.exists():
            print("[INFO] ℹ️ Nessun backup trovato su Drive")
            return False

        print(f"\n{'='*80}")
        print(f"RIPRISTINO DA GOOGLE DRIVE")
        print(f"{'='*80}")

        shutil.copytree(str(source), str(dest), dirs_exist_ok=True)

        total_files = sum(1 for _ in dest.rglob('*') if _.is_file())

        print(f"✅ Ripristino completato!")
        print(f"📦 File ripristinati: {total_files}")
        print(f"{'='*80}\n")

        return True
    except Exception as e:
        print(f"[ERROR] ❌ Ripristino fallito: {e}")
        return False


# ============================================================================
# RECOVERY FUNCTIONS
# ============================================================================

def recover_vlm_descriptions_from_json():
    """
    Legge tutti i JSON esistenti ed estrae le descrizioni VLM
    per creare i file .txt mancanti
    """
    print(f"\n{'='*80}")
    print(f"RECUPERO DESCRIZIONI VLM DA JSON ESISTENTI")
    print(f"{'='*80}\n")

    if not JSON_DIR.exists():
        print("[ERROR] ❌ Directory JSON non trovata!")
        return 0

    json_files = list(JSON_DIR.glob("*.json"))
    json_files = [f for f in json_files if not f.name.startswith('vector_db_manifest')]

    print(f"📄 Trovati {len(json_files)} file JSON\n")

    total_images = 0
    recovered_descriptions = 0
    skipped_no_vlm = 0
    skipped_existing = 0

    for json_file in json_files:
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)

            # Gestisci sia file singoli che batch
            if isinstance(data, list):
                docs_to_process = data
            else:
                docs_to_process = [data]

            for doc in docs_to_process:
                if doc.get('status') != 'success':
                    continue

                doc_content = doc.get('document_content')
                if not doc_content:
                    continue

                # Estrai exported_images
                exported_images = doc_content.get('exported_images', [])

                if not exported_images:
                    continue

                print(f"📁 {json_file.name}")

                for img_info in exported_images:
                    total_images += 1

                    # Controlla se ha descrizione VLM
                    vlm_desc = img_info.get('vlm_description')

                    if not vlm_desc:
                        skipped_no_vlm += 1
                        print(f"   ⏭️  {img_info.get('filename', 'unknown')}: No VLM")
                        continue

                    # Ottieni path immagine
                    img_path_str = img_info.get('path')
                    if not img_path_str:
                        continue

                    img_path = Path(img_path_str)

                    # Crea file .txt con descrizione VLM
                    txt_path = img_path.with_suffix('.txt')

                    # Se esiste già, salta
                    if txt_path.exists():
                        skipped_existing += 1
                        print(f"   ✓ {img_info.get('filename', 'unknown')}: Già esiste")
                        continue

                    # Crea directory se non esiste
                    txt_path.parent.mkdir(parents=True, exist_ok=True)

                    # Scrivi file .txt
                    with open(txt_path, 'w', encoding='utf-8') as f:
                        f.write("="*80 + "\n")
                        f.write(f"VLM DESCRIPTION - {img_info.get('filename', 'unknown')}\n")
                        f.write("="*80 + "\n\n")

                        # Metadati
                        f.write("IMAGE METADATA:\n")
                        f.write(f"  - Filename: {img_info.get('filename', 'N/A')}\n")
                        f.write(f"  - Caption: {img_info.get('caption', 'N/A')}\n")
                        f.write(f"  - Position: {img_info.get('position', 'N/A')}\n")
                        f.write(f"  - Image path: {img_path}\n")
                        f.write(f"  - Recovered from JSON: {json_file.name}\n")
                        f.write(f"  - Recovery timestamp: {datetime.now().isoformat()}\n\n")

                        # Descrizione VLM
                        f.write("VLM ANALYSIS:\n")
                        f.write("-"*80 + "\n")
                        f.write(vlm_desc)
                        f.write("\n" + "-"*80 + "\n")

                    recovered_descriptions += 1
                    print(f"   ✅ {img_info.get('filename', 'unknown')}: Recuperato")

        except Exception as e:
            print(f"   ❌ Errore: {str(e)[:80]}")

    print(f"\n{'='*80}")
    print(f"RECUPERO COMPLETATO")
    print(f"{'='*80}")
    print(f"📊 Immagini totali: {total_images}")
    print(f"✅ Nuove descrizioni create: {recovered_descriptions}")
    print(f"📝 Già esistenti: {skipped_existing}")
    print(f"⏭️  Senza VLM: {skipped_no_vlm}")
    print(f"{'='*80}\n")

    return recovered_descriptions


def create_vector_db_manifest(json_dir: Path = None) -> Path:
    """
    Crea manifest JSON aggregato per vector database indexing
    """
    if json_dir is None:
        json_dir = JSON_DIR

    manifest = {
        'created_at': datetime.now().isoformat(),
        'purpose': 'Vector database indexing manifest',
        'version': '1.0',
        'total_documents': 0,
        'documents': []
    }

    json_files = list(json_dir.glob("*.json"))
    json_files = [f for f in json_files if not f.name.startswith('vector_db_manifest')]

    print(f"\n{'='*80}")
    print(f"CREAZIONE MANIFEST PER VECTOR DB")
    print(f"{'='*80}\n")

    for json_file in json_files:
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                data = json.load(f)

            # Gestisci sia file singoli che batch
            if isinstance(data, list):
                docs_to_process = data
            else:
                docs_to_process = [data]

            for doc in docs_to_process:
                if doc.get('status') != 'success':
                    continue

                doc_content = doc.get('document_content')
                if not doc_content:
                    continue

                web_meta = doc.get('web_metadata', {})
                doc_id = re.sub(r'[^\w\-_.]', '_', web_meta.get('title', 'unknown')[:50])

                # Prepara entry per vector DB
                vector_entry = {
                    'document_id': doc_id,
                    'title': web_meta.get('title', 'N/A'),
                    'publication_date': web_meta.get('data_pubblicazione', 'N/A'),
                    'source_url': doc.get('scraping_metadata', {}).get('url'),
                    'pdf_url': web_meta.get('pdf_url'),
                    'content_types': [],
                    'indexable_content': []
                }

                # 1. Testo principale (Markdown)
                if doc_content.get('markdown_content'):
                    vector_entry['content_types'].append('markdown_text')
                    vector_entry['indexable_content'].append({
                        'type': 'markdown_text',
                        'content': doc_content['markdown_content'],
                        'metadata': {
                            'num_pages': doc_content.get('num_pages', 'N/A'),
                            'num_headings': doc_content.get('num_headings', 0)
                        }
                    })

                # 2. Tabelle con metadati
                for table_info in doc_content.get('tables', []):
                    vector_entry['content_types'].append('table')

                    table_text = f"TABELLA: {table_info.get('caption', 'Senza titolo')}\n\n"

                    if table_info.get('potential_columns'):
                        table_text += "COLONNE: " + ", ".join(table_info['potential_columns']) + "\n\n"

                    table_text += "CONTENUTO:\n" + table_info.get('text_content', '')

                    vector_entry['indexable_content'].append({
                        'type': 'table',
                        'content': table_text,
                        'metadata': {
                            'table_id': table_info.get('table_id'),
                            'caption': table_info.get('caption'),
                            'num_rows': table_info.get('num_rows'),
                            'num_columns': table_info.get('num_columns'),
                            'columns': table_info.get('potential_columns', []),
                            'position': table_info.get('position')
                        }
                    })

                # 3. Immagini con descrizioni VLM
                for img_info in doc_content.get('exported_images', []):
                    if img_info.get('vlm_description'):
                        vector_entry['content_types'].append('image_vlm')

                        img_text = f"IMMAGINE: {img_info.get('filename')}\n"
                        img_text += f"CAPTION: {img_info.get('caption', 'N/A')}\n\n"
                        img_text += f"DESCRIZIONE VISIVA:\n{img_info['vlm_description']}"

                        # Cerca il file .txt se esiste
                        img_path = Path(img_info.get('path', ''))
                        txt_path = img_path.with_suffix('.txt')
                        vlm_file = str(txt_path) if txt_path.exists() else None

                        vector_entry['indexable_content'].append({
                            'type': 'image_vlm',
                            'content': img_text,
                            'metadata': {
                                'filename': img_info.get('filename'),
                                'caption': img_info.get('caption'),
                                'vlm_description_file': vlm_file,
                                'image_path': img_info.get('path'),
                                'position': img_info.get('position')
                            }
                        })

                # Aggiungi al manifest se ha contenuto indicizzabile
                if vector_entry['indexable_content']:
                    manifest['documents'].append(vector_entry)
                    manifest['total_documents'] += 1
                    print(f"  ✓ {doc_id}: {len(vector_entry['indexable_content'])} chunks")

        except Exception as e:
            print(f"  ✗ Errore {json_file.name}: {e}")

    # Salva manifest
    manifest_path = json_dir / f"vector_db_manifest_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"

    with open(manifest_path, 'w', encoding='utf-8') as f:
        json.dump(manifest, f, ensure_ascii=False, indent=2)

    print(f"\n{'='*80}")
    print(f"MANIFEST CREATO")
    print(f"{'='*80}")
    print(f"📄 File: {manifest_path.name}")
    print(f"📊 Documenti: {manifest['total_documents']}")

    # Statistiche dettagliate
    total_chunks = sum(len(doc['indexable_content']) for doc in manifest['documents'])
    total_tables = sum(1 for doc in manifest['documents']
                      for chunk in doc['indexable_content']
                      if chunk['type'] == 'table')
    total_images = sum(1 for doc in manifest['documents']
                      for chunk in doc['indexable_content']
                      if chunk['type'] == 'image_vlm')
    total_markdown = sum(1 for doc in manifest['documents']
                        for chunk in doc['indexable_content']
                        if chunk['type'] == 'markdown_text')

    print(f"📦 Chunks totali: {total_chunks}")
    print(f"   - Testi markdown: {total_markdown}")
    print(f"   - Tabelle: {total_tables}")
    print(f"   - Immagini VLM: {total_images}")
    print(f"{'='*80}\n")

    return manifest_path


def verify_recovery():
    """Verifica stato dei dati e coverage VLM"""
    print(f"\n{'='*80}")
    print(f"VERIFICA STATO DATI")
    print(f"{'='*80}\n")

    # JSON files
    json_files = list(JSON_DIR.glob("*.json")) if JSON_DIR.exists() else []
    json_files = [f for f in json_files if not f.name.startswith('vector_db_manifest')]
    print(f"📄 File JSON: {len(json_files)}")

    # Immagini e descrizioni
    total_png = 0
    total_txt = 0

    if IMAGES_DIR.exists():
        for doc_dir in IMAGES_DIR.iterdir():
            if doc_dir.is_dir():
                png_files = list(doc_dir.glob("*.png"))
                txt_files = list(doc_dir.glob("*.txt"))
                total_png += len(png_files)
                total_txt += len(txt_files)

    print(f"🖼️  Immagini totali: {total_png}")
    print(f"📝 Descrizioni VLM: {total_txt}")

    if total_png > 0:
        coverage = (total_txt / total_png) * 100
        print(f"📊 Coverage VLM: {coverage:.1f}%")

        if coverage < 100:
            missing = total_png - total_txt
            print(f"\n⚠️  Mancano {missing} descrizioni VLM")
            print(f"   → Esegui: recover_vlm_descriptions_from_json()")
        else:
            print(f"\n✅ Tutte le immagini hanno descrizioni VLM!")

    # Tabelle
    if TABLES_DIR.exists():
        total_tables = sum(1 for _ in TABLES_DIR.rglob("*.txt"))
        print(f"📊 Tabelle estratte: {total_tables}")

    # Manifest
    manifest_files = list(JSON_DIR.glob("vector_db_manifest*.json")) if JSON_DIR.exists() else []
    if manifest_files:
        print(f"📋 Manifest esistenti: {len(manifest_files)}")
        latest = max(manifest_files, key=lambda p: p.stat().st_mtime)
        print(f"   Ultimo: {latest.name}")
    else:
        print(f"📋 Manifest: ❌ Nessuno")
        print(f"   → Esegui: create_vector_db_manifest()")

    print(f"{'='*80}\n")

    return total_png == total_txt if total_png > 0 else True


# ============================================================================
# PIPELINE COMPLETA
# ============================================================================

def full_recovery_pipeline(auto_backup: bool = True):
    """
    Pipeline completa di recovery con backup automatico su Drive
    """
    print(f"\n{'='*80}")
    print(f"🚀 PIPELINE COMPLETA DI RECOVERY")
    print(f"{'='*80}\n")

    # Step 0: Verifica iniziale
    print("📊 STEP 0: Verifica stato iniziale")
    verify_recovery()

    # Step 1: Recovery descrizioni VLM
    print("\n📝 STEP 1: Recovery descrizioni VLM")
    recovered = recover_vlm_descriptions_from_json()

    if recovered > 0:
        print(f"\n✅ Recuperate {recovered} nuove descrizioni!\n")
    else:
        print(f"\nℹ️  Nessuna nuova descrizione da recuperare\n")

    # Step 2: Creazione manifest
    print("\n📋 STEP 2: Creazione manifest per Vector DB")
    manifest_path = create_vector_db_manifest()
    print(f"✅ Manifest salvato: {manifest_path}\n")

    # Step 3: Verifica finale
    print("\n✅ STEP 3: Verifica finale")
    all_ok = verify_recovery()

    # Step 4: Backup su Drive (se richiesto)
    if auto_backup:
        print("\n☁️  STEP 4: Backup su Google Drive")
        backup_success = backup_to_drive()

        if backup_success:
            print("✅ Tutto salvato su Drive!")
        else:
            print("⚠️  Backup non riuscito (ma dati locali OK)")

    print(f"\n{'='*80}")
    print(f"🎉 PIPELINE COMPLETATA!")
    print(f"{'='*80}")

    if all_ok:
        print("✅ Tutti i dati sono pronti per il Vector DB")
        print(f"📁 Path locale: {OUTPUT_DIR}")
        if auto_backup:
            print(f"☁️  Path Drive: {DRIVE_BACKUP_DIR}")
    else:
        print("⚠️  Alcune descrizioni VLM potrebbero mancare")
        print("   Controlla i log sopra per dettagli")

    print(f"{'='*80}\n")


# ============================================================================
# ESECUZIONE INTERATTIVA
# ============================================================================

if __name__ == "__main__":
    print("\n" + "="*80)
    print("🔧 SCRIPT DI RECOVERY INAIL + DRIVE BACKUP")
    print("="*80)
    print("\nQuesto script:")
    print("1. ✅ Recupera descrizioni VLM dai JSON")
    print("2. ✅ Crea file .txt per ogni immagine")
    print("3. ✅ Genera manifest per Vector DB")
    print("4. ☁️  Backup automatico su Google Drive")
    print("\n⚠️  Esegui DOPO che lo scraping è terminato!")
    print("="*80 + "\n")

    # Menu scelta
    print("OPZIONI:")
    print("1. Pipeline completa (Recovery + Manifest + Backup Drive)")
    print("2. Solo recovery descrizioni VLM")
    print("3. Solo creazione manifest")
    print("4. Solo backup su Drive")
    print("5. Solo ripristino da Drive")
    print("6. Verifica stato")
    print("="*80 + "\n")

    choice = input("Scegli (1-6, default 1): ").strip() or "1"

    if choice == "1":
        # Pipeline completa
        mount_google_drive()
        full_recovery_pipeline(auto_backup=True)

    elif choice == "2":
        recovered = recover_vlm_descriptions_from_json()
        print(f"\n✅ Recuperate {recovered} descrizioni")

    elif choice == "3":
        manifest_path = create_vector_db_manifest()
        print(f"\n✅ Manifest: {manifest_path}")

    elif choice == "4":
        mount_google_drive()
        backup_to_drive()

    elif choice == "5":
        mount_google_drive()
        restore_from_drive()

    elif choice == "6":
        verify_recovery()

    print("\n✅ FATTO!\n")


🔧 SCRIPT DI RECOVERY INAIL + DRIVE BACKUP

Questo script:
1. ✅ Recupera descrizioni VLM dai JSON
2. ✅ Crea file .txt per ogni immagine
3. ✅ Genera manifest per Vector DB
4. ☁️  Backup automatico su Google Drive

⚠️  Esegui DOPO che lo scraping è terminato!

OPZIONI:
1. Pipeline completa (Recovery + Manifest + Backup Drive)
2. Solo recovery descrizioni VLM
3. Solo creazione manifest
4. Solo backup su Drive
5. Solo ripristino da Drive
6. Verifica stato



KeyboardInterrupt: Interrupted by user

**INAIL SCRAPER - PRODUCTION VERSION WITH FULL AUTOMATION**

Comprehensive web scraper for INAIL publication database with enterprise-grade
features: fully incremental scraping with auto-skip of processed documents,
automatic Google Drive backup/restore on startup and during execution, VLM
image descriptions saved as standalone .txt files for vector indexing, table
extraction with rich metadata (columns/captions/position), robust error handling with automatic driver recovery, persistent state management across Colab sessions, and vector database preparation utilities. Implements 7-step workflow: auto-restore
from Drive, load processed URLs, extract links with retry logic, filter new
documents, process with Docling+VLM, auto-backup every N docs, final backup.



In [31]:
# CONFIGURAZIONE
class INAILConfig:
    OUTPUT_DIR = Path('./inail_scraped_data')
    PDF_DIR = OUTPUT_DIR / 'pdfs'
    JSON_DIR = OUTPUT_DIR / 'json'
    IMAGES_DIR = OUTPUT_DIR / 'images'
    TABLES_DIR = OUTPUT_DIR / 'tables'
    DRIVE_BACKUP_DIR = Path('/content/drive/MyDrive/INAIL_Thesis_Data')

    VLM_ENABLED = True
    VLM_MODEL = "HuggingFaceTB/SmolVLM-256M-Instruct"
    VLM_PROMPT = """Describe this technical document image in detail for academic research. Focus on:
- Main content type (diagram, chart, photo, schematic)
- Key visual elements and their relationships
- Any visible text, labels, or numerical data
- Technical details relevant to workplace safety or industrial processes
Be precise and comprehensive."""

    @classmethod
    def setup_directories(cls):
        for dir_path in [cls.OUTPUT_DIR, cls.PDF_DIR, cls.JSON_DIR,
                        cls.IMAGES_DIR, cls.TABLES_DIR]:
            dir_path.mkdir(exist_ok=True, parents=True)
        print(f"[INFO] Directory create: {cls.OUTPUT_DIR}")

# MANTIENI le funzioni Drive invariate

# add_vlm_descriptions_to_images con salvataggio .txt

def add_vlm_descriptions_to_images(exported_images: List[Dict], doc_id: str) -> List[Dict]:

    #  Salva descrizioni VLM come file .txt accanto alle immagini per facilitare l'indicizzazione vettoriale

    if not exported_images or not INAILConfig.VLM_ENABLED:
        return exported_images

    try:
        from transformers import AutoProcessor, AutoModelForVision2Seq
        from PIL import Image
        import torch

        print(f"\n[VLM] Caricamento {INAILConfig.VLM_MODEL}...")

        processor = AutoProcessor.from_pretrained(INAILConfig.VLM_MODEL, trust_remote_code=True)
        model = AutoModelForVision2Seq.from_pretrained(
            INAILConfig.VLM_MODEL,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
            device_map="auto",
            trust_remote_code=True
        )

        for idx, img_data in enumerate(exported_images, 1):
            try:
                print(f"  [{idx}/{len(exported_images)}] {img_data['filename']}")
                image = Image.open(img_data['path']).convert('RGB')

                messages = [{
                    "role": "user",
                    "content": [
                        {"type": "image"},
                        {"type": "text", "text": INAILConfig.VLM_PROMPT}
                    ]
                }]

                prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
                inputs = processor(text=prompt, images=[image], return_tensors="pt").to(model.device)

                with torch.no_grad():
                    outputs = model.generate(**inputs, max_new_tokens=300, do_sample=False)

                full_output = processor.decode(outputs[0], skip_special_tokens=True)
                description = full_output.split("Assistant:")[-1].strip() if "Assistant:" in full_output else full_output.strip()

                # Salva nel dizionario (come prima)
                img_data['vlm_description'] = description

                # Salva come file .txt accanto all'immagine
                img_path = Path(img_data['path'])
                txt_path = img_path.with_suffix('.txt')

                with open(txt_path, 'w', encoding='utf-8') as f:
                    f.write("="*80 + "\n")
                    f.write(f"VLM DESCRIPTION - {img_data['filename']}\n")
                    f.write("="*80 + "\n\n")

                    f.write("IMAGE METADATA:\n")
                    f.write(f"  - Filename: {img_data['filename']}\n")
                    f.write(f"  - Caption: {img_data.get('caption', 'N/A')}\n")
                    f.write(f"  - Position: {img_data.get('position', 'N/A')}\n")
                    f.write(f"  - Model: {INAILConfig.VLM_MODEL}\n")
                    f.write(f"  - Timestamp: {datetime.now().isoformat()}\n\n")

                    f.write("VLM ANALYSIS:\n")
                    f.write("-"*80 + "\n")
                    f.write(description)
                    f.write("\n" + "-"*80 + "\n")

                img_data['vlm_description_file'] = str(txt_path)
                print(f"    ✓ {description[:80]}...")

            except Exception as e:
                print(f"    ✗ {e}")
                img_data['vlm_description'] = None
                img_data['vlm_description_file'] = None

        del model, processor
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        success = sum(1 for img in exported_images if img.get('vlm_description'))
        print(f"[VLM] Completato: {success}/{len(exported_images)}\n")

    except Exception as e:
        print(f"[ERROR] VLM fallito: {e}")

    return exported_images


# Funzione per caricare URL già processati
def load_already_scraped_urls() -> set:

    # Carica tutti gli URL già processati da File JSON locali e Backup Drive (se disponibile)

    scraped_urls = set()

    # Cerca sia in locale che su Drive
    search_dirs = [INAILConfig.JSON_DIR]

    if INAILConfig.DRIVE_BACKUP_DIR.exists():
        drive_json_dir = INAILConfig.DRIVE_BACKUP_DIR / 'json'
        if drive_json_dir.exists():
            search_dirs.append(drive_json_dir)

    for json_dir in search_dirs:
        if not json_dir.exists():
            continue

        json_files = list(json_dir.glob("*.json"))
        json_files = [f for f in json_files if not f.name.startswith('vector_db_manifest')]

        for json_file in json_files:
            try:
                with open(json_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)

                # Gestisci sia file singoli che batch
                if isinstance(data, list):
                    docs = data
                else:
                    docs = [data]

                for doc in docs:
                    url = doc.get('scraping_metadata', {}).get('url')
                    if url:
                        scraped_urls.add(url)
            except:
                continue

    return scraped_urls


# batch_scrape con skip automatico
def batch_scrape_inail_publications_incremental(
    driver, converter,
    query: str = None,
    max_pages: int = 5,
    max_documents: Optional[int] = None,
    enable_vlm: bool = False,
    save_pdfs: bool = True,
    delay_between_docs: int = 4,
    auto_backup_every: int = 5  # Backup ogni N documenti
) -> List[Dict[str, Any]]:

    # Versione incrementale, Skippa automaticamente documenti già processati


    # 1) Carica URL già fatti
    already_scraped = load_already_scraped_urls()
    print(f"\n{'='*80}")
    print(f"SCRAPING INCREMENTALE")
    print(f"{'='*80}")
    print(f"📊 Documenti già processati: {len(already_scraped)}")
    print(f"{'='*80}\n")

    # 2) Estrai tutti i link
    publication_links = get_inail_publication_links(driver, query=query, max_pages=max_pages)

    if not publication_links:
        print("[WARNING] Nessuna pubblicazione trovata")
        return []

    # 3) Filtra solo quelli nuovi
    new_links = [pub for pub in publication_links if pub['url'] not in already_scraped]

    print(f"\n{'='*80}")
    print(f"FILTRO DOCUMENTI")
    print(f"{'='*80}")
    print(f"Totali trovati: {len(publication_links)}")
    print(f"Già processati: {len(publication_links) - len(new_links)}")
    print(f"Nuovi da processare: {len(new_links)}")
    print(f"{'='*80}\n")

    if not new_links:
        print("utti i documenti sono già stati processati!")
        return []

    if max_documents:
        new_links = new_links[:max_documents]
        print(f"[INFO] Limitato a {max_documents} nuovi documenti\n")

    # 4) Scraping con backup automatico
    results = []
    failed = []

    print(f"{'='*80}")
    print(f"SCRAPING {len(new_links)} NUOVI DOCUMENTI")
    print(f"{'='*80}\n")

    for idx, pub in enumerate(new_links, 1):
        print(f"\n[{idx}/{len(new_links)}] {pub['titolo'][:50]}...")

        try:
            result = scrape_inail_publication(
                driver=driver,
                publication_url=pub['url'],
                converter=converter,
                save_pdf=save_pdfs,
                enable_vlm=enable_vlm
            )

            if result['status'] == 'success':
                results.append(result)
                save_to_json(result)
                print(f"  OK")

                # Backup automatico ogni N documenti
                if idx % auto_backup_every == 0:
                    print(f"\n  ☁️  Auto-backup ({idx}/{len(new_links)})...")
                    backup_to_drive()
            else:
                failed.append({'url': pub['url'], 'title': pub['titolo'],
                              'error': result.get('error_message', 'Unknown')})
                print(f"  FAIL")

        except KeyboardInterrupt:
            print(f"\n INTERRUZIONE - Salvati: {len(results)}")
            print("💾 Backup finale...")
            backup_to_drive()
            break

        except Exception as e:
            print(f"  Eccezione: {str(e)[:80]}")
            failed.append({'url': pub['url'], 'title': pub['titolo'], 'error': str(e)})

        if idx < len(new_links):
            delay = random.uniform(delay_between_docs, delay_between_docs + 2)
            time.sleep(delay)

    # 5) Backup finale
    if results:
        print(f"\n☁️  Backup finale su Drive...")
        backup_to_drive()

    print(f"\n{'='*80}")
    print(f"COMPLETATO: {len(results)} nuovi | {len(failed)} falliti")
    print(f"Totale documenti ora: {len(already_scraped) + len(results)}")
    print(f"{'='*80}\n")

    return results


# MAIN con ripristino automatico

def main():
    print("\n" + "="*80)
    print("INAIL SCRAPER INCREMENTALE - VERSIONE FINALE")
    print("="*80 + "\n")

    # STEP 1) Setup directories
    INAILConfig.setup_directories()

    # STEP 2) Monta Drive e ripristina automaticamente
    print("\n[STEP 1] Setup Google Drive...")
    if mount_google_drive():
        print("\n[STEP 2] Ripristino dati da Drive...")
        restore_from_drive()

    # STEP 3) Mostra statistiche
    already_scraped = load_already_scraped_urls()
    print(f"\n[INFO] 📊 Documenti già processati: {len(already_scraped)}")

    # STEP 4) Inizializza Docling
    print("\n[STEP 3] Inizializzazione Docling...")
    converter = initialize_docling_converter()

    # STEP 5) Menu
    print("\n" + "="*80)
    print("MODALITÀ:")
    print("="*80)
    print("1. 🚀 Scraping incrementale (skippa già fatti)")
    print("2. 📊 Verifica stato (quanti documenti ho)")
    print("3. ☁️  Solo backup su Drive")
    print("4. 🔄 Solo ripristino da Drive")
    print("="*80 + "\n")

    choice = input("Scegli (1-4, default 1): ").strip() or "1"

    if choice == "3":
        backup_to_drive()
        return None
    elif choice == "4":
        restore_from_drive()
        return None
    elif choice == "2":
        print(f"\n📊 STATO ATTUALE:")
        print(f"   - Documenti processati: {len(already_scraped)}")
        print(f"   - Path locale: {INAILConfig.OUTPUT_DIR}")
        print(f"   - Path Drive: {INAILConfig.DRIVE_BACKUP_DIR}")
        return None

    # SCRAPING INCREMENTALE
    print("\n[SETUP] Configurazione scraping incrementale\n")

    query = input("Query (Enter = tutte): ").strip() or None

    try:
        max_pages = int(input("Max pagine (default 5): ").strip() or "5")
    except:
        max_pages = 5

    max_docs_input = input("Limite nuovi docs (Enter = tutti): ").strip()
    max_docs = int(max_docs_input) if max_docs_input else None

    vlm_input = input("VLM? (y/n, default: y): ").strip().lower()
    enable_vlm = (vlm_input != 'n')  # Default = True

    print(f"\n{'='*80}")
    print("RIEPILOGO:")
    print(f"{'='*80}")
    print(f"Query: {query if query else '(tutte)'}")
    print(f"Pagine: {max_pages}")
    print(f"Nuovi docs max: {max_docs if max_docs else '(tutti)'}")
    print(f"VLM: {'ON' if enable_vlm else 'OFF'}")
    print(f"Già processati: {len(already_scraped)} (verranno skippati)")
    print(f"{'='*80}\n")

    confirm = input("Procedere? (y/n): ").strip().lower()
    if confirm != 'y':
        print("[CANCELLED]")
        return None

    # ESEGUI SCRAPING INCREMENTALE
    results = batch_scrape_inail_publications_incremental(
        driver=driver,
        converter=converter,
        query=query,
        max_pages=max_pages,
        max_documents=max_docs,
        enable_vlm=enable_vlm,
        save_pdfs=True,
        delay_between_docs=4,
        auto_backup_every=5  # Backup ogni 5 documenti
    )

    return results


# ESECUZIONE
if __name__ == "__main__":
    results = main()

    print("\n" + "="*80)
    print("SCRAPING TERMINATO")
    print("="*80)

    if results:
        print(f"✅ Nuovi documenti: {len(results)}")

    total_docs = len(load_already_scraped_urls())
    print(f"📊 Documenti totali: {total_docs}")
    print(f"📁 Output locale: {INAILConfig.OUTPUT_DIR}")
    print(f"☁️  Backup Drive: {INAILConfig.DRIVE_BACKUP_DIR}")
    print("="*80 + "\n")


INAIL SCRAPER INCREMENTALE - VERSIONE FINALE

[INFO] Directory create: inail_scraped_data

[STEP 1] Setup Google Drive...
[INFO] Drive già montato

[STEP 2] Ripristino dati da Drive...

RIPRISTINO DA GOOGLE DRIVE
✅ Ripristino completato!
📦 File ripristinati: 2858


[INFO] 📊 Documenti già processati: 31

[STEP 3] Inizializzazione Docling...
[INFO] Docling OK | VLM: ON

MODALITÀ:
1. 🚀 Scraping incrementale (skippa già fatti)
2. 📊 Verifica stato (quanti documenti ho)
3. ☁️  Solo backup su Drive
4. 🔄 Solo ripristino da Drive

Scegli (1-4, default 1): 1

[SETUP] Configurazione scraping incrementale

Query (Enter = tutte): 
Max pagine (default 5): 5
Limite nuovi docs (Enter = tutti): 
VLM? (y/n, default: y): y

RIEPILOGO:
Query: (tutte)
Pagine: 5
Nuovi docs max: (tutti)
VLM: ON
Già processati: 31 (verranno skippati)

Procedere? (y/n): y

SCRAPING INCREMENTALE
📊 Documenti già processati: 31


ESTRAZIONE LINK (VERSIONE FIXED)
Query: (tutte) | Max pagine: 5

[Pagina 1/5]
    [Tentativo 1/4] ✓ 

[32m[INFO] 2025-10-26 16:55:58,840 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-10-26 16:55:58,845 [RapidOCR] download_file.py:68: Initiating download: https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/det/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-10-26 16:56:00,234 [RapidOCR] download_file.py:82: Download size: 13.83MB[0m
[32m[INFO] 2025-10-26 16:56:00,368 [RapidOCR] download_file.py:95: Successfully saved to: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-10-26 16:56:00,370 [RapidOCR] torch.py:54: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-10-26 16:56:00,590 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-10-26 16:56:00,591 [RapidOCR] download_file.py:68: Initiating download: https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/cls/ch_ptocr_mobile_v2.0_cls

  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
  [+] figure_8.png
  [+] figure_9.png
  [+] figure_10.png
  [+] figure_11.png
  [+] figure_12.png
  [+] figure_13.png
  [+] figure_14.png
  [+] figure_15.png
  [+] figure_16.png
  [+] figure_17.png
  [+] figure_18.png
  [+] figure_19.png
  [+] figure_20.png
  [+] figure_21.png
  [+] figure_22.png
  [+] figure_23.png
  [+] figure_24.png
  [+] figure_25.png
  [+] figure_26.png
  [+] figure_27.png
  [+] figure_28.png
  [+] figure_29.png
  [+] figure_30.png
  [+] figure_31.png
  [+] figure_32.png
  [+] figure_33.png
  [+] figure_34.png
  [+] figure_35.png
  [+] figure_36.png
  [+] figure_37.png
  [+] figure_38.png
  [+] figure_39.png
  [+] figure_40.png
  [+] figure_41.png
  [+] figure_42.png
  [+] figure_43.png
  [+] figure_44.png
  [+] figure_45.png
  [+] figure_46.png
  [+] figure_47.png
  [+] figure_48.png
  [+] figure_49.png
  [+] figure_50.png
  [+] fig

`torch_dtype` is deprecated! Use `dtype` instead!


  [1/189] figure_1.png
    ✓ This technical document image is a diagram, chart, or schematic depicting the co...
  [2/189] figure_2.png
    ✓ The image is a technical document titled "CONSIGLI Nazionale Degli Ingegneri" (G...
  [3/189] figure_3.png
    ✓ The image depicts an illustration of a workplace scene, likely from a technical ...
  [4/189] figure_4.png
    ✓ This technical document image is a diagram, chart, or schematic of a product or ...
  [5/189] figure_5.png
    ✓ INCIL...
  [6/189] figure_6.png
    ✓ The image depicts a logo, which is a circular emblem typically used in business ...
  [7/189] figure_7.png
    ✓ The image is a technical document titled "CONSIGLO Nazionale DEGLI INEGNERI" whi...
  [8/189] figure_8.png
    ✓ This technical document image consists of a blue circle with the word "mic" writ...
  [9/189] figure_9.png
    ✓ The image is a technical document image that consists of multiple panels and ima...
  [10/189] figure_10.png
    ✓ The image is a technical do



  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
  [+] figure_8.png
  [+] figure_9.png
  [+] figure_10.png
  [+] figure_11.png
  [+] figure_12.png
  [+] figure_13.png
  [+] figure_14.png
  [+] figure_15.png
  [+] figure_16.png
  [+] figure_17.png
  [+] figure_18.png
  [+] figure_19.png
  [+] figure_20.png
  [+] figure_21.png
  [+] figure_22.png
  [+] figure_23.png
  [+] figure_24.png
  [+] figure_25.png
  [+] figure_26.png
  [+] figure_27.png
  [+] figure_28.png
[OK] 28 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/28] figure_1.png
    ✓ The logo is for INCail, a company that produces and sells industrial safety equi...
  [2/28] figure_2.png
    ✓ SMART DPI...
  [3/28] figure_3.png
    ✓ The image is an image of a technical document, likely a report or a report on sa...
  [4/28] figure_4.png
    ✓ This technical document image is a diagram, chart, or schematic depicting the pr...
  [5/28] figure_5.png
    ✓ **Image Description:**

The image is a technical document titled "SELTECH DPI" w...
  [6/28] figure_6.png
    ✓ The image is a technical document that depicts a pie chart. The pie chart is div...
  [7/28] figure_7.png
    ✓ The chart depicts a data spectrum, with two distinct parts. The left part is a y...
  [8/28] figure_8.png
    ✓ The image depicts a simple, horizontal bar chart with three distinct colors: gre...
  [9/28] figure_9.png
    ✓ The diagram shows a winding arc from left to right. The diagram includes three s...
  [10/28] figure_10.png
    ✓ A blue tree with branches and lea



  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
  [+] figure_8.png
  [+] figure_9.png
  [+] figure_10.png
  [+] figure_11.png
  [+] figure_12.png
  [+] figure_13.png
  [+] figure_14.png
  [+] figure_15.png
  [+] figure_16.png
  [+] figure_17.png
  [+] figure_18.png
  [+] figure_19.png
  [+] figure_20.png
  [+] figure_21.png
  [+] figure_22.png
  [+] figure_23.png
  [+] figure_24.png
[OK] 24 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/24] figure_1.png
    ✓ Incl. Italy...
  [2/24] figure_2.png
    ✓ This technical document image is in a dark blue color. The diagram is made up of...
  [3/24] figure_3.png
    ✓ The image depicts a woman in a boat, likely a kayak, seated in a white boat. She...
  [4/24] figure_4.png
    ✓ The image depicts a man in a gray long-sleeve shirt standing in front of a red P...
  [5/24] figure_5.png
    ✓ The image depicts a man in a blue shirt with headphones on, standing in what app...
  [6/24] figure_6.png
    ✓ The image depicts a scene within a farm setting. The primary focus is on a woman...
  [7/24] figure_7.png
    ✓ The image depicts a man and a woman engaged in a technical task in what appears ...
  [8/24] figure_8.png
    ✓ The image depicts a man sitting at a desk in an office setting. He is wearing a ...
  [9/24] figure_9.png
    ✓ This technical document image depicts a man wearing a blue Asics Sweater and a w...
  [10/24] figure_10.png
    ✓ The image depicts a man in a wh



  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
  [+] figure_8.png
  [+] figure_9.png
  [+] figure_10.png
  [+] figure_11.png
  [+] figure_12.png
  [+] figure_13.png
  [+] figure_14.png
  [+] figure_15.png
  [+] figure_16.png
  [+] figure_17.png
  [+] figure_18.png
  [+] figure_19.png
  [+] figure_20.png
  [+] figure_21.png
  [+] figure_22.png
  [+] figure_23.png
  [+] figure_24.png
  [+] figure_25.png
  [+] figure_26.png
  [+] figure_27.png
  [+] figure_28.png
  [+] figure_29.png
  [+] figure_30.png
  [+] figure_31.png
  [+] figure_32.png
  [+] figure_33.png
  [+] figure_34.png
  [+] figure_35.png
  [+] figure_36.png
  [+] figure_37.png
  [+] figure_38.png
  [+] figure_39.png
  [+] figure_40.png
  [+] figure_41.png
  [+] figure_42.png
  [+] figure_43.png
  [+] figure_44.png
  [+] figure_45.png
  [+] figure_46.png
  [+] figure_47.png
  [+] figure_48.png
  [+] figure_49.png
  [+] figure_50.png
  [+] fig



  [1/60] figure_1.png
    ✓ This technical document image is a diagram, chart, or schematic of a company’s i...
  [2/60] figure_2.png
    ✓ SICURI E CONNESSI N3...
  [3/60] figure_3.png
    ✓ This image is a technical document image that includes several key visual elemen...
  [4/60] figure_4.png
    ✓ The image is a technical document titled "INAIL & GATTO PERRY 3" which is part o...
  [5/60] figure_5.png
    ✓ PRESENTATIONZONE...
  [6/60] figure_6.png
    ✓ This technical document image is a photograph of a paw print....
  [7/60] figure_7.png
    ✓ This technical document image is a diagram, chart, or schematic depicting the pa...
  [8/60] figure_8.png
    ✓ This technical document image depicts a book cover with a title at the top: "Nai...
  [9/60] figure_9.png
    ✓ This technical document image is a detailed breakdown of a comic strip. The cont...
  [10/60] figure_10.png
    ✓ The technical document image is a cartoon-style illustration that depicts a woma...
  [11/60] figure_11.p



  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
  [+] figure_8.png
  [+] figure_9.png
  [+] figure_10.png
  [+] figure_11.png
  [+] figure_12.png
  [+] figure_13.png
  [+] figure_14.png
  [+] figure_15.png
  [+] figure_16.png
  [+] figure_17.png
  [+] figure_18.png
  [+] figure_19.png
  [+] figure_20.png
  [+] figure_21.png
  [+] figure_22.png
  [+] figure_23.png
  [+] figure_24.png
  [+] figure_25.png
  [+] figure_26.png
  [+] figure_27.png
  [+] figure_28.png
  [+] figure_29.png
  [+] figure_30.png
  [+] figure_31.png
  [+] figure_32.png
  [+] figure_33.png
  [+] figure_34.png
  [+] figure_35.png
  [+] figure_36.png
  [+] figure_37.png
  [+] figure_38.png
  [+] figure_39.png
  [+] figure_40.png
  [+] figure_41.png
  [+] figure_42.png
[OK] 42 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/42] figure_1.png
    ✓ The image is a technical document titled "2025 Collana Sanita" which appears to ...
  [2/42] figure_2.png
    ✓ This technical document image is a detailed diagram, chart, or schematic of a sp...
  [3/42] figure_3.png
    ✓ The image is a technical document image that appears to be a diagram or chart re...
  [4/42] figure_4.png
    ✓ This image is a technical document image that appears to be a diagram or chart r...
  [5/42] figure_5.png
    ✓ The image is an artistic depiction of a man in an ornate frame. The frame is fra...
  [6/42] figure_6.png
    ✓ The document is a technical document titled "De Noxis Paludum E Fluxuvilis, Eoru...
  [7/42] figure_7.png
    ✓ The technical document image depicts a man in a formal suit, wearing glasses and...
  [8/42] figure_8.png
    ✓ This technical document image is a detailed illustration of a diagram, chart, or...
  [9/42] figure_9.png
    ✓ The technical document image is a black and white photograph of an older man



  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
  [+] figure_8.png
  [+] figure_9.png
  [+] figure_10.png
  [+] figure_11.png
  [+] figure_12.png
  [+] figure_13.png
  [+] figure_14.png
  [+] figure_15.png
  [+] figure_16.png
  [+] figure_17.png
  [+] figure_18.png
[OK] 18 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/18] figure_1.png
    ✓ The document is a technical document titled "Collana Sanità 2025" (Spanish for "...
  [2/18] figure_2.png
    ✓ This technical document image is a diagram, chart, or schematic of a process or ...
  [3/18] figure_3.png
    ✓ This technical document image is a diagram, chart, or schematic depicting a proc...
  [4/18] figure_4.png
    ✓ B b...
  [5/18] figure_5.png
    ✓ \( C \)...
  [6/18] figure_6.png
    ✓ D2...
  [7/18] figure_7.png
    ✓ This technical document image in detail for academic research. Focus on:
- Main ...
  [8/18] figure_8.png
    ✓ H...
  [9/18] figure_9.png
    ✓ L...
  [10/18] figure_10.png
    ✓ This technical document image in detail for academic research. Focus on:
- Main ...
  [11/18] figure_11.png
    ✓ This technical document image in detail for academic research. Focus on:
- Main ...
  [12/18] figure_12.png
    ✓ 0 and o...
  [13/18] figure_13.png
    ✓ P2...
  [14/18] figure_14.png
    ✓ This technical document image in detail for



  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
  [+] figure_8.png
  [+] figure_9.png
  [+] figure_10.png
  [+] figure_11.png
  [+] figure_12.png
  [+] figure_13.png
  [+] figure_14.png
  [+] figure_15.png
[OK] 15 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/15] figure_1.png
    ✓ INCail...
  [2/15] figure_2.png
    ✓ This technical document image is for academic research. It is a diagram, chart, ...
  [3/15] figure_3.png
    ✓ This technical document image is a diagram, chart, and schematic illustrating th...
  [4/15] figure_4.png
    ✓ This is a technical diagram or chart that shows the process of a process called ...
  [5/15] figure_5.png
    ✓ This document is a technical document that is designed to provide detailed infor...
  [6/15] figure_6.png
    ✓ The image is a technical document titled "Follow Up Azioni Intrapresse". The doc...
  [7/15] figure_7.png
    ✓ This technical document image in detail description is a diagram, chart, photo, ...
  [8/15] figure_8.png
    ✓ Figure 6 is a process diagram that follows several steps. The main components ar...
  [9/15] figure_9.png
    ✓ The given image is a technical document from a company called ITU. The document ...
  [10/15] figure_10.png
    ✓ This technical document image is a b



  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
  [+] figure_8.png
  [+] figure_9.png
  [+] figure_10.png
  [+] figure_11.png
  [+] figure_12.png
  [+] figure_13.png
  [+] figure_14.png
  [+] figure_15.png
  [+] figure_16.png
  [+] figure_17.png
  [+] figure_18.png
  [+] figure_19.png
  [+] figure_20.png
  [+] figure_21.png
  [+] figure_22.png
  [+] figure_23.png
  [+] figure_24.png
  [+] figure_25.png
  [+] figure_26.png
  [+] figure_27.png
  [+] figure_28.png
  [+] figure_29.png
  [+] figure_30.png
  [+] figure_31.png
  [+] figure_32.png
  [+] figure_33.png
  [+] figure_34.png
  [+] figure_35.png
[OK] 35 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/35] figure_1.png
    ✓ The image is a technical document titled "2025" with a blue background and a gre...
  [2/35] figure_2.png
    ✓ This technical document image is of a blue rectangular brochure. The title at th...
  [3/35] figure_3.png
    ✓ The image is a figure from a report on the circularity gap in the economy in Ams...
  [4/35] figure_4.png
    ✓ The image is a table that presents the data on the cost of per capita per day in...
  [5/35] figure_5.png
    ✓ This technical document image is a color-coded map of the United States, illustr...
  [6/35] figure_6.png
    ✓ Figure 4: Panorama globale delle colltivazioni per biocarburanti, alimentazione ...
  [7/35] figure_7.png
    ✓ This technical document image in detail for academic research. Focus on:
- Main ...
  [8/35] figure_8.png
    ✓ This technical document image depicts a linear diagram with four interconnected ...
  [9/35] figure_9.png
    ✓ This image is a technical document image from a report titled "Figure 7." Th



  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
[OK] 5 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/5] figure_1.png
    ✓ This technical document image is a logo that includes the word "INCIL" in large,...
  [2/5] figure_2.png
    ✓ This technical document image is from a document titled "2025" and is a part of ...
  [3/5] figure_3.png
    ✓ This technical document image is for a company called INCIL. The main content ty...
  [4/5] figure_4.png
    ✓ APPENDICIUS...
  [5/5] figure_5.png
    ✓ The image is a technical document titled “Area Contesto del lavoro”. It is divid...
[VLM] Completato: 5/5

  [+] table_1.txt
  [+] table_2.txt
  [+] table_3.txt
  [+] table_4.txt
  [+] table_5.txt
  [+] table_6.txt
  [+] table_7.txt
  [+] table_8.txt
  [+] table_9.txt
  [+] table_10.txt
  [+] table_11.txt
  [+] table_12.txt
  [+] table_13.txt
  [+] table_14.txt
  [+] table_15.txt
  [+] table_16.txt
  [+] table_17.txt
  [+] table_18.txt
  [+] table_19.txt
  [+] table_20.txt
  [+] table_21.txt
  [+] table_22.txt
  [+] table_23.txt
  [+] table_24.txt
  [+] table_25.txt
  [+] table_26.txt
  [+] 



  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
  [+] figure_8.png
  [+] figure_9.png
  [+] figure_10.png
  [+] figure_11.png
  [+] figure_12.png
  [+] figure_13.png
  [+] figure_14.png
  [+] figure_15.png
  [+] figure_16.png
  [+] figure_17.png
  [+] figure_18.png
  [+] figure_19.png
  [+] figure_20.png
  [+] figure_21.png
  [+] figure_22.png
  [+] figure_23.png
  [+] figure_24.png
  [+] figure_25.png
  [+] figure_26.png
  [+] figure_27.png
  [+] figure_28.png
  [+] figure_29.png
  [+] figure_30.png
  [+] figure_31.png
  [+] figure_32.png
  [+] figure_33.png
  [+] figure_34.png
  [+] figure_35.png
  [+] figure_36.png
  [+] figure_37.png
  [+] figure_38.png
  [+] figure_39.png
  [+] figure_40.png
  [+] figure_41.png
  [+] figure_42.png
  [+] figure_43.png
  [+] figure_44.png
  [+] figure_45.png
  [+] figure_46.png
  [+] figure_47.png
  [+] figure_48.png
  [+] figure_49.png
  [+] figure_50.png
  [+] fig



  [1/131] figure_1.png
    ✓ INCail...
  [2/131] figure_2.png
    ✓ This image is a technical document image that appears to be from a document rela...
  [3/131] figure_3.png
    ✓ This technical document image is of a brochure for INCIL, a company that produce...
  [4/131] figure_4.png
    ✓ This technical document image in outline form depicts a figure 1 of a work area ...
  [5/131] figure_5.png
    ✓ This technical document image in Figure 2 depicts a diagram and chart related to...
  [6/131] figure_6.png
    ✓ Figure 3 shows a diagram with several key elements and their relationships. The ...
  [7/131] figure_7.png
    ✓ Figure 4 depicts a scene involving a pedestrian crossing area. The scene is divi...
  [8/131] figure_8.png
    ✓ This figure shows a barricade in silhouette with a horizontal line across the mi...
  [9/131] figure_9.png
    ✓ This technical document image depicts a set of fencing elements, including a rec...
  [10/131] figure_10.png
    ✓ This document depicts a fi



  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
[OK] 3 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/3] figure_1.png
    ✓ This technical document image is for academic research. The main content type is...
  [2/3] figure_2.png
    ✓ This image is a technical document image that is designed to be used for academi...
  [3/3] figure_3.png
    ✓ This image is a technical document image that is divided into several sections. ...
[VLM] Completato: 3/3

  [+] table_1.txt
[OK] 4 pag | 1 tab | 3 fig
  ✅ OK

[12/19] Soluzioni basate su modelli AI: chatbot specializz...

SCRAPING PUBBLICAZIONE

[+] Soluzioni basate su modelli AI: chatbot specializzato nel re...
[+] Data: 9/04/2025
[INFO] Processing PDF...




  [+] figure_1.png
  [+] figure_2.png
[OK] 2 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/2] figure_1.png
    ✓ This technical document image is for academic research. The main content type is...
  [2/2] figure_2.png
    ✓ The image is a technical document image. It is a bar chart that shows the values...
[VLM] Completato: 2/2

  [+] table_1.txt
  [+] table_2.txt
  [+] table_3.txt
[OK] 4 pag | 3 tab | 2 fig
  ✅ OK

[13/19] Rischi correlati ad eccesso di confidenza da parte...

SCRAPING PUBBLICAZIONE

[+] Rischi correlati ad eccesso di confidenza da parte degli ope...
[+] Data: 27/03/2025
[INFO] Processing PDF...




  [+] figure_1.png
[OK] 1 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/1] figure_1.png
    ✓ The image is a technical document titled "INCIL" in white, uppercase letters on ...
[VLM] Completato: 1/1

[OK] 2 pag | 0 tab | 1 fig
  ✅ OK

[14/19] Antifungino-resistenza in agricoltura: un allarme ...

SCRAPING PUBBLICAZIONE

[+] Antifungino-resistenza in agricoltura: un allarme globale...
[+] Data: 27/03/2025
[INFO] Processing PDF...




  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
[OK] 3 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/3] figure_1.png
    ✓ This technical document image is for academic research. The main content type is...
  [2/3] figure_2.png
    ✓ This image is a table that shows the list of fungi that are most common in a lab...
  [3/3] figure_3.png
    ✓ This technical document image is divided into two parts. The top part contains a...
[VLM] Completato: 3/3

  [+] table_1.txt
[OK] 2 pag | 1 tab | 3 fig
  ✅ OK

[15/19] Prodotti fitosanitari ad uso non professionale - C...

SCRAPING PUBBLICAZIONE

[+] Prodotti fitosanitari ad uso non professionale - Cosa sono e...
[+] Data: 7/03/2025
[INFO] Processing PDF...




  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
  [+] figure_8.png
  [+] figure_9.png
  [+] figure_10.png
  [+] figure_11.png
  [+] figure_12.png
  [+] figure_13.png
  [+] figure_14.png
  [+] figure_15.png
  [+] figure_16.png
  [+] figure_17.png
  [+] figure_18.png
  [+] figure_19.png
  [+] figure_20.png
  [+] figure_21.png
  [+] figure_22.png
  [+] figure_23.png
  [+] figure_24.png
  [+] figure_25.png
  [+] figure_26.png
  [+] figure_27.png
  [+] figure_28.png
  [+] figure_29.png
  [+] figure_30.png
  [+] figure_31.png
  [+] figure_32.png
  [+] figure_33.png
  [+] figure_34.png
  [+] figure_35.png
  [+] figure_36.png
  [+] figure_37.png
  [+] figure_38.png
  [+] figure_39.png
  [+] figure_40.png
  [+] figure_41.png
  [+] figure_42.png
  [+] figure_43.png
[OK] 43 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/43] figure_1.png
    ✓ INCILL is a color-coded label that uses a solid brown background with white, gre...
  [2/43] figure_2.png
    ✓ This technical document image is a diagram, chart, or schematic of a machine or ...
  [3/43] figure_3.png
    ✓ This image is an illustration of a document titled "Alcune normative di settore:...
  [4/43] figure_4.png
    ✓ This image shows a diagram with a green box on the right side. The box contains ...
  [5/43] figure_5.png
    ✓ This is a screenshot of a webpage about the Ministry of Agriculture in Italy....
  [6/43] figure_6.png
    ✓ The image depicts a scene from a cityscape, likely representing a suburban or su...
  [7/43] figure_7.png
    ✓ The image is a technical document titled "PERICOLO PER LA SALUTE" which translat...
  [8/43] figure_8.png
    ✓ This technical document image is a diagram, chart, or schematic depicting a bee ...
  [9/43] figure_9.png
    ✓ The image depicts a scene from a workplace, specifically in a room with a large



  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
[OK] 4 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/4] figure_1.png
    ✓ This technical document image is for academic research. The main content type is...
  [2/4] figure_2.png
    ✓ The image is a technical document titled "Ovale R = fraganeo organica" which is ...
  [3/4] figure_3.png
    ✓ This technical document image is a diagram, chart, or schematic of a specific to...
  [4/4] figure_4.png
    ✓ Figure 3: Soignetti sanizionabile e possibili sconcere ai lungi la catena di app...
[VLM] Completato: 4/4

  [+] table_1.txt
[OK] 4 pag | 1 tab | 4 fig
  ✅ OK

[17/19] Esposizione ai gas di scarico dei motori diesel: v...

SCRAPING PUBBLICAZIONE

[+] Esposizione ai gas di scarico dei motori diesel: valore limi...
[+] Data: 25/02/2025
[INFO] Processing PDF...




  [+] figure_1.png
  [+] figure_2.png
[OK] 2 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/2] figure_1.png
    ✓ This technical document image is for academic research. The main content type is...
  [2/2] figure_2.png
    ✓ The document is a bar chart titled "Concentrazione media di carbono elemanate pe...
[VLM] Completato: 2/2

[OK] 4 pag | 0 tab | 2 fig
  ✅ OK

[18/19] Registro Nazionale dei Mesoteliomi: ottavo rapport...

SCRAPING PUBBLICAZIONE

[+] Registro Nazionale dei Mesoteliomi: ottavo rapporto...
[+] Data: 21/02/2025
[INFO] Processing PDF...




  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
  [+] figure_6.png
  [+] figure_7.png
  [+] figure_8.png
  [+] figure_9.png
  [+] figure_10.png
  [+] figure_11.png
  [+] figure_12.png
  [+] figure_13.png
  [+] figure_14.png
  [+] figure_15.png
  [+] figure_16.png
  [+] figure_17.png
  [+] figure_18.png
  [+] figure_19.png
  [+] figure_20.png
  [+] figure_21.png
  [+] figure_22.png
  [+] figure_23.png
  [+] figure_24.png
  [+] figure_25.png
  [+] figure_26.png
  [+] figure_27.png
  [+] figure_28.png
  [+] figure_29.png
  [+] figure_30.png
  [+] figure_31.png
  [+] figure_32.png
  [+] figure_33.png
  [+] figure_34.png
  [+] figure_35.png
  [+] figure_36.png
  [+] figure_37.png
  [+] figure_38.png
  [+] figure_39.png
  [+] figure_40.png
  [+] figure_41.png
  [+] figure_42.png
  [+] figure_43.png
  [+] figure_44.png
  [+] figure_45.png
  [+] figure_46.png
  [+] figure_47.png
  [+] figure_48.png
  [+] figure_49.png
  [+] figure_50.png
  [+] fig



  [1/156] figure_1.png
    ✓ The image is a technical document titled "Ottavo Rapporto 2024" which appears to...
  [2/156] figure_2.png
    ✓ The image is a technical document titled "INCIL" in white text on a blue backgro...
  [3/156] figure_3.png
    ✓ The image is a technical document titled "Dimensione dell'archivio numero di cas...
  [4/156] figure_4.png
    ✓ Figure 2 shows a map of Italy with a specific diagnostic area in the region of R...
  [5/156] figure_5.png
    ✓ The image is a technical document titled "Percentsual di casi esterometologia ce...
  [6/156] figure_6.png
    ✓ Figure 4 shows the percentage of patients with a specific diagnosis per period o...
  [7/156] figure_7.png
    ✓ The image is a technical document image that is divided into four sections. Each...
  [8/156] figure_8.png
    ✓ Figure 6 shows the number of workers employed in the construction industry in It...
  [9/156] figure_9.png
    ✓ The image is a bar chart titled "Percentuale del numero di casi di 



  [+] figure_1.png
  [+] figure_2.png
  [+] figure_3.png
  [+] figure_4.png
  [+] figure_5.png
[OK] 5 immagini esportate

[VLM] Caricamento HuggingFaceTB/SmolVLM-256M-Instruct...




  [1/5] figure_1.png
    ✓ The INCIL 2025 technical document is a rectangular, blue poster with white text ...
  [2/5] figure_2.png
    ✓ This technical document image is a diagram, chart, or photo depicting a process ...
  [3/5] figure_3.png
    ✓ The image is a technical document titled "Ricaria Modo 2: Normalization of a 32 ...
  [4/5] figure_4.png
    ✓ This technical document image in detail for academic research....
  [5/5] figure_5.png
    ✓ This technical document image is a schematic of a car's electrical system. The d...
[VLM] Completato: 5/5

[OK] 2 pag | 0 tab | 5 fig
  ✅ OK

☁️  Backup finale su Drive...

BACKUP SU GOOGLE DRIVE
Origine: inail_scraped_data
Destinazione: /content/drive/MyDrive/INAIL_Thesis_Data

✅ Backup completato!
📦 File totali: 4860
📁 Path Drive: /content/drive/MyDrive/INAIL_Thesis_Data


COMPLETATO: ✅ 19 nuovi | ❌ 0 falliti
📊 Totale documenti ora: 50


SCRAPING TERMINATO
✅ Nuovi documenti: 19
📊 Documenti totali: 50
📁 Output locale: inail_scraped_data
☁️ 