# **OSHA Publications Scraper con Docling + VLM Integration**

In [None]:
import re
import requests
from bs4 import BeautifulSoup
import json
from pathlib import Path
from datetime import datetime
from typing import Dict, Optional, Any, List
import tempfile
import os
import base64
from io import BytesIO

# Installazione dipendenze
!pip install docling pymupdf pillow -q

from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions, PictureDescriptionVlmOptions
from docling_core.types.doc import ImageRefMode

# Per visualizzazione in Colab
from IPython.display import display, HTML, Image as IPImage
import pandas as pd

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.4/251.4 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m66.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m164.5/164.5 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.1/15.1 MB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.7/42.7 kB[0m [31m3.9 MB/s[0m eta 

### Global Configuration Settings

This configuration class centralizes all scraper parameters, including directory paths for output files (PDFs, JSON metadata, extracted images and tables), HTTP request settings, and Vision Language Model (VLM) configuration for image description enrichment. We use this setup in order to easily modify parameters without searching the code and maintain consistency across all functions.


In [None]:
class ScraperConfig:
    USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
    TIMEOUT = 30
    OUTPUT_DIR = Path('./osha_scraped_data')
    PDF_DIR = OUTPUT_DIR / 'pdfs'
    JSON_DIR = OUTPUT_DIR / 'json'
    IMAGES_DIR = OUTPUT_DIR / 'images'
    TABLES_DIR = OUTPUT_DIR / 'tables'

    # VLM Configuration
    VLM_ENABLED = True  # Abilita VLM enrichment
    VLM_MODEL = "HuggingFaceTB/SmolVLM-256M-Instruct"
    VLM_PROMPT = "Describe this image in detail. Focus on charts, diagrams, tables, and any visual data. Be technical and precise."

    # This method ensures all necessary output directories exist before use.
    @classmethod
    def setup_directories(cls):
        cls.OUTPUT_DIR.mkdir(exist_ok=True)
        cls.PDF_DIR.mkdir(exist_ok=True)
        cls.JSON_DIR.mkdir(exist_ok=True)
        cls.IMAGES_DIR.mkdir(exist_ok=True)
        cls.TABLES_DIR.mkdir(exist_ok=True)

### Docling Document Converter Initialization

This function initializes the Docling DocumentConverter with configurable pipeline options for PDF processing. The optional choosen to work with are high-resolution image extraction (2x scale), OCR for scanned documents, and optionally activates a Vision Language Model (SmolVLM) to automatically generate textual descriptions of figures and charts found in the documents.


In [None]:
def initialize_docling_converter(enable_vlm: bool = True):

    pipeline_options = PdfPipelineOptions(
        # Abilita l'estrazione di immagini
        generate_picture_images=True,
        images_scale=2.0,

        # OCR, si attiva solo dove necessario
        do_ocr=True,
    )

    # Se VLM è abilitato, aggiungi la configurazione
    if enable_vlm and ScraperConfig.VLM_ENABLED:
        print(f"VLM abilitato: {ScraperConfig.VLM_MODEL}")
        pipeline_options.do_picture_description = True    # Abilita l'analisi delle immagini con AI
        pipeline_options.picture_description_options = PictureDescriptionVlmOptions(
            repo_id=ScraperConfig.VLM_MODEL,
            prompt=ScraperConfig.VLM_PROMPT,
        )

    converter = DocumentConverter(
        format_options={
            InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
        }
    )

    # Alla fine restituisce l'istanza configurata di DocumentConverter pronta per l'uso
    return converter

### PDF Download Function

This function handles the HTTP download of PDF files from OSHA publication URLs. It uses streaming download (chunk-based) to efficiently handle large files, includes proper HTTP headers to avoid bot detection, implements timeout protection, and provides robust error handling with informative status messages.


In [None]:
def download_pdf(pdf_url: str, output_path: Path) -> bool:

    try:
        headers = {'User-Agent': ScraperConfig.USER_AGENT}
        response = requests.get(
            pdf_url,
            headers=headers,
            timeout=ScraperConfig.TIMEOUT,
            stream=True    # Abilita lo streaming download, invece di caricare l'intero file in memoria, lo scarica a pezzi, chunks. Essenziale per PDF grandi, di anche 50-100 MB
        )
        response.raise_for_status()

        with open(output_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)

        print(f"PDF scaricato: {output_path.name}")
        return True

    except requests.exceptions.RequestException as e:
        print(f"Errore download PDF: {e}")
        return False


### Image and Table Export Functions

These utility functions extract and save images and tables from processed documents as separate files. Images are saved as PNG files in organized subdirectories, while tables are exported as TXT files containing the caption, column names, and full content. Both functions handle multiple extraction methods for compatibility across Docling versions.


In [None]:
# Esporta immagini tentando piu di un approccio, prima prova API nativa altrimenti itera manualmente sugli elementi
# Risultato è un Dizionario per ogni immagine che include path, filename, caption per riferimento futuro
# Esporta tutte le immagini dal documento gestendo ImageRef correttamente

def export_images_from_document(doc, doc_id: str) -> List[Dict[str, Any]]:
    """
    Esporta tutte le immagini dal documento - VERSIONE CORRETTA
    Passa il documento a get_image() come nell'esempio Docling
    """
    images_dir = ScraperConfig.IMAGES_DIR / doc_id
    images_dir.mkdir(parents=True, exist_ok=True)

    exported_images = []

    try:
        from docling_core.types.doc import PictureItem

        # Itera sugli elementi del documento
        picture_counter = 0

        if hasattr(doc, 'iterate_items'):
            print(f"[DEBUG] Usando iterate_items()")

            for element, _level in doc.iterate_items():
                if isinstance(element, PictureItem):
                    picture_counter += 1
                    img_filename = f"figure_{picture_counter}.png"
                    img_path = images_dir / img_filename

                    try:
                        # CHIAVE: Passa doc a get_image()!
                        pil_image = element.get_image(doc)
                        pil_image.save(str(img_path), "PNG")

                        exported_images.append({
                            'path': str(img_path),
                            'filename': img_filename,
                            'caption': getattr(element, 'caption', 'N/A')
                        })
                        print(f"  [+] Salvata: {img_filename}")

                    except Exception as e:
                        print(f"  [!] Errore salvataggio figura {picture_counter}: {e}")

        # Fallback: usa doc.pictures se iterate_items non disponibile
        elif hasattr(doc, 'pictures') and doc.pictures:
            print(f"[DEBUG] Usando doc.pictures con get_image(doc)")

            for idx, picture in enumerate(doc.pictures):
                try:
                    img_filename = f"figure_{idx}.png"
                    img_path = images_dir / img_filename

                    # Prova get_image con documento
                    if hasattr(picture, 'get_image'):
                        pil_image = picture.get_image(doc)  # ← Passa doc!
                        pil_image.save(str(img_path), "PNG")

                        exported_images.append({
                            'path': str(img_path),
                            'filename': img_filename,
                            'caption': getattr(picture, 'caption', 'N/A')
                        })
                        print(f"  [+] Salvata: {img_filename}")

                except Exception as e:
                    print(f"  [!] Errore figura {idx}: {e}")

        else:
            print("[WARNING] Nessun metodo di iterazione disponibile")

        print(f"[RESULT] {len(exported_images)} immagini esportate in: {images_dir}")

    except Exception as e:
        print(f"[ERROR] Export immagini fallito: {e}")
        import traceback
        traceback.print_exc()

    return exported_images


def export_tables_to_files(tables: List[Dict], doc_id: str) -> List[str]:
    # Esporta ogni tabella come formato TXT strutturato: Caption + Colonne + Contenuto in formato leggibile
    # inoltre usiamo encoding='utf-8' importante per il supporto di caratteri internazionali (in questo caso documenti multilingua OSHA)
    if not tables:
        return []

    tables_dir = ScraperConfig.TABLES_DIR / doc_id
    tables_dir.mkdir(parents=True, exist_ok=True)

    exported_tables = []

    for table in tables:
        table_id = table['table_id']
        txt_path = tables_dir / f"{table_id}.txt"

        try:
            with open(txt_path, 'w', encoding='utf-8') as f:
                f.write(f"Caption: {table['caption']}\n\n")
                if table.get('potential_columns'):
                    f.write(f"Columns: {', '.join(table['potential_columns'])}\n\n")
                f.write("Content:\n")
                f.write(table['text_content'])

            exported_tables.append(str(txt_path))
            print(f" Tabella salvata: {txt_path.name}")
        except Exception as e:
            print(f" Errore salvataggio {table_id}: {e}")

    if exported_tables:
        print(f"{len(exported_tables)} tabelle esportate in: {tables_dir}")

    return exported_tables

### Main PDF Processing with Docling

This is the core processing function that orchestrates the complete document analysis pipeline. It uses Docling to convert PDFs into structured data, extracts text in both Markdown and plain formats, analyzes document structure (headings, tables, figures), exports images and tables as separate files, enriches figures with VLM-generated descriptions when available, and compiles comprehensive metadata including conversion quality scores.


In [None]:
def process_pdf_with_docling(pdf_path: Path, converter: DocumentConverter, doc_id: str) -> Optional[Dict[str, Any]]:

    try:
        print(f"Processing con Docling: {pdf_path.name}...")

        # Converti il documento
        result = converter.convert(str(pdf_path))     # Punto di ingresso di Docling, analizza layout, estrae testo, applica OCR, genera descrizioni VLM.
        doc = result.document

        # Estrazione contenuto in formato Markdown e plain text, Markdown preserva struttura (heading, liste, tabelle), plain text è semplice stringa, entrambi utili in base al contesto
        markdown_text = doc.export_to_markdown()
        plain_text = doc.export_to_text()

        # Analizza la struttura del documento
        structure_info = analyze_document_structure(doc)

        # Export immagini
        exported_images = export_images_from_document(doc, doc_id)

        # Arricchisci figure con VLM (se abilitato)
        if ScraperConfig.VLM_ENABLED:
            structure_info['figures'] = enrich_figures_with_vlm_annotations(
                doc,
                structure_info['figures']
            )

        # Export tabelle
        exported_tables = export_tables_to_files(structure_info['tables'], doc_id)

        # Compila metadati strutturali completi
        document_data = {
            'markdown_content': markdown_text,
            'plain_text': plain_text,

            'num_pages': len(doc.pages) if hasattr(doc, 'pages') else 'N/A',
            'num_tables': structure_info['num_tables'],
            'num_figures': structure_info['num_figures'],
            'num_headings': structure_info['num_headings'],

            'headings': structure_info['headings'],
            'tables': structure_info['tables'],
            'figures': structure_info['figures'],

            'exported_images': exported_images,
            'exported_tables': exported_tables,

            'stats': {
                'total_chars': len(plain_text),
                'total_chars_markdown': len(markdown_text),
                'has_structured_content': structure_info['num_tables'] > 0 or structure_info['num_figures'] > 0
            },

            'conversion_quality': get_conversion_quality(result)
        }

        print(f"Docling processing completato:")
        print(f"  - {document_data['num_pages']} pagine")
        print(f"  - {document_data['num_tables']} tabelle")
        print(f"  - {document_data['num_figures']} figure")
        print(f"  - {document_data['num_headings']} headings")
        print(f"  - {len(exported_images)} immagini esportate")
        print(f"  - {len(exported_tables)} tabelle esportate")

        return document_data

    except Exception as e:
        print(f"Errore processing Docling: {e}")
        import traceback
        traceback.print_exc()
        return None


def enrich_figures_with_vlm_annotations(doc, figures_metadata: List[Dict]) -> List[Dict]:
    # Cerca annotazioni già generate da Docling durante la conversione, non chiama VLM direttamente in quanto Docling potrebbe averlo già fatto se configurato.
    enriched_figures = []

    try:
        for fig_meta in figures_metadata:
            enriched_fig = fig_meta.copy()

            # Cerca annotazioni VLM generate da Docling
            if hasattr(doc, 'main_text'):
                for item in doc.main_text:
                    if hasattr(item, 'annotations'):
                        for annotation in item.annotations:
                            annotation_type = type(annotation).__name__
                            if 'PictureDescription' in annotation_type or 'Description' in annotation_type:
                                if hasattr(annotation, 'text'):
                                    enriched_fig['vlm_description'] = annotation.text
                                    enriched_fig['vlm_model'] = ScraperConfig.VLM_MODEL
                                    print(f" Descrizione VLM trovata per {fig_meta['figure_id']}")
                                elif hasattr(annotation, 'description'):
                                    enriched_fig['vlm_description'] = annotation.description
                                    enriched_fig['vlm_model'] = ScraperConfig.VLM_MODEL

            enriched_figures.append(enriched_fig)

    except Exception as e:
        print(f"Avviso durante enrichment VLM: {e}")
        enriched_figures = figures_metadata

    return enriched_figures


### Document Structure Analysis and Metadata Extraction

These utility functions analyze the internal structure of Docling documents to identify and extract headings, tables, and figures. The system implements a multi-tier fallback strategy: first attempting to parse Docling's internal representation, then falling back to regex-based Markdown parsing if needed. Additional helper functions extract detailed metadata from tables (columns, captions) and figures, and retrieve conversion quality scores when available.


In [None]:
# Funzione principale che cerca di estrarre la struttura del documento usando l'API interna di Docling
def analyze_document_structure(doc) -> Dict[str, Any]:

    structure = {
        'num_tables': 0, 'num_figures': 0, 'num_headings': 0,    # Usiamo dei contatori per spaere quante tabelle/figure/titoli ci sono
        'headings': [], 'tables': [], 'figures': []
    }

    try:
        # Tenta accesso struttura interna Docling, prova prima main_text, se non esiste prova body.children, facciamo una cascata di tentativi
        items = None
        if hasattr(doc, 'main_text'):
            items = doc.main_text
        elif hasattr(doc, 'body') and hasattr(doc.body, 'children'):
            items = doc.body.children

        # Fallback se struttura non accessibile
        if items is None or (hasattr(items, '__len__') and len(list(items)) == 0):
            print("Struttura interna non disponibile, usando metodo alternativo")
            return extract_structure_from_markdown(doc)

        # Itera sugli elementi, quindi prepara e identifica ogni elemento del documento
        items_processed = 0
        for idx, item in enumerate(items):
            items_processed += 1
            item_label = getattr(item, 'label', None) or type(item).__name__
            item_label_lower = str(item_label).lower()
            item_text = getattr(item, 'text', '')

            if 'table' in item_label_lower:
                structure['num_tables'] += 1
                structure['tables'].append(extract_table_metadata(item, idx))
            elif any(kw in item_label_lower for kw in ['picture', 'figure', 'image']):
                structure['num_figures'] += 1
                structure['figures'].append(extract_figure_metadata(item, idx))
            elif any(kw in item_label_lower for kw in ['heading', 'title', 'section']):
                structure['num_headings'] += 1
                structure['headings'].append({
                    'type': item_label, 'text': item_text[:200], 'position': idx
                })

        # Se non trova nulla, usa fallback, ed attiva il fallback che analizza il Markdown che è sempre affidabile
        if all(structure[k] == 0 for k in ['num_tables', 'num_figures', 'num_headings']):
            print(f"Nessun elemento trovato in {items_processed} items, fallback markdown")
            return extract_structure_from_markdown(doc)

    except Exception as e:
        print(f"Errore analisi struttura: {e}")
        try:
            return extract_structure_from_markdown(doc)
        except:
            pass

    return structure


def extract_structure_from_markdown(doc) -> Dict[str, Any]:
    # Fallback estrae struttura da Markdown con regex e pattern matching
    structure = {
        'num_tables': 0, 'num_figures': 0, 'num_headings': 0,
        'headings': [], 'tables': [], 'figures': []
    }

    try:
        markdown = doc.export_to_markdown()
        lines = markdown.split('\n')
        in_table = False
        current_table_lines = []
        table_start_idx = -1

        for i, line in enumerate(lines):
            # Riconoscimento Headings, ovvero linee che iniziano con # e rimuove spazi bianchi all'inizio/fine
            if line.strip().startswith('#'):
                level = len(line) - len(line.lstrip('#'))
                text = line.lstrip('#').strip()
                structure['num_headings'] += 1
                structure['headings'].append({
                    'type': f'heading_level_{level}',
                    'text': text[:200],
                    'position': i,
                    'level': level
                })

            # Riconoscimento Tabelle, cerca righe con | e separatori ---
            if '|' in line and ('-' in line or '─' in line):
                if not in_table:
                    in_table = True
                    table_start_idx = i
                    current_table_lines = []

            # accumula tutte le righe di una tabella fino a trovare una riga che non fa parte della tabella
            if in_table:
                if '|' in line:
                    current_table_lines.append(line)
                else:
                    if current_table_lines:
                        structure['num_tables'] += 1
                        caption = 'N/A'
                        for j in range(max(0, table_start_idx-3), table_start_idx):
                            if lines[j].strip() and not lines[j].strip().startswith('#'):
                                caption = lines[j].strip()
                                break
                        structure['tables'].append({
                            'table_id': f'table_{structure["num_tables"]-1}',
                            'caption': caption,
                            'text_content': '\n'.join(current_table_lines),
                            'num_rows': len(current_table_lines),
                            'position': table_start_idx
                        })
                    in_table = False
                    current_table_lines = []

            # Riconoscimento Figure (<!-- image --> o ![alt](url))
            if '<!-- image -->' in line.lower() or (line.strip().startswith('![') and '](' in line):
                structure['num_figures'] += 1
                caption = 'N/A'
                # Ricerca caption nelle righe successive, controlla le 3 righe successive alla figura
                for j in range(i+1, min(len(lines), i+4)):
                    if lines[j].strip() and not lines[j].strip().startswith('#'):
                        caption = lines[j].strip()
                        break      # Esce dal loop appena trova una caption, non continua a cercare
                structure['figures'].append({
                    'figure_id': f'figure_{structure["num_figures"]-1}',
                    'caption': caption[:200],
                    'position': i
                })

        # Gestisci tabella che finisce a fine file
        # Problema: Se il documento finisce mentre siamo in una tabella, il loop termina senza salvare l'ultima tabella
        # Soluzione: Dopo il loop, controlla se in_table == True e se ci sono righe accumulate, se sì, salva la tabella
        if in_table and current_table_lines:
            structure['num_tables'] += 1
            structure['tables'].append({
                'table_id': f'table_{structure["num_tables"]-1}',
                'caption': 'End of document table',
                'text_content': '\n'.join(current_table_lines),
                'num_rows': len(current_table_lines),
                'position': table_start_idx
            })

        print(f"Struttura da Markdown: {structure['num_headings']} headings, {structure['num_tables']} tabelle, {structure['num_figures']} figure")

    except Exception as e:
        print(f"Errore fallback markdown: {e}")

    return structure

# Estrae metadati dettagliati da una tabella Docling
def extract_table_metadata(table_item, index: int) -> Dict[str, Any]:

    table_data = {
        'table_id': f"table_{index}",
        'caption': getattr(table_item, 'caption', 'N/A'),
        'text_content': getattr(table_item, 'text', ''),
    }
    try:
        if hasattr(table_item, 'export_to_markdown'):
            table_data['markdown_content'] = table_item.export_to_markdown()    # Alcune versioni Docling permettono export Markdown diretto della tabella. Se disponibile, lo usa.
        text_lines = table_data['text_content'].split('\n')
        if text_lines:
            table_data['potential_columns'] = [c.strip() for c in text_lines[0].split('|') if c.strip()]     # Assume che la prima riga contenga i nomi delle colonne (convenzione Markdown).
        table_data['num_rows_estimated'] = len(text_lines)
    except Exception as e:
        print(f"Errore metadati tabella {index}: {e}")
    return table_data

# Crea e restituisce il dizionario in un solo step
def extract_figure_metadata(figure_item, index: int) -> Dict[str, Any]:

    return {
        'figure_id': f"figure_{index}",
        'caption': getattr(figure_item, 'caption', 'N/A'),
        'alt_text': getattr(figure_item, 'text', 'N/A'),
        'vlm_description': None,
        'image_path': None
    }

# Estrae quality scores introdotti nelle nuove versioni di Docling
def get_conversion_quality(result) -> Dict[str, Any]:

    quality_info = {'status': 'completed', 'has_confidence_scores': False}    # Anche se tutto fallisce, abbiamo almeno status: 'completed'

    try:
        if hasattr(result, 'confidence') and result.confidence:
            quality_info['has_confidence_scores'] = True
            quality_info['mean_grade'] = str(getattr(result.confidence, 'mean_grade', 'N/A'))        # Media di tutti i punteggi (overall quality)
            quality_info['low_grade'] = str(getattr(result.confidence, 'low_grade', 'N/A'))          # 5° percentile (worst areas)
            quality_info['layout_score'] = str(getattr(result.confidence, 'layout_score', 'N/A'))    # Qualità riconoscimento layout
            quality_info['ocr_score'] = str(getattr(result.confidence, 'ocr_score', 'N/A'))          # Qualità OCR
    except Exception as e:
        print(f" Errore quality scores: {e}")
    return quality_info


Up to this point, the work has been set up with 3 levels of fallback which ensure that it always functions, even with different Docling versions or unusual documents. This guarantees we extract the maximum information possible: It extracts everything—text, metadata, coordinates (positions), and captions. Furthermore, we have complete error handling—it never crashes, providing partial data at worst.

### Main Scraping Function

This is the top-level function that coordinates the complete scraping and processing workflow for OSHA publications. It manages three distinct phases:


1. Web scraping to extract metadata from the publication webpage using BeautifulSoup with custom CSS selectors for OSHA's specific HTML structure
2. PDF download and Docling-based document processing with VLM enrichment.
3. Data consolidation into a comprehensive JSON structure.

The function implements robust error handling, returns structured metadata including scraping timestamp and version tracking, and provides boolean flags for workflow control. Optional PDF retention allows for disk space management in large-scale batch operations.


In [None]:
def scrape_osha_publication(
    publication_url: str,
    converter: DocumentConverter,
    save_pdf: bool = True
) -> Dict[str, Any]:

    print(f"\n{'='*80}")   # separatore visivo
    print(f"Scraping pubblicazione: {publication_url}")
    print(f"{'='*80}\n")

    scraping_metadata = {
        'url': publication_url,
        'timestamp': datetime.now().isoformat(),
        'scraper_version': '3.0',
        'docling_used': True,
        'vlm_enabled': ScraperConfig.VLM_ENABLED
    }

    # Estrazione Metadati con Selettori specifici per OSHA
    try:
        headers = {'User-Agent': ScraperConfig.USER_AGENT}
        response = requests.get(publication_url, headers=headers, timeout=ScraperConfig.TIMEOUT)
        response.raise_for_status()

        page = BeautifulSoup(response.content, 'lxml')

        print("Estrazione metadati dalla pagina web...")

        title_elem = page.find('h1')
        title = title_elem.text.strip() if title_elem else 'N/A'

        keywords_elem = page.find('ul', class_='field__items')
        keywords = [kw.strip() for kw in keywords_elem.text.strip().split('\n')] if keywords_elem else []

        descr_blocks = page.find_all(id=re.compile('^tmgmt-'))
        description = ' '.join(p.get_text(strip=True, separator=' ') for p in descr_blocks)
        description = re.sub(r'\s+', ' ', description.replace('\xa0', ' ')).strip()

        date_elem = page.find('p', class_='datetime-style')
        publication_date = date_elem.text.strip() if date_elem else 'N/A'

        link_pdf_elem = page.find('div', class_='download-pdf')
        pdf_url = link_pdf_elem.find('a')['href'] if link_pdf_elem and link_pdf_elem.find('a') else None

        print(f"Titolo: {title}")
        print(f"Data: {publication_date}")
        print(f"Keywords: {', '.join(keywords)}")

        pdf_data = None
        pdf_local_path = None

        # Processing PDF Condizionale
        if pdf_url:
            safe_filename = re.sub(r'[^\w\-_.]', '_', title[:50]) + '.pdf'
            pdf_local_path = ScraperConfig.PDF_DIR / safe_filename
            doc_id = safe_filename.replace('.pdf', '')

            if download_pdf(pdf_url, pdf_local_path):
                pdf_data = process_pdf_with_docling(pdf_local_path, converter, doc_id)

                if not save_pdf:
                    pdf_local_path.unlink()    # elaborazione di un PDF scaricato, includendo una fase opzionale per eliminare il file locale subito dopo averlo processato
                    pdf_local_path = None      # ottimizzare l'uso dello spazio disco, specialmente nel trattamento batch di un gran numero di documenti
        else:
            print("Nessun PDF disponibile per questa pubblicazione")

        # Costruzione Dizionario Risultato
        publication_data = {
            'scraping_metadata': scraping_metadata,

            'web_metadata': {
                'title': title,
                'publication_date': publication_date,
                'keywords': keywords,
                'description': description,
                'pdf_url': pdf_url,
                'pdf_local_path': str(pdf_local_path) if pdf_local_path else None
            },

            'document_content': pdf_data if pdf_data else None,

            'status': 'success',
            'has_pdf': pdf_url is not None,
            'pdf_processed': pdf_data is not None
        }

        print(f"\n Scraping completato con successo!")
        return publication_data

    except Exception as e:
        print(f"\n Errore durante lo scraping: {e}")
        return {
            'scraping_metadata': scraping_metadata,
            'status': 'error',
            'error_message': str(e)
        }

### Saving and Visualization Functions

These utility functions ensure consistent data storage and reproducible presentation of results within Jupyter or Colab environments.
The save_to_json function serializes the full scraping output into a structured JSON file. Filenames are automatically generated using the publication title and a timestamp, while UTF-8 encoding guarantees compatibility with multilingual text. The resulting files are indented for improved readability and downstream inspection.

The display_results_in_colab function generates an interactive HTML summary designed to facilitate exploratory analysis and quality assessment. It presents key processing statistics, a hierarchical overview of the document structure, extracted tables and figures accompanied by VLM-generated descriptions, a Markdown rendering of the content, and organized export options for all derived files.

This combination of machine-readable storage (JSON) and human-readable visualization (HTML) supports both automated integration in data workflows and transparent evaluation of extraction outcomes.

In [None]:
def save_to_json(data: Dict[str, Any], output_name: Optional[str] = None) -> Path:
    # Salva i dati in formato JSON, restituisce un oggetto Path che rappresenta il file salvato
    if output_name is None:
        title = data.get('web_metadata', {}).get('title', 'unknown')    # Cerca accesso sicuro a dizionario annidato se presenta, web_metadata è un dizionario (dictionary) Python che contiene tutti i metadati estratti dalla pagina web
        safe_title = re.sub(r'[^\w\-_.]', '_', title[:50])              # Cerca title dentro
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')            # Formatta datetime come stringa
        output_name = f"{safe_title}_{timestamp}.json"

    output_path = ScraperConfig.JSON_DIR / output_name

    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)                # ensure_ascii=False parametro che istruisce a non convertire i caratteri non-ASCII,
                                                                        # indent=2 enables 'pretty print' formatting (newlines + 2 spaces), making the JSON output much clearer and human-readable.
    print(f"Dati salvati in: {output_path}")
    return output_path


def display_results_in_colab(result: Dict[str, Any], json_path: Path):

    # Visualizza i risultati in modo interattivo su Colab, CSS styling

    print(f"\n{'='*80}")
    print("VISUALIZZAZIONE RISULTATI")
    print(f"{'='*80}\n")

    # 1) Link per scaricare il JSON
    display(HTML(f"""
    <div style="padding: 20px; background-color: #f0f0f0; border-radius: 10px; margin: 10px 0;">
        <h3>File JSON Generato</h3>
        <p><strong>Path:</strong> <code>{json_path}</code></p>
        <p>Per scaricare: Click destro sul file nel pannello Files → Download</p>
    </div>
    """))

    # 2) Informazioni principali
    if result['status'] == 'success' and result['document_content']:
        doc_content = result['document_content']
        web_meta = result['web_metadata']

        display(HTML(f"""
        <div style="padding: 20px; background-color: #e8f5e9; border-radius: 10px; margin: 10px 0;">
            <h3>Documento Processato con Successo</h3>
            <p><strong>Titolo:</strong> {web_meta['title']}</p>
            <p><strong>Data:</strong> {web_meta['publication_date']}</p>
            <p><strong>Keywords:</strong> {', '.join(web_meta['keywords'])}</p>
            <hr>
            <p><strong>Pagine:</strong> {doc_content['num_pages']}</p>
            <p><strong>Caratteri:</strong> {doc_content['stats']['total_chars']:,}</p>
            <p><strong>Headings:</strong> {doc_content['num_headings']}</p>
            <p><strong>Tabelle:</strong> {doc_content['num_tables']}</p>
            <p><strong>Figure:</strong> {doc_content['num_figures']}</p>
            <p><strong>Immagini esportate:</strong> {len(doc_content.get('exported_images', []))}</p>
            <p><strong>Tabelle esportate:</strong> {len(doc_content.get('exported_tables', []))}</p>
        </div>
        """))

        # 3) Mostra headings
        if doc_content['headings']:
            print("\n Struttura del Documento (Headings):\n")
            for h in doc_content['headings'][:10]:
                level = h.get('level', 2)
                indent = "  " * (level - 1)
                print(f"{indent}{'#' * level} {h['text'][:80]}")

        # 4) Mostra tabelle
        if doc_content['tables']:
            print(f"\n Tabelle Trovate ({len(doc_content['tables'])}):\n")
            for table in doc_content['tables']:
                print(f"  • {table['table_id']}: {table['caption']}")
                if table.get('potential_columns'):
                    print(f"    Colonne: {', '.join(table['potential_columns'][:5])}")

        # 5) Mostra figure con descrizioni VLM
        if doc_content['figures']:
            print(f"\n Figure Trovate ({len(doc_content['figures'])}):\n")
            for fig in doc_content['figures']:
                print(f"  • {fig['figure_id']}: {fig['caption']}")
                if fig.get('vlm_description'):
                    print(f"  VLM: {fig['vlm_description'][:150]}...")
                    print()

        # 6) Mostra anteprima markdown
        print(f"\n{'='*80}")
        print("ANTEPRIMA MARKDOWN (prime 800 caratteri)")
        print(f"{'='*80}\n")
        print(doc_content['markdown_content'][:800])
        print("\n[...]\n")

        # 7) Lista file esportati
        if doc_content.get('exported_images') or doc_content.get('exported_tables'):
            print(f"\n{'='*80}")
            print("FILE ESPORTATI")
            print(f"{'='*80}\n")

            if doc_content.get('exported_images'):
                print(f"Immagini ({len(doc_content['exported_images'])}):")
                for img in doc_content['exported_images'][:5]:
                    print(f"  - {img.get('filename', img.get('path', 'N/A'))}")

            if doc_content.get('exported_tables'):
                print(f"\n Tabelle ({len(doc_content['exported_tables'])}):")
                for table_path in doc_content['exported_tables']:
                    print(f"  - {Path(table_path).name}")

### Main Execution Block and Usage Example

This is the entry point of the scraping system, demonstrating a complete end-to-end workflow execution.

In [None]:
if __name__ == "__main__":
    # Setup
    ScraperConfig.setup_directories()

    # Inizializza converter con VLM
    converter = initialize_docling_converter(enable_vlm=True)

    # URL di test
    test_url = 'https://osha.europa.eu/it/publications/health-and-social-care-workers-free-musculoskeletal-disorders-awareness-campaign'
    # https://osha.europa.eu/it/publications/tms-pros-programme-supporting-msds-prevention-health-and-social-care-sector
    # Esegui scraping
    result = scrape_osha_publication(
        publication_url=test_url,
        converter=converter,
        save_pdf=True
    )

    # Salva risultati
    json_path = save_to_json(result)

    # Visualizza risultati in Colab
    display_results_in_colab(result, json_path)

    print(f"\n{'='*80}")
    print("PROCESSO COMPLETATO!")
    print(f"{'='*80}")
    print(f"\n Trova tutti i file in:")
    print(f"  - JSON: {ScraperConfig.JSON_DIR}")
    print(f"  - PDF: {ScraperConfig.PDF_DIR}")
    print(f"  - Immagini: {ScraperConfig.IMAGES_DIR}")
    print(f"  - Tabelle: {ScraperConfig.TABLES_DIR}")
    print(f"\n Usa il pannello Files (sinistra) per navigare e scaricare i file")

VLM abilitato: HuggingFaceTB/SmolVLM-256M-Instruct

Scraping pubblicazione: https://osha.europa.eu/it/publications/health-and-social-care-workers-free-musculoskeletal-disorders-awareness-campaign

Estrazione metadati dalla pagina web...
Titolo: «Per lavoratori del settore dell’assistenza socio-sanitaria senza disturbi muscolo-scheletrici»: una campagna di sensibilizzazione
Data: 03/10/2025
Keywords: Assistenza sociosanitaria, Disturbi muscoloscheletrici
PDF scaricato: _Per_lavoratori_del_settore_dell_assistenza_socio-.pdf
Processing con Docling: _Per_lavoratori_del_settore_dell_assistenza_socio-.pdf...


[32m[INFO] 2025-10-22 16:25:35,344 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-10-22 16:25:35,385 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-10-22 16:25:35,386 [RapidOCR] torch.py:54: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-10-22 16:25:35,595 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-10-22 16:25:35,599 [RapidOCR] download_file.py:60: File exists and is valid: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2025-10-22 16:25:35,600 [RapidOCR] torch.py:54: Using /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_ptocr_mobile_v2.0_cls_infer.pth[0m
[32m[INFO] 2025-10-22 16:25:35,688 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-10-22 16:25:35,763 [RapidOCR] downloa

Nessun elemento trovato in 139 items, fallback markdown
Struttura da Markdown: 24 headings, 1 tabelle, 8 figure
[DEBUG] Usando iterate_items()
  [+] Salvata: figure_1.png
  [+] Salvata: figure_2.png
  [+] Salvata: figure_3.png
  [+] Salvata: figure_4.png
  [+] Salvata: figure_5.png
  [+] Salvata: figure_6.png
  [+] Salvata: figure_7.png
  [+] Salvata: figure_8.png
[RESULT] 8 immagini esportate in: osha_scraped_data/images/_Per_lavoratori_del_settore_dell_assistenza_socio-
 Tabella salvata: table_0.txt
1 tabelle esportate in: osha_scraped_data/tables/_Per_lavoratori_del_settore_dell_assistenza_socio-
Docling processing completato:
  - 8 pagine
  - 1 tabelle
  - 8 figure
  - 24 headings
  - 8 immagini esportate
  - 1 tabelle esportate

 Scraping completato con successo!
Dati salvati in: osha_scraped_data/json/_Per_lavoratori_del_settore_dell_assistenza_socio-_20251022_163453.json

VISUALIZZAZIONE RISULTATI




 Struttura del Documento (Headings):

  ## 'FOR HEALTH AND SOCIAL CARE WORKERS FREE FROM MUSCULOSKELETAL DISORDERS' - AN AW
  ## Introduction
  ## Description of the case
  ## Aim
  ## What was done and how
  ## Preparation of the plan
  ## Focus on the residential care sector
  ## Campaign material 14
  ##  Videos
  ##  Posters

 Tabelle Trovate (1):

  • table_0: 20 Impressions are the number of times a post is seen on the platform.

 Figure Trovate (8):

  • figure_0: N/A
  • figure_1: N/A
  • figure_2: <!-- image -->
  • figure_3: <!-- image -->
  • figure_4: These posters were available in PDF and in paper format. In the case of the latter, they were made available to their networks by the various members of the CNSST working group.
  • figure_5: N/A
  • figure_6: N/A
  • figure_7: N/A

ANTEPRIMA MARKDOWN (prime 800 caratteri)

<!-- image -->

## 'FOR HEALTH AND SOCIAL CARE WORKERS FREE FROM MUSCULOSKELETAL DISORDERS' - AN AWARENESS CAMPAIGN

## Introduction

Work-related muscu

### Keyword-Based Publication Discovery

Builds OSHA search URLs from user queries and extracts publication links across multiple result pages. Implements anti-detection measures (realistic headers, progressive delays, session cookies) and dual parsing strategies to reliably collect unique publication URLs while avoiding rate limiting. Includes automatic pagination handling and duplicate removal.


In [None]:
from urllib.parse import quote_plus
import unicodedata
import time
import json

def build_osha_search_url(query: str, lang: str, sort: str = "field_publication_date") -> str:

    if not isinstance(query, str) or not query.strip():
        raise ValueError("La query deve essere una stringa non vuota.")
    if lang not in {"it", "en"}:
        raise ValueError("Il parametro 'lang' deve essere 'it' oppure 'en'.")

    # Normalizzazione: unicode, trim, lower, spazi multipli
    q = unicodedata.normalize("NFKC", query).strip().lower()
    q = " ".join(q.split())
    q = q.replace("'", "'")

    encoded_query = quote_plus(q)
    base_url = f"https://osha.europa.eu/{lang}/publications"
    return f"{base_url}?search_api_fulltext={encoded_query}&sort_by={sort}"


def get_osha_publication_links(search_url: str, max_pages: int = 5) -> List[Dict[str, str]]:

    # Estrae i link con anti-detection migliorato e delay adattivo

    publications = []
    seen_urls = set()  # Track duplicati
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.9',
        'Accept-Encoding': 'gzip, deflate, br',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'none',
        'Cache-Control': 'max-age=0'
    }

    print(f"\n{'='*80}")
    print(f"ESTRAZIONE LINK PUBBLICAZIONI")
    print(f"{'='*80}\n")
    print(f"Search URL: {search_url}")
    print(f"Max pagine: {max_pages}\n")

    # Session per mantenere cookies
    session = requests.Session()         # Mantiene cookies tra richieste (OSHA potrebbe usarli per tracking)
    session.headers.update(headers)

    for page_num in range(max_pages):
        # URL con paginazione
        if page_num == 0:
            url = search_url
        else:
            separator = '&' if '?' in search_url else '?'
            url = f"{search_url}{separator}page={page_num}"

        print(f"[Pagina {page_num + 1}/{max_pages}] Fetching: {url}")

        # Delay progressivo per evitare 403
        if page_num > 0:
            delay = 5 + (page_num * 2)  # 5, 7, 9, 11 secondi...
            print(f"  [~] Attesa {delay} secondi per evitare rate limit...")
            time.sleep(delay)

        try:
            response = session.get(url, timeout=30)

            # Gestisci 403
            if response.status_code == 403:
                print(f"  [!] 403 Forbidden - Server ha bloccato la richiesta")
                print(f"  [INFO] Aumenta delay o riprova più tardi")
                print(f"  [INFO] Pubblicazioni raccolte finora: {len(publications)}")
                break

            response.raise_for_status()
            soup = BeautifulSoup(response.content, 'lxml')

            # DEBUG: Salva prima pagina
            if page_num == 0:
                with open('debug_osha_page.html', 'w', encoding='utf-8') as f:
                    f.write(soup.prettify())

            results_found = False
            page_pubs = []

            # METODO 1: article con classe node--type-publication
            articles = soup.find_all('article', class_='node--type-publication')
            if articles:
                print(f"  [+] Metodo 1: Trovati {len(articles)} articles")
                results_found = True

                for article in articles:
                    title_link = article.find('h2', class_='node__title')
                    if not title_link:
                        title_link = article.find('h2')

                    if title_link:
                        a_tag = title_link.find('a')
                        if a_tag and a_tag.get('href'):
                            href = a_tag['href']
                            if href.startswith('/'):
                                full_url = f"https://osha.europa.eu{href}"
                            else:
                                full_url = href

                            # Skip duplicati
                            if full_url not in seen_urls:
                                title = a_tag.text.strip()
                                page_pubs.append({
                                    'title': title,
                                    'url': full_url
                                })
                                seen_urls.add(full_url)

            # METODO 2: Ricerca link diretti
            if not results_found:
                # Pattern più specifico: solo pubblicazioni vere
                pattern = re.compile(r'/(en|it)/publications/[a-z0-9-]+$')
                all_links = soup.find_all('a', href=pattern)

                if all_links:
                    print(f"  [+] Metodo 2: Trovati {len(all_links)} link")
                    results_found = True

                    for a_tag in all_links:
                        href = a_tag['href']
                        if href.startswith('/'):
                            full_url = f"https://osha.europa.eu{href}"
                        else:
                            full_url = href

                        # Skip duplicati e URL non validi
                        if full_url not in seen_urls:
                            # Prendi titolo dal link o cerca parent heading
                            title = a_tag.text.strip()
                            if not title or len(title) < 10:
                                parent = a_tag.find_parent(['article', 'div'])
                                if parent:
                                    heading = parent.find(['h2', 'h3'])
                                    if heading:
                                        title = heading.text.strip()

                            if title and len(title) >= 10:
                                page_pubs.append({
                                    'title': title,
                                    'url': full_url
                                })
                                seen_urls.add(full_url)

            if not results_found:
                print(f"  [!] Nessun risultato in pagina {page_num + 1} - fine")
                break

            # Mostra trovati in questa pagina
            print(f"  [FOUND] {len(page_pubs)} nuove pubblicazioni:")
            for pub in page_pubs:
                print(f"    - {pub['title'][:75]}...")
                publications.append(pub)

            # Se la pagina ha meno di 5 risultati, probabilmente è l'ultima
            if len(page_pubs) < 5:
                print(f"  [INFO] Pagina con pochi risultati - probabilmente ultima pagina")
                break

        except requests.exceptions.RequestException as e:
            print(f"  [!] Errore su pagina {page_num + 1}: {e}")
            break

    print(f"\n[RESULT] Totale: {len(publications)} pubblicazioni uniche trovate")
    return publications


### Batch Processing Orchestrator

Main pipeline that processes multiple OSHA publications sequentially. Handles link extraction, document scraping with Docling, individual and aggregate JSON export, error recovery, rate limiting, and aggregate statistics (total pages/tables/figures across all documents). Ensures efficient resource usage by reusing the Docling converter instance.


In [None]:
def batch_scrape_osha_publications(
    search_url: str,
    max_pages: int = 5,
    max_documents: Optional[int] = None,
    output_file: Optional[str] = None,
    save_pdfs: bool = True,
    delay_between_docs: int = 3
) -> List[Dict[str, Any]]:

    # Setup
    ScraperConfig.setup_directories()
    converter = initialize_docling_converter(enable_vlm=ScraperConfig.VLM_ENABLED)

    # 1) Estrae tutti i link
    publication_links = get_osha_publication_links(search_url, max_pages)

    if not publication_links:
        print("[WARNING] Nessuna pubblicazione trovata!")
        return []

    # Limita numero documenti se richiesto
    if max_documents:
        publication_links = publication_links[:max_documents]
        print(f"\n[INFO] Limitato a {max_documents} documenti")

    # 2) Scrape ogni pubblicazione
    results = []
    failed = []

    print(f"\n{'='*80}")
    print(f"INIZIO BATCH SCRAPING - {len(publication_links)} DOCUMENTI")
    print(f"{'='*80}\n")

    for idx, pub in enumerate(publication_links, 1):
        print(f"\n[{idx}/{len(publication_links)}] Processing: {pub['title'][:60]}...")
        print(f"URL: {pub['url']}")

        try:
            # Scrape con Docling
            result = scrape_osha_publication(
                publication_url=pub['url'],
                converter=converter,
                save_pdf=save_pdfs
            )

            if result['status'] == 'success':
                results.append(result)

                # Salva JSON individuale
                json_path = save_to_json(result)
                print(f"  [+] Salvato: {json_path.name}")
            else:
                failed.append({
                    'url': pub['url'],
                    'title': pub['title'],
                    'error': result.get('error_message', 'Unknown error')
                })
                print(f"  [!] Fallito: {result.get('error_message', 'Unknown')}")

            # Rate limiting - evita di sovraccaricare il server
            if idx < len(publication_links):
                print(f"  [~] Pausa {delay_between_docs} secondi...")
                time.sleep(delay_between_docs)

        except Exception as e:
            print(f"  [!] Eccezione: {e}")
            failed.append({
                'url': pub['url'],
                'title': pub['title'],
                'error': str(e)
            })

    # 3) Salva risultati aggregati (opzionale)
    if output_file:
        output_path = ScraperConfig.JSON_DIR / output_file
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, ensure_ascii=False, indent=2)
        print(f"\n[SAVED] Risultati aggregati in: {output_path}")

    # 4) Summary
    print(f"\n{'='*80}")
    print(f"BATCH SCRAPING COMPLETATO")
    print(f"{'='*80}")
    print(f"Successi: {len(results)}/{len(publication_links)}")
    print(f"Falliti: {len(failed)}/{len(publication_links)}")

    if failed:
        print(f"\nDocumenti falliti:")
        for fail in failed:
            print(f"  - {fail['title'][:60]}: {fail['error']}")

    # Statistiche aggregate
    total_pages = sum(r['document_content']['num_pages']
                     for r in results
                     if r.get('document_content') and r['document_content'].get('num_pages') != 'N/A')
    total_tables = sum(r['document_content']['num_tables']
                      for r in results
                      if r.get('document_content'))
    total_figures = sum(r['document_content']['num_figures']
                       for r in results
                       if r.get('document_content'))

    print(f"\nStatistiche aggregate:")
    print(f"  - Pagine totali: {total_pages}")
    print(f"  - Tabelle totali: {total_tables}")
    print(f"  - Figure totali: {total_figures}")

    return results

### Interactive Batch Scraping Interface

User-friendly CLI wrapper for batch scraping. Collects search parameters through prompts (keyword, language, page/document limits), validates inputs, shows confirmation dialog, and executes the full pipeline. Auto-generates descriptive output filenames. Perfect for exploratory scraping sessions in notebooks.


In [None]:
def interactive_batch_scraping():
    print("\n" + "="*80)
    print("OSHA BATCH SCRAPER CON DOCLING")
    print("="*80 + "\n")

    # Input query
    query = input("Inserisci la parola chiave o frase da cercare: ").strip()
    if not query:
        print("[ERROR] Query vuota!")
        return

    # Input lingua
    lang = input("Lingua ('it' per italiano, 'en' per inglese): ").strip().lower()
    while lang not in {"it", "en"}:
        print("[ERROR] Lingua non valida!")
        lang = input("Lingua ('it' o 'en'): ").strip().lower()

    # Input numero pagine
    try:
        max_pages = int(input("Numero massimo di pagine di risultati (default 5): ").strip() or "5")
    except ValueError:
        max_pages = 5
        print(f"[INFO] Usando default: {max_pages} pagine")

    # Input limite documenti
    max_docs_input = input("Limite documenti da processare (Enter = tutti): ").strip()
    max_docs = int(max_docs_input) if max_docs_input else None

    # Costruisce URL
    search_url = build_osha_search_url(query, lang=lang)
    print(f"\n[INFO] URL di ricerca: {search_url}\n")

    # Conferma
    confirm = input("Procedere con lo scraping? (y/n): ").strip().lower()
    if confirm != 'y':
        print("[CANCELLED] Operazione annullata")
        return

    # Output filename
    safe_query = re.sub(r'[^\w\-_.]', '_', query[:30])
    output_file = f"batch_{safe_query}_{lang}.json"

    # Esegui batch scraping
    results = batch_scrape_osha_publications(
        search_url=search_url,
        max_pages=max_pages,
        max_documents=max_docs,
        output_file=output_file,
        save_pdfs=True,
        delay_between_docs=3
    )

    print(f"\n[SUCCESS] Batch scraping completato!")
    print(f"Risultati salvati in: {ScraperConfig.JSON_DIR}")

    return results

### Script Entry Point

Launches the interactive scraping interface when the script runs standalone.

In [None]:
if __name__ == "__main__":
    # Modalità interattiva - chiede input all'utente
    interactive_batch_scraping()



OSHA BATCH SCRAPER CON DOCLING

Inserisci la parola chiave o frase da cercare: work safety
Lingua ('it' per italiano, 'en' per inglese): en
Numero massimo di pagine di risultati (default 5): 2
Limite documenti da processare (Enter = tutti): 

[INFO] URL di ricerca: https://osha.europa.eu/en/publications?search_api_fulltext=work+safety&sort_by=field_publication_date

Procedere con lo scraping? (y/n): y
VLM abilitato: HuggingFaceTB/SmolVLM-256M-Instruct

ESTRAZIONE LINK PUBBLICAZIONI

Search URL: https://osha.europa.eu/en/publications?search_api_fulltext=work+safety&sort_by=field_publication_date
Max pagine: 2

[Pagina 1/2] Fetching: https://osha.europa.eu/en/publications?search_api_fulltext=work+safety&sort_by=field_publication_date
  [DEBUG] HTML salvato in debug_osha_page.html
  [+] Metodo 3: Trovati 24 link (ricerca generica)
    - Musculoskeletal health and risk factors in the HeSCare sector – a review of exis...
    - OSH Pulse 2025: Occupational safety and health in the era of cl

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
[32m[INFO] 2025-10-23 13:59:51,937 [RapidOCR] base.py:22: Using engine_name: torch[0m
[32m[INFO] 2025-10-23 13:59:51,946 [RapidOCR] download_file.py:68: Initiating download: https://www.modelscope.cn/models/RapidAI/RapidOCR/resolve/v3.4.0/torch/PP-OCRv4/det/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-10-23 13:59:54,666 [RapidOCR] download_file.py:82: Download size: 13.83MB[0m
[32m[INFO] 2025-10-23 13:59:54,879 [RapidOCR] download_file.py:95: Successfully saved to: /usr/local/lib/python3.12/dist-packages/rapidocr/models/ch_PP-OCRv4_det_infer.pth[0m
[32m[INFO] 2025-10-23 13

Nessun elemento trovato in 1213 items, fallback markdown
Struttura da Markdown: 212 headings, 15 tabelle, 17 figure
[DEBUG] Usando iterate_items()
  [+] Salvata: figure_1.png
  [+] Salvata: figure_2.png
  [+] Salvata: figure_3.png
  [+] Salvata: figure_4.png
  [+] Salvata: figure_5.png
  [+] Salvata: figure_6.png
  [+] Salvata: figure_7.png
  [+] Salvata: figure_8.png
  [+] Salvata: figure_9.png
  [+] Salvata: figure_10.png
  [+] Salvata: figure_11.png
  [+] Salvata: figure_12.png
  [+] Salvata: figure_13.png
  [+] Salvata: figure_14.png
  [+] Salvata: figure_15.png
  [+] Salvata: figure_16.png
  [+] Salvata: figure_17.png
[RESULT] 17 immagini esportate in: osha_scraped_data/images/Musculoskeletal_health_and_risk_factors_in_the_HeS
 Tabella salvata: table_0.txt
 Tabella salvata: table_1.txt
 Tabella salvata: table_2.txt
 Tabella salvata: table_3.txt
 Tabella salvata: table_4.txt
 Tabella salvata: table_5.txt
 Tabella salvata: table_6.txt
 Tabella salvata: table_7.txt
 Tabella salvata: 



Nessun elemento trovato in 637 items, fallback markdown
Struttura da Markdown: 74 headings, 45 tabelle, 61 figure
[DEBUG] Usando iterate_items()
  [+] Salvata: figure_1.png
  [+] Salvata: figure_2.png
  [+] Salvata: figure_3.png
  [+] Salvata: figure_4.png
  [+] Salvata: figure_5.png
  [+] Salvata: figure_6.png
  [+] Salvata: figure_7.png
  [+] Salvata: figure_8.png
  [+] Salvata: figure_9.png
  [+] Salvata: figure_10.png
  [+] Salvata: figure_11.png
  [+] Salvata: figure_12.png
  [+] Salvata: figure_13.png
  [+] Salvata: figure_14.png
  [+] Salvata: figure_15.png
  [+] Salvata: figure_16.png
  [+] Salvata: figure_17.png
  [+] Salvata: figure_18.png
  [+] Salvata: figure_19.png
  [+] Salvata: figure_20.png
  [+] Salvata: figure_21.png
  [+] Salvata: figure_22.png
  [+] Salvata: figure_23.png
  [+] Salvata: figure_24.png
  [+] Salvata: figure_25.png
  [+] Salvata: figure_26.png
  [+] Salvata: figure_27.png
  [+] Salvata: figure_28.png
  [+] Salvata: figure_29.png
  [+] Salvata: figure_3



Nessun elemento trovato in 267 items, fallback markdown
Struttura da Markdown: 33 headings, 0 tabelle, 2 figure
[DEBUG] Usando iterate_items()
  [+] Salvata: figure_1.png
  [+] Salvata: figure_2.png
[RESULT] 2 immagini esportate in: osha_scraped_data/images/Strategies_and_legislation_on_psychosocial_risks_i
Docling processing completato:
  - 27 pagine
  - 0 tabelle
  - 2 figure
  - 33 headings
  - 2 immagini esportate
  - 0 tabelle esportate

 Scraping completato con successo!
Dati salvati in: osha_scraped_data/json/Strategies_and_legislation_on_psychosocial_risks_i_20251023_151011.json
  [+] Salvato: Strategies_and_legislation_on_psychosocial_risks_i_20251023_151011.json
  [~] Pausa 3 secondi...

[4/8] Processing: Digital platform work and occupational safety and health: ov...
URL: https://osha.europa.eu/en/publications/digital-platform-work-and-occupational-safety-and-health-overview-regulation-policies-practices-and-research

Scraping pubblicazione: https://osha.europa.eu/en/publica



Nessun elemento trovato in 570 items, fallback markdown
Struttura da Markdown: 78 headings, 7 tabelle, 7 figure
[DEBUG] Usando iterate_items()
  [+] Salvata: figure_1.png
  [+] Salvata: figure_2.png
  [+] Salvata: figure_3.png
  [+] Salvata: figure_4.png
  [+] Salvata: figure_5.png
  [+] Salvata: figure_6.png
  [+] Salvata: figure_7.png
[RESULT] 7 immagini esportate in: osha_scraped_data/images/Digital_platform_work_and_occupational_safety_and_
 Tabella salvata: table_0.txt
 Tabella salvata: table_1.txt
 Tabella salvata: table_2.txt
 Tabella salvata: table_3.txt
 Tabella salvata: table_4.txt
 Tabella salvata: table_5.txt
 Tabella salvata: table_6.txt
7 tabelle esportate in: osha_scraped_data/tables/Digital_platform_work_and_occupational_safety_and_
Docling processing completato:
  - 56 pagine
  - 7 tabelle
  - 7 figure
  - 78 headings
  - 7 immagini esportate
  - 7 tabelle esportate

 Scraping completato con successo!
Dati salvati in: osha_scraped_data/json/Digital_platform_work_and_oc



Nessun elemento trovato in 223 items, fallback markdown
Struttura da Markdown: 28 headings, 1 tabelle, 6 figure
[DEBUG] Usando iterate_items()
  [+] Salvata: figure_1.png
  [+] Salvata: figure_2.png
  [+] Salvata: figure_3.png
  [+] Salvata: figure_4.png
  [+] Salvata: figure_5.png
  [+] Salvata: figure_6.png
[RESULT] 6 immagini esportate in: osha_scraped_data/images/Summary_-_Digital_platform_work_and_occupational_s
 Tabella salvata: table_0.txt
1 tabelle esportate in: osha_scraped_data/tables/Summary_-_Digital_platform_work_and_occupational_s
Docling processing completato:
  - 25 pagine
  - 1 tabelle
  - 6 figure
  - 28 headings
  - 6 immagini esportate
  - 1 tabelle esportate

 Scraping completato con successo!
Dati salvati in: osha_scraped_data/json/Summary_-_Digital_platform_work_and_occupational_s_20251023_151422.json
  [+] Salvato: Summary_-_Digital_platform_work_and_occupational_s_20251023_151422.json
  [~] Pausa 3 secondi...

[6/8] Processing: Occupational safety and health ri



Nessun elemento trovato in 119 items, fallback markdown
Struttura da Markdown: 24 headings, 1 tabelle, 3 figure
[DEBUG] Usando iterate_items()
  [+] Salvata: figure_1.png
  [+] Salvata: figure_2.png
  [+] Salvata: figure_3.png
[RESULT] 3 immagini esportate in: osha_scraped_data/images/Occupational_safety_and_health_risks_of_remote_pro
 Tabella salvata: table_0.txt
1 tabelle esportate in: osha_scraped_data/tables/Occupational_safety_and_health_risks_of_remote_pro
Docling processing completato:
  - 13 pagine
  - 1 tabelle
  - 3 figure
  - 24 headings
  - 3 immagini esportate
  - 1 tabelle esportate

 Scraping completato con successo!
Dati salvati in: osha_scraped_data/json/Occupational_safety_and_health_risks_of_remote_pro_20251023_153202.json
  [+] Salvato: Occupational_safety_and_health_risks_of_remote_pro_20251023_153202.json
  [~] Pausa 3 secondi...

[7/8] Processing: Occupational safety and health risks of parcel delivery work...
URL: https://osha.europa.eu/en/publications/occupatio



Nessun elemento trovato in 138 items, fallback markdown
Struttura da Markdown: 14 headings, 0 tabelle, 2 figure
[DEBUG] Usando iterate_items()
  [+] Salvata: figure_1.png
  [+] Salvata: figure_2.png
[RESULT] 2 immagini esportate in: osha_scraped_data/images/Occupational_safety_and_health_risks_of_parcel_del
Docling processing completato:
  - 14 pagine
  - 0 tabelle
  - 2 figure
  - 14 headings
  - 2 immagini esportate
  - 0 tabelle esportate

 Scraping completato con successo!
Dati salvati in: osha_scraped_data/json/Occupational_safety_and_health_risks_of_parcel_del_20251023_155401.json
  [+] Salvato: Occupational_safety_and_health_risks_of_parcel_del_20251023_155401.json
  [~] Pausa 3 secondi...

[8/8] Processing: Occupational safety and health risks of online content revie...
URL: https://osha.europa.eu/en/publications/occupational-safety-and-health-risks-online-content-review-work-provided-through-digital-labour-platforms

Scraping pubblicazione: https://osha.europa.eu/en/publicatio

### Incremental Scraping - Process Only New Documents

Smart differential update system that compares search results against existing JSON archives, identifies new publications, and processes only those not previously scraped. Supports check-only mode for dry-run verification. Safely appends new data while preserving existing records. Returns detailed statistics (existing/new/failed counts). Interactive interface included for periodic archive updates with minimal overhead.


In [None]:
def incremental_batch_scraping(
    search_url: str,
    output_file: str,
    max_pages: int = 5,
    max_new_documents: Optional[int] = None,
    save_pdfs: bool = True,
    check_only: bool = False,
    delay_between_docs: int = 3
) -> Dict[str, Any]:

    output_path = ScraperConfig.JSON_DIR / output_file

    # 1) Carica dati esistenti
    existing_data = []
    existing_urls = set()

    if output_path.exists():
        try:
            with open(output_path, 'r', encoding='utf-8') as f:
                existing_data = json.load(f)

            # Estrae tutti gli URL già processati
            for item in existing_data:
                # Supporta diversi formati di URL storage
                url = (item.get('scraping_metadata', {}).get('url') or
                       item.get('web_metadata', {}).get('pdf_url') or
                       item.get('url'))
                if url:
                    existing_urls.add(url)

            print(f"\n[INFO] Caricato file esistente: {output_path}")
            print(f"[INFO] Documenti già processati: {len(existing_urls)}")

        except json.JSONDecodeError as e:
            print(f"[ERROR] File JSON corrotto: {e}")
            print(f"[INFO] Procedo come se il file non esistesse")
    else:
        print(f"\n[INFO] File {output_file} non trovato - creazione nuovo archivio")

    # 2) Setup converter
    ScraperConfig.setup_directories()
    converter = initialize_docling_converter(enable_vlm=ScraperConfig.VLM_ENABLED)

    # 3) Ottieni link pubblicazioni
    print(f"\n{'='*80}")
    print(f"CONTROLLO NUOVE PUBBLICAZIONI")
    print(f"{'='*80}\n")

    all_links = get_osha_publication_links(search_url, max_pages)

    if not all_links:
        print("[WARNING] Nessuna pubblicazione trovata nella ricerca")
        return {
            'existing': len(existing_data),
            'new_found': 0,
            'new_processed': 0,
            'failed': 0
        }

    # 4) Filtra link nuovi
    new_links = [link for link in all_links if link['url'] not in existing_urls]

    print(f"\n{'='*80}")
    print(f"ANALISI PUBBLICAZIONI")
    print(f"{'='*80}")
    print(f"Totale trovate: {len(all_links)}")
    print(f"Già esistenti: {len(all_links) - len(new_links)}")
    print(f"Nuove da processare: {len(new_links)}")

    if not new_links:
        print("\n[SUCCESS] Archivio già aggiornato - nessun nuovo documento!")
        return {
            'existing': len(existing_data),
            'new_found': 0,
            'new_processed': 0,
            'failed': 0
        }

    # Mostra lista nuovi documenti
    print(f"\n[NEW] Pubblicazioni nuove trovate:")
    for idx, link in enumerate(new_links, 1):
        print(f"  {idx}. {link['title'][:70]}...")
        print(f"     {link['url']}")

    # 5) Check-only mode
    if check_only:
        print(f"\n[CHECK-ONLY] Modalità verifica - nessun processing eseguito")
        return {
            'existing': len(existing_data),
            'new_found': len(new_links),
            'new_processed': 0,
            'failed': 0
        }

    # 6) Conferma processing
    print(f"\n{'='*80}")
    confirm = input(f"Processare {len(new_links)} nuovi documenti? (y/n): ").strip().lower()
    if confirm != 'y':
        print("[CANCELLED] Operazione annullata")
        return {
            'existing': len(existing_data),
            'new_found': len(new_links),
            'new_processed': 0,
            'failed': 0
        }

    # Limita numero se richiesto
    if max_new_documents:
        new_links = new_links[:max_new_documents]
        print(f"[INFO] Limitato a {max_new_documents} documenti")

    # 7) Processa nuovi documenti
    new_results = []
    failed = []

    print(f"\n{'='*80}")
    print(f"PROCESSING NUOVI DOCUMENTI - {len(new_links)}")
    print(f"{'='*80}\n")

    for idx, link in enumerate(new_links, 1):
        print(f"\n[{idx}/{len(new_links)}] Processing: {link['title'][:60]}...")

        try:
            result = scrape_osha_publication(
                publication_url=link['url'],
                converter=converter,
                save_pdf=save_pdfs
            )

            if result['status'] == 'success':
                new_results.append(result)

                # Salva JSON individuale
                json_path = save_to_json(result)
                print(f"  [+] Salvato: {json_path.name}")
            else:
                failed.append({
                    'url': link['url'],
                    'title': link['title'],
                    'error': result.get('error_message', 'Unknown')
                })
                print(f"  [!] Fallito: {result.get('error_message')}")

            # Rate limiting
            if idx < len(new_links):
                print(f"  [~] Pausa {delay_between_docs} secondi...")
                time.sleep(delay_between_docs)

        except Exception as e:
            print(f"  [!] Eccezione: {e}")
            failed.append({
                'url': link['url'],
                'title': link['title'],
                'error': str(e)
            })

    # 8) Aggiorna file aggregato
    if new_results:
        updated_data = existing_data + new_results

        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(updated_data, f, ensure_ascii=False, indent=2)

        print(f"\n[SAVED] File aggregato aggiornato: {output_path}")
        print(f"        Totale documenti: {len(updated_data)}")
        print(f"        Nuovi aggiunti: {len(new_results)}")

    # 9) Summary finale
    print(f"\n{'='*80}")
    print(f"UPDATE INCREMENTALE COMPLETATO")
    print(f"{'='*80}")
    print(f"Documenti esistenti: {len(existing_data)}")
    print(f"Nuovi trovati: {len(new_links)}")
    print(f"Nuovi processati: {len(new_results)}")
    print(f"Falliti: {len(failed)}")

    if failed:
        print(f"\nDocumenti falliti:")
        for fail in failed:
            print(f"  - {fail['title'][:60]}: {fail['error']}")

    return {
        'existing': len(existing_data),
        'new_found': len(new_links),
        'new_processed': len(new_results),
        'failed': len(failed),
        'failed_details': failed
    }


def interactive_incremental_scraping():
    """
    Interfaccia interattiva per update incrementale
    """
    print("\n" + "="*80)
    print("OSHA INCREMENTAL UPDATE - Solo Nuovi Documenti")
    print("="*80 + "\n")

    # Input query
    query = input("Parola chiave o frase da cercare: ").strip()
    if not query:
        print("[ERROR] Query vuota!")
        return

    lang = input("Lingua ('it' o 'en'): ").strip().lower()
    while lang not in {"it", "en"}:
        print("[ERROR] Lingua non valida!")
        lang = input("Lingua ('it' o 'en'): ").strip().lower()

    # Nome file output
    safe_query = re.sub(r'[^\w\-_.]', '_', query[:30])
    default_file = f"osha_{safe_query}_{lang}.json"
    output_file = input(f"Nome file JSON (default: {default_file}): ").strip() or default_file

    # Numero pagine
    try:
        max_pages = int(input("Max pagine di risultati (default 5): ").strip() or "5")
    except ValueError:
        max_pages = 5

    # Check-only mode
    check_only_input = input("Solo verifica senza aggiornare? (y/n, default: n): ").strip().lower()
    check_only = (check_only_input == 'y')

    # Costruisci URL
    search_url = build_osha_search_url(query, lang=lang)
    print(f"\n[INFO] URL: {search_url}\n")

    # Esegui
    stats = incremental_batch_scraping(
        search_url=search_url,
        output_file=output_file,
        max_pages=max_pages,
        check_only=check_only,
        save_pdfs=True,
        delay_between_docs=5
    )

    print(f"\n[DONE] Operazione completata!")
    return stats

Check-Only Mode (Solo Verifica)

In [None]:
# Verifica senza processare
stats = incremental_batch_scraping(
    search_url=build_osha_search_url("AI e lavoro", lang="it"),
    output_file="osha_ai_lavoro.json",
    max_pages=3,
    check_only=True  # ← Solo verifica
)

print(f"Nuovi documenti disponibili: {stats['new_found']}")


Auto-Update Mode (Processing Automatico)

In [None]:
# Processa e aggiorna automaticamente
stats = incremental_batch_scraping(
    search_url=build_osha_search_url("workplace safety", lang="en"),
    output_file="osha_workplace_safety_en.json",
    max_pages=5,
    max_new_documents=10,  # Limite nuovi docs
    check_only=False,
    save_pdfs=False  # Non salvare PDF
)

Interactive Mode

In [None]:
if __name__ == "__main__":
    # Interfaccia interattiva
    interactive_incremental_scraping()
