# **arXiv publications**

In [23]:
import re
import os
import time
import json
import random
import requests
from urllib.parse import urljoin, quote_plus
from bs4 import BeautifulSoup
import fitz  # PyMuPDF

**Generic function to handle pdf**

Takes a given pdf and extracts the content.

In [31]:
def scarica_e_leggi_pdf(pdf_url):
    #Scarica un PDF da arXiv e restituisce il testo estratto
    if not pdf_url:
        return None

    response = requests.get(pdf_url, stream=True)
    if response.status_code != 200:
        print(f"Errore nel download del PDF: {response.status_code}")
        return None

    nome_file_pdf = 'arxiv_temp.pdf'
    with open(nome_file_pdf, 'wb') as f:
        f.write(response.content)

    testo = []
    with fitz.open(nome_file_pdf) as doc:
        for page in doc:
            testo.append(page.get_text())

    full_text = re.sub(r'\s+', ' ', " ".join(testo)).strip()

    # Elimina il file temporaneo
    os.remove(nome_file_pdf)
    return full_text


**Search URL construction**

Builds a normalized, case-insensitive search URL for the arXiv catalog using a free-text query (single or multi-word) and a target page index

In [32]:
def build_arxiv_search_url(query: str, page: int = 0, size: int = 50):
    # Crea l’URL per cercare articoli su arXiv

    encoded_query = quote_plus(query.strip())
    start = page * size
    return f"https://arxiv.org/search/?query={encoded_query}&searchtype=all&abstracts=show&order=-announced_date_first&size={size}&start={start}"


**Link and base metadata extraction**

Parses a results page to collect all publication cards and extracts each card’s title and absolute URL.

In [33]:
def get_arxiv_links(query: str, max_pages: int = 1):
    # Estrae tutti i link e i titoli dalle pagine di ricerca arXiv
    headers = {'User-Agent': 'Mozilla/5.0'}
    all_links = []

    for p in range(max_pages):
        url = build_arxiv_search_url(query, page=p)
        print(f" Pagina {p+1}/{max_pages}: {url}")

        res = requests.get(url, headers=headers)
        if res.status_code != 200:
            print(f"Errore nella pagina {p+1}: {res.status_code}")
            continue

        soup = BeautifulSoup(res.content, 'lxml')
        results = soup.find_all('li', class_='arxiv-result')

        if not results:
            print("Nessun risultato trovato.")
            break

        for r in results:
            title_elem = r.find('p', class_='title')
            link_elem = r.find('p', class_='list-title')
            abs_elem = r.find('span', class_='abstract-full')

            if link_elem and link_elem.find('a'):
                link = link_elem.find('a')['href']
                all_links.append({
                    "titolo": title_elem.text.strip() if title_elem else "N/A",
                    "url": link.strip(),
                    "abstract": abs_elem.text.strip() if abs_elem else "N/A"
                })

        time.sleep(random.uniform(1, 2))  # piccola pausa per sicurezza

    return all_links


**Article-level details extraction**

Given an article page, extracts core metadata (title, publication date, PDF link) and, when available, the PDF’s full text.

In [34]:
def scrape_arxiv_details(article_url):
    # Estrae metadati e testo PDF da un articolo arXiv
    headers = {'User-Agent': 'Mozilla/5.0'}
    p = requests.get(article_url, headers=headers)
    page = BeautifulSoup(p.content, 'lxml')

    # Titolo
    title_elem = page.find('h1', class_='title')
    titolo = title_elem.text.replace('Title:', '').strip() if title_elem else "N/A"

    # Autori
    authors_elem = page.find('div', class_='authors')
    autori = authors_elem.text.replace('Authors:', '').strip() if authors_elem else "N/A"

    # Data
    date_elem = page.find('div', class_='dateline')
    data = date_elem.text.strip().replace('Submitted ', '') if date_elem else "N/A"

    # Abstract
    abs_elem = page.find('blockquote', class_='abstract')
    abstract = abs_elem.text.replace('Abstract:', '').strip() if abs_elem else "N/A"

    # PDF link
    pdf_a = page.find('a', string=lambda s: s and s.strip().lower() == 'pdf')
    pdf_url = urljoin("https://arxiv.org", pdf_a['href']) if pdf_a else None

    # Estrazione testo PDF
    testo_pdf = scarica_e_leggi_pdf(pdf_url) if pdf_url else None

    return {
        "titolo": titolo,
        "autori": autori,
        "data": data,
        "abstract": abstract,
        "pdf_url": pdf_url,
        "testo_pdf": testo_pdf,
        "url_pubblicazione": article_url
    }

**JSON saving**

In [35]:
def salva_in_json(record, filename):

    if os.path.exists(filename):
        with open(filename, 'r', encoding='utf-8') as f:
            data = json.load(f)
    else:
        data = []
    data.append(record)
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)


**Interactive function**

Prompts the user for a keyword (and optionally a publication date range), builds the corresponding search URLs for the requested number of pages, scrapes results, and saves them to JSON.

In [36]:
def scrape_arxiv_interattivo(output_file="arxiv_risultati.json"):
    # Chiede all’utente la query e il numero di pagine, poi estrae tutto
    print("RICERCA PUBBLICAZIONI ARXIV")
    query = input(" Inserisci la parola o frase da cercare: ").strip()
    while not query:
        query = input("La query non può essere vuota. Riprova: ").strip()

    try:
        max_pages = int(input(" Inserisci il numero di pagine da analizzare: ").strip())
        if max_pages < 1:
            raise ValueError
    except ValueError:
        max_pages = 1
        print("Numero non valido. Analizzerò solo 1 pagina.")

    print(" Cerco le pubblicazioni, attendi...\n")
    links = get_arxiv_links(query, max_pages=max_pages)
    print(f"\nTrovate {len(links)} pubblicazioni totali.\n")

    for i, item in enumerate(links, start=1):
        print(f"[{i}/{len(links)}] Estraggo: {item['titolo']}")
        try:
            dati = scrape_arxiv_details(item['url'])
            salva_in_json(dati, output_file)
            print(f" Salvato: {item['titolo']}\n")
        except Exception as e:
            print(f" Errore su {item['url']}: {e}")
        time.sleep(random.uniform(2, 4))

    print(f" Tutti i dati salvati in '{output_file}'.")

**Test the function**

In [37]:
scrape_arxiv_interattivo()

RICERCA PUBBLICAZIONI ARXIV
 Inserisci la parola o frase da cercare: mental health
 Inserisci il numero di pagine da analizzare: 2
 Cerco le pubblicazioni, attendi...

 Pagina 1/2: https://arxiv.org/search/?query=mental+health&searchtype=all&abstracts=show&order=-announced_date_first&size=50&start=0
 Pagina 2/2: https://arxiv.org/search/?query=mental+health&searchtype=all&abstracts=show&order=-announced_date_first&size=50&start=50

Trovate 100 pubblicazioni totali.

[1/100] Estraggo: A Case for Leveraging Generative AI to Expand and Enhance Training in the Provision of Mental Health Services
 Salvato: A Case for Leveraging Generative AI to Expand and Enhance Training in the Provision of Mental Health Services

[2/100] Estraggo: Position: AI Will Transform Neuropsychology Through Mental Health Digital Twins for Dynamic Mental Health Care, Especially for ADHD
 Salvato: Position: AI Will Transform Neuropsychology Through Mental Health Digital Twins for Dynamic Mental Health Care, Especial

KeyboardInterrupt: 

Fermata volontariamente ai primi 10 giusto per provare che funzionasse, informazioni scaricate molto bene