In [None]:
# Install necessary libraries. pdfplumber is excellent for robust PDF text and table extraction.
!pip install requests pdfplumber

import requests
import xml.etree.ElementTree as ET
import pdfplumber
import re
from pathlib import Path
import os
from collections import defaultdict
import html

print("Setup Complete. Libraries are ready.")

Setup Complete. Libraries are ready.


In [None]:
# arXiv API Ingestor

def fetch_recent_quant_ph_papers(max_results=20):
    """
    Fetches the most recent papers from the arXiv quant-ph category.
    Returns a list of dictionaries, each representing a paper.
    """
    base_url = 'http://export.arxiv.org/api/query?'
    query = f'search_query=cat:quant-ph&sortBy=submittedDate&sortOrder=descending&max_results={max_results}'

    print(f"Fetching {max_results} recent papers from quant-ph...")
    try:
        response = requests.get(base_url + query)
        response.raise_for_status()  # Raises an exception for bad status codes
    except requests.exceptions.RequestException as e:
        print(f"Error fetching from arXiv: {e}")
        return []

    root = ET.fromstring(response.content)
    papers = []
    # arXiv API uses a namespace, which we need to handle
    namespace = {'arxiv': 'http://www.w3.org/2005/Atom'}

    for entry in root.findall('arxiv:entry', namespace):
        paper_id = entry.find('arxiv:id', namespace).text.split('/abs/')[-1]
        title = entry.find('arxiv:title', namespace).text.strip().replace('\n', ' ')
        pdf_url = entry.find('arxiv:link[@title="pdf"]', namespace).attrib['href']

        papers.append({
            'id': paper_id,
            'title': title,
            'pdf_url': pdf_url
        })

    print(f"Successfully fetched {len(papers)} papers.")
    return papers

# --- Execute and Display ---
papers_to_analyze = fetch_recent_quant_ph_papers()
# Display the first few fetched papers to show it works
for i, paper in enumerate(papers_to_analyze[:10]):
    print(f"{i+1}. {paper['title']} ({paper['id']})")

Fetching 20 recent papers from quant-ph...
Successfully fetched 20 papers.
1. Strong-to-Weak Symmetry Breaking Phases in Steady States of Quantum   Operations (2509.09669v1)
2. Bogoliubov quasi-particles in superconductors are integer-charged   particles inapplicable for braiding quantum information (2509.09663v1)
3. Towards A High-Performance Quantum Data Center Network Architecture (2509.09653v1)
4. Resource quantification for programming low-depth quantum circuits (2509.09642v1)
5. Work statistics of sudden Quantum quenches: A random matrix theory   perspective on Gaussianity and its deviations (2509.09640v1)
6. Reconstructing the Hamiltonian from the local density of states using   neural networks (2509.09604v1)
7. Fault-tolerant transformations of spacetime codes (2509.09603v1)
8. PT symmetry-enriched non-unitary criticality (2509.09587v1)
9. Quantum signatures of proper time in optical ion clocks (2509.09573v1)
10. Vacuum electromagnetic field correlations between two moving poin

In [None]:
# PDF Processing Engine

# Create a directory to store downloaded PDFs
PDF_DIR = Path('downloaded_pdfs')
PDF_DIR.mkdir(exist_ok=True)

def download_and_extract_text(paper):
    """
    Downloads a PDF and extracts its full text.
    Returns the text content as a single string.
    """
    pdf_path = PDF_DIR / f"{paper['id']}.pdf"

    # Download the file if it doesn't exist
    if not pdf_path.exists():
        try:
            print(f"Downloading {paper['id']}...")
            response = requests.get(paper['pdf_url'])
            response.raise_for_status()
            pdf_path.write_bytes(response.content)
        except requests.exceptions.RequestException as e:
            print(f"  -> Failed to download {paper['id']}: {e}")
            return None, ""

    # Extract text using pdfplumber
    full_text = ""
    try:
        with pdfplumber.open(pdf_path) as pdf:
            print(f"  -> Extracting text from {paper['id']}...")
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    full_text += page_text + "\n"
        print(f"  -> Extracted {len(full_text)} characters.")
        return pdf_path, full_text
    except Exception as e:
        print(f"  -> Failed to extract text from {paper['id']}: {e}")
        return pdf_path, ""

# --- We will call this function inside the main loop later ---
# This block just defines the capability.

In [None]:
# The Heuristic Scoring Engine

# This helper function is used by Tiers 1 & 2
def get_evidence_snippets(text, keyword, window=150):
    """Finds a keyword and returns a snippet of text around it."""
    snippets = []
    # Use finditer to get match objects, which have start/end positions
    for match in re.finditer(r'\b' + re.escape(keyword) + r'\b', text, re.IGNORECASE):
        start = max(0, match.start() - window)
        end = min(len(text), match.end() + window)
        snippet = text[start:end]
        # Highlight the keyword in the snippet
        highlighted = snippet.replace(match.group(0), f"<strong>{match.group(0)}</strong>")
        snippets.append(f"...{html.escape(highlighted)}...")
    return snippets

def score_paper(text):
    """
    Analyzes the text of a paper and returns a score and evidence.
    This version includes the refined "SMART Tier 3" for caption analysis.
    """
    score = 0
    evidence = defaultdict(list)

    # --- Tier 1: Weighted Keyword Search ---
    strong_keywords = {
        "state-of-the-art": 25, "fidelity": 15, "benchmark": 20,
        "quantum volume": 25, "error rate": 15, "gate fidelity": 20,
        "cross-entropy": 15
    }
    platform_keywords = {
        "IBMQ": 10, "Sycamore": 10, "Quantinuum": 10, "IonQ": 10,
        "Rigetti": 10
    }
    all_keywords = {**strong_keywords, **platform_keywords}

    for keyword, weight in all_keywords.items():
        count = len(re.findall(r'\b' + re.escape(keyword) + r'\b', text, re.IGNORECASE))
        if count > 0:
            score += weight * count
            # Get the first snippet as evidence
            evidence[f"Tier 1: Found '{keyword}' ({count}x)"].extend(get_evidence_snippets(text, keyword, 75)[:1])

    # --- Tier 2: Contextual Analysis  ---
    # This regex is forgiving about whitespace and section formatting.
    # It looks for a heading, followed by ANY amount of whitespace (\s+),
    # and stops when it SEES (but doesn't consume) the next likely heading.
    section_finder_regex = r'(?:results|experiment|discussion|conclusion|benchmark)\s+(.*?)(?=\n\s*\n\s*(?:[IVX\d]+\.|references|acknowledgments|appendix)|\Z)'
    results_sections = re.findall(section_finder_regex, text, re.DOTALL | re.IGNORECASE)
    context_text = " ".join(results_sections)

    if context_text:
        for keyword, weight in strong_keywords.items():
            count = len(re.findall(r'\b' + re.escape(keyword) + r'\b', context_text, re.IGNORECASE))
            if count > 0:
                score += (weight * 1.5) * count # 50% bonus for being in a key section
                evidence[f"Tier 2: Found '{keyword}' in Results/Experiment"].extend(get_evidence_snippets(context_text, keyword, 75)[:1])

    # --- Tier 3: SMART Structural Analysis ---
    # This logic analyzes the CAPTIONS of tables and figures, not just their mentions.

    # Regex to find a declaration (e.g., "Table 1") and capture its following caption text (up to 5 lines).
    caption_regex = r'(Table\s+[IVX\d]+|Figure\s+\d+|Fig\.\s+\d+)\.?\s*((?:[^\n]+\n?){1,5})'

    # Keywords specifically sought after within captions. These indicate data and comparison.
    caption_keywords = {
        "benchmark": 50, "comparison": 40, "performance": 40,
        "fidelity": 30, "error rate": 30, "summary of results": 50
    }

    captions_found = re.findall(caption_regex, text, re.IGNORECASE)

    if captions_found:
        # First, add a small base score for the raw count, as high density is still a useful signal.
        num_tables = sum(1 for cap in captions_found if 'table' in cap[0].lower())
        num_figures = len(captions_found) - num_tables
        score += num_tables * 5
        score += num_figures * 2
        evidence["Tier 3: Base Count"].append(f"Found {num_tables} Tables and {num_figures} Figures.")

        # Second, the "smart" part: analyze the caption text itself for high-value keywords.
        for entity, caption_text in captions_found: # e.g., entity="Table 1", caption_text="Summary of..."
            for keyword, weight in caption_keywords.items():
                if re.search(r'\b' + re.escape(keyword) + r'\b', caption_text, re.IGNORECASE):
                    # Give a significant score bonus if the keyword is in a TABLE caption
                    final_weight = weight * 1.5 if 'table' in entity.lower() else weight
                    score += final_weight

                    # Create a rich evidence snippet from the actual caption
                    clean_caption = ' '.join(caption_text.strip().splitlines())
                    # Highlight the keyword found within the caption for the report
                    highlighted_caption = re.sub(r'(' + re.escape(keyword) + r')', r'<strong>\1</strong>', clean_caption, flags=re.IGNORECASE)

                    evidence_key = f"Tier 3 SMART: Found '{keyword}' in caption for {entity.strip()}"
                    evidence[evidence_key].append(f"{html.escape(highlighted_caption)}")

                    # Break after finding the first keyword to avoid score inflation from a single caption.
                    break

    return int(score), evidence

In [None]:
# Digest Generator and Main Execution Loop

def generate_html_digest(scored_papers, filename="benchmark_digest.html"):
    """Generates a static HTML file from the scored papers."""
    # Sort papers by score, descending
    sorted_papers = sorted(scored_papers, key=lambda p: p['score'], reverse=True)

    html_content = """
    <html>
    <head>
        <title>Metriq Benchmark Digest</title>
        <style>
            body { font-family: sans-serif; margin: 2em; }
            .paper { border: 1px solid #ccc; padding: 1em; margin-bottom: 1em; border-radius: 5px; }
            h2 { font-size: 1.2em; }
            .score { font-size: 1.5em; font-weight: bold; color: #0056b3; }
            .score-bar { background-color: #e9ecef; border-radius: 3px; }
            .score-bar-inner { background-color: #007bff; height: 10px; border-radius: 3px; }
            .evidence-list { list-style-type: none; padding-left: 0; }
            .evidence-list li { background-color: #f0f0f0; padding: 0.5em; margin-top: 0.5em; border-radius: 3px; font-family: monospace; font-size: 0.9em;}
            strong { color: #d9534f; }
        </style>
    </head>
    <body>
        <h1>Metriq Benchmark Digest</h1>
        <p>Generated for recent quant-ph submissions. Top papers ranked by their likelihood of containing benchmark results.</p>
    """

    for paper in sorted_papers:
      # In the loop for each paper
        max_score = sorted_papers[0]['score'] if sorted_papers else 1 # Avoid division by zero
        score_percentage = (paper['score'] / max_score) * 100 if max_score > 0 else 0

        html_content += f"""
        <div class="paper">
            <h2><a href="http://arxiv.org/abs/{paper['id']}" target="_blank">{html.escape(paper['title'])}</a></h2>
            <p>arXiv ID: {paper['id']}</p>
            <p class="score">Likelihood Score: {paper['score']}</p>
            <div class="score-bar"><div class="score-bar-inner" style="width: {score_percentage}%;"></div></div>
            <h3>Evidence:</h3>
            <ul class="evidence-list">
        """
        for reason, snippets in paper['evidence'].items():
            for snippet in snippets:
                html_content += f"<li><b>{html.escape(reason)}:</b> {snippet}</li>"

        html_content += "</ul></div>"

    html_content += "</body></html>"

    with open(filename, 'w', encoding='utf-8') as f:
        f.write(html_content)

    print(f"\nDigest generated! See the file '{filename}' in the file browser on the left.")

if __name__ == "__main__":
    # Fetch papers
    papers = fetch_recent_quant_ph_papers(max_results=20) # keeping it small for a demo

    scored_results = []

    # Process and score each paper
    for paper in papers:
        print("-" * 20)
        pdf_path, text = download_and_extract_text(paper)
        if text:
            score, evidence = score_paper(text)
            print(f"  -> Final Score for {paper['id']}: {score}")
            paper_data = paper.copy()
            paper_data['score'] = score
            paper_data['evidence'] = evidence
            scored_results.append(paper_data)
        else:
            print(f"  -> Skipping {paper['id']} due to processing error.")

    # Generate the final report
    if scored_results:
        generate_html_digest(scored_results)

Fetching 20 recent papers from quant-ph...
Successfully fetched 20 papers.
--------------------
  -> Extracting text from 2509.09669v1...
  -> Extracted 141869 characters.
  -> Final Score for 2509.09669v1: -2347
--------------------
  -> Extracting text from 2509.09663v1...
  -> Extracted 36575 characters.
  -> Final Score for 2509.09663v1: -119
--------------------
  -> Extracting text from 2509.09653v1...
  -> Extracted 28818 characters.
  -> Final Score for 2509.09653v1: 670
--------------------
  -> Extracting text from 2509.09642v1...
  -> Extracted 65592 characters.
  -> Final Score for 2509.09642v1: -660
--------------------
  -> Extracting text from 2509.09640v1...
  -> Extracted 51013 characters.
  -> Final Score for 2509.09640v1: -318
--------------------
  -> Extracting text from 2509.09604v1...
  -> Extracted 42299 characters.
  -> Final Score for 2509.09604v1: 3
--------------------
  -> Extracting text from 2509.09603v1...
  -> Extracted 195290 characters.
  -> Final Sco



  -> Extracted 49086 characters.
  -> Final Score for 2509.09538v1: -56
--------------------
  -> Extracting text from 2509.09517v1...
  -> Extracted 83519 characters.
  -> Final Score for 2509.09517v1: -245
--------------------
  -> Extracting text from 2509.09477v1...
  -> Extracted 27238 characters.
  -> Final Score for 2509.09477v1: 4
--------------------
  -> Extracting text from 2509.09476v1...
  -> Extracted 62728 characters.
  -> Final Score for 2509.09476v1: -204
--------------------
  -> Extracting text from 2509.09465v1...
  -> Extracted 86300 characters.
  -> Final Score for 2509.09465v1: 38
--------------------
  -> Extracting text from 2509.09464v1...




  -> Extracted 96338 characters.
  -> Final Score for 2509.09464v1: 936
--------------------
  -> Extracting text from 2509.09432v1...
  -> Extracted 49948 characters.
  -> Final Score for 2509.09432v1: 170
--------------------
  -> Extracting text from 2509.09423v1...
  -> Extracted 36582 characters.
  -> Final Score for 2509.09423v1: -25
--------------------
  -> Extracting text from 2509.09421v1...
  -> Extracted 83154 characters.
  -> Final Score for 2509.09421v1: -184
--------------------
  -> Extracting text from 2509.09402v1...
  -> Extracted 32077 characters.
  -> Final Score for 2509.09402v1: -76

Digest generated! See the file 'benchmark_digest.html' in the file browser on the left.
