In [3]:
import os
import json
import aiohttp
import asyncio
from pathlib import Path
from tqdm import tqdm
import nest_asyncio

jsonl_path = "litqa-v2-public.jsonl"
output_dir = Path("./litqa_pdfs")
output_dir.mkdir(parents=True, exist_ok=True)

def extract_dois(jsonl_file):
    dois = set()
    with open(jsonl_file, "r", encoding="utf-8") as f:
        for line in f:
            entry = json.loads(line)
            sources = entry.get("sources", [])
            for src in sources:
                if "doi.org" in src:
                    dois.add(src.strip())
    return list(dois)

async def fetch_pdf(session, doi_url, save_path):
    headers = {"User-Agent": "Mozilla/5.0"}
    
    if save_path.exists():
        return "exists"

    try:
        async with session.get(doi_url, headers=headers, timeout=30) as resp:
            text = await resp.text()
            if ".pdf" in text:
                start = text.find("https://")
                end = text.find(".pdf", start)
                if start != -1 and end != -1:
                    pdf_url = text[start:end + 4]

                    async with session.get(pdf_url, headers=headers, timeout=30) as pdf_resp:
                        content = await pdf_resp.read()
                        with open(save_path, "wb") as f:
                            f.write(content)
                        return "downloaded"
    except Exception as e:
        return f"error: {e}"

    return "not_found"

async def download_all(dois):
    async with aiohttp.ClientSession() as session:
        results = []
        for doi in tqdm(dois):
            filename = doi.split("/")[-1] + ".pdf"
            path = output_dir / filename
            result = await fetch_pdf(session, doi, path)
            results.append((doi, result))
        return results

dois = extract_dois(jsonl_path)
nest_asyncio.apply()
results = await download_all(dois)

with open("download_log.txt", "w") as log:
    for doi, status in results:
        log.write(f"{doi}\t{status}\n")

print("All pdf saved in ./litqa_pdfs/")

100%|██████████| 191/191 [03:08<00:00,  1.02it/s]

All pdf saved in ./litqa_pdfs/





In [4]:
import os
import json
import aiohttp
import asyncio
import urllib.parse
from pathlib import Path
from tqdm import tqdm
import nest_asyncio

# File paths
jsonl_path = "litqa-v2-public.jsonl"
output_dir = Path("./litqa_pdfs")
output_dir.mkdir(parents=True, exist_ok=True)

# Map DOI or URL to a safe filename
def safe_filename(doi_or_url: str) -> str:
    return urllib.parse.quote_plus(doi_or_url) + ".pdf"

# Extract all unique source DOIs from jsonl file
def extract_dois(jsonl_file):
    dois = set()
    with open(jsonl_file, "r", encoding="utf-8") as f:
        for line in f:
            entry = json.loads(line)
            for src in entry.get("sources", []):
                if "doi.org" in src or "arxiv.org" in src:
                    dois.add(src.strip())
    return sorted(list(dois))

# Try downloading PDF from a given DOI/URL
async def fetch_pdf(session, doi_url, save_path):
    headers = {"User-Agent": "Mozilla/5.0"}

    if save_path.exists() and save_path.stat().st_size > 10_000:
        return "exists"

    try:
        # Special case: arXiv
        if "arxiv.org" in doi_url:
            arxiv_id = doi_url.split("/")[-1]
            pdf_url = f"https://arxiv.org/pdf/{arxiv_id}.pdf"
            async with session.get(pdf_url, headers=headers, timeout=30) as resp:
                if resp.status == 200 and "application/pdf" in resp.headers.get("Content-Type", ""):
                    content = await resp.read()
                    with open(save_path, "wb") as f:
                        f.write(content)
                    return "arxiv_downloaded"
                return f"arxiv_fail_{resp.status}"

        # Fallback: generic DOI
        async with session.get(doi_url, headers=headers, timeout=30, allow_redirects=True) as resp:
            if resp.status == 200 and "application/pdf" in resp.headers.get("Content-Type", ""):
                content = await resp.read()
                with open(save_path, "wb") as f:
                    f.write(content)
                return "pdf_downloaded"
            else:
                return f"not_pdf_or_fail_{resp.status}"

    except Exception as e:
        return f"error: {str(e)}"

# Main loop for downloading all papers
async def download_all(dois):
    results = []
    async with aiohttp.ClientSession() as session:
        for doi in tqdm(dois):
            filename = safe_filename(doi)
            path = output_dir / filename
            result = await fetch_pdf(session, doi, path)
            results.append((doi, filename, result))
    return results

# Run
dois = extract_dois(jsonl_path)
nest_asyncio.apply()
results = await download_all(dois)

# Save download log
with open("download_log.txt", "w") as f:
    for doi, fname, status in results:
        f.write(f"{doi}\t{fname}\t{status}\n")

print(f"Finished downloading {len(results)} documents.")


100%|██████████| 191/191 [01:26<00:00,  2.20it/s]

Finished downloading 191 documents.





In [5]:
import os
import json
import aiohttp
import asyncio
import urllib.parse
from pathlib import Path
from tqdm import tqdm
import nest_asyncio

# Configuration
jsonl_path = "litqa-v2-public.jsonl"
output_dir = Path("./litqa_pdfs")
email = "xc392@cam.ac.uk"
output_dir.mkdir(parents=True, exist_ok=True)

# Helpers
def safe_filename(doi: str) -> str:
    return urllib.parse.quote_plus(doi) + ".pdf"

def extract_dois(jsonl_file):
    dois = set()
    with open(jsonl_file, "r", encoding="utf-8") as f:
        for line in f:
            entry = json.loads(line)
            for src in entry.get("sources", []):
                if "doi.org" in src:
                    dois.add(src.strip().split("doi.org/")[-1])
    return sorted(list(dois))

# Download via Unpaywall
async def get_pdf_url_from_unpaywall(session, doi: str) -> str | None:
    api_url = f"https://api.unpaywall.org/v2/{doi}?email={email}"
    try:
        async with session.get(api_url, timeout=15) as resp:
            if resp.status != 200:
                return None
            data = await resp.json()
            pdf_info = data.get("best_oa_location", {})
            return pdf_info.get("url_for_pdf", None)
    except Exception:
        return None

async def download_pdf(session, url: str, save_path: Path) -> bool:
    try:
        async with session.get(url, timeout=30) as resp:
            if resp.status == 200 and "application/pdf" in resp.headers.get("Content-Type", ""):
                content = await resp.read()
                with open(save_path, "wb") as f:
                    f.write(content)
                return True
    except Exception:
        pass
    return False

# Main download loop
async def download_all_via_unpaywall(dois):
    results = []
    async with aiohttp.ClientSession() as session:
        for doi in tqdm(dois):
            filename = safe_filename(doi)
            save_path = output_dir / filename

            if save_path.exists() and save_path.stat().st_size > 10_000:
                results.append((doi, "exists"))
                continue

            pdf_url = await get_pdf_url_from_unpaywall(session, doi)
            if not pdf_url:
                results.append((doi, "no_pdf"))
                continue

            success = await download_pdf(session, pdf_url, save_path)
            results.append((doi, "downloaded" if success else "fail_download"))
    return results

# Run
dois = extract_dois(jsonl_path)
nest_asyncio.apply()
results = await download_all_via_unpaywall(dois)

# Save log
with open("download_log_unpaywall.txt", "w") as f:
    for doi, status in results:
        f.write(f"{doi}\t{status}\n")

print(f"Finished. Downloaded {sum(1 for _, s in results if s == 'downloaded')} PDFs.")


100%|██████████| 190/190 [02:59<00:00,  1.06it/s]

Finished. Downloaded 58 PDFs.





In [None]:
import os
import json5
import aiohttp
import asyncio
import urllib.parse
from pathlib import Path
from tqdm import tqdm
import nest_asyncio

# Config
split_file = "2024-10-16_litqa2-splits.json5"
output_dir = Path("./litqa_pdfs")
email = "xc392@cam.ac.uk"
output_dir.mkdir(parents=True, exist_ok=True)

# Parse JSON5
with open(split_file, "r", encoding="utf-8") as f:
    split_data = json5.load(f)

eval_dois = sorted(set(split_data.get("eval", {}).get("dois", [])))
print(f"✅ Found {len(eval_dois)} eval DOIs")

# Unpaywall Downloader
async def get_pdf_url_from_unpaywall(session, doi: str) -> str | None:
    api_url = f"https://api.unpaywall.org/v2/{doi}?email={email}"
    try:
        async with session.get(api_url, timeout=15) as resp:
            if resp.status != 200:
                return None
            data = await resp.json()
            pdf_info = data.get("best_oa_location", {})
            return pdf_info.get("url_for_pdf", None)
    except Exception:
        return None

async def download_pdf(session, url: str, save_path: Path) -> bool:
    try:
        async with session.get(url, timeout=30) as resp:
            if resp.status == 200 and "application/pdf" in resp.headers.get("Content-Type", ""):
                content = await resp.read()
                with open(save_path, "wb") as f:
                    f.write(content)
                return True
    except Exception:
        pass
    return False

async def download_eval_dois(dois):
    results = []
    async with aiohttp.ClientSession() as session:
        for doi in tqdm(dois):
            filename = urllib.parse.quote_plus(doi) + ".pdf"
            save_path = output_dir / filename

            if save_path.exists() and save_path.stat().st_size > 10_000:
                results.append((doi, "exists"))
                continue

            pdf_url = await get_pdf_url_from_unpaywall(session, doi)
            if not pdf_url:
                results.append((doi, "no_pdf"))
                continue

            success = await download_pdf(session, pdf_url, save_path)
            results.append((doi, "downloaded" if success else "fail_download"))
    return results

# Run It
nest_asyncio.apply()
results = await download_eval_dois(eval_dois)

# Log It
with open("download_eval_log.txt", "w") as log_file:
    for doi, status in results:
        log_file.write(f"{doi}\t{status}\n")

print(f"Finished downloading. {sum(1 for _, s in results if s == 'downloaded')} PDFs downloaded.")


If download all the pdfs of the eval dois, there are 5456 to be downloaded, that's too many! So we only download those DOIs that are actually cited by the question used by eval split.

In [1]:
import os
import json
import json5
import aiohttp
import asyncio
import urllib.parse
from pathlib import Path
from tqdm import tqdm
import nest_asyncio

# Input files
split_file = "2024-10-16_litqa2-splits.json5"
jsonl_file = "litqa-v2-public.jsonl"

# Output directory
output_dir = Path("./litqa_pdfs")
output_dir.mkdir(parents=True, exist_ok=True)

# Contact email for Unpaywall
email = "xc392@cam.ac.uk"

# Extract eval question IDs from JSON5 split file
with open(split_file, "r", encoding="utf-8") as f:
    split_data = json5.load(f)
eval_ids = set(split_data.get("eval", {}).get("question_ids", []))

# Extract DOIs from jsonl questions matching eval_ids
eval_dois = set()
with open(jsonl_file, "r", encoding="utf-8") as f:
    for line in f:
        entry = json.loads(line)
        if entry.get("id") in eval_ids:
            for src in entry.get("sources", []):
                if "doi.org" in src:
                    doi = src.strip().split("doi.org/")[-1]
                    eval_dois.add(doi)

print(f"Found {len(eval_dois)} DOIs for eval questions")

# Use Unpaywall API to resolve free PDF URLs
async def get_pdf_url_from_unpaywall(session, doi: str) -> str | None:
    api_url = f"https://api.unpaywall.org/v2/{doi}?email={email}"
    try:
        async with session.get(api_url, timeout=15) as resp:
            if resp.status != 200:
                return None
            data = await resp.json()
            pdf_info = data.get("best_oa_location", {})
            return pdf_info.get("url_for_pdf", None)
    except Exception:
        return None

# Download the PDF file from a given URL
async def download_pdf(session, url: str, save_path: Path) -> bool:
    try:
        async with session.get(url, timeout=30) as resp:
            if resp.status == 200 and "application/pdf" in resp.headers.get("Content-Type", ""):
                content = await resp.read()
                with open(save_path, "wb") as f:
                    f.write(content)
                return True
    except Exception:
        pass
    return False

# Batch download all DOIs using aiohttp
async def download_eval_dois(dois):
    results = []
    async with aiohttp.ClientSession() as session:
        for doi in tqdm(sorted(dois)):
            filename = urllib.parse.quote_plus(doi) + ".pdf"
            save_path = output_dir / filename

            if save_path.exists() and save_path.stat().st_size > 10_000:
                results.append((doi, "exists"))
                continue

            pdf_url = await get_pdf_url_from_unpaywall(session, doi)
            if not pdf_url:
                results.append((doi, "no_pdf"))
                continue

            success = await download_pdf(session, pdf_url, save_path)
            results.append((doi, "downloaded" if success else "fail_download"))
    return results

# Run the download pipeline
nest_asyncio.apply()
results = await download_eval_dois(eval_dois)

# Log download results
with open("download_eval_log.txt", "w") as log_file:
    for doi, status in results:
        log_file.write(f"{doi}\t{status}\n")

print(f"Finished downloading. {sum(1 for _, s in results if s == 'downloaded')} PDFs downloaded.")


Found 40 DOIs for eval questions


100%|██████████| 40/40 [03:30<00:00,  5.25s/it]

Finished downloading. 12 PDFs downloaded.





In [None]:
# Get manifest.json
import urllib.parse
import json
from pathlib import Path

pdf_dir = Path("litqa_pdfs")
manifest = {}

for pdf_path in pdf_dir.glob("*.pdf"):
    decoded_doi = urllib.parse.unquote_plus(pdf_path.stem)
    manifest[decoded_doi] = {
        "doi": decoded_doi,
        "file_path": str(pdf_path.name)
    }

with open("manifest.jsonl", "w") as f:
    json.dump(manifest, f, indent=2)

print(f"Generated manifest with {len(manifest)} entries.")

Generated manifest with 14 entries.


In [None]:
import os
import json
import json5
import aiohttp
import asyncio
import urllib.parse
from pathlib import Path
from tqdm import tqdm
import nest_asyncio

# Input files
split_file = "2024-10-16_litqa2-splits.json5"
jsonl_file = "litqa-v2-public.jsonl"

# Output directory
output_dir = Path("./litqa_pdfs")
output_dir.mkdir(parents=True, exist_ok=True)

# Output file for manual download list
manual_json_path = "manual_download_needed.json"

# Contact email for Unpaywall API
email = "xc392@cam.ac.uk"

# Load train + eval question IDs from JSON5 split file
with open(split_file, "r", encoding="utf-8") as f:
    split_data = json5.load(f)
train_ids = set(split_data.get("train", {}).get("question_ids", []))
eval_ids = set(split_data.get("eval", {}).get("question_ids", []))
combined_ids = train_ids | eval_ids
print(f"Total questions to process (train + eval): {len(combined_ids)}")

# Extract DOIs from jsonl for the selected questions
combined_doi_map = {}  # key: question_id, value: set of DOIs
with open(jsonl_file, "r", encoding="utf-8") as f:
    for line in f:
        entry = json.loads(line)
        qid = entry.get("id")
        if qid in combined_ids:
            for src in entry.get("sources", []):
                if "doi.org" in src:
                    doi = src.strip().split("doi.org/")[-1]
                    combined_doi_map.setdefault(qid, set()).add(doi)

# Flatten to unique DOIs
combined_dois = set(doi for dois in combined_doi_map.values() for doi in dois)
print(f"Found {len(combined_dois)} unique DOIs for train + eval questions")

# Resolve a DOI to a PDF URL via Unpaywall
async def get_pdf_url_from_unpaywall(session, doi: str) -> str | None:
    api_url = f"https://api.unpaywall.org/v2/{doi}?email={email}"
    try:
        async with session.get(api_url, timeout=15) as resp:
            if resp.status != 200:
                return None
            data = await resp.json()
            pdf_info = data.get("best_oa_location", {})
            return pdf_info.get("url_for_pdf", None)
    except Exception:
        return None

# Download a PDF from a URL to a specified path
async def download_pdf(session, url: str, save_path: Path) -> bool:
    try:
        async with session.get(url, timeout=30) as resp:
            if resp.status == 200 and "application/pdf" in resp.headers.get("Content-Type", ""):
                content = await resp.read()
                with open(save_path, "wb") as f:
                    f.write(content)
                return True
    except Exception:
        pass
    return False

# Download all DOIs and track failures for manual handling
async def download_all_dois(dois, doi_to_qids):
    results = []
    manual_entries = []
    async with aiohttp.ClientSession() as session:
        for doi in tqdm(sorted(dois)):
            filename = urllib.parse.quote_plus(doi) + ".pdf"
            save_path = output_dir / filename

            if save_path.exists() and save_path.stat().st_size > 10_000:
                results.append((doi, "exists"))
                continue

            pdf_url = await get_pdf_url_from_unpaywall(session, doi)
            if not pdf_url:
                results.append((doi, "no_pdf"))
                for qid in doi_to_qids.get(doi, []):
                    manual_entries.append({
                        "question_id": qid,
                        "doi": doi,
                        "download_url": None
                    })
                continue

            success = await download_pdf(session, pdf_url, save_path)
            status = "downloaded" if success else "fail_download"
            results.append((doi, status))
            if not success:
                for qid in doi_to_qids.get(doi, []):
                    manual_entries.append({
                        "question_id": qid,
                        "doi": doi,
                        "download_url": pdf_url
                    })
    return results, manual_entries

# Build DOI to question ID reverse mapping
doi_to_qids = {}
for qid, dois in combined_doi_map.items():
    for doi in dois:
        doi_to_qids.setdefault(doi, []).append(qid)

# Run the download process
nest_asyncio.apply()
results, manual_entries = await download_all_dois(combined_dois, doi_to_qids)

# Save log file
with open("download_eval_log.txt", "w") as log_file:
    for doi, status in results:
        log_file.write(f"{doi}\t{status}\n")

# Save failed or missing download information
with open(manual_json_path, "w", encoding="utf-8") as f:
    json.dump(manual_entries, f, indent=2)

print(f"Finished downloading. {sum(1 for _, s in results if s == 'downloaded')} PDFs downloaded.")
print(f"Manual download list saved to {manual_json_path} with {len(manual_entries)} entries.")

In [5]:
import os
import json
import aiohttp
import asyncio
import urllib.parse
from pathlib import Path
from tqdm import tqdm
import nest_asyncio

# File paths
jsonl_file = "litqa-v2-public.jsonl"
output_dir = Path("./litqa_pdfs")
manual_output_file = "manual_download_needed.json"

# Contact email for Unpaywall
email = "xc392@cam.ac.uk"

# Ensure output directory exists
output_dir.mkdir(parents=True, exist_ok=True)

# Extract all (question_id, doi) pairs from litqa-v2-public.jsonl
question_doi_pairs = []
with open(jsonl_file, "r", encoding="utf-8") as f:
    for line in f:
        entry = json.loads(line)
        qid = entry.get("id")
        for src in entry.get("sources", []):
            if "doi.org" in src:
                doi = src.strip().split("doi.org/")[-1]
                question_doi_pairs.append((qid, doi))

# Use Unpaywall API to resolve PDF URLs
async def get_pdf_url_from_unpaywall(session, doi: str) -> str | None:
    api_url = f"https://api.unpaywall.org/v2/{doi}?email={email}"
    try:
        async with session.get(api_url, timeout=15) as resp:
            if resp.status != 200:
                return None
            data = await resp.json()
            pdf_info = data.get("best_oa_location", {})
            return pdf_info.get("url_for_pdf", None)
    except Exception:
        return None

# Download the PDF from resolved URL
async def download_pdf(session, url: str, save_path: Path) -> bool:
    try:
        async with session.get(url, timeout=30) as resp:
            if resp.status == 200 and "application/pdf" in resp.headers.get("Content-Type", ""):
                content = await resp.read()
                with open(save_path, "wb") as f:
                    f.write(content)
                return True
    except Exception:
        pass
    return False

# Batch download all DOIs, record manual entries if failed
async def download_all():
    results = []
    manual_entries = []
    async with aiohttp.ClientSession() as session:
        for qid, doi in tqdm(question_doi_pairs):
            filename = urllib.parse.quote_plus(doi) + ".pdf"
            save_path = output_dir / filename

            if save_path.exists() and save_path.stat().st_size > 10_000:
                results.append((qid, doi, "exists"))
                continue

            pdf_url = await get_pdf_url_from_unpaywall(session, doi)
            if not pdf_url:
                results.append((qid, doi, "no_pdf"))
                manual_entries.append({
                    "question_id": qid,
                    "doi": doi,
                    "download_url": None
                })
                continue

            success = await download_pdf(session, pdf_url, save_path)
            status = "downloaded" if success else "fail_download"
            results.append((qid, doi, status))

            if not success:
                manual_entries.append({
                    "question_id": qid,
                    "doi": doi,
                    "download_url": pdf_url
                })
    return results, manual_entries

# Run the download pipeline
nest_asyncio.apply()
results, manual_entries = asyncio.run(download_all())

# Save failed entries for manual download
with open(manual_output_file, "w", encoding="utf-8") as f:
    json.dump(manual_entries, f, indent=2, ensure_ascii=False)

# Optional: print summary
print(f"Total attempted: {len(results)}")
print(f"Downloaded: {sum(1 for _, _, s in results if s == 'downloaded')}")
print(f"Failed (manual): {len(manual_entries)}")


100%|██████████| 205/205 [02:43<00:00,  1.26it/s]

Total attempted: 205
Downloaded: 58
Failed (manual): 147





In [154]:
doi = '10.1126/science.abk2432'
from urllib.parse import quote_plus
filename = quote_plus(doi) + '.pdf'
print(filename)
print("https://doi.org/" + doi)

10.1126%2Fscience.abk2432.pdf
https://doi.org/10.1126/science.abk2432
