In [None]:
import os
import google.generativeai as genai
from google.genai import types
import dotenv

dotenv.load_dotenv()

True

In [21]:
paper_url = "https://arxiv.org/abs/2406.18518"  

genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
model=genai.GenerativeModel("gemini-2.5-flash")

# Gemini prompt to get PDF URL from the provided paper URL
prompt = f"""
Extract the direct PDF download link for this paper URL:
{paper_url}
Only return a valid downloadable link.
"""

response = model.generate_content(
    contents=prompt
)
pdf_url = response.text.strip()
print("PDF URL:", pdf_url)

import requests

pdf_filename = "paper.pdf"
with requests.get(pdf_url, stream=True) as r:
    r.raise_for_status()
    with open(pdf_filename, 'wb') as f:
        for chunk in r.iter_content(chunk_size=8192):
            if chunk:
                f.write(chunk)

print("PDF downloaded as:", pdf_filename)


PDF URL: https://arxiv.org/pdf/2406.18518.pdf
PDF downloaded as: paper.pdf


In [22]:
def extract_references_from_pdf(pdf_path):
    with open(pdf_path, "rb") as f:
        pdf_bytes = f.read()

    prompt = (
        "Read this PDF research paper. "
        "Extract all references and their links in Markdown format. "
        "Place main reference entries first, and external/resource links (if any) at the bottom. "
        "Format: Each reference as a Markdown list item, link as [text](url)."
    )

    response = model.generate_content(
        contents=[
            {
                "role": "user",
                "parts": [
                    {"text": prompt},
                    {
                        "inline_data": {
                            "mime_type": "application/pdf",
                            "data": pdf_bytes
                        }
                    }
                ]
            }
        ]
    )

    return response.text.strip()

pdf_filename = "paper.pdf"
references_md = extract_references_from_pdf(pdf_filename)

with open("Reference.md", "w", encoding="utf-8") as f:
    f.write("# References\n\n" + references_md)

print("Extracted references saved to Reference.md")


Extracted references saved to Reference.md


In [23]:
def extract_datasets_from_pdf(pdf_path):
    with open(pdf_path, "rb") as f:
        pdf_bytes = f.read()

    # Prompt for Gemini: Adjust for best results with your PDFs/research area
    prompt = (
        "Read this PDF research paper. "
        "List ALL public/open-source datasets or benchmarks used or most relevant to this work. "
        "For each, include a short description, and if possible, a direct link. "
        "Format the answer as a Markdown list."
    )

    response = model.generate_content(
        contents=[
            {
                "role": "user",
                "parts": [
                    {"text": prompt},
                    {
                        "inline_data": {
                            "mime_type": "application/pdf",
                            "data": pdf_bytes
                        }
                    }
                ]
            }
        ]
    )
    return response.text.strip()

# Usage
datasets_md = extract_datasets_from_pdf(pdf_filename)
with open("dataset.md", "w", encoding="utf-8") as f:
    f.write("# Datasets / Benchmarks\n\n" + datasets_md)
print("Related datasets saved to dataset.md")


Related datasets saved to dataset.md


In [26]:
def extract_code_repos_from_pdf(pdf_path):
    with open(pdf_path, "rb") as f:
        pdf_bytes = f.read()

    prompt = (
        "Read this PDF research paper. "
        "Identify all official or relevant open-source code repositories (e.g., GitHub, Hugging Face, etc) mentioned, used, or best suited for this research. "
        # "For each repo, provide a short summary and its public link. "
        "At last, consider the best repo, which can more effectively fulfill the provided research paper requirement and extract and provide the code from it."
        "Return your answer as a Markdown list."
    )

    response = model.generate_content(
        contents=[
            {
                "role": "user",
                "parts": [
                    {"text": prompt},
                    {
                        "inline_data": {
                            "mime_type": "application/pdf",
                            "data": pdf_bytes
                        }
                    }
                ]
            }
        ]
    )
    return response.text.strip()

# Usage
code_repos_md = extract_code_repos_from_pdf(pdf_filename)
with open("code-repo.md", "w", encoding="utf-8") as f:
    f.write("# Code Repositories\n\n" + code_repos_md)
print("Related code repositories saved to code-repo.md")


Related code repositories saved to code-repo.md
