### Paper Collection from arXiv, Semantic Scholar and OpenAlex

### Import some required library

In [1]:
import os
import re
import json
import arxiv
import yaml
import logging
import argparse
import datetime
import requests
import pandas as pd
from datetime import datetime, timedelta

#### Collect Paper from arXiv

In [4]:
# arxiv_scraper.py
import arxiv
import json
from datetime import datetime, timedelta

def fetch_arxiv_papers(query, max_results=300, time_range="month"):
    end_date = datetime.now()
    if time_range == "week":
        start_date = end_date - timedelta(days=7)
    elif time_range == "months":
        start_date = end_date - timedelta(days=30)
    else:
        start_date = end_date - timedelta(days=30)

    time_filter = f"submittedDate:[{start_date.strftime('%Y%m%d')}+TO+{end_date.strftime('%Y%m%d')}]"
    full_query = f"{query}+AND+{time_filter}"

    search = arxiv.Search(
        query=full_query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.SubmittedDate
    )

    results = []
    for r in search.results():
        paper = {
            "source": "arxiv",
            "paper_id": r.get_short_id(),
            "title": r.title,
            "authors": [str(a) for a in r.authors],
            "abstract": r.summary,
            "publish_date": str(r.published.date()),
            "url": f"http://arxiv.org/abs/{r.get_short_id()}",
            "categories": r.categories
        }
        results.append(paper)

    return results

if __name__ == "__main__":
    topics = [
    "artificial intelligence",
    "machine learning",
    "computer vision",
    "natural language processing",
    "robotics and human-computer interaction"
    ]

    all_papers = []

    for t in topics:
        papers = fetch_arxiv_papers(t, max_results=300, time_range="month")
        all_papers.extend(papers)
        print(f"Fetched {len(papers)} from arXiv for '{t}'")

    with open("arxiv_papers.jsonl", "w", encoding="utf-8") as f:
        for p in all_papers:
            f.write(json.dumps(p, ensure_ascii=False) + "\n")

    print(f"Saved total {len(all_papers)} papers from arXiv.")


  for r in search.results():


Fetched 300 from arXiv for 'artificial intelligence'
Fetched 300 from arXiv for 'machine learning'
Fetched 300 from arXiv for 'computer vision'
Fetched 300 from arXiv for 'natural language processing'
Fetched 300 from arXiv for 'robotics and human-computer interaction'
Saved total 1500 papers from arXiv.


#### Collect Paper from Semantic Scholar

In [5]:
!pip install semanticscholar


Collecting semanticscholar
  Downloading semanticscholar-0.11.0-py3-none-any.whl.metadata (3.8 kB)
Collecting tenacity (from semanticscholar)
  Downloading tenacity-9.1.2-py3-none-any.whl.metadata (1.2 kB)
Downloading semanticscholar-0.11.0-py3-none-any.whl (26 kB)
Downloading tenacity-9.1.2-py3-none-any.whl (28 kB)
Installing collected packages: tenacity, semanticscholar

   -------------------- ------------------- 1/2 [semanticscholar]
   -------------------- ------------------- 1/2 [semanticscholar]
   ---------------------------------------- 2/2 [semanticscholar]

Successfully installed semanticscholar-0.11.0 tenacity-9.1.2


In [8]:
# semantic_scholar_scraper.py
import json
import time
from semanticscholar import SemanticScholar

sch = SemanticScholar()


def fetch_s2_papers(query, limit=200):
    print(f"Searching Semantic Scholar for '{query}' ...")

    results = sch.search_paper(
        query=query,
        limit=limit,
        fields=[
            "paperId",
            "title",
            "abstract",
            "authors",
            "year",
            "url",
            "venue",
            "fieldsOfStudy",
            "citationCount"
        ]
    )

    papers = []

    for p in results:
        paper = {
            "source": "semantic_scholar",
            "paper_id": p.paperId,
            "title": p.title,
            "authors": [a.name for a in p.authors] if p.authors else [],
            "abstract": p.abstract or "",
            "publish_date": str(p.year) if p.year else "",
            "venue": p.venue or "",
            "fields_of_study": p.fieldsOfStudy or [],
            "citation_count": p.citationCount if p.citationCount is not None else 0,
            "url": p.url or ""
        }
        papers.append(paper)

    return papers


if __name__ == "__main__":

    topics = [
        "artificial intelligence",
        "machine learning",
        "computer vision",
        "natural language processing",
        "robotics and human-computer interaction"
    ]

    all_papers = []

    for t in topics:
        s2papers = fetch_s2_papers(t, limit=100)
        all_papers.extend(s2papers)
        print(f"Got {len(s2papers)} papers from Semantic Scholar for '{t}'")

        time.sleep(1)

    output_file = "semantic_scholar_papers.jsonl"
    with open(output_file, "w", encoding="utf-8") as f:
        for p in all_papers:
            f.write(json.dumps(p, ensure_ascii=False) + "\n")

    print(f"Saved total {len(all_papers)} papers to {output_file}")


Searching Semantic Scholar for 'artificial intelligence' ...
Got 1000 papers from Semantic Scholar for 'artificial intelligence'
Searching Semantic Scholar for 'machine learning' ...
Got 1000 papers from Semantic Scholar for 'machine learning'
Searching Semantic Scholar for 'computer vision' ...
Got 1000 papers from Semantic Scholar for 'computer vision'
Searching Semantic Scholar for 'natural language processing' ...
Got 1000 papers from Semantic Scholar for 'natural language processing'
Searching Semantic Scholar for 'robotics and human-computer interaction' ...
Got 1000 papers from Semantic Scholar for 'robotics and human-computer interaction'
Saved total 5000 papers to semantic_scholar_papers.jsonl


#### Collect Paper from OpenAlex

In [None]:
# openalex_scraper.py (robust version)
import json
import time
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

BASE_URL = "https://api.openalex.org/works"


def build_session():
    session = requests.Session()

    retry = Retry(
        total=5,
        backoff_factor=1.5,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"],
        raise_on_status=False
    )

    adapter = HTTPAdapter(max_retries=retry)
    session.mount("https://", adapter)
    session.mount("http://", adapter)

    return session


def fetch_openalex_papers(query, limit=200, per_page=50):
    print(f"Searching OpenAlex for '{query}' ...")

    session = build_session()
    papers = []

    page = 1
    collected = 0

    while collected < limit:
        params = {
            "search": query,
            "per-page": per_page,
            "page": page,
            "mailto": "research@example.com"
        }

        try:
            resp = session.get(BASE_URL, params=params, timeout=30)
            resp.raise_for_status()
            data = resp.json()
        except requests.exceptions.RequestException as e:
            print(f"[WARN] OpenAlex request failed (page={page}): {e}")
            print("       Sleeping 5s and skipping this page...")
            time.sleep(5)
            page += 1
            continue

        results = data.get("results", [])
        if not results:
            break

        for r in results:
            if collected >= limit:
                break

            paper = {
                "source": "openalex",
                "paper_id": r.get("id", ""),
                "title": r.get("title", ""),
                "abstract": r.get("abstract") or "",
                "authors": [
                    a["author"]["display_name"]
                    for a in r.get("authorships", [])
                    if "author" in a
                ],
                "publish_date": r.get("publication_year", ""),
                "venue": r.get("host_venue", {}).get("display_name", ""),
                "citation_count": r.get("cited_by_count", 0),
                "fields_of_study": [
                    c["display_name"]
                    for c in r.get("concepts", [])
                    if c.get("level", 10) <= 1
                ],
                "url": r.get("id", "")
            }

            papers.append(paper)
            collected += 1

        page += 1
        time.sleep(1.5)

    return papers


if __name__ == "__main__":

    topics = [
        "artificial intelligence",
        "machine learning",
        "computer vision",
        "natural language processing",
        "robotics human computer interaction"
    ]

    all_papers = []

    for t in topics:
        papers = fetch_openalex_papers(t, limit=300)
        all_papers.extend(papers)
        print(f"Got {len(papers)} papers from OpenAlex for '{t}'")
        time.sleep(3)

    output_file = "openalex_papers.jsonl"
    with open(output_file, "w", encoding="utf-8") as f:
        for p in all_papers:
            f.write(json.dumps(p, ensure_ascii=False) + "\n")

    print(f"Saved total {len(all_papers)} papers to {output_file}")


Searching OpenAlex for 'artificial intelligence' ...
Got 300 papers from OpenAlex for 'artificial intelligence'
Searching OpenAlex for 'machine learning' ...
Got 300 papers from OpenAlex for 'machine learning'
Searching OpenAlex for 'computer vision' ...
Got 300 papers from OpenAlex for 'computer vision'
Searching OpenAlex for 'natural language processing' ...
Got 300 papers from OpenAlex for 'natural language processing'
Searching OpenAlex for 'robotics human computer interaction' ...
Got 300 papers from OpenAlex for 'robotics human computer interaction'
Saved total 1500 papers to openalex_papers.jsonl


In [13]:
# openalex_scraper.py
import json
import time
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry


BASE_URL = "https://api.openalex.org/works"


def build_session():
    session = requests.Session()

    retry = Retry(
        total=5,
        backoff_factor=1.5,
        status_forcelist=[429, 500, 502, 503, 504],
        allowed_methods=["GET"],
        raise_on_status=False
    )

    adapter = HTTPAdapter(max_retries=retry)
    session.mount("https://", adapter)
    session.mount("http://", adapter)

    return session


def reconstruct_abstract(abstract_index):
    if not abstract_index:
        return ""

    position_to_word = {}
    for word, positions in abstract_index.items():
        for pos in positions:
            position_to_word[pos] = word

    abstract_words = [
        position_to_word[pos]
        for pos in sorted(position_to_word.keys())
    ]

    return " ".join(abstract_words)

def fetch_openalex_papers(query, limit=200, per_page=50):
    print(f"Searching OpenAlex for '{query}' ...")

    session = build_session()
    papers = []

    page = 1
    collected = 0

    while collected < limit:
        params = {
            "search": query,
            "per-page": per_page,
            "page": page,
            "mailto": "research@example.com"  # OpenAlex 官方推荐
        }

        try:
            resp = session.get(BASE_URL, params=params, timeout=30)
            resp.raise_for_status()
            data = resp.json()
        except requests.exceptions.RequestException as e:
            print(f"[WARN] OpenAlex request failed (page={page}): {e}")
            print("       Sleep 5s and skip this page.")
            time.sleep(5)
            page += 1
            continue

        results = data.get("results", [])
        if not results:
            break

        for r in results:
            if collected >= limit:
                break

            abstract = reconstruct_abstract(
                r.get("abstract_inverted_index", {})
            )

            paper = {
                "source": "openalex",
                "paper_id": r.get("id", ""),
                "title": r.get("title", ""),
                "abstract": abstract,
                "abstract_source": "openalex" if abstract else "",
                "authors": [
                    a["author"]["display_name"]
                    for a in r.get("authorships", [])
                    if "author" in a
                ],
                "publish_year": r.get("publication_year", ""),
                "venue": r.get("host_venue", {}).get("display_name", ""),
                "citation_count": r.get("cited_by_count", 0),
                "fields_of_study": [
                    c["display_name"]
                    for c in r.get("concepts", [])
                    if c.get("level", 10) <= 1
                ],
                "url": r.get("id", ""),
            }

            papers.append(paper)
            collected += 1

        page += 1
        time.sleep(1.5) 
    return papers


if __name__ == "__main__":

    topics = [
        "artificial intelligence",
        "machine learning",
        "computer vision",
        "natural language processing",
        "robotics human computer interaction"
    ]

    all_papers = []

    for topic in topics:
        papers = fetch_openalex_papers(topic, limit=300)
        all_papers.extend(papers)
        print(f"Got {len(papers)} papers from OpenAlex for '{topic}'")
        time.sleep(3) 

    output_file = "openalex_papers.jsonl"
    with open(output_file, "w", encoding="utf-8") as f:
        for paper in all_papers:
            f.write(json.dumps(paper, ensure_ascii=False) + "\n")

    print(f"\nSaved {len(all_papers)} papers to {output_file}")


Searching OpenAlex for 'artificial intelligence' ...
Got 300 papers from OpenAlex for 'artificial intelligence'
Searching OpenAlex for 'machine learning' ...
Got 300 papers from OpenAlex for 'machine learning'
Searching OpenAlex for 'computer vision' ...
Got 300 papers from OpenAlex for 'computer vision'
Searching OpenAlex for 'natural language processing' ...
Got 300 papers from OpenAlex for 'natural language processing'
Searching OpenAlex for 'robotics human computer interaction' ...
Got 300 papers from OpenAlex for 'robotics human computer interaction'

Saved 1500 papers to openalex_papers.jsonl


#### Merge all collected paper into one .jsonl file

In [14]:
# merge_jsonl.py
import json

files = [
    "arxiv_papers.jsonl",
    "semantic_scholar_papers.jsonl",
    "openalex_papers.jsonl"
]

merged_file = "merged_papers.jsonl"
seen_ids = set()
count = 0

with open(merged_file, "w", encoding="utf-8") as fout:
    for file in files:
        with open(file, "r", encoding="utf-8") as fin:
            for line in fin:
                data = json.loads(line.strip())
                pid = data.get("paper_id") or data.get("title")
                if pid not in seen_ids:
                    seen_ids.add(pid)
                    fout.write(json.dumps(data, ensure_ascii=False) + "\n")
                    count += 1

print(f"Merged total {count} unique records into {merged_file}.")


Merged total 7397 unique records into merged_papers.jsonl.
