# Process of Fetching Papers

## Overview

We need to fetch papers and their relevant metadata to later capture them as entities and link them to companies and technologies.

We use OpenAlex API for this because it records various useful properties of research papers.

We fetch the top 1 000 cited papers for each year 2024-2019 that have abstracts. We will fetch the following properties:

1. OpenAlex ID
2. DOI
3. title
4. publication_date
5. cited_by_count
6. authorships
7. keywords
8. topics 
9. abstract

In [1]:
import requests
import os
import dotenv
import json

dotenv.load_dotenv()

#OpenAlex API endpoint
url = "https://api.openalex.org/works"
mail_to = os.getenv("MAIL")

years = [2024, 2023, 2022, 2021, 2020, 2019]
all_papers = []

per_page = 200

for year in years:
    paper_count = 0 
    page = 1
    while paper_count < 1000:
        params = {
            "filter": f"publication_year:{year}",
            "per-page": 200,
            "mailto": mail_to,
            "sort": "cited_by_count:desc",
            "page": page,
            "per-page": per_page,
            "select": "id, doi, title, publication_date, authorships, cited_by_count, keywords, topics, abstract_inverted_index"
        }
        
        response = requests.get(url, params=params)
        
        if response.status_code == 200:
            data = response.json()
            papers = data.get('results', [])
            if not papers:
                break
            
            # Filter out papers with null abstract_inverted_index
            papers_with_abstract = [p for p in papers if p.get('abstract_inverted_index') is not None]
            remaining_needed = 1000 - paper_count
            papers_to_add = papers_with_abstract[:remaining_needed]
            paper_count += len(papers_to_add)
            all_papers.extend(papers_to_add)
            if paper_count >= 1000:
                break
        else:
            print(f"Error fetching data for year {year}: {response.status_code}")
            break
        
        page += 1
        
print(len(all_papers), "papers found")

raw_path = "../data/papers-data/raw_papers_v2.jsonl"
with open(raw_path, "w", encoding="utf-8") as f:
    for p in all_papers:
        f.write(json.dumps(p, ensure_ascii=False) + "\n")

print("Raw papers saved to", raw_path)


    

6000 papers found
Raw papers saved to ../data/papers-data/raw_papers_v2.jsonl


We have 6 000 papers, 1 000 from each year sorted by most cited. Each entry is a rather large JSON that contains a lot of unnecessary data for our use case. Now we clean and restrucutre the data into a useful format.

In [4]:
import json
from copy import deepcopy
from typing import List, Dict, Any

def inv_index(abstract_index: Dict[str, List[int]]) -> str:
    """
    Convert an inverted index of an abstract into its plain text form.

    Args:
        abstract_index: A mapping from words to lists of positions.

    Returns:
        The reconstructed abstract as a single string.
    """
    pos2word: Dict[int, str] = {}
    for word, positions in abstract_index.items():
        for pos in positions:
            pos2word[pos] = word
    # Reassemble text in correct order
    return " ".join(pos2word[i] for i in sorted(pos2word))

def clean_paper(paper: Dict[str, Any]) -> Dict[str, Any]:
    """ Return a cleaned copy of one paper"""
    paper = deepcopy(paper)
    
    cleaned_authors: List[Dict[str, Any]] = []
    institutions: Dict[str, Dict[str, Any]] = {}
    
    for author in paper.get("authorships", []):
        # slim author
        cleaned_authors.append(
            {
                "author_position": author.get("author_position"),
                "display_name": author.get("author", {}).get("display_name"),
                "orcid": author.get("author", {}).get("orcid"),
            }
        )
        
        # collect institutions
        for institution in author.get("institutions", []):
            # drop lineage field
            institution_without_lineage = {k: v for k, v in institution.items() if k != "lineage"}
            institutions[institution_without_lineage["id"]] = institution_without_lineage
    
    paper["authorships"] = cleaned_authors
    paper["institutions"] = list(institutions.values())
    
    # keywords
    
    paper["keywords"] = [
        {
            "id": keywords.get("id"),
            "display_name": keywords.get("display_name"),
            "score": keywords.get("score"),
        }
        for keywords in paper.get("keywords", [])
    ]
    
    # topics
    
    topics_raw = paper.get("topics", [])
    paper["topics"] = [
        {
            "display_name": topic.get("display_name"),
            "score": topic.get("score")
        }
            for topic in topics_raw
    ]
    
    # helper dictionaires for deduplication
    
    subfields, fields, domains = {}, {}, {}
    
    
    for topic in topics_raw:
        subfield = topic.get("subfield")
        if subfield:
            subfields[subfield["id"]] = {"id": subfield["id"], "display_name": subfield["display_name"]}

        field = topic.get("field")
        if field:
            fields[field["id"]] = {"id": field["id"], "display_name": field["display_name"]}

        domain = topic.get("domain")
        if domain:
            domains[domain["id"]] = {"id": domain["id"], "display_name": domain["display_name"]}
    
    paper["subfields"] = list(subfields.values())
    paper["fields"] = list(fields.values())
    paper["domains"] = list(domains.values())
    
    # Reconstruct abstract from inverted index, if present
    abstract_index = paper.get("abstract_inverted_index")
    if isinstance(abstract_index, dict):
        paper["abstract"] = inv_index(abstract_index)
        # Optionally remove the raw index
        paper.pop("abstract_inverted_index", None)

    return paper

def stream_jsonl(path: str):
    with open(path, encoding="utf-8") as f:
        for line in f:
            if line.strip():
                yield json.loads(line)
                
def write_jsonl(records, path: str):
    with open(path, "w", encoding="utf-8") as fh:
        for record in records:
            fh.write(json.dumps(record, ensure_ascii=False) + "\n")


In [5]:
raw_path = "../data/papers-data/raw_papers_v2.jsonl"
cleaned_path = "../data/papers-data/cleaned_papers_v2.jsonl"

cleaned_gen = (clean_paper(p) for p in stream_jsonl(raw_path))
write_jsonl(cleaned_gen, cleaned_path)

Now we extracted the relevant properties out of our API request.

This is the structure of our JSON Data

🔑 Top-Level Fields
Field	Type	Description
id	string	Unique identifier for the paper
doi	string	DOI of the paper
title	string	Title of the paper
publication_date	string	Date of publication
cited_by_count	integer	Number of times the paper has been cited

👨‍🔬 Authorships
An array of author objects:

json
Copy
Edit
"authorships": [
  {
    "author_position": "first" | "middle" | "last",
    "display_name": "Full name",
    "orcid": "ORCID identifier" or null
  }
]
🏷️ Keywords
An array of keyword objects describing the paper:

json
Copy
Edit
"keywords": [
  {
    "id": "keyword_id",
    "display_name": "Keyword name",
    "score": float
  }
]
📚 Topics
An array of general topics the paper is related to:

json
Copy
Edit
"topics": [
  {
    "display_name": "Topic name",
    "score": float
  }
]
🏛️ Institutions
An array of affiliated institutions:

json
Copy
Edit
"institutions": [
  {
    "id": "institution_id",
    "display_name": "Institution name",
    "ror": "ROR ID",
    "country_code": "e.g. US, DE",
    "type": "e.g. education, company"
  }
]
🧪 Classification Fields
Each of these is an array of objects with id and display_name.

Subfields (e.g. "Machine Learning", "Biophysics")
json
Copy
Edit
"subfields": [ { "id": "...", "display_name": "..." } ]
Fields (e.g. "Computer Science", "Biology")
json
Copy
Edit
"fields": [ { "id": "...", "display_name": "..." } ]
Domains (e.g. "STEM", "Social Sciences")
json
Copy
Edit
"domains": [ { "id": "...", "display_name": "..." } ]
