# Fetching papers

Running the code below took around 2.5 minutes. 

In [1]:
from pyalex import Works, Authors, Source, Institutions, Topics
import pandas as pd
import requests
import time 

BASE_URL = "https://api.openalex.org/works"


query = """
(
  "artificial intelligence" OR AI OR "machine learning" OR ML OR "deep learning" OR DL OR 
  "natural language processing" OR NLP OR "predictive modeling" OR "data mining" OR 
  "data science" OR "neural networks" OR "transformer models" OR "language models" OR 
  "recommendation systems" OR "generative AI" OR "unsupervised learning" OR "supervised learning"
)
AND
(
  pricing OR promotion OR discount OR "price optimization" OR "dynamic pricing" OR 
  "sales prediction" OR "revenue management" OR "price elasticity" OR "demand forecasting" OR 
  marketing OR "campaign optimization" OR "consumer behavior" OR "targeting" OR "personalization"
)
AND
(
  retail OR supermarket OR "large-scale distribution" OR GDO OR "grocery stores" OR 
  "e-commerce" OR ecommerce OR "supply chain" OR "consumer goods" OR "FMCG" OR 
  "wholesale" OR "shopping behavior" OR "omnichannel" OR "online retail" OR "brick and mortar"
)
"""


params = {
    "search": query,
    "filter": "from_publication_date:2014-01-01,to_publication_date:2024-12-31",
    "per_page": 200,
    "cursor": "*"
}
all_results = []
max_pages = 50  # Safety limit: remove this if you want everything

for i in range(max_pages):
    print(f"Fetching page {i+1}")
    response = requests.get(BASE_URL, params=params)
    if response.status_code != 200:
        print("Error:", response.status_code, response.text)
        break

    data = response.json()
    all_results.extend(data["results"])

    next_cursor = data.get("meta", {}).get("next_cursor")
    if not next_cursor:
        break  # No more pages
    params["cursor"] = next_cursor
    time.sleep(1)  # Optional: Avoid rate-limiting (max 10 requests/sec)

print(f"Total papers fetched: {len(all_results)}")


Fetching page 1
Fetching page 2
Fetching page 3
Fetching page 4
Fetching page 5
Fetching page 6
Fetching page 7
Fetching page 8
Fetching page 9
Fetching page 10
Fetching page 11
Fetching page 12
Fetching page 13
Fetching page 14
Fetching page 15
Fetching page 16
Fetching page 17
Fetching page 18
Fetching page 19
Fetching page 20
Fetching page 21
Fetching page 22
Fetching page 23
Fetching page 24
Fetching page 25
Fetching page 26
Fetching page 27
Fetching page 28
Fetching page 29
Fetching page 30
Fetching page 31
Fetching page 32
Fetching page 33
Fetching page 34
Fetching page 35
Fetching page 36
Fetching page 37
Fetching page 38
Fetching page 39
Fetching page 40
Fetching page 41
Fetching page 42
Fetching page 43
Fetching page 44
Fetching page 45
Fetching page 46
Fetching page 47
Fetching page 48
Fetching page 49
Fetching page 50
Total papers fetched: 10000


In [5]:
import json

# Check the first paper
first_paper = all_results[0]
print(json.dumps(first_paper, indent=2))


{
  "id": "https://openalex.org/W3047327247",
  "doi": "https://doi.org/10.1080/13675567.2020.1803246",
  "title": "Machine learning demand forecasting and supply chain performance",
  "display_name": "Machine learning demand forecasting and supply chain performance",
  "relevance_score": 851.8783,
  "publication_year": 2020,
  "publication_date": "2020-08-04",
  "ids": {
    "openalex": "https://openalex.org/W3047327247",
    "doi": "https://doi.org/10.1080/13675567.2020.1803246",
    "mag": "3047327247"
  },
  "language": "en",
  "primary_location": {
    "is_oa": true,
    "landing_page_url": "https://doi.org/10.1080/13675567.2020.1803246",
    "pdf_url": null,
    "source": {
      "id": "https://openalex.org/S4210220312",
      "display_name": "International Journal of Logistics Research and Applications",
      "issn_l": "1367-5567",
      "issn": [
        "1367-5567",
        "1469-848X"
      ],
      "is_oa": false,
      "is_in_doaj": false,
      "is_indexed_in_scopus": tru

In [6]:
# Make sure you have at least one result
if all_results:
    first_paper = all_results[0]
    pub_year = first_paper.get("publication_year", "N/A")
    print(f"Publication year of the first paper: {pub_year}")
else:
    print("No papers found.")


Publication year of the first paper: 2020


In [7]:
file_path_json = "/Users/dionnespaltman/Desktop/Luiss /Data Science in Action/Project/openalex_results.json"
file_path_csv = "/Users/dionnespaltman/Desktop/Luiss /Data Science in Action/Project/openalex_results.csv"

import pandas as pd
import json

# Save data in both CSV and JSON formats

# Convert to DataFrame for flat/tabular structure (good for CSV export)
df = pd.DataFrame(all_results)

# Save as CSV
# CSV is ideal for quick inspection, Excel use, or working with pandas.
# However, it will flatten the structure and drop nested fields like 'authorships'.
df.to_csv(file_path_csv, index=False)
print("Data saved to openalex_results.csv")

# Save as JSON
# JSON retains the full nested structure of each entry (e.g., authorship, abstract index, locations),
# which is essential for any downstream processing like recommendation systems or NLP tasks.
with open(file_path_json, "w") as f:
    json.dump(all_results, f, indent=2)
print("Data saved to openalex_results.json")


Data saved to openalex_results.csv
Data saved to openalex_results.json
