In [2]:
import requests, time 
import pandas as pd

In [3]:
def fetch_biorxiv_sciedu(date_from="2017-01-01", date_to="2025-09-19", max_pages=200):
    rows, cursor = [], 0
    params = {"category": "scientific_communication_and_education"}
    while cursor < max_pages*100:
        url = f"https://api.biorxiv.org/details/biorxiv/{date_from}/{date_to}/{cursor}"
        r = requests.get(url, params=params, timeout=30)
        r.raise_for_status()
        data = r.json()
        items = data.get("collection") or data.get("messages") 
        items = data.get("collection", [])
        if not items: break
        for it in items:
            rows.append({
                "doi": it.get("doi"),
                "title": it.get("title"),
                "authors_raw": it.get("authors"),
                "author_corresponding": it.get("author_corresponding"),
                "date": it.get("date"),
                "category": it.get("category"),
                "abstract": it.get("abstract"),
            })
        cursor += 100
        time.sleep(0.2)
    df = pd.DataFrame(rows).drop_duplicates(subset=["doi"])
    return df


In [4]:

df = fetch_biorxiv_sciedu()
print(df.shape, df.head())


(1395, 7)               doi                                              title  \
0  10.1101/055681  Biogeographic Ancestry and Socioeconomic Outco...   
1  10.1101/058511  Learning from critical care management of shee...   
4  10.1101/061200  A signal detection theoretic argument against ...   
5  10.1101/070631  Gender disparity in computational biology rese...   
6  10.1101/069468  How do Research Faculty in the Biosciences Eva...   

                                         authors_raw author_corresponding  \
0         Kirkegaard, E. O. W.; Wang, M.; Fuerst, J.         John  Fuerst   
1                                      Chemonges, S.      Saul  Chemonges   
4  Miuccio, M. T.; Liu, K.-y.; Lau, H.; Peters, M...   Megan A. K. Peters   
5                       Bonham, K. S.; Stefan, M. I.      Kevin S. Bonham   
6                                         Kassis, T.      Timothy  Kassis   

         date                                category  \
0  2017-03-01  scientific communicati

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1395 entries, 0 to 1869
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   doi                   1395 non-null   object
 1   title                 1395 non-null   object
 2   authors_raw           1395 non-null   object
 3   author_corresponding  1395 non-null   object
 4   date                  1395 non-null   object
 5   category              1395 non-null   object
 6   abstract              1395 non-null   object
dtypes: object(7)
memory usage: 87.2+ KB


In [6]:
df.to_csv("biorxiv_sciedu.csv", index=False)

In [7]:
print("\nMissing values:")
print(df.isnull().sum())
print("\n NAN value:")
print(df.isna().sum())


Missing values:
doi                     0
title                   0
authors_raw             0
author_corresponding    0
date                    0
category                0
abstract                0
dtype: int64

 NAN value:
doi                     0
title                   0
authors_raw             0
author_corresponding    0
date                    0
category                0
abstract                0
dtype: int64
