In [None]:
import pandas as pd
import requests
from tqdm import tqdm

df = pd.read_csv("PMC-Patients.csv")
df = df.set_index('patient_id')

def is_open_access(pmid):
    url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID:{pmid}&resultType=core&format=json"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        if "resultList" in data and data["resultList"]["result"]:
            result = data["resultList"]["result"][0]
            return result.get("isOpenAccess", "N") == "Y"
    return False

valid_rows = []
limit = 10

with tqdm(total=min(len(df), limit if limit > 0 else len(df)), desc="Processing PMIDs") as pbar:
    for i, (patient_id, row) in enumerate(df.iterrows()):
        if limit > 0 and i >= limit:
            break
        pmid = row['PMID']
        if is_open_access(pmid):
            valid_rows.append(row)
        pbar.update(1)

filtered_df = pd.DataFrame(valid_rows)

filtered_df.reset_index(inplace=True)


Check the Dataframe

In [None]:
filtered_df

Run this to save the dataframe as csv file

Naming convention for the output file:\
oa: Open Access\
num_rows: The number of rows which are included in this dataset

In [None]:
num_rows = len(filtered_df)
output_csv_file = f"PMC-Patients-oa-{num_rows}.csv"
filtered_df.to_csv(output_csv_file, index=False)