In [21]:
import pandas as pd
import requests
from tqdm import tqdm

df = pd.read_csv("PMC-Patients.csv")
df = df.set_index('patient_id')

def is_open_access(pmid):
    url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID:{pmid}&resultType=core&format=json"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        if "resultList" in data and data["resultList"]["result"]:
            result = data["resultList"]["result"][0]
            return result.get("isOpenAccess", "N") == "Y"
    return False

valid_rows = []
limit = 10

with tqdm(total=min(len(df), limit if limit > 0 else len(df)), desc="Processing PMIDs") as pbar:
    for i, (patient_id, row) in enumerate(df.iterrows()):
        if limit > 0 and i >= limit:
            break
        pmid = row['PMID']
        if is_open_access(pmid):
            valid_rows.append(row)
        pbar.update(1)

filtered_df = pd.DataFrame(valid_rows)

filtered_df.reset_index(inplace=True)


Processing PMIDs:   0%|          | 0/10 [02:14<?, ?it/s]


KeyboardInterrupt: 

Check the Dataframe

In [7]:
filtered_df

Unnamed: 0,index,patient_uid,PMID,file_path,title,patient,age,gender,relevant_articles,similar_patients
0,0,7665777-1,33492400,comm/PMC007xxxxxx/PMC7665777.xml,Early Physical Therapist Interventions for Pat...,This 60-year-old male was hospitalized due to ...,"[[60.0, 'year']]",M,"{'32320506': 1, '32293716': 1, '23219649': 1, ...","{'7665777-2': 2, '7665777-3': 2, '7665777-4': ..."
1,1,7665777-2,33492400,comm/PMC007xxxxxx/PMC7665777.xml,Early Physical Therapist Interventions for Pat...,A 39-year-old man was hospitalized due to an i...,"[[39.0, 'year']]",M,"{'32320506': 1, '32293716': 1, '23219649': 1, ...","{'7665777-1': 2, '7665777-3': 2, '7665777-4': ..."
2,2,7665777-3,33492400,comm/PMC007xxxxxx/PMC7665777.xml,Early Physical Therapist Interventions for Pat...,One week after a positive COVID-19 result this...,"[[57.0, 'year']]",M,"{'32320506': 1, '32293716': 1, '23219649': 1, ...","{'7665777-1': 2, '7665777-2': 2, '7665777-4': ..."
3,3,7665777-4,33492400,comm/PMC007xxxxxx/PMC7665777.xml,Early Physical Therapist Interventions for Pat...,This 69-year-old male was admitted to the ICU ...,"[[69.0, 'year']]",M,"{'32320506': 1, '32293716': 1, '23219649': 1, ...","{'7665777-1': 2, '7665777-2': 2, '7665777-3': ..."
4,4,7665777-5,33492400,comm/PMC007xxxxxx/PMC7665777.xml,Early Physical Therapist Interventions for Pat...,This 57-year-old male was admitted to the ICU ...,"[[57.0, 'year']]",M,"{'32320506': 1, '32293716': 1, '23219649': 1, ...","{'7665777-1': 2, '7665777-2': 2, '7665777-3': ..."
5,5,7665777-6,33492400,comm/PMC007xxxxxx/PMC7665777.xml,Early Physical Therapist Interventions for Pat...,This 52-year-old male tested COVID-19 positive...,"[[52.0, 'year']]",M,"{'32320506': 1, '32293716': 1, '23219649': 1, ...","{'7665777-1': 2, '7665777-2': 2, '7665777-3': ..."
6,6,7665777-7,33492400,comm/PMC007xxxxxx/PMC7665777.xml,Early Physical Therapist Interventions for Pat...,Paramedics found this 59-year-old female with ...,"[[59.0, 'year']]",F,"{'32320506': 1, '32293716': 1, '23219649': 1, ...","{'7665777-1': 2, '7665777-2': 2, '7665777-3': ..."
7,7,7665777-8,33492400,comm/PMC007xxxxxx/PMC7665777.xml,Early Physical Therapist Interventions for Pat...,This 33-year-old female patient had typical CO...,"[[33.0, 'year']]",F,"{'32320506': 1, '32293716': 1, '23219649': 1, ...","{'7665777-1': 2, '7665777-2': 2, '7665777-3': ..."
8,8,7665777-9,33492400,comm/PMC007xxxxxx/PMC7665777.xml,Early Physical Therapist Interventions for Pat...,This 66-year-old male patient was admitted to ...,"[[66.0, 'year']]",M,"{'32320506': 1, '32293716': 1, '23219649': 1, ...","{'7665777-1': 2, '7665777-2': 2, '7665777-3': ..."
9,9,7665777-10,33492400,comm/PMC007xxxxxx/PMC7665777.xml,Early Physical Therapist Interventions for Pat...,A 66-year-old male started to present symptoms...,"[[66.0, 'year']]",M,"{'32320506': 1, '32293716': 1, '23219649': 1, ...","{'7665777-1': 2, '7665777-2': 2, '7665777-3': ..."


Run this to save the dataframe as csv file

Naming convention for the output file:\
oa: Open Access\
num_rows: The number of rows which are included in this dataset

In [None]:
num_rows = len(filtered_df)
output_csv_file = f"PMC-Patients-oa-{num_rows}.csv"
filtered_df.to_csv(output_csv_file, index=False)

In [13]:
import pandas as pd
import requests
from tqdm import tqdm

df = pd.read_csv("PMC-Patients.csv")
df = df.set_index('patient_id')

def check_access_get_url(pmid):
    url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID:{pmid}&resultType=core&format=json"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        if "resultList" in data and data["resultList"]["result"]:
            result = data["resultList"]["result"][0]
            if result.get("isOpenAccess", "N") == "Y":
                return data["resultList"]["result"][0]["fullTextUrlList"]["fullTextUrl"][0]["url"]
    return None

valid_rows = []
limit = 100
valid_pmids = []

with tqdm(total=min(len(df), limit if limit > 0 else len(df)), desc="Processing PMIDs") as pbar:
    for i, (patient_id, row) in enumerate(df.iterrows()):
        if limit > 0 and i >= limit:
            break
        pmid = row['PMID']
        pdf_url = check_access_get_url(pmid)
        if pdf_url is not None:
            row["url"] = pdf_url
            valid_rows.append(row)
        pbar.update(1)

filtered_df = pd.DataFrame(valid_rows)

filtered_df.reset_index(inplace=True)


Processing PMIDs: 100%|██████████| 100/100 [00:27<00:00,  3.70it/s]


In [19]:
df.head()


Unnamed: 0_level_0,patient_uid,PMID,file_path,title,patient,age,gender,relevant_articles,similar_patients
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,7665777-1,33492400,comm/PMC007xxxxxx/PMC7665777.xml,Early Physical Therapist Interventions for Pat...,This 60-year-old male was hospitalized due to ...,"[[60.0, 'year']]",M,"{'32320506': 1, '32293716': 1, '23219649': 1, ...","{'7665777-2': 2, '7665777-3': 2, '7665777-4': ..."
1,7665777-2,33492400,comm/PMC007xxxxxx/PMC7665777.xml,Early Physical Therapist Interventions for Pat...,A 39-year-old man was hospitalized due to an i...,"[[39.0, 'year']]",M,"{'32320506': 1, '32293716': 1, '23219649': 1, ...","{'7665777-1': 2, '7665777-3': 2, '7665777-4': ..."
2,7665777-3,33492400,comm/PMC007xxxxxx/PMC7665777.xml,Early Physical Therapist Interventions for Pat...,One week after a positive COVID-19 result this...,"[[57.0, 'year']]",M,"{'32320506': 1, '32293716': 1, '23219649': 1, ...","{'7665777-1': 2, '7665777-2': 2, '7665777-4': ..."
3,7665777-4,33492400,comm/PMC007xxxxxx/PMC7665777.xml,Early Physical Therapist Interventions for Pat...,This 69-year-old male was admitted to the ICU ...,"[[69.0, 'year']]",M,"{'32320506': 1, '32293716': 1, '23219649': 1, ...","{'7665777-1': 2, '7665777-2': 2, '7665777-3': ..."
4,7665777-5,33492400,comm/PMC007xxxxxx/PMC7665777.xml,Early Physical Therapist Interventions for Pat...,This 57-year-old male was admitted to the ICU ...,"[[57.0, 'year']]",M,"{'32320506': 1, '32293716': 1, '23219649': 1, ...","{'7665777-1': 2, '7665777-2': 2, '7665777-3': ..."


In [14]:
len(filtered_df)

100

In [10]:
! pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.1-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m167.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:04[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.1


In [16]:
import requests
import fitz  # PyMuPDF
import io

# Funktion zum Auslesen des Texts aus der PDF-URL
def extract_text_from_pdf(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            pdf_file = io.BytesIO(response.content)
            pdf_document = fitz.open(stream=pdf_file, filetype="pdf")
            text = ""
            for page in pdf_document:
                text += page.get_text()
            pdf_document.close()
            return text
        else:
            return f"Failed to download PDF (status code: {response.status_code})"
    except Exception as e:
        return str(e)

# Fortschrittsanzeige bei der Anwendung der Funktion
tqdm.pandas(desc="Extracting text from PDFs")

# Erstelle eine neue Spalte 'pdf_text' im DataFrame mit Fortschrittsanzeige
filtered_df['pdf_text'] = filtered_df['url'].progress_apply(extract_text_from_pdf)

# Ausgabe des DataFrames
print(filtered_df)

Extracting text from PDFs: 100%|██████████| 100/100 [05:39<00:00,  3.40s/it]

    index patient_uid      PMID                         file_path  \
0       0   7665777-1  33492400  comm/PMC007xxxxxx/PMC7665777.xml   
1       1   7665777-2  33492400  comm/PMC007xxxxxx/PMC7665777.xml   
2       2   7665777-3  33492400  comm/PMC007xxxxxx/PMC7665777.xml   
3       3   7665777-4  33492400  comm/PMC007xxxxxx/PMC7665777.xml   
4       4   7665777-5  33492400  comm/PMC007xxxxxx/PMC7665777.xml   
..    ...         ...       ...                               ...   
95     95   8699977-1  34943306  comm/PMC008xxxxxx/PMC8699977.xml   
96     96   8699977-2  34943306  comm/PMC008xxxxxx/PMC8699977.xml   
97     97   8699977-3  34943306  comm/PMC008xxxxxx/PMC8699977.xml   
98     98   8700017-1  34943285  comm/PMC008xxxxxx/PMC8700017.xml   
99     99   8700032-1  34940005  comm/PMC008xxxxxx/PMC8700032.xml   

                                                title  \
0   Early Physical Therapist Interventions for Pat...   
1   Early Physical Therapist Interventions for Pat...   





In [20]:
filtered_df.head()

Unnamed: 0,index,patient_uid,PMID,file_path,title,patient,age,gender,relevant_articles,similar_patients,url,pdf_text
0,0,7665777-1,33492400,comm/PMC007xxxxxx/PMC7665777.xml,Early Physical Therapist Interventions for Pat...,This 60-year-old male was hospitalized due to ...,"[[60.0, 'year']]",M,"{'32320506': 1, '32293716': 1, '23219649': 1, ...","{'7665777-2': 2, '7665777-3': 2, '7665777-4': ...",https://academic.oup.com/ptj/article-pdf/101/1...,Failed to download PDF (status code: 403)
1,1,7665777-2,33492400,comm/PMC007xxxxxx/PMC7665777.xml,Early Physical Therapist Interventions for Pat...,A 39-year-old man was hospitalized due to an i...,"[[39.0, 'year']]",M,"{'32320506': 1, '32293716': 1, '23219649': 1, ...","{'7665777-1': 2, '7665777-3': 2, '7665777-4': ...",https://academic.oup.com/ptj/article-pdf/101/1...,Failed to download PDF (status code: 403)
2,2,7665777-3,33492400,comm/PMC007xxxxxx/PMC7665777.xml,Early Physical Therapist Interventions for Pat...,One week after a positive COVID-19 result this...,"[[57.0, 'year']]",M,"{'32320506': 1, '32293716': 1, '23219649': 1, ...","{'7665777-1': 2, '7665777-2': 2, '7665777-4': ...",https://academic.oup.com/ptj/article-pdf/101/1...,Failed to download PDF (status code: 403)
3,3,7665777-4,33492400,comm/PMC007xxxxxx/PMC7665777.xml,Early Physical Therapist Interventions for Pat...,This 69-year-old male was admitted to the ICU ...,"[[69.0, 'year']]",M,"{'32320506': 1, '32293716': 1, '23219649': 1, ...","{'7665777-1': 2, '7665777-2': 2, '7665777-3': ...",https://academic.oup.com/ptj/article-pdf/101/1...,Failed to download PDF (status code: 403)
4,4,7665777-5,33492400,comm/PMC007xxxxxx/PMC7665777.xml,Early Physical Therapist Interventions for Pat...,This 57-year-old male was admitted to the ICU ...,"[[57.0, 'year']]",M,"{'32320506': 1, '32293716': 1, '23219649': 1, ...","{'7665777-1': 2, '7665777-2': 2, '7665777-3': ...",https://academic.oup.com/ptj/article-pdf/101/1...,Failed to download PDF (status code: 403)


In [2]:
import pandas as pd

filtered_df = pd.read_csv("PMC-Patients-oa-9995.csv")
article_ids = filtered_df["PMID"].tolist()
linked_ids_scores = filtered_df["relevant_articles"].tolist()

articles_PMID = set()
articles_PMID.update(article_ids)

In [3]:
import ast
for id_score in linked_ids_scores:
    data_dict = ast.literal_eval(id_score)
    for article_id in data_dict.keys():
        articles_PMID.add(article_id)



# Ausgabe des Dictionary
print(len(articles_PMID))

153252


In [4]:
articles = pd.DataFrame(list(articles_PMID), columns=["PMID"])

In [5]:
articles.head()

Unnamed: 0,PMID
0,32066827
1,7559895
2,27213173
3,11387725
4,18817572


In [6]:
def is_open_access(pmid):
    url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID:{pmid}&resultType=core&format=json"
    response = requests.get(url)
    
    if response.status_code == 200:
        data = response.json()
        if "resultList" in data and data["resultList"]["result"]:
            result = data["resultList"]["result"][0]
            if result.get("isOpenAccess", "N") == "Y":
                return data["resultList"]["result"][0]["fullTextUrlList"]["fullTextUrl"][0]["url"]
        return None

In [12]:
import requests
from tqdm import tqdm

tqdm.pandas(desc="Checking Open Access for PMIDs")

# Anwendung der Funktion auf die ersten 100 PMIDs und Speicherung der URLs in der neuen Spalte 'pdf_url'
articles['pdf_url'] = articles['PMID'].head(1000).progress_apply(is_open_access)

# Ausgabe des DataFrames
print(articles)

Checking Open Access for PMIDs: 100%|██████████| 1000/1000 [03:55<00:00,  4.25it/s]

            PMID                                            pdf_url
0       32066827  https://www.nature.com/articles/s41598-020-597...
1        7559895                                               None
2       27213173  https://nn.neurology.org/content/nnn/3/3/e228....
3       11387725                                               None
4       18817572  https://ojrd.biomedcentral.com/counter/pdf/10....
...          ...                                                ...
153247   5099723                                                NaN
153248  23020820                                                NaN
153249  25990654                                                NaN
153250  29443960                                                NaN
153251   8153934                                                NaN

[153252 rows x 2 columns]





In [13]:
first_1000 = articles.head(1000)

In [14]:
first_1000

Unnamed: 0,PMID,pdf_url
0,32066827,https://www.nature.com/articles/s41598-020-597...
1,7559895,
2,27213173,https://nn.neurology.org/content/nnn/3/3/e228....
3,11387725,
4,18817572,https://ojrd.biomedcentral.com/counter/pdf/10....
...,...,...
995,33628522,https://downloads.hindawi.com/journals/cricc/2...
996,23254794,
997,7922453,
998,14675689,


In [18]:
import fitz  # PyMuPDF
import io
def extract_text_from_pdf(url):
    if url is None:  # Überprüfen, ob die URL None ist
        return None
    try:
        response = requests.get(url)
        if response.status_code == 200:
            pdf_file = io.BytesIO(response.content)
            pdf_document = fitz.open(stream=pdf_file, filetype="pdf")
            text = ""
            for page in pdf_document:
                text += page.get_text()
            pdf_document.close()
            return text
        else:
            return f"Failed to download PDF (status code: {response.status_code})"
    except Exception as e:
        return str(e)

In [20]:
tqdm.pandas(desc="Extracting text from PDFs")

# Anwendung der Funktion auf die ersten 100 URLs und Speicherung der Texte in einer neuen Spalte 'pdf_text'
first_1000['pdf_text'] = first_1000['pdf_url'].head(1000).progress_apply(extract_text_from_pdf)

# Ausgabe des DataFrames
first_1000.head(100)

Extracting text from PDFs:   0%|          | 0/1000 [00:00<?, ?it/s]

Extracting text from PDFs: 100%|██████████| 1000/1000 [11:38<00:00,  1.43it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  first_1000['pdf_text'] = first_1000['pdf_url'].head(1000).progress_apply(extract_text_from_pdf)


Unnamed: 0,PMID,pdf_url,pdf_text
0,32066827,https://www.nature.com/articles/s41598-020-597...,1\nScientific Reports | (2020) 10:2759...
1,7559895,,
2,27213173,https://nn.neurology.org/content/nnn/3/3/e228....,Failed to download PDF (status code: 403)
3,11387725,,
4,18817572,https://ojrd.biomedcentral.com/counter/pdf/10....,BioMed Central\nPage 1 of 13\n(page number not...
...,...,...,...
95,27433909,,
96,28760285,,
97,29244647,,
98,4160738,,


In [21]:
first_1000.to_csv("/home/dominik/Documents/Informatik/1_Semester/medLLM/data/pmc_patients/1000_articles.csv")

In [25]:
articles_pdfs = first_1000[first_1000["pdf_url"].notna()]

In [26]:
# Selektiere die Zeilen, die entweder "Failed to download PDF" oder "Failed to open stream" enthalten
failed_downloads = articles_pdfs[articles_pdfs["pdf_text"].str.contains("Failed to download PDF|Failed to open stream", na=False)]

# Zähle die Zeilen
num_failed_downloads = failed_downloads.shape[0]

# Gib die Anzahl der Zeilen aus
print(f"Anzahl der Zeilen mit 'Failed to download PDF' oder 'Failed to open stream': {num_failed_downloads}")

Anzahl der Zeilen mit 'Failed to download PDF' oder 'Failed to open stream': 182


In [27]:
len(articles_pdfs) - num_failed_downloads

149

1000 articles; 331 available, 182 PDF to text failed because of access/streamfailures etc, 149 pdf to text conversions successful