In [1]:
import ast
import fitz
import io
import pandas as pd
from tqdm import tqdm
import requests

In [7]:
df = pd.read_csv("/home/dominik/Documents/Informatik/1_Semester/medLLM/data/pmc_patients/PMC-Patients.csv")
article_ids = df["PMID"].head(100).tolist()
linked_ids_scores = df["relevant_articles"].head(100).tolist()

articles_PMID = set()
articles_PMID.update(article_ids)

In [8]:
for id_score in linked_ids_scores:
    data_dict = ast.literal_eval(id_score)
    for article_id in data_dict.keys():
        articles_PMID.add(article_id)


print(len(articles_PMID))

1497


In [9]:
articles = pd.DataFrame(list(articles_PMID), columns=["PMID"])
articles.head()

Unnamed: 0,PMID
0,25417072
1,34957196
2,34706170
3,34957243
4,25428188


In [10]:
def get_full_text_url(response_dict_list: list):
    for file_type_dict in response_dict_list:
        if file_type_dict.get("documentStyle") == "pdf" and file_type_dict.get("availabilityCode") == "OA":
            return file_type_dict.get("url", None)
    return None

In [34]:
def get_article_data(pmid):
    url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID:{pmid}&resultType=core&format=json"
    response = requests.get(url)
    article_data = {}
    if response.status_code == 200:
        data = response.json()
        if "resultList" in data and data["resultList"]["result"]:
            data_result = data["resultList"]["result"][0]
            article_data["PMID"] = pmid
            article_data["title"] = data_result.get("title")
            article_data["doi"] = data_result.get("doi")
            article_data["authors"] = data_result.get("authorString")
            article_data["year"] = data_result.get("pubYear")
            article_data["journal"] = (
                    data_result.get("journalInfo", {})
                    .get("journal", {})
                    .get("title", None)
                )
            article_data["abstract"] = data_result.get("abstractText")
            result = data["resultList"]["result"][0]
            if result.get("isOpenAccess", "N") == "Y":
                article_data["full_text_url"] = get_full_text_url(data["resultList"]["result"][0]["fullTextUrlList"]["fullTextUrl"])
        return article_data

In [21]:
tqdm.pandas()


articles = articles['PMID'].progress_apply(get_article_data)
print(articles)
print(articles.keys())

KeyError: 'PMID'

In [23]:
type(articles)

pandas.core.series.Series

In [25]:
articles.describe()

count                                                  1497
unique                                                 1497
top       {'PMID': '26055964', 'title': 'Gradual loss of...
freq                                                      1
Name: PMID, dtype: object

In [20]:
articles_df = pd.DataFrame(articles)
articles_df

Unnamed: 0,PMID
0,"{'PMID': '25417072', 'title': 'New oral antico..."
1,"{'PMID': '34957196', 'title': 'Optic Foraminot..."
2,"{'PMID': '34706170', 'title': 'Waning Immunity..."
3,"{'PMID': '34957243', 'title': 'Case Report: A ..."
4,"{'PMID': '25428188', 'title': 'Enteric disease..."
...,...
1492,"{'PMID': '18268431', 'title': 'Congenital hypo..."
1493,"{'PMID': '30058742', 'title': 'Suicide as a re..."
1494,"{'PMID': '30299888', 'title': 'GENETICS IN END..."
1495,"{'PMID': '23737188', 'title': 'Post-transplant..."


In [26]:
articles_list = articles.tolist()

In [27]:
articles_df = pd.DataFrame(articles_list)

In [29]:
articles_df.head()
articles_df.to_csv("/home/dominik/Documents/Informatik/1_Semester/medLLM/data/pmc_patients/100_article_data_final_state.csv")

In [55]:
first_1000 = articles_df[articles_df["full_text_url"].notna()].head(50)

In [56]:
import time 

def extract_text_from_pdf(url):
    if url is None:
        return None
    try:
        #time.sleep(5)
        session = requests.Session()  # Persistente Sitzung
        headers = {
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
            'Referer': 'https://example.com',  # Generische URL oder die Seite, von der du den Link hast
        }
        response = session.get(url, headers=headers)
        if response.status_code == 200:
            pdf_file = io.BytesIO(response.content)
            pdf_document = fitz.open(stream=pdf_file, filetype="pdf")
            text = ""
            for page in pdf_document:
                text += page.get_text()
            pdf_document.close()
            return text
        else:
            return f"Failed to download PDF (status code: {response.status_code})"
    except Exception as e:
        return str(e)

In [57]:
tqdm.pandas(desc="Extracting text from PDFs")

first_1000['full_text'] = first_1000['full_text_url'].progress_apply(extract_text_from_pdf)

first_1000.head(10)

Extracting text from PDFs: 100%|██████████| 50/50 [00:44<00:00,  1.11it/s]


Unnamed: 0,PMID,title,doi,authors,year,journal,abstract,full_text_url,full_text
1,34957196,Optic Foraminotomy for Clipping of Superior Ca...,10.3389/fsurg.2021.681115,"Baldoncini M, Luzzi S, Giotta Lucifero A, Flor...",2021,Frontiers in surgery,<b>Background:</b> Carotid-ophthalmic aneurysm...,https://www.frontiersin.org/articles/10.3389/f...,ORIGINAL RESEARCH\npublished: 09 December 2021...
2,34706170,Waning Immunity after the BNT162b2 Vaccine in ...,10.1056/nejmoa2114228,"Goldberg Y, Mandel M, Bar-On YM, Bodenheimer O...",2021,The New England journal of medicine,"<h4>Background</h4>In December 2020, Israel be...",https://europepmc.org/articles/PMC8609604?pdf=...,The new engl and jour nal of medicine\nn engl ...
3,34957243,Case Report: A Giant Left-Ventricular Intramur...,10.3389/fcvm.2021.753627,"Hua M, Gao Y, Li J, Tong F, Li X, Zhang H.",2021,Frontiers in cardiovascular medicine,"In this report, we present a case study of an ...",https://www.frontiersin.org/articles/10.3389/f...,CASE REPORT\npublished: 09 December 2021\ndoi:...
11,32563547,Autoinflammatory and autoimmune conditions at ...,10.1016/j.jaut.2020.102506,"Rodríguez Y, Novelli L, Rojas M, De Santis M, ...",2020,Journal of autoimmunity,Coronavirus disease 2019 (COVID-19) has been c...,https://europepmc.org/articles/PMC7296326?pdf=...,\n \nSince January 2020 Elsevier has created ...
25,34614329,Myocarditis after Covid-19 Vaccination in a La...,10.1056/nejmoa2110737,"Witberg G, Barda N, Hoss S, Richter I, Wiessma...",2021,The New England journal of medicine,<h4>Background</h4>Reports have suggested an a...,https://www.nejm.org/doi/pdf/10.1056/NEJMoa211...,Failed to download PDF (status code: 403)
30,27378956,Mental Pain and Suicide: A Systematic Review o...,10.3389/fpsyt.2016.00108,"Verrocchio MC, Verrocchio MC, Carrozzino D, Ma...",2016,Frontiers in psychiatry,"<h4>Background</h4>Mental pain, defined as a s...",https://www.frontiersin.org/articles/10.3389/f...,June 2016 | Volume 7 | Article 108\n1\nRev...
34,30712880,Prenatal exome sequencing analysis in fetal st...,10.1016/s0140-6736(18)31940-8,"Lord J, McMullan DJ, Eberhardt RY, Rinck G, Ha...",2019,"Lancet (London, England)","<h4>Background</h4>Fetal structural anomalies,...",http://www.thelancet.com/article/S014067361831...,Failed to download PDF (status code: 403)
35,34942917,Ischemic Stroke in a Patient with Stable CADAS...,10.3390/brainsci11121615,"Cruciani A, Pilato F, Rossi M, Motolese F, Di ...",2021,Brain sciences,<h4>Background</h4>SARS-CoV-2 infection has be...,https://www.mdpi.com/2076-3425/11/12/1615/pdf?...,brain\nsciences\nCase Report\nIschemic Stroke ...
37,33587810,Prolonged elevation of D-dimer levels in conva...,10.1111/jth.15267,"Townsend L, Fogarty H, Dyer A, Martin-Loeches ...",2021,Journal of thrombosis and haemostasis : JTH,"<h4>Background</h4>Persistent fatigue, breathl...",http://www.jthjournal.org/article/S15387836220...,Failed to download PDF (status code: 403)
38,33282175,Repetitive Transcranial Magnetic Stimulation i...,10.1177/2045125320973790,"Hett D, Marwaha S.",2020,Therapeutic advances in psychopharmacology,Bipolar disorder (BD) is a debilitating mood d...,https://europepmc.org/articles/PMC7682206?pdf=...,https://doi.org/10.1177/2045125320973790 \nhtt...


In [59]:
# Filtert die Zeilen, die "Failed to download PDF" in der 'full_text' Spalte enthalten
failed_rows = first_1000[first_1000['full_text'].str.contains('Failed to download PDF', na=False)]

# Ausgabe der Zeilen
print(len(failed_rows))


9


In [None]:
first_1000.to_csv("/home/dominik/Documents/Informatik/1_Semester/medLLM/data/pmc_patients/1000_articles.csv")

In [None]:
articles_pdfs = first_1000[first_1000["pdf_url"].notna()]
failed_downloads = articles_pdfs[articles_pdfs["pdf_text"].str.contains("Failed to download PDF|Failed to open stream", na=False)]
num_failed_downloads = failed_downloads.shape[0]

print(f"Number of rows containing 'Failed to download PDF' or 'Failed to open stream': {num_failed_downloads}")

1000 articles; 331 available, 182 PDF to text failed because of access/streamfailures etc, 149 pdf to text conversions successful