In [9]:
import os
import pandas as pd

# get all full texts
list_of_files = []
length = 0
for index in range(1, 13):
    file_path = f"../data/pmc_patients/processed/full_texts_{index}.csv"
    if os.path.isfile(file_path):
        df = pd.read_csv(file_path)
        print(len(df))
        df.drop_duplicates(subset=["PMID"], keep="first", inplace=True)
        print(len(df))
        length += len(df)
        list_of_files.append(df)

print(length)

2945
2943
2945
2945
2945
2942
2945
2942
2944
2918
2944
2080
2944
2941
2944
2918
2944
2080
2944
1587
2944
1567
2944
1578
29441


In [10]:
# combine all full texts
df = pd.concat(list_of_files)
print(len(df))

# remove duplicates
df.drop_duplicates(subset=["PMID"], keep="first", inplace=True)
print(len(df))

# delete rows with empty full text
df = df.dropna(subset=["full_text"])
print(len(df))

# remove rows with failed downloads
df = df[~df.full_text.str.contains("Failed to download")]

print(len(df))
# update index
df = df.reset_index(drop=True)

29441
21500
21474
16889


In [11]:
df.to_csv("../data/pmc_patients/processed/full_texts_combined.csv")

## Remove patients with no full text available

In [13]:
print(len(df))
# get all full text paper ids
full_text_paper_ids = df["PMID"].tolist()

# get all relevant articles
list_of_articles = pd.read_csv("../data/pmc_patients/Summary_data/list_of_articles.csv")
print(len(list_of_articles))

# drop articles that are not in the list of full text papers
list_of_articles = list_of_articles[list_of_articles["article"].isin(full_text_paper_ids)]

print(len(list_of_articles))
list_of_articles.reset_index(drop=False, inplace=True)
list_of_articles.to_csv("../data/pmc_patients/Summary_data/list_of_articles_with_full_text.csv", index=False)

16889
128836
10178


In [14]:
import ast

patients = pd.read_csv("../data/pmc_patients/Summary_data/patient_relevant_articles_map.csv")
list_of_papers = list_of_articles["article"].tolist()
def flatten_list(input_list):
    return [item for sublist in input_list for item in sublist]

# drop articles that are not in the list of full text papers
def filter_relevant_articles(relevant_articles):
    articles = ast.literal_eval(relevant_articles)
    return [article for article in articles if int(article) in list_of_papers]

patients["relevant_articles"] = patients["relevant_articles"].apply(filter_relevant_articles)
# drop row if relevant articles are empty
patients = patients[patients["relevant_articles"].apply(len) > 0]
patients.reset_index(drop=False, inplace=True)
patients.to_csv("../data/pmc_patients/Summary_data/patient_relevant_articles_map_with_full_text.csv", index=False)

In [15]:
patients

Unnamed: 0,index,patient,relevant_articles
0,0,This 60-year-old male was hospitalized due to ...,"[30427933, 12493078, 29208005]"
1,1,A 39-year-old man was hospitalized due to an i...,"[30427933, 12493078, 29208005]"
2,2,One week after a positive COVID-19 result this...,"[30427933, 12493078, 29208005]"
3,3,This 69-year-old male was admitted to the ICU ...,"[30427933, 12493078, 29208005]"
4,4,This 57-year-old male was admitted to the ICU ...,"[30427933, 12493078, 29208005]"
...,...,...,...
6007,9988,A 63-year-old woman with metastatic breast car...,[28814897]
6008,9989,"A 6 years old, neutered male Lhasa Apso was pr...","[30756087, 29057987, 29166400]"
6009,9990,"An 8 years old, neutered male mixed breed dog ...","[30756087, 29057987, 29166400]"
6010,9991,A 4 years old spayed female Doberman Pinscher ...,"[30756087, 29057987, 29166400]"


# Test paper access

In [17]:
import pandas as pd

full_text_papers = pd.read_csv("../data/pmc_patients/processed/full_texts_combined.csv")
full_text_papers_ids = pd.read_csv("../data/pmc_patients/Summary_data/list_of_articles_with_full_text.csv")

In [18]:
predictions = [89,120,181]

# input: list of paper indices
# map to paper ids
paper_ids = full_text_papers_ids[full_text_papers_ids["index"].isin(predictions)]
print(paper_ids)

papers = full_text_papers[full_text_papers["PMID"].isin(paper_ids["article"])]
papers

    index   article
5      89  22427859
8     120  16318699
13    181  19753282


Unnamed: 0.1,Unnamed: 0,PMID,full_text
289,289,16318699,Botulinum neurotoxins (BoNTs) are proteases th...
1064,1064,19753282,21\nCASE REPORT\nMJM 2009 12(1):21-24\nCopyrig...
2353,2353,22427859,Interferon-Gamma Release Assay Performance in\...


## create new maps for reduced dataset

In [11]:
import pandas as pd
import ast

short_description_df = pd.read_csv("../data/extracted_symptoms_on6k.csv")
short_description_df.head()

Unnamed: 0.1,Unnamed: 0,index,patient,relevant_articles,extracted_symptoms
0,0,0,This 60-year-old male was hospitalized due to ...,"['30427933', '12493078', '29208005']","['fever', 'cough']"
1,1,1,A 39-year-old man was hospitalized due to an i...,"['30427933', '12493078', '29208005']",['fever']
2,2,2,One week after a positive COVID-19 result this...,"['30427933', '12493078', '29208005']",[]
3,3,3,This 69-year-old male was admitted to the ICU ...,"['30427933', '12493078', '29208005']",[]
4,4,4,This 57-year-old male was admitted to the ICU ...,"['30427933', '12493078', '29208005']","['fever', 'cough']"


In [20]:
list_of_articles = short_description_df['relevant_articles'].tolist()
list_of_articles = [ast.literal_eval(string) for string in list_of_articles]
list_of_articles = flatten_list(list_of_articles)

set_of_articles = set(list_of_articles)
list_of_articles = list(set_of_articles)
dict_of_articles = {article: index for index, article in enumerate(list_of_articles)}
df = pd.DataFrame(list_of_articles, columns=['article'])
df.to_csv('list_of_articles.csv', index=True)
df

Unnamed: 0,article
0,29484096
1,11207055
2,23077705
3,28752316
4,24040190
...,...
10173,28680095
10174,28149297
10175,26674602
10176,20809268


In [22]:


patient_relevant_articles_map = pd.read_csv("../data/extracted_symptoms_on6k.csv")
patient_relevant_articles_map['relevant_articles'] = patient_relevant_articles_map['relevant_articles'].apply(ast.literal_eval)

for index, row in patient_relevant_articles_map.iterrows():
    value = row['relevant_articles']
    #value = ast.literal_eval(value)
    new_list = []
    for paper in value:
        new_list.append(dict_of_articles[paper])
    patient_relevant_articles_map.at[index, 'relevant_articles'] = new_list
patient_relevant_articles_map.to_csv('patient_relevant_articles_ids_map_with_full_text.csv', index=False)
patient_relevant_articles_map

Unnamed: 0.1,Unnamed: 0,index,patient,relevant_articles,extracted_symptoms
0,0,0,This 60-year-old male was hospitalized due to ...,"[7926, 5730, 10140]","['fever', 'cough']"
1,1,1,A 39-year-old man was hospitalized due to an i...,"[7926, 5730, 10140]",['fever']
2,2,2,One week after a positive COVID-19 result this...,"[7926, 5730, 10140]",[]
3,3,3,This 69-year-old male was admitted to the ICU ...,"[7926, 5730, 10140]",[]
4,4,4,This 57-year-old male was admitted to the ICU ...,"[7926, 5730, 10140]","['fever', 'cough']"
...,...,...,...,...,...
6007,6007,9988,A 63-year-old woman with metastatic breast car...,[1901],[]
6008,6008,9989,"A 6 years old, neutered male Lhasa Apso was pr...","[10141, 7524, 365]",['weakness']
6009,6009,9990,"An 8 years old, neutered male mixed breed dog ...","[10141, 7524, 365]",[]
6010,6010,9991,A 4 years old spayed female Doberman Pinscher ...,"[10141, 7524, 365]",['weakness']


In [27]:
# drow rows where relevant articles are empty
patient_relevant_articles_map = patient_relevant_articles_map[patient_relevant_articles_map["extracted_symptoms"].apply(len) > 2]
patient_relevant_articles_map.reset_index(drop=True, inplace=True)
patient_relevant_articles_map.to_csv('patient_relevant_articles_ids_map_full_text_short_description.csv', index=False)
patient_relevant_articles_map

Unnamed: 0.1,Unnamed: 0,index,patient,relevant_articles,extracted_symptoms
0,0,0,This 60-year-old male was hospitalized due to ...,"[7926, 5730, 10140]","['fever', 'cough']"
1,1,1,A 39-year-old man was hospitalized due to an i...,"[7926, 5730, 10140]",['fever']
2,4,4,This 57-year-old male was admitted to the ICU ...,"[7926, 5730, 10140]","['fever', 'cough']"
3,5,5,This 52-year-old male tested COVID-19 positive...,"[7926, 5730, 10140]",['fever']
4,7,7,This 33-year-old female patient had typical CO...,"[7926, 5730, 10140]","['fever', 'headache', 'delirium', 'delirium']"
...,...,...,...,...,...
3651,6004,9982,A 21-year-old female presented with fatigue an...,[6378],"['fatigue', 'palpitations', 'palpitations', 'f..."
3652,6005,9983,A 39-year-old female presented with fatigue an...,[6378],"['fatigue', 'jaundice']"
3653,6006,9984,A 38-year-old female presented with marked jau...,[6378],"['jaundice', 'tremors', 'palpitations']"
3654,6008,9989,"A 6 years old, neutered male Lhasa Apso was pr...","[10141, 7524, 365]",['weakness']
