In [2]:
import pandas as pd

# Load the dataset
file_path = '../PMC-Patients-oa-9995.csv'
data = pd.read_csv(file_path)

# Extract and limit the length of summaries
def extract_and_trim(description, max_length=300):
    if isinstance(description, str):
        summary = description.split('.')[0].strip()  # Extract first sentence
        return summary[:max_length].strip()  # Trim to max_length
    return None

data['summary'] = data['patient'].apply(lambda x: extract_and_trim(x, max_length=300))

# Save the trimmed summaries
output_file_path = 'patients_short_summaries.csv'
data[['summary']].to_csv(output_file_path, index=False)

print(f"Short summaries with limited length saved to: {output_file_path}")

Short summaries with limited length saved to: patients_short_summaries.csv


In [1]:
import pandas as pd

# Load the dataset
file_path =  '../PMC-Patients-oa-9995.csv'
data = pd.read_csv(file_path)

# Extract and trim the patient descriptions
trimmed_data = data[['patient']].copy()
trimmed_data['patient'] = trimmed_data['patient'].str.strip()

# Save the trimmed descriptions to a new CSV file
data['description'] = trimmed_data
output_file_path = 'processed_patient_descriptions.csv'  
data[['description']].to_csv(output_file_path, index=False)

print(f"Trimmed patient descriptions saved to: {output_file_path}")


Trimmed patient descriptions saved to: processed_patient_descriptions.csv


In [2]:
patient_articles_df = data[['patient', 'relevant_articles']]
patient_articles_df

Unnamed: 0,patient,relevant_articles
0,This 60-year-old male was hospitalized due to ...,"{'32320506': 1, '32293716': 1, '23219649': 1, ..."
1,A 39-year-old man was hospitalized due to an i...,"{'32320506': 1, '32293716': 1, '23219649': 1, ..."
2,One week after a positive COVID-19 result this...,"{'32320506': 1, '32293716': 1, '23219649': 1, ..."
3,This 69-year-old male was admitted to the ICU ...,"{'32320506': 1, '32293716': 1, '23219649': 1, ..."
4,This 57-year-old male was admitted to the ICU ...,"{'32320506': 1, '32293716': 1, '23219649': 1, ..."
...,...,...
9988,A 63-year-old woman with metastatic breast car...,"{'15756132': 1, '26178006': 1, '28814897': 1, ..."
9989,"A 6 years old, neutered male Lhasa Apso was pr...","{'21889166': 1, '30756087': 1, '27616317': 1, ..."
9990,"An 8 years old, neutered male mixed breed dog ...","{'21889166': 1, '30756087': 1, '27616317': 1, ..."
9991,A 4 years old spayed female Doberman Pinscher ...,"{'21889166': 1, '30756087': 1, '27616317': 1, ..."


In [3]:
import ast
for index, row in patient_articles_df.iterrows():
    value = row['relevant_articles']
    value = ast.literal_eval(value)
    relevant_articles = [k for k, v in value.items() if v == 1]
    patient_articles_df.at[index, 'relevant_articles'] = relevant_articles

patient_articles_df
#patient_articles_df.to_csv('patient_relevant_articles_map.csv', index=False)

Unnamed: 0,patient,relevant_articles
0,This 60-year-old male was hospitalized due to ...,"[32320506, 32293716, 23219649, 30339549, 17470..."
1,A 39-year-old man was hospitalized due to an i...,"[32320506, 32293716, 23219649, 30339549, 17470..."
2,One week after a positive COVID-19 result this...,"[32320506, 32293716, 23219649, 30339549, 17470..."
3,This 69-year-old male was admitted to the ICU ...,"[32320506, 32293716, 23219649, 30339549, 17470..."
4,This 57-year-old male was admitted to the ICU ...,"[32320506, 32293716, 23219649, 30339549, 17470..."
...,...,...
9988,A 63-year-old woman with metastatic breast car...,"[15756132, 26178006, 28814897, 4820982, 185963..."
9989,"A 6 years old, neutered male Lhasa Apso was pr...","[21889166, 30756087, 27616317, 8410225, 171530..."
9990,"An 8 years old, neutered male mixed breed dog ...","[21889166, 30756087, 27616317, 8410225, 171530..."
9991,A 4 years old spayed female Doberman Pinscher ...,"[21889166, 30756087, 27616317, 8410225, 171530..."


In [8]:
list_of_articles = patient_articles_df['relevant_articles'].tolist()

def flatten_list(input_list):
    return [item for sublist in input_list for item in sublist]

list_of_articles = flatten_list(list_of_articles)
print(len(list_of_articles))
# get unique articles
set_of_articles = set(list_of_articles)
print(len(set_of_articles))
# convert back to list
list_of_articles = list(set_of_articles)
print(len(list_of_articles))
dict_of_articles = {article: index for index, article in enumerate(list_of_articles)}
# safe to csv
df = pd.DataFrame(list_of_articles, columns=['article'])
df.to_csv('list_of_articles.csv', index=False)
df

181404
128836
128836


Unnamed: 0,article
0,18977609
1,19405787
2,11737198
3,29759122
4,19247035
...,...
128831,27288794
128832,17983378
128833,1033657
128834,19067792


In [9]:
patient_relevant_articles_map = pd.read_csv("patient_relevant_articles_map_with_full_text.csv")
patient_relevant_articles_map['relevant_articles'] = patient_relevant_articles_map['relevant_articles'].apply(ast.literal_eval)

for index, row in patient_relevant_articles_map.iterrows():
    value = row['relevant_articles']
    #value = ast.literal_eval(value)
    new_list = []
    for paper in value:
        new_list.append(dict_of_articles[paper])
    patient_relevant_articles_map.at[index, 'relevant_articles'] = new_list
patient_relevant_articles_map.to_csv('patient_relevant_articles_ids_map_with_full_text.csv', index=False)
patient_relevant_articles_map

Unnamed: 0,patient,relevant_articles
0,This 60-year-old male was hospitalized due to ...,"[105070, 55722, 110382, 31323, 33968, 124035, ..."
1,A 39-year-old man was hospitalized due to an i...,"[105070, 55722, 110382, 31323, 33968, 124035, ..."
2,One week after a positive COVID-19 result this...,"[105070, 55722, 110382, 31323, 33968, 124035, ..."
3,This 69-year-old male was admitted to the ICU ...,"[105070, 55722, 110382, 31323, 33968, 124035, ..."
4,This 57-year-old male was admitted to the ICU ...,"[105070, 55722, 110382, 31323, 33968, 124035, ..."
...,...,...
9988,A 63-year-old woman with metastatic breast car...,"[24318, 125514, 46154, 124944, 124806, 102274,..."
9989,"A 6 years old, neutered male Lhasa Apso was pr...","[60122, 128583, 20064, 13703, 102365, 91978, 5..."
9990,"An 8 years old, neutered male mixed breed dog ...","[60122, 128583, 20064, 13703, 102365, 91978, 5..."
9991,A 4 years old spayed female Doberman Pinscher ...,"[60122, 128583, 20064, 13703, 102365, 91978, 5..."


In [42]:
# check for Ellipsis
dataframe = pd.read_csv('patient_relevant_articles_ids_map.csv')

dataframe['relevant_articles'] = dataframe['relevant_articles'].apply(ast.literal_eval)
for index, row in dataframe.iterrows():
    value = row['relevant_articles']
    value

Unnamed: 0,0,"This 60-year-old male was hospitalized due to moderate ARDS from COVID-19 with symptoms of fever, dry cough, and dyspnea. We encountered several difficulties during physical therapy on the acute ward. First, any change of position or deep breathing triggered coughing attacks that induced oxygen desaturation and dyspnea. To avoid rapid deterioration and respiratory failure, we instructed and performed position changes very slowly and step-by-step. In this way, a position change to the 135° prone position () took around 30 minutes. This approach was well tolerated and increased oxygen saturation, for example, on day 5 with 6 L/min of oxygen from 93% to 97%. Second, we had to adapt the breathing exercises to avoid prolonged coughing and oxygen desaturation. Accordingly, we instructed the patient to stop every deep breath before the need to cough and to hold inspiration for better air distribution. In this manner, the patient performed the breathing exercises well and managed to increase his oxygen saturation. Third, the patient had difficulty maintaining sufficient oxygen saturation during physical activity. However, with close monitoring and frequent breaks, he managed to perform strength and walking exercises at a low level without any significant deoxygenation. Exercise progression was low on days 1 to 5, but then increased daily until hospital discharge to a rehabilitation clinic on day 10.","[127082, 55475, 107059, 14066, 73070, 110067, 27856, 116854, 54626, 59682, 888, 17284, 47046, 22894, 127168, 51499, 83941, 119320, 83722, 44126, 108724, 3736, 57729, 15386, 43828, 48305, 96863]"
0,1,A 39-year-old man was hospitalized due to an i...,"[127082, 55475, 107059, 14066, 73070, 110067, ..."
1,2,One week after a positive COVID-19 result this...,"[127082, 55475, 107059, 14066, 73070, 110067, ..."
2,3,This 69-year-old male was admitted to the ICU ...,"[127082, 55475, 107059, 14066, 73070, 110067, ..."
3,4,This 57-year-old male was admitted to the ICU ...,"[127082, 55475, 107059, 14066, 73070, 110067, ..."
4,5,This 52-year-old male tested COVID-19 positive...,"[127082, 55475, 107059, 14066, 73070, 110067, ..."
...,...,...,...
9987,9988,A 63-year-old woman with metastatic breast car...,"[78656, 549, 11668, 42513, 110902, 17059, 1205..."
9988,9989,"A 6 years old, neutered male Lhasa Apso was pr...","[20284, 47250, 28066, 109854, 126970, 7727, 52..."
9989,9990,"An 8 years old, neutered male mixed breed dog ...","[20284, 47250, 28066, 109854, 126970, 7727, 52..."
9990,9991,A 4 years old spayed female Doberman Pinscher ...,"[20284, 47250, 28066, 109854, 126970, 7727, 52..."
