In [2]:
import json
import os
import pandas as pd
from llama_index.core.schema import Document

In [3]:
source_directory = "./pubmed_outputs"
target_directory = "./pubmed_abstracts"

In [4]:
os.makedirs("pubmed_abstracts", exist_ok=True)

In [5]:
abstracts_df = pd.DataFrame(columns=["title", "abstract", "year"])

In [6]:
for file in os.listdir(source_directory):
    file_path = os.path.join(source_directory,file)
    with open(file_path, 'r') as f:
        try:
            json_data = json.loads(f.read())
            for data in json_data:
                abstract = data.get("abstract", "")
                title = data.get("title", "")
                year = data.get("year", "")
                abstracts_df.loc[len(abstracts_df)] = [title, abstract, year]     
        except Exception as e:
            print(f"Error occurred while processing file {file}: {e}")

In [7]:
target_directory = os.path.join(target_directory, "pubmed_abstracts.csv")
abstracts_df.to_csv(target_directory, index=False)

In [8]:
abstracts_df

Unnamed: 0,title,abstract,year
0,Choline metabolism in regulating inflammatory ...,Anxiety and depression caused by inflammatory ...,2024
1,Selective corticotropin-releasing factor 1 rec...,Corticotropin-releasing factor (CRF) is a horm...,2016
2,Behavior and the cholinergic parameters in olf...,Olfactory bulbectomy (OBX) in rodents induces ...,2016
3,The role of the cholinergic system in the sign...,In comparison to studies of the involvement of...,2013
4,Effects of cholinergic system of dorsal hippoc...,Some investigations have shown that the glutam...,2011
...,...,...,...
830,Glucocorticoid receptor dysregulation underlie...,Prenatal environmental insults increase the ri...,2022
831,Association between the loudness dependence of...,Although serotonergic dysfunction is significa...,2022
832,Identification of cerebrospinal fluid and seru...,Psychotic disorders are currently diagnosed by...,2022
833,Mismatch negativity as an index of target enga...,Serotonin type-3 receptor (5-HT<sub>3</sub>R) ...,2022


In [9]:
def dataframe_to_documents(df):
    docs = []
    for _, row in df.iterrows():
        text = row["abstract"]
        metadata = {
            "title": row.get("title", ""),
            "year": row.get("year", ""),
        }
        docs.append(Document(text=text, metadata=metadata))
    return docs

In [10]:
documents = dataframe_to_documents(abstracts_df)