In [1]:
import joblib
import pandas as pd
import random
import json
import re
import spacy
nlp = spacy.load("en_core_web_sm")
import string
import os
from tqdm.notebook import tqdm
from openai import OpenAI
import numpy as np
from multiprocessing import Pool
from tqdm import trange



In [2]:
# 加载.joblib文件
old_data = joblib.load('/home/gy237/project/Biomedical_datasets/total_pubmed/abstracts_1-1252.joblib')
print(old_data.columns)
# ['pmid', 'title', 'abstract', 'journal', 'pubdate', 'authors','mesh_terms']

Index(['pmid', 'title', 'abstract', 'journal', 'pubdate', 'authors',
       'mesh_terms', 'pub_year', 'pub_month'],
      dtype='object')


In [4]:
new_data = joblib.load('/home/gy237/project/Biomedical_datasets/total_pubmed/abstracts_1253-1575.joblib')
print(new_data.columns)

Index(['pmid', 'title', 'abstract', 'journal', 'authors', 'mesh_terms',
       'pub_year', 'pub_month'],
      dtype='object')


In [5]:
combined_df = pd.concat([old_data, new_data], axis=0)
print('去重前：', len(combined_df))
data = combined_df.drop_duplicates(subset=['pmid'], keep='last')
print('去重后：', len(data))
print(data.columns)
# 去重前： 39390916
# 去重后： 37814430

去重前： 39390916
去重后： 37814430
Index(['pmid', 'title', 'abstract', 'journal', 'pubdate', 'authors',
       'mesh_terms', 'pub_year', 'pub_month'],
      dtype='object')


In [6]:
# 将abstracts分句并进行保存
def ends_with_punctuation(text):
    return text[-1] in string.punctuation if text else False

def split_abstracts(abstract):
    doc = nlp(abstract)
    abstracts = [sent.text for sent in doc.sents]
    assert len(abstracts) != 0, abstract
        
    new_abstracts = []
    for j in abstracts:
        j = j.strip().split('\n')   # 有些内部会有\n
        for k in j:
            if len(k.split(' ')) > 5 and ends_with_punctuation(k):
                new_abstracts.append(k)
    return new_abstracts

def save_abstracts(data, name):
    for index, row in data.iterrows():
        pmid = row['pmid']
        abstract = row['abstract']
        doc = split_abstracts(abstract)
        
        # 以pmid为文件名创建txt文件
        os.makedirs(f"/home/gy237/project/Biomedical_datasets/total_pubmed/{name}", exist_ok=True)
        with open(f"/home/gy237/project/Biomedical_datasets/total_pubmed/{name}/{pmid}.txt", "w", encoding="utf-8") as file:
            for j in doc:
                file.write(j + '\n')


In [7]:
def generate(prompt, question):
    # return 'Y'    # use, when you check the diagnoses list
    client = OpenAI(api_key="sk-svcacct-Pqi-bFBAdqeGHBZO56dX8kbGpYm05g3dV920DDxOc7LNGpts6jpeYvRSwRDaSF15xT3BlbkFJd5kbvd-ja5Jh7jXMv6_OpXZKviW84lRFT3DGkPGgz43himYNc_7VzYwMWDhcn9FAA")
        
    chat_return = client.chat.completions.create(model='gpt-4o-mini',temperature=0.0, messages=[{"role": "system", "content": prompt}, {"role":"user", "content": question}])

    result = chat_return.choices[0].message.content
    return result

def process_chunk(index_list, abstract_list, prompt_list):
    result = []
    for i in trange(len(index_list)):
        flag = True
        count = 0
        # while flag:
        yn = generate(prompt_list[i], abstract_list[i])
            # count += 1
            # if yn in ['Y', 'N']:
            #     flag = False
            # elif count > 2:
            #     print(f'Error, {abstract_list[i]}')
            #     flag = False
        result.append({'id': index_list[i], 'yn': yn})
    return result

def filter(prompt, batch, num_tasks):
    index_list = batch.index.tolist()
    abstracts = batch['abstract'].tolist()
    prompt_list = [prompt]*len(index_list)
    
    index_list = np.array_split(index_list, num_tasks)
    abstracts_list = np.array_split(abstracts, num_tasks)
    prompt_list = np.array_split(prompt_list, num_tasks)

    with Pool(num_tasks) as pool:
        results = pool.starmap(process_chunk, zip(index_list, abstracts_list, prompt_list))
    
    for result in results:
        for i in result:
            batch.loc[i['id'], 'yn'] = i['yn']
        
    return batch

In [32]:
prompt = '''I will provide you with the abstract of an article. Your task is to determine if it contains any mentions of Datasets, Repositories, or Knowledge Bases. If any of these are mentioned, respond with Y and the mentions; otherwise, respond with N. Please note that you should lean towards outputting Y, as manual verification will be conducted later.
Below are the definitions and examples of Datasets, Repositories, and Knowledge Bases for your reference:
Dataset: A collection of data.
Repository: A data hosting site that collects, manages, and stores datasets for secondary use in research.
Knowledge Base: A collection of data or information about a particular subject. A knowledge base is typically curated.'''

In [8]:
prompt = '''**Task:**
You will be provided with the abstract of an article. Your goal is to determine whether it mentions any Datasets, Repositories, or Knowledge Bases. Importantly, each mention must include a specific name (e.g., a named dataset, repository, or knowledge base).
**Output Format:**
If any of these are mentioned, respond with: Y
If none are mentioned, respond with: N
**Guidelines:**
Err on the side of caution and lean towards responding with "Y" when in doubt, as manual verification will follow.
**Definitions for Reference:**
Dataset: A structured collection of data. Surveys, interviews and questionnaires can be considered as dataset.
Dataset Examples: "2020-2021 Minimum Data Set 3.0", "Medicare datasets", "the Current Population Survey (CPS)", "the Kansas City Cardiomyopathy Questionnaire (KCCQ)"
Repository: A platform or site that collects, manages, and stores datasets for secondary use in research.
Repository Examples: "GenBank", "the Protein Data Bank (PDB)", "Scopus", "the Human Genome Project data", "the ReNDiS database", "the Canadian Cancer Registry", "International clinical guidelines"
Knowledge Base: A curated collection of information or data about a specific topic.
Knowledge Base Examples: "LinkedOmicsKB", "the Clinical Trial Knowledge Base", "mirtronDB"'''

In [9]:
batch = data.sample(n=300, random_state=42)
print(len(batch))
df_filtered = batch[batch['abstract'].apply(lambda x: x.strip()) != '']
print(len(df_filtered))

df_filtered = filter(prompt, df_filtered, 10)
df_filtered.to_json('/home/gy237/project/Biomedical_datasets/total_pubmed/df_filtered.json', orient='records', lines=True, indent=4)

df_filtered = df_filtered[df_filtered['yn']!='N']
print(len(df_filtered))

# batch = df_filtered.sample(n=200, random_state=42)
# save_abstracts(batch, 'total_PubMed_sample_to_decide_key_words/Batch_2')

300
219


100%|██████████| 22/22 [00:08<00:00,  2.59it/s]
100%|██████████| 21/21 [00:08<00:00,  2.46it/s]
100%|██████████| 22/22 [00:08<00:00,  2.51it/s]
100%|██████████| 22/22 [00:09<00:00,  2.40it/s]
100%|██████████| 22/22 [00:09<00:00,  2.37it/s]
100%|██████████| 22/22 [00:09<00:00,  2.33it/s]
100%|██████████| 22/22 [00:10<00:00,  2.20it/s]
100%|██████████| 22/22 [00:10<00:00,  2.17it/s]
100%|██████████| 22/22 [00:10<00:00,  2.13it/s]
100%|██████████| 22/22 [00:12<00:00,  1.81it/s]


6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  batch.loc[i['id'], 'yn'] = i['yn']


In [None]:
general_repositories = [
        " data", "dataset", "database", "repository", "survey", "questionnaire", "PubMed"," NCBI ","Scopus","Kaggle","knowledge base",
        "Mendeley","Education Resources Information Center","Science Open","Web of Science","Cochrane Library","EMbase",
        "Chinese Biomedical Literature Database","Medline",
        "Kaggle", "Google Dataset Search", "Zenodo", "Dryad", "Figshare",
        "Open Data Portal", "World Bank Open Data", "UN Data", "Data.gov", 
        "DataHub", "IEEE DataPort", "Mendeley Data", 
        "Open Science Framework", "AWS Open Data Registry", "Harvard Dataverse",
        "ICPSR","re3data.org", "PLOS Data Repository", "UK Data Service", 
        "Humanitarian Data Exchange", "UCI Machine Learning Repository", 
        "Statista", "Quandl", "OpenStreetMap", "EarthData", "Global Health Observatory", 
        "GBIF", "GCMD", "Eurostat", "OECD Data", 
        "Climate Data Store", "FAOSTAT", "Geonames", "NCEI", 
        "Copernicus Open Access Hub", "Landsat Data Repository", 
        "National Cancer Institute Genomic Data Commons", 
        "Census Bureau Data", "IMF Data", "OpenAIRE", 
        "NCES", "PERSEE", "StatBank Denmark", 
        "Australian Data Archive", "China Statistical Yearbook", 
        "Open Knowledge Foundation CKAN", "BigQuery Public Datasets", 
        "UNESCO Institute for Statistics Data Centre", "Linked Data Platform"
]

life_science_repository = [
        "ClinicalTrials", "clinical trials",
        "GenBank","Gene Expression Omnibus","PubChem","Protein Data Bank","UniProt",
        "Genotype-Tissue Expression","Bioproject","dbSNP","ClinVar","PhysioNet","National Alzheimer's Coordinating Center",
        "Sequence Read Archive","LINCS","ImmPort","dbGaP","The Cancer Imaging Archive","CellChat",
        "FlyBase","BioPortal","Mouse Genome Informatics","National COVID Cohort Collaborative","Saccharomyces Genome Database",
        "Genomic Data Commons","PeptideAtlas","WormBase","International Mouse Phenotyping Consortium",
        "BindingDB","CHILDES","NITRC","Rat Genome Database","Immunological Genome Project",
        "Investigational New Drug Applications","ICPSR","HOMD","AphasiaBank","OpenNeuro",
        "Metabolomics","4D-Nucleome","NDEx","Mouse Phenome Database","BioLINCC",
        "National Sleep Research Resource","Xenbase",
        "NIH Genetic Testing Registry","BMRB","Kids First Data Resource",
        "Monarch Initiative","dbVar","ZFIN",
]
total_target = general_repositories + life_science_repository
print(len(total_target))

In [None]:
import pandas as pd
from tqdm import tqdm
import multiprocessing
import numpy as np

# 创建一个空的DataFrame用于存储匹配结果
filtered_data = pd.DataFrame()
key_dic = {}

# 将数据切割成n个部分
def process_chunk(chunk, total_target):
    chunk_filtered_data = pd.DataFrame()
    chunk_key_dic = {}

    for keyword in total_target:
        matched_rows = chunk[chunk['abstract'].str.contains(keyword, case=False, na=False)]
        chunk_key_dic[keyword] = len(matched_rows)
        chunk_filtered_data = pd.concat([chunk_filtered_data, matched_rows])

    # 去重
    chunk_filtered_data = chunk_filtered_data.drop_duplicates(subset=['pmid'], keep='last')

    return chunk_filtered_data, chunk_key_dic

def parallel_process(data, total_target, num_chunks=20):
    print(len(data))
    # 将数据切割成多个块
    chunks = np.array_split(data, num_chunks)
    
    # 使用多进程处理每个块
    with multiprocessing.Pool(processes=num_chunks) as pool:
        results = pool.starmap(process_chunk, [(chunk, total_target) for chunk in chunks])

    # 合并结果
    final_filtered_data = pd.DataFrame()
    final_key_dic = {}

    for chunk_filtered_data, chunk_key_dic in results:
        final_filtered_data = pd.concat([final_filtered_data, chunk_filtered_data])
        for key, value in chunk_key_dic.items():
            if key in final_key_dic:
                final_key_dic[key] += value
            else:
                final_key_dic[key] = value

    # 去重
    final_filtered_data = final_filtered_data.drop_duplicates(subset=['pmid'], keep='last')

    return final_filtered_data, final_key_dic

# 假设data和total_target已经准备好
filtered_data, key_dic = parallel_process(data, total_target, num_chunks=10)

# 打印结果
print(len(filtered_data))
print(key_dic)

In [None]:
# joblib.dump(filtered_data, '/home/gy237/project/Biomedical_datasets/total_pubmed/abstracts_filtere_1-1575.joblib')

In [None]:
# with open('/home/gy237/project/Biomedical_datasets/total_pubmed/abstracts_filtere_1-1575.json', 'w', encoding='utf-8') as f:
#     json.dump(key_dic, f, ensure_ascii=False, indent=4)

In [3]:
filtered_df = joblib.load('/home/gy237/project/Biomedical_datasets/total_pubmed/abstracts_filtere_1-1575.joblib')
print(len(filtered_df))

10501984


In [None]:
f_list = [
    '/home/gy237/project/Biomedical_datasets/total_pubmed/Batch_1/agreement',
    '/home/gy237/project/Biomedical_datasets/total_pubmed/Batch_2/Batch_2',
    '/home/gy237/project/Biomedical_datasets/total_pubmed/Batch_3',
]

folder_list = [
    "/home/gy237/project/Biomedical_datasets/total_pubmed/Gui_Batches",
    "/home/gy237/project/Biomedical_datasets/total_pubmed/Kalpana_Batches"
]

for i in folder_list:
    f_list.extend([f'{i}/{j}' for j in os.listdir(i)])
print(len(f_list))

exist_pmids = []
for i in f_list:
    exist_pmids.extend(os.listdir(i))
exist_pmids = [i.split('.')[0] for i in exist_pmids]
print('filtered_df:', len(filtered_df))
print('exist_pmids:', len(set(exist_pmids)))


filtered_df_left = filtered_df[~filtered_df['pmid'].isin(exist_pmids)]
print('filtered_df_left :', len(filtered_df_left))


batch = filtered_df_left.sample(n=100, random_state=42)
save_abstracts(batch, 'Kalpana_Batches/Batch_10')