In [2]:
import joblib
import pandas as pd
import random
import json
import re
import spacy
nlp = spacy.load("en_core_web_sm")
import string
import os
from tqdm.notebook import tqdm
from openai import OpenAI
import numpy as np
from multiprocessing import Pool
from tqdm import trange

In [3]:
data_26406687 = joblib.load('/home/gy237/project/Biomedical_datasets/total_pubmed/Sampled_from_total_PubMed_specific_name_12_16/abstracts_nonull_1-1575.joblib')
print(len(data_26406687))
print(data_26406687.columns)

26406687
Index(['pmid', 'title', 'abstract', 'journal', 'pubdate', 'authors',
       'mesh_terms', 'pub_year', 'pub_month'],
      dtype='object')


In [22]:
# 将abstracts分句并进行保存
def ends_with_punctuation(text):
    return text[-1] in string.punctuation if text else False

def split_abstracts(abstract):
    doc = nlp(abstract)
    abstracts = [sent.text for sent in doc.sents]
    assert len(abstracts) != 0, abstract
        
    new_abstracts = []
    for j in abstracts:
        j = j.strip().split('\n')   # 有些内部会有\n
        for k in j:
            if len(k.split(' ')) > 5 and ends_with_punctuation(k):
                new_abstracts.append(k)
    return new_abstracts

def save_abstracts(data, folder):
    os.makedirs(folder, exist_ok=True)
    for index, row in data.iterrows():
        pmid = row['pmid']
        abstract = row['abstract']
        yn = row['yn']
        doc = split_abstracts(abstract)
        
        # 以pmid为文件名创建txt文件
        os.makedirs(f"{folder}", exist_ok=True)
        with open(f"{folder}/{pmid}.txt", "w", encoding="utf-8") as file:
            for j in doc:
                file.write(j + '\n')
            file.write(f'{yn} (No need to annotate, just for notice).\n')


In [None]:
def generate(prompt, question):
    # return 'Y'    # use, when you check the diagnoses list
    client = OpenAI()
        
    chat_return = client.chat.completions.create(model='gpt-4o-mini',temperature=0.0, messages=[{"role": "system", "content": prompt}, {"role":"user", "content": question}])

    result = chat_return.choices[0].message.content
    return result

def process_chunk(index_list, abstract_list, prompt_list):
    result = []
    for i in trange(len(index_list)):
        flag = True
        count = 0
        # while flag:
        yn = generate(prompt_list[i], abstract_list[i])
            # count += 1
            # if yn in ['Y', 'N']:
            #     flag = False
            # elif count > 2:
            #     print(f'Error, {abstract_list[i]}')
            #     flag = False
        result.append({'id': index_list[i], 'yn': yn})
    return result

def filter(prompt, batch, num_tasks):
    index_list = batch.index.tolist()
    abstracts = batch['abstract'].tolist()
    prompt_list = [prompt]*len(index_list)
    
    index_list = np.array_split(index_list, num_tasks)
    abstracts_list = np.array_split(abstracts, num_tasks)
    prompt_list = np.array_split(prompt_list, num_tasks)

    with Pool(num_tasks) as pool:
        results = pool.starmap(process_chunk, zip(index_list, abstracts_list, prompt_list))
    
    for result in results:
        for i in result:
            batch.loc[i['id'], 'yn'] = i['yn']
        
    return batch

In [17]:
prompt = '''**Task:**
You will be provided with the abstract of an article. Your goal is to determine whether it mentions any Datasets, Repositories, or Knowledge Bases. Importantly, each mention must include a specific name.
**Output Format:**
If any names of these are mentioned, respond with: Y and the names
If none names are mentioned, respond with: N
**Guidelines:**
1. When determining your response, respond with “Y” when any named entity that includes, provides or refers to data is mentioned. 
2. Please note that you should also respond "Y" even if you don't recognize the name.
3. If you encounter any hesitation or need to think, you should respond with “Y”.
**Definitions for Reference:**
Specific name: A name that allows the mentions to be identified, usually containing capital letters.
Examples: 'yeast dataset' and 'a dataset of 150 COVID-19 RCT abstracts' are not specific names.
Dataset: A structured collection of data. Note that any named surveys, interviews and questionnaires can be considered as Dataset when they have specific names.
Dataset Examples: "2020-2021 Minimum Data Set 3.0", "Medicare datasets", "2001 Participation and Activity Limitation Survey (PALS)", "the Kansas City Cardiomyopathy Questionnaire (KCCQ)"
Repository: A platform or site that collects, manages, and stores datasets for secondary use in research. Note that search platforms can be considered as Repository when they have specific names.
Repository Examples: "GenBank", "ClinicalTrials", "the Protein Data Bank (PDB)", "PubMed", "the Human Genome Project data", "the ReNDiS database", "the Canadian Cancer Registry", "International clinical guidelines"
Knowledge Base: A curated collection of information or data about a specific topic.
Knowledge Base Examples: "LinkedOmicsKB", "the Clinical Trial Knowledge Base", "mirtronDB"
**Abstract:**
'''

# Test Accuracy

In [4]:
# 加载之前的200个annotated pmid and abstracts
batch1 = '/home/gy237/project/Biomedical_datasets/total_pubmed/Batch_1/agreement'
batch2 = '/home/gy237/project//Biomedical_datasets/total_pubmed/Batch_2/Batch_2'
true = '/home/gy237/project/Biomedical_datasets/total_pubmed/Sampled_from_total_PubMed_specific_name_12_16/specific_name_pmid.txt'

with open(true, 'r') as file:
    true_pmid = file.readlines()
true_pmid = [i.strip() for i in true_pmid]

pmid1 = os.listdir(batch1)
pmid2 = os.listdir(batch2)

pmids = [i.split('.')[0] for i in pmid1] + [i.split('.')[0] for i in pmid2]
assert set(true_pmid).issubset(pmids)


df = pd.DataFrame(columns=["pmid", "abstract"])
for i in pmid1:
    with open(f"{batch1}/{i}", 'r') as f:
        text = f.readlines()
        text = [j.strip() for j in text]
        text = ' '.join(text)
    i = i.split('.')[0]
    df.loc[len(df), 'pmid'] = i
    df.loc[len(df)-1, 'abstract'] = text

for i in pmid2:
    with open(f"{batch2}/{i}", 'r') as f:
        text = f.readlines()
        text = [j.strip() for j in text]
        text = ' '.join(text)
    i = i.split('.')[0]
    df.loc[len(df), 'pmid'] = i
    df.loc[len(df)-1, 'abstract'] = text

In [62]:
# Test accuracy
# 使用交互式API
df_yn = filter(prompt, df, 10)

yn_pmid = df_yn[df_yn['yn'] != 'N']['pmid'].tolist()
print(len(yn_pmid))
print(len(true_pmid))

error = []
for i in true_pmid:
    if i not in yn_pmid:
        j = f'{i}.txt'
        error.append(j)
print(len(error))
print(f'Accuracy: {len(true_pmid)-len(error)}/{len(true_pmid)}')

# gpt-4o-mini accuracy 16/27
# gpt-4o accuracy 27/27

100%|██████████| 20/20 [00:08<00:00,  2.49it/s]
100%|██████████| 20/20 [00:08<00:00,  2.49it/s]
100%|██████████| 20/20 [00:08<00:00,  2.39it/s]
100%|██████████| 20/20 [00:08<00:00,  2.36it/s]
100%|██████████| 20/20 [00:08<00:00,  2.27it/s]
100%|██████████| 20/20 [00:09<00:00,  2.21it/s]
100%|██████████| 20/20 [00:09<00:00,  2.14it/s]
100%|██████████| 20/20 [00:10<00:00,  1.95it/s]
100%|██████████| 20/20 [00:10<00:00,  1.82it/s]
100%|██████████| 20/20 [00:12<00:00,  1.55it/s]


32
27
0
Accuracy: 27/27


# Sample Batch

In [13]:
batch = data_26406687.sample(n=6500, random_state=42)
batch = batch.reset_index(drop=True)
print(len(batch))
print(batch.columns)

batch_1 = joblib.load('/home/gy237/project/Biomedical_datasets/total_pubmed/Sampled_from_total_PubMed_specific_name_12_16/Batch_1_sample_5000_12_16/Batch_1_sample_5000_12_16.joblib')
print(len(batch_1))

filtered_df = batch[~batch['pmid'].isin(batch_1['pmid'])]
print(len(filtered_df))

6500
Index(['pmid', 'title', 'abstract', 'journal', 'pubdate', 'authors',
       'mesh_terms', 'pub_year', 'pub_month'],
      dtype='object')
5000
1500


In [18]:
batch_yn = filter(prompt, filtered_df, 20)
print(len(batch_yn[batch_yn['yn'] != 'N']))

100%|██████████| 75/75 [00:27<00:00,  2.73it/s]
100%|██████████| 75/75 [00:27<00:00,  2.69it/s]
 79%|███████▊  | 59/75 [00:28<00:07,  2.26it/s]
100%|██████████| 75/75 [00:28<00:00,  2.59it/s]
100%|██████████| 75/75 [00:29<00:00,  2.57it/s]
100%|██████████| 75/75 [00:30<00:00,  2.49it/s]
100%|██████████| 75/75 [00:30<00:00,  2.44it/s]
100%|██████████| 75/75 [00:30<00:00,  2.43it/s]
100%|██████████| 75/75 [00:31<00:00,  2.39it/s]
100%|██████████| 75/75 [00:30<00:00,  2.43it/s]
100%|██████████| 75/75 [00:32<00:00,  2.32it/s]
100%|██████████| 75/75 [00:32<00:00,  2.29it/s]
100%|██████████| 75/75 [00:32<00:00,  2.29it/s]
100%|██████████| 75/75 [00:33<00:00,  2.25it/s]
100%|██████████| 75/75 [00:33<00:00,  2.25it/s]
100%|██████████| 75/75 [00:34<00:00,  2.18it/s]
100%|██████████| 75/75 [00:34<00:00,  2.15it/s]
100%|██████████| 75/75 [00:35<00:00,  2.14it/s]
100%|██████████| 75/75 [00:36<00:00,  2.08it/s]
100%|██████████| 75/75 [00:47<00:00,  1.59it/s]
A value is trying to be set on a copy of

In [23]:
# 保存文件
name = 'Batch_3_sample_6500_12_16'

joblib.dump(batch_yn, f'/home/gy237/project/Biomedical_datasets/total_pubmed/Sampled_from_total_PubMed_specific_name_12_16/{name}/{name}.joblib')

print(len(batch_yn))
batch_y = batch_yn[batch_yn['yn'] != 'N']
print(len(batch_y))

# 指定第一组的数量
group1_size = 56
# 随机打乱索引
shuffled_indices = np.random.permutation(batch_y.index)
# 分成两组
group1_indices = shuffled_indices[:group1_size]
group2_indices = shuffled_indices[group1_size:]
group1 = batch_y.loc[group1_indices]
group2 = batch_y.loc[group2_indices]

save_abstracts(group1, f'/home/gy237/project/Biomedical_datasets/total_pubmed/Sampled_from_total_PubMed_specific_name_12_16/{name}/{name}')
save_abstracts(group2, f'/home/gy237/project/Biomedical_datasets/total_pubmed/Sampled_from_total_PubMed_specific_name_12_16/The_rest/{name}_number_{len(group2)}')

1500
59


# Batch API

In [63]:
# batch = data.sample(n=5000, random_state=42)
# batch = batch.reset_index(drop=True)
# print(len(batch))
# print(batch.columns)

output_file = '/home/gy237/project/Biomedical_datasets/total_pubmed/Sampled_from_total_PubMed_specific_name_12_16/sample_5000_to_openai.jsonl'

df = df.drop_duplicates(subset=['pmid'], keep='last')
df = df.reset_index(drop=True)
for i in range(len(df)):
    dic = {"custom_id": df.loc[i, 'pmid'], "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-4o", "messages": [{"role": "system", "content": prompt},{"role": "user", "content": df.loc[i, 'abstract']}],"max_tokens": 16000}}
    with open(output_file,'a', encoding='utf-8') as file:
        file.write(json.dumps(dic) + '\n')

# for i in range(len(batch)):
#     dic = {"custom_id": batch.loc[i, 'pmid'], "method": "POST", "url": "/v1/chat/completions", "body": {"model": "gpt-4o", "messages": [{"role": "system", "content": prompt},{"role": "user", "content": batch.loc[i, 'abstract']}],"max_tokens": 49000}}
#     with open(output_file,'a', encoding='utf-8') as file:
#         file.write(json.dumps(dic) + '\n')

In [84]:
name = "Filter out pmids with specific names"
client = OpenAI()
# upload your batch input file
batch_input_file = client.files.create(
  file=open(f"/home/gy237/project/Biomedical_datasets/total_pubmed/Sampled_from_total_PubMed_specific_name_12_16/sample_5000_to_openai.jsonl", "rb"),
  purpose="batch"
)

# create the batch, only the description can change
batch_input_file_id = batch_input_file.id
client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": f'{name}'
    }
)

Batch(id='batch_6760a2410a0081909b888c8d085a0fd4', completion_window='24h', created_at=1734386241, endpoint='/v1/chat/completions', input_file_id='file-JgfzLMAaS5ZVLvwQFs8Akg', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1734472641, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'Filter out pmids with specific names'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0))

In [102]:
batch_api = client.batches.retrieve("batch_67609fd318208190b4b6c8d6523160be")
# client.batches.cancel("batch_abc123")
print(batch_api)
print(batch_api.status)

Batch(id='batch_67609fd318208190b4b6c8d6523160be', completion_window='24h', created_at=1734385619, endpoint='/v1/chat/completions', input_file_id='file-G2MBddu6JjdmvCRcjcUnaR', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1734386639, error_file_id=None, errors=None, expired_at=None, expires_at=1734472019, failed_at=None, finalizing_at=1734386619, in_progress_at=1734385619, metadata={'description': 'Filter out pmids with specific names'}, output_file_id='file-4D9n2fxX9U64CgjRJGWzGB', request_counts=BatchRequestCounts(completed=184, failed=0, total=184))


# curl https://api.openai.com/v1/files/file-4D9n2fxX9U64CgjRJGWzGB/content \
# -H "Authorization: Bearer $OPENAI_API_KEY" > /home/gy237/project/Biomedical_datasets/total_pubmed/Sampled_from_total_PubMed_specific_name_12_16/batch_output_2.jsonl

Batch(id='batch_67609fd318208190b4b6c8d6523160be', completion_window='24h', created_at=1734385619, endpoint='/v1/chat/completions', input_file_id='file-G2MBddu6JjdmvCRcjcUnaR', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1734386639, error_file_id=None, errors=None, expired_at=None, expires_at=1734472019, failed_at=None, finalizing_at=1734386619, in_progress_at=1734385619, metadata={'description': 'Filter out pmids with specific names'}, output_file_id='file-4D9n2fxX9U64CgjRJGWzGB', request_counts=BatchRequestCounts(completed=184, failed=0, total=184))
completed


In [9]:
with open('/home/gy237/project/Biomedical_datasets/total_pubmed/Sampled_from_total_PubMed_specific_name_12_16/batch_output.jsonl', 'r', encoding='utf-8') as file:
    data = []
    for line in file:
        data.append(json.loads(line.strip()))

print(len(data))
print(data[0])


# merge 回去
for i in data:
    pmid = i['custom_id']
    yn = i['response']['body']['choices'][0]['message']['content']

    # 计算cost
    prompt_tokens = i['response']['body']['usage']['prompt_tokens']
    completion_tokens = i['response']['body']['usage']['completion_tokens']
    cost = (prompt_tokens*1.25 + completion_tokens*5)/1000000

    df.loc[df['pmid'] == pmid, 'yn'] = yn
    df.loc[df['pmid'] == pmid, 'cost'] = cost

# for i in data:
#     pmid = i['custom_id']
#     yn = i['response']['body']['choices'][0]['message']['content']
#     batch.loc[batch['pmid'] == pmid, 'yn'] = yn


yn_pmid = df[df['yn'] != 'N']['pmid'].tolist()
print(len(yn_pmid))
print(len(true_pmid))

error = []
for i in true_pmid:
    if i not in yn_pmid:
        error.append(i)
print(len(error))
print(error)
print(f'Accuracy: {len(true_pmid)-len(error)}/{len(true_pmid)}')

total_cost = df['cost'].sum()
print("Total cost:", total_cost)
# 184 abstracts cost 0.1705$ using Batch API

184
{'id': 'batch_req_6760964fbd608190bbee0810f56c3e14', 'custom_id': '8523925', 'response': {'status_code': 200, 'request_id': '0c82e98f0f99937797fc449e5c620cad', 'body': {'id': 'chatcmpl-AfCZ7z5AS6bT9FDa84Mg7DEuJ85bl', 'object': 'chat.completion', 'created': 1734383113, 'model': 'gpt-4o-2024-08-06', 'choices': [{'index': 0, 'message': {'role': 'assistant', 'content': 'N', 'refusal': None}, 'logprobs': None, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 685, 'completion_tokens': 1, 'total_tokens': 686, 'prompt_tokens_details': {'cached_tokens': 0, 'audio_tokens': 0}, 'completion_tokens_details': {'reasoning_tokens': 0, 'audio_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}}, 'system_fingerprint': 'fp_9faba9f038'}}, 'error': None}
32
27
1
['19269992']
Accuracy: 26/27
Total cost: 0.17054249999999999


In [None]:
# joblib.dump(filtered_data, '/home/gy237/project/Biomedical_datasets/total_pubmed/abstracts_filtere_1-1575.joblib')

In [5]:
import hashlib

def calculate_file_hash(file_path, algorithm='sha256', buffer_size=65536):
    """
    计算文件的哈希值。
    
    参数:
        file_path (str): 文件路径。
        algorithm (str): 哈希算法 ('md5', 'sha1', 'sha256'等)。
        buffer_size (int): 每次读取的字节数，默认为 64KB。
    
    返回:
        str: 文件的哈希值。
    """
    try:
        # 根据算法名称创建哈希对象
        hash_func = hashlib.new(algorithm)
        
        # 按块读取文件并更新哈希
        with open(file_path, 'rb') as f:
            while chunk := f.read(buffer_size):
                hash_func.update(chunk)
        
        # 返回十六进制哈希值
        return hash_func.hexdigest()
    except Exception as e:
        print(f"计算哈希值时出错: {e}")
        return None

# 示例用法
file_path = "Batch_1_sample_5000_12_16/Batch_1_sample_5000_12_16.joblib"
hash_value = calculate_file_hash(file_path, algorithm='sha256')
if hash_value:
    print(f"文件的 SHA-256 哈希值: {hash_value}")

文件的 SHA-256 哈希值: 4218dd64a7f04a96e0d25a1f1232b7b904af1936c2b963f573d6da3dade4f8ea


In [3]:
import joblib
data_26406687 = joblib.load('/home/gy237/project/Biomedical_datasets/total_pubmed/Sampled_from_total_PubMed_specific_name_12_16/Batch_1_sample_5000_12_16.joblib')
print(len(data_26406687))
print(data_26406687.columns)

5000
Index(['pmid', 'title', 'abstract', 'journal', 'pubdate', 'authors',
       'mesh_terms', 'pub_year', 'pub_month', 'yn'],
      dtype='object')
