In [1]:
## Import the text content of an existing paper and process the text by chapter

In [1]:
# 1 Import the full text content and save the results separately

import csv
import os
import pickle


csv_file_path = "./Diabetes-PDFs-Outputs/all_papers.csv"
output_full_dict_path = "./data_process/ori_data/full_text_dict.pkl"
output_abstract_dict_path = "./data_process/ori_data/abstract_dict.pkl"
output_section1_dict_path = "./data_process/ori_data/introduction_dict.pkl"
output_section2_dict_path = "./data_process/ori_data/method_dict.pkl"
output_section3_dict_path = "./data_process/ori_data/result_dict.pkl"

# section 1, 2, and 3 represent introduction, method, and results respectively.


full_text_dict = {}                 
abstract_dict = {}
section1_dict = {}
section2_dict = {}
section3_dict = {}


with open(csv_file_path, 'r', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for idx, row in enumerate(reader):
        paper_id = f"paper_{idx + 1}"
        

        path_1 = os.path.join('./Diabetes-PDFs-Outputs/', row['PDF name'], 'pickle', row['abstract'])
        with open(path_1, 'rb') as file:
            abstract = pickle.load(file)
        

        path_2 = os.path.join('./Diabetes-PDFs-Outputs/', row['PDF name'], 'pickle', row['section 1'])
        with open(path_2, 'rb') as file:
            section1 = pickle.load(file)

        path_3 = os.path.join('./Diabetes-PDFs-Outputs/', row['PDF name'], 'pickle', row['section 2'])
        with open(path_3, 'rb') as file:
            section2 = pickle.load(file)
        
        path_4 = os.path.join('./Diabetes-PDFs-Outputs/', row['PDF name'], 'pickle', row['section 3'])
        with open(path_4, 'rb') as file:
            section3 = pickle.load(file)
        

        full_text = {
            'abstract': abstract,
            'intro': section1,
            'method': section2,
            'result': section3,
            'paper_name': row['article']
        }

        full_text_dict[paper_id] = full_text
        abstract_dict[paper_id] = abstract
        section1_dict[paper_id] = section1
        section2_dict[paper_id] = section2
        section3_dict[paper_id] = section3


with open(output_full_dict_path, 'wb') as file:
    pickle.dump(full_text_dict, file)
with open(output_abstract_dict_path, 'wb') as file:
    pickle.dump(abstract_dict, file)
with open(output_section1_dict_path, 'wb') as file:
    pickle.dump(section1_dict, file)
with open(output_section2_dict_path, 'wb') as file:
    pickle.dump(section2_dict, file)
with open(output_section3_dict_path, 'wb') as file:
    pickle.dump(section3_dict, file)

print("successful。")


文件已成功保存。


In [2]:
## Use the llama3 70B interface to replace the way of importing 70B model to construct demo

In [2]:

from openai import OpenAI

client = OpenAI(
  base_url = "https://integrate.api.nvidia.com/v1",
  api_key = "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" #"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"
)

In [None]:
# test

# completion = client.chat.completions.create(
#   model="meta/llama3-70b-instruct",
#   messages=[{"role":"user","content":"generate a paper about English, the paper contains more than 50 words"}],
#   temperature=0.5,
#   top_p=1,
#   max_tokens=8000, # 8192 max
#   stream=True
# )

# abs_str = ''
# for chunk in completion:
#   if chunk.choices[0].delta.content is not None:
#     abs_str += chunk.choices[0].delta.content.replace('\n', ' ')
  
# print(abs_str)

In [6]:
# !pip install tiktoken

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
[0m

Construct summary function

Summarize each chapter. To avoid the text context length exceeding the limit, construct a segment summary function

Function: When using the token calculator to calculate the number of tokens in the text, when it exceeds the token limit, segment (not stage) and summarize by segment

In [13]:
# generate abstract
import tiktoken

# Process the text in segments. When the token length exceeds the set length, segment it and cut it into text lists
## It will affect the contextual learning

def split_text(text, max_tokens=5000, model="gpt-3.5-turbo"):
    encoding = tiktoken.encoding_for_model(model)
    tokens = encoding.encode(text)
    
    if len(tokens) <= max_tokens:
        return [text]
    
    splits = []
    start = 0
    while start < len(tokens):
        end = start + max_tokens
        split_tokens = tokens[start:end]
        split_texts = encoding.decode(split_tokens)
        splits.append(split_texts)
        start = end
    
    return splits

# Summarize the small paragraphs divided into chapters one by one
def summarize_text(text, client, section, model="meta/llama3-70b-instruct"):
    splits = split_text(text, max_tokens=3000)
    summaries = []
    
    for split in splits:
        prompt = f""" 
You are an expert in the field of medicine with significant contributions to diabetes treatment. You are now tasked with researching and summarizing different sections of a scientific paper, highlighting the core content of the article. You need to consider the characteristics of each section of the paper to provide varying levels of content summarization.

The main sections and themes are:

Abstract: Briefly summarize the study's purpose, methods, results, and conclusions.
Introduction: Provide background information, research questions, and study objectives.
Methods: Describe the study design, experimental methods, and data collection techniques.
Results: Present the research findings and data analysis results.
Below is the text passage from the section {section}: {split}

Please summarize the content according to the following requirements:

1. Strictly adhere to the original content of the section for summarization.
2. Limit the summary to within 300 words.
3. Use a review writing tone to appropriately refine the related text.
4. Do not include any words or descriptions unrelated to the original text and summary.
"""

        completion = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.5,
            top_p=1,
            max_tokens=4500,  # Allow some buffer for response tokens
            stream=True
        )
        summary = ""
        for chunk in completion:
            if chunk.choices[0].delta.content is not None:
                summary += chunk.choices[0].delta.content.replace('\n', ' ')
        summaries.append(summary)
    
    return summaries

# Merge the paragraph summary list of the previous single chapter, and splice it by splitting spaces
def combine_summaries(summaries):
    combined_summary = " ".join(summaries)
    return combined_summary



# Extract the method and result of the text corresponding to each paperid, considering extracting from the abstract + introduction
def extract_info(abstract, introduction, model="meta/llama3-70b-instruct"):

    prompt = f""" 
    You are an expert in medical research, focusing on extracting key elements from academic papers. Your task is to summarize the core content related to methods and results from the given sections of a medical paper.

Main sections and their purposes:

1. Abstract: Summarizes the purpose, methods, results, and conclusions of the study.
2. Introduction: Provides background information, research problem, and study objectives.

The content of the abstract is: {abstract}
The content of the introduction is: {introduction}

Please summarize the following:

1. Methods: Extract and describe the research design, experimental methods, and data collection techniques used in the study.
2. Results: Extract and describe the key findings and data analysis results.
Requirements:

1. Base the summary strictly on the original text content.
2. Each summary should be within 50 words.
3. Use a review-like tone to appropriately polish the text.
4. Avoid any words or descriptions unrelated to the original text and summary.
5. Please output in json format

Output Example:
"Method": "xx", 'result':"xx" 
"""

    completion = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.5,
        top_p=1,
        max_tokens=300,  # Allow some buffer for response tokens
        stream=True
    )
    info = ""
    for chunk in completion:
        if chunk.choices[0].delta.content is not None:
            info += chunk.choices[0].delta.content.replace('\n', ' ')
    
    return info


# Merge all the content to generate the final summary.
def gen_sum(text, client, section, model="meta/llama3-70b-instruct"):

    if section == 'all':
        prompt = f""" 
    You are an expert in the field of medicine with significant contributions to diabetes treatment. You are now tasked with researching and summarizing different sections of a scientific paper, highlighting the core content of the article. You need to consider the characteristics of each section of the paper to provide varying levels of content summarization.

    The main sections and themes are:

    Abstract: Briefly summarize the study's purpose, methods, results, and conclusions.
    Introduction: Provide background information, research questions, and study objectives.
    Methods: Describe the study design, experimental methods, and data collection techniques.
    Results: Present the research findings and data analysis results.
    Below is the text passage from the section {section}: {text}

    Please summarize the content according to the following requirements:

    1. Strictly adhere to the original content of the section for summarization.
    2. Limit the summary to within 300 words.
    3. Use a review writing tone to appropriately refine the related text.
    4. Do not include any words or descriptions unrelated to the original text and summary.
    """
        

    else:
        prompt = f""" 
    You are an expert in the field of medicine with significant contributions to diabetes treatment. You are now tasked with researching and summarizing different sections of a scientific paper, highlighting the core content of the article. You need to consider the characteristics of each section of the paper to provide varying levels of content summarization.

    The main sections and themes are:

    Abstract: Briefly summarize the study's purpose, methods, results, and conclusions.
    Introduction: Provide background information, research questions, and study objectives.
    Methods: Describe the study design, experimental methods, and data collection techniques.
    Results: Present the research findings and data analysis results.
    Below is the text passage from the section {section}: {text}

    Please summarize the content according to the following requirements:

    1. Strictly adhere to the original content of the section for summarization.
    2. Limit the summary to within 300 words.
    3. Use a review writing tone to appropriately refine the related text.
    4. Do not include any words or descriptions unrelated to the original text and summary.
    """
        
    completion = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.5,
        top_p=1,
        max_tokens=4000,  # Allow some buffer for response tokens
        stream=True
    )
    summary = ""
    for chunk in completion:
        if chunk.choices[0].delta.content is not None:
            summary += chunk.choices[0].delta.content.replace('\n', ' ')
    
    return summary


In [14]:
# Use llama3 to generate summaries of no more than 100 words for abstracts and introductions for topic classification
import pickle

pkl_file_path = './data_process/ori_data/full_text_dict.pkl'
with open(pkl_file_path, 'rb') as file:
    paper_dict = pickle.load(file)


paper_info = {}
for key, value in paper_dict.items(): # key is the chapter name, value is the actual content
    paper_info[key] = {
        "methods": value['method']  # only test method here
    }

abs_dict = {}
for key, value in paper_info.items():
    text = value['methods'] * 5 # Expand the text length for testing
    while True:  
        
        summaries = summarize_text(text, client, key) # Generate summary for the first time, the content has been summarized in sections here
        text = '. '.join(summaries) # Get multiple summaries and splice the content
 
        # Generate summary multiple times
        encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')
        tokens = encoding.encode(text) 

        numtokens = len(tokens) 
        if numtokens < 8000: 
            break

    final_summary = combine_summaries(summaries) 
    
    # The method section of each paper continues to be summarized to make it smooth
    final_summary = gen_sum(final_summary, client, section='method')

    print(final_summary)
    print('-' * 100)
    abs_dict[key] = final_summary

Here is a summary of the Methods section:  This study is part of the CORDIOPREV trial, a randomized controlled trial that included 1002 patients with coronary heart disease (CHD) who followed one of two healthy diets (Mediterranean or low-fat) for 7 years. The patients were recruited from hospitals in Cordoba and Jaen, Spain, between 2009 and 2012. Inclusion criteria included patients aged 20-75 years with established CHD, no clinical events in the last 6 months, and no other serious illnesses. A total of 422 patients without type 2 diabetes (T2DM) at the beginning of the study were included, with 106 developing T2DM after 5 years (incident-T2DM group) and 316 not developing T2DM (non-T2DM group).  Dietary data was collected using a validated 137-item semi-quantitative food-frequency questionnaire (FFQ) at baseline and 1-year follow-up. The nutrient density of the total diet was assessed using the NRF9.3 score, which is based on 12 nutrients: 9 to encourage and 3 to limit. The NRF9.3 s

In [6]:
# Operation 1: Extract the method and result corresponding to each paperid
import pickle
import json

pkl_file_path = './data_process/ori_data/full_text_dict.pkl'
with open(pkl_file_path, 'rb') as file:
    paper_dict = pickle.load(file)

paper_information = {}

for paperid, value in paper_dict.items():
    method = paper_dict[paperid]['abstract']
    introduction = paper_dict[paperid]['intro']

    gen_result = extract_info(method, introduction)
    paper_information[paperid] = gen_result
    print(gen_result)


with open('paper_information.json', 'w', encoding='utf-8') as json_file:
    json.dump(paper_information, json_file, ensure_ascii=False, indent=4)

Here is the summary of the methods and results:  ``` {   "Method": "The study used a Mediterranean and a low-fat diet intervention in patients with coronary heart disease (CHD) without T2DM at baseline. Data were collected using Food Frequency Questionnaires at baseline and after 1 year of intervention.",   "Result": "Patients with greater improvement in NRF9.3 had over 50% less risk of developing T2DM compared with the lowest tertile (HR 2.10, 95%, CI = 1.12–3.56). Incident-T2DM showed less improvement in NRF9.3 than non-T2DM (p = 0.010)." } ```  Let me know if this meets your requirements!
Here is the summary:  { "Method": "The study randomly selected 120 T2DM patients, divided into T2DM and T2DM + AIS groups, and measured 12 serum biomarkers. Multivariate logistic regression and ROC curve analysis were performed to identify risk factors for AIS.", "Result": "The study found that hs-CRP and Lp-PLA2 were independent risk factors for AIS in T2DM patients, with good predictive performan

In [7]:
# Construct rag vector database

import faiss
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
import pickle

# Mean Pooling - Take attention mask into account for correct averaging
def meanpooling(output, mask):
    embeddings = output[0]  # First element of model_output contains all token embeddings
    mask = mask.unsqueeze(-1).expand(embeddings.size()).float()
    return torch.sum(embeddings * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)

# Use PubMedBERT model
model_name = "pubmedbert-base-embeddings" #"neuml/pubmedbert-base-embeddings"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


with open('./data_process/ori_data/full_text_dict.pkl', 'rb') as f:
    papers = pickle.load(f)

# Convert text to vector
def text_to_vector(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        output = model(**inputs)
    return meanpooling(output, inputs['attention_mask']).numpy()

# Split long text
def split_text(text, max_length=512):
    tokens = tokenizer.tokenize(text)
    chunks = []
    for i in range(0, len(tokens), max_length):
        chunk = tokens[i:i+max_length]
        chunks.append(tokenizer.convert_tokens_to_string(chunk))
    return chunks

# Build multiple indexes, one for each chapter
index_dict = {}
text_id_dict = {}
text_content_dict = {}
original_id_map = {}  # Used to map segment text ID to original ID

for paper_id, sections in papers.items():
    for section_name, text in sections.items():
        if section_name == "paper_name":
            continue  
        if section_name not in index_dict:
            index_dict[section_name] = faiss.IndexFlatL2(768)  # 768 is the default dimension of PubMedBERT vector
            text_id_dict[section_name] = []
            text_content_dict[section_name] = []
        
        text_chunks = split_text(text)
        for idx, chunk in enumerate(text_chunks):
            vector = text_to_vector(chunk)
            index_dict[section_name].add(vector)
            chunk_id = f"{paper_id}_{section_name}_{idx}"
            text_id_dict[section_name].append(chunk_id)
            text_content_dict[section_name].append(chunk)
            original_id_map[chunk_id] = paper_id


for section_name in index_dict:
    faiss.write_index(index_dict[section_name], f"text_vectors_{section_name}.index")
    np.save(f"text_ids_{section_name}.npy", text_id_dict[section_name])
    with open(f"text_content_{section_name}.pkl", 'wb') as f:
        pickle.dump(text_content_dict[section_name], f)
    with open(f"original_id_map_{section_name}.pkl", 'wb') as f:
        pickle.dump(original_id_map, f)


  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# Set up the query function and relevance sorting function

import faiss
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel
import pickle
from FlagEmbedding import FlagReranker

# Mean Pooling - Take attention mask into account for correct averaging
def meanpooling(output, mask):
    embeddings = output[0]  # First element of model_output contains all token embeddings
    mask = mask.unsqueeze(-1).expand(embeddings.size()).float()
    return torch.sum(embeddings * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)


model_name = "pubmedbert-base-embeddings" #"neuml/pubmedbert-base-embeddings"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


def text_to_vector(text):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        output = model(**inputs)
    return meanpooling(output, inputs['attention_mask']).numpy()

# Initialize the re-ranking model
reranker = FlagReranker("bge-reranker-base", use_fp16=True)  # 'BAAI/bge-reranker-base'


index_dict = {}
text_id_dict = {}
text_content_dict = {}
original_id_map = {}
section_names = ["abstract", "intro", "method", "result"]

for section_name in section_names:
    index_dict[section_name] = faiss.read_index(f"text_vectors_{section_name}.index")
    text_id_dict[section_name] = np.load(f"text_ids_{section_name}.npy", allow_pickle=True)
    with open(f"text_content_{section_name}.pkl", 'rb') as f:
        text_content_dict[section_name] = pickle.load(f)
    with open(f"original_id_map_{section_name}.pkl", 'rb') as f:
        original_id_map.update(pickle.load(f))

# Query function
def query_vector(query, k):
    query_vector = text_to_vector(query)
    results = {}
    
    for section_name, index in index_dict.items():
        num_vectors = index.ntotal  # The number of vectors in the current index
        k_search = min(k, num_vectors)  # Make sure the search number does not exceed the number of existing vectors
        D, I = index.search(query_vector, k_search)  # Search the top k nearest neighbors
        section_results = [(original_id_map[text_id_dict[section_name][I[0][i]]], text_content_dict[section_name][I[0][i]], D[0][i]) for i in range(k_search)]
        results[section_name] = section_results
    
    return results

# Re-sorting function, in addition to outputting sorting results by chapter, 
# can also merge the total sorting results and summarize the results from multipe ways
def rerank_results(query, results):
    reranked_results = {}
    
    for section_name, section_results in results.items():
        pairs = [[query, result[1]] for result in section_results]  # Using text snippets for relevance judgment
        scores = reranker.compute_score(pairs, normalize=True)  # Calculating relevance scores
        
        if isinstance(scores, np.float64):
            scores = [scores]  # Make sure scores is a list

        scored_results = [(result[0], result[1], score) for result, score in zip(section_results, scores)]
        reranked_results[section_name] = sorted(scored_results, key=lambda x: x[2], reverse=True)  # Sort by relevance score in descending order
    
  # Merge fragments of the same original document, keeping only the first one
    final_results = {}
    for section_name, section_results in reranked_results.items():
        seen_ids = set()
        final_section_results = []
        for result in section_results:
            original_id = result[0]
            if original_id not in seen_ids:
                final_section_results.append(result)
                seen_ids.add(original_id)
        final_results[section_name] = final_section_results
    
# Calculate the average score for each paper
    paper_scores = {}
    for section_name, section_results in final_results.items():
        for result in section_results:
            paper_id = result[0]
            score = result[2]
            if paper_id not in paper_scores:
                paper_scores[paper_id] = []
            paper_scores[paper_id].append(score)
    
    average_scores = {paper_id: np.mean(scores) for paper_id, scores in paper_scores.items()}
    sorted_average_scores = sorted(average_scores.items(), key=lambda x: x[1], reverse=True)

    return final_results, sorted_average_scores


2024-07-31 00:53:53.311953: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-31 00:53:53.335761: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
# query = "A systematic review and meta-analysis was conducted to assess the relationship between the common dietary antioxidants vitamin C, vitamin E, and beta-carotene, and the incidence of cardiovascular diseases."
# initial_results = query_vector(query, k=50) 
# final_results, sorted_average_scores = rerank_results(query, initial_results) # Sorting: Top-k of 4 chapters, compared with query

# for section_name, section_results in final_results.items():
#     print(f"Top results for section {section_name}:")
#     for result in section_results:
#         print(f"Original ID: {result[0]}, Text: {result[1]}, Score: {result[2]}")

# print("\nCombined results based on average scores:")
# for paper_id, avg_score in sorted_average_scores:
#     print(f"Paper ID: {paper_id}, Average Score: {avg_score}")

Top results for section abstract:
Original ID: paper_6, Text: a systematic review and meta - analysis was conducted to assess the relationship between the common dietary antioxidants vitamin c, vitamin e, and β - carotene and type 2 diabetes ( t2d ) and related traits. medline, embase, and the cochrane library were searched for relevant publications up until may 2023. studies were eligible if they had a cohort, case – control, or randomized controlled trial ( rct ) design and examined dietary intake, supplementation, or circulating levels of these antioxidants as exposure, and insulin resistance, β - cell function, or t2d incidence as outcomes. summary relative risks ( rr ) or mean differences ( md ) with 95 % confidence intervals ( ci ) were estimated using random - effects models. the certainty of the evidence was assessed with the grading of recommendations, assessment, development and evaluations framework. among 6190 screened records, 25 prospective observational studies and 15 rc

In [10]:
# Construct the subject content and save it as json
import json

themes = {
    "Epigenetics": [
        "DNA Methylation",
        "Histone acetylation",
        "Histone Modifications",
        "RNA Interference: non-coding RNAs"
    ],
    "Genetics": [
        "Gene: TCF7L2, PPARG, KCNJ11, IRS2, HNF1A, GCK, CAPN10, TCF7L2, IRS1, IRS-2, WFS-1, HNF1B, HNF4A, TCF7L2, ABCC8, GLUT2, GCGR, FTO, MC4R, HHEX, SLC30A8, CDKN2A/B, IGF2BP2, CDKAL1, KCNQ11, NOTCH2-ADAM30...",
        "extranuclear inheritance: family history"
    ],
    "Metabolism": [
        "carbohydrate metabolites (glucose and fructose, lipid metabolites such as phospholipids, sphingomyelins, and triglycerides)",
        "amino acid metabolites (branched-chain amino acids (BCAAs), aromatic amino acids, glycine, and glutamine)"
    ],
    "Organelles Stress": [
        "Endoplasmic Reticulum (ER) Stress: Stress in the ER can lead to improper protein folding, affecting insulin production and sensitivity.",
        "Mitochondrial Stress: Dysfunctional mitochondria can impair cellular energy balance and contribute to insulin resistance.",
        "Peroxisomes Stress: Oxidative stress and metabolic disturbances affecting insulin signaling."
    ],
    "Gut Microbiome": [
        "Gut Dysbiosis: Imbalance in gut microbial communities linked to metabolic diseases, including T2D. (Bacteroides fragilis, Lactobacillus fermentum, Roseburia intestinalis, L. plantarum, L. casei, and Akkermansia muciniphila)",
        "Leaky Gut: Increased intestinal permeability, leading to systemic inflammation and insulin resistance.",
        "Gut Inflammation: Inflammatory responses in the gut influencing overall metabolic health.",
        "oxidative stress"
    ],
    "Metabolites and Metabolic Pathways": [
        "Carbohydrates: Metabolites like glucose and fructose that play a role in energy balance and insulin response.",
        "Lipids: Metabolic products such as phospholipids, sphingolipids, and triglycerides associated with insulin resistance.",
        "Amino Acids: Certain amino acids, including branched-chain amino acids (BCAAs), that may impact glucose metabolism.",
        "microbial metabolites: xanthurenate, creatine, urate, xanthine, 2-hydroxyhippurate, 3-(4-hydroxyphenyl)lactate, and 2-hydroxybutyrate"
    ],
    "Environmental Pollutants": [
        "Persistent Organic Pollutants (POPs): Chemicals like DDT, TCDD, and PCBs affecting endocrine function.",
        "Air Pollutants: Particulates such as PM2.5 and PM10 linked to increased T2D risk.",
        "Endocrine Disrupting Chemicals (EDCs): Compounds like phthalates and BPA influencing hormone regulation.",
        "Heavy Metals: Elements like cadmium (Cd), lead (Pb), arsenic (As), and mercury (Hg) that can impair metabolic health.",
        "residential noise levels",
        "neighborhood walkability",
        "greenery",
        "area-level socio-economic deprivation"
    ],
    "Diet and Lifestyle": [
        "Refined Carbohydrates: High intake of refined sugars and carbs leading to spikes in blood glucose.",
        "Saturated Fat: Diets high in saturated fats contributing to insulin resistance.",
        "Processed Foods: Consumption of highly processed foods linked to poor metabolic health.",
        "Smoking: Tobacco use increasing the risk of insulin resistance and T2D.",
        "Physical Inactivity: Lack of exercise contributing to obesity and impaired glucose metabolism.",
        "Sleep duration",
        "Antioxidants",
        "micronutrients (Vitamin B,C,D,E,... and minerals)",
        "overweight, obesity, high fat level, large waist circumference",
        "mental health: stress, depression, anxious"
    ]
}

with open('paper_themes.json', 'w', encoding='utf-8') as f:
    json.dump(themes, f, ensure_ascii=False, indent=4)


In [11]:
import json

with open("paper_themes.json", "r") as f:
    themes = json.load(f)

# Build a topic query and get related papers
paper_theme_map = {}  # Used to record the unique topic and score corresponding to each paper ID

for theme, subthemes in themes.items():
    for subtheme in subthemes:
        results = query_vector(subtheme, k=50) 
        reranked_results, sorted_scores = rerank_results(subtheme, results)
        
        # Get the top1 paper ID and corresponding score
        top_paper_id, top_score = sorted_scores[0]
        
        # Check if the paper ID already has a corresponding subtopic
        if top_paper_id in paper_theme_map:
            # If the new score is higher, update the subtopic and score
            if top_score > paper_theme_map[top_paper_id]['score']:
                paper_theme_map[top_paper_id] = {'main_theme': theme, 'sub_theme': subtheme, 'score': top_score}
        else:
            paper_theme_map[top_paper_id] = {'main_theme': theme, 'sub_theme': subtheme, 'score': top_score}


with open("paper_theme_map.json", "w") as f:
    json.dump(paper_theme_map, f, indent=4)

print(json.dumps(paper_theme_map, indent=4))


{
    "paper_4": {
        "main_theme": "Environmental Pollutants",
        "sub_theme": "Air Pollutants: Particulates such as PM2.5 and PM10 linked to increased T2D risk.",
        "score": 0.3575504105134501
    },
    "paper_2": {
        "main_theme": "Epigenetics",
        "sub_theme": "Histone acetylation",
        "score": 0.0006556566748274668
    },
    "paper_3": {
        "main_theme": "Diet and Lifestyle",
        "sub_theme": "overweight, obesity, high fat level, large waist circumference",
        "score": 0.21734743806362997
    },
    "paper_6": {
        "main_theme": "Diet and Lifestyle",
        "sub_theme": "Smoking: Tobacco use increasing the risk of insulin resistance and T2D.",
        "score": 0.6089117101984429
    },
    "paper_5": {
        "main_theme": "Diet and Lifestyle",
        "sub_theme": "Physical Inactivity: Lack of exercise contributing to obesity and impaired glucose metabolism.",
        "score": 0.17593499173509752
    },
    "paper_1": {
     

In [None]:
# Complete the topic construction and merge the final chapter content
## Merge different chapters of the paper and save the content of each chapter
import pickle
import tiktoken
import json
from openai import OpenAI

pkl_file_path = './data_process/ori_data/full_text_dict.pkl'
with open(pkl_file_path, 'rb') as file:
    paper_dict = pickle.load(file)

# 构造字典内容，参考摘要和introduction，同时过滤长度问题（多阶段总结，直到满足要求）
paper_info = {}
for key, value in paper_dict.items(): # key为章节名，value为实际内容
    paper_info[key] = {
        "abstract": value['abstract'],
        "intro": value['intro'],
        "methods": value['method'],
        "results": value['result']
    }

abs_dict = {}
section_order = ["abstract", "intro", "methods", "results"]
section_summaries = {section: [] for section in section_order}

for paper_id, sections in paper_info.items():
    for section, text in sections.items():
        while True:  
            summaries = summarize_text(text, client, paper_id) 
            text = '. '.join(summaries) 

            
            encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')
            tokens = encoding.encode(text) 

            numtokens = len(tokens) 
            if numtokens < 8000: 
                break

        final_summary = combine_summaries(summaries)
        

        final_summary = gen_sum(final_summary, client, section)
        section_summaries[section].append(final_summary)

# Save the merged results of the chapters, that is, four chapters, each chapter is a list, saved in the order of paperid
with open('section_summaries.json', 'w', encoding='utf-8') as json_file:
    json.dump(section_summaries, json_file, ensure_ascii=False, indent=4)




In [14]:
# Complete the topic construction and merge the final chapter content
## Merge different chapters of the paper and save the content of each chapter

import pickle
import tiktoken
from openai import OpenAI
import json

pkl_file_path = './data_process/ori_data/full_text_dict.pkl'
with open(pkl_file_path, 'rb') as file:
    paper_dict = pickle.load(file)


paper_info = {}
for key, value in paper_dict.items(): 
    paper_info[key] = {
        "abstract": value['abstract'],
        "intro": value['intro'],
        "methods": value['method'],
        "results": value['result']
    }

abs_dict = {}
section_order = ["abstract", "intro", "methods", "results"]
section_summaries = {section: [] for section in section_order}

for paper_id, sections in paper_info.items():
    for section, text in sections.items():
        while True:   
            summaries = summarize_text(text, client, paper_id) 
            text = '. '.join(summaries) 

           
            encoding = tiktoken.encoding_for_model('gpt-3.5-turbo')
            tokens = encoding.encode(text) 

            numtokens = len(tokens) 
            if numtokens < 8000: 
                break

        final_summary = combine_summaries(summaries) 
        
        # 每个章节继续汇总，使其顺畅
        final_summary = gen_sum(final_summary, client, section)
        section_summaries[section].append({"paper_id": paper_id, "summary": final_summary})

# Save the merged results of the chapters, that is, four chapters, each chapter is a list, saved in the order of paperid
# Save section_summaries as a json file
with open('section_summaries.json', 'w') as json_file:
    json.dump(section_summaries, json_file, indent=4)


print(json.dumps(section_summaries, indent=4))

{
    "abstract": [
        {
            "paper_id": "paper_1",
            "summary": "Here is a summary of the provided text passage, broken down into the main sections of the paper:  **Abstract:** This study examines the relationship between changes in Nutrient-Rich Food Index 9.3 (NRF9.3) and the risk of developing type 2 diabetes mellitus (T2DM) in patients with coronary heart disease (CHD). The findings suggest that improved diet quality, as measured by NRF9.3, is associated with a lower risk of T2DM.  **Introduction:** The increasing incidence of T2DM in Western countries highlights the need for nutritional interventions promoting high-quality dietary patterns. This study aims to investigate whether changes in NRF9.3 are related to the risk of developing T2DM in patients with CHD.  **Methods:** The study was conducted within the context of two healthy dietary interventions (Mediterranean and low-fat diets) using data from the CORDIOPREV study. Patients without T2DM at baseline 

In [41]:
import json

# Read the JSON file containing the topic information
with open('section_summaries.json', 'r', encoding='utf-8') as json_file:
    section_summaries = json.load(json_file)

# Read the topic content
with open('paper_theme_map.json', 'r', encoding='utf-8') as json_file:
    paper_theme_map = json.load(json_file)

# Define chapter order
section_order = ['abstract', 'intro', 'methods', 'results']

# Merge the contents of each chapter and generate a final review
final_combined_summaries = {}
for section in section_order:
    combined_text = ''
    for entry in section_summaries[section]:
        summary = entry['summary']
        paper_id = entry['paper_id']
        theme = paper_theme_map[paper_id]
        combined_text += f"Main Theme: {theme['main_theme']}\nSub Theme: {theme['sub_theme']}\n{summary}\n"
    final_combined_summaries[section] = gen_sum(combined_text, client, section)  # Generate a summary of each chapter's contents


# Merge the contents of all chapters to generate the first review
overall_survey = ''
for key, item in final_combined_summaries.items():
    # if key == 'results':continue # Results are not concatenated and are used directly for subsequent processing
    overall_survey += key + '\n' + item + '\n'

print(overall_survey) # 


abstract
Here is a summary of the provided text passages, broken down into the main sections of the paper:  **Diet and Lifestyle: Saturated Fat**  This study examines the relationship between changes in Nutrient-Rich Food Index 9.3 (NRF9.3) and the risk of developing type 2 diabetes mellitus (T2DM) in patients with coronary heart disease (CHD). The findings suggest that improved diet quality, as measured by NRF9.3, is associated with a lower risk of T2DM. The study was conducted within the context of two healthy dietary interventions (Mediterranean and low-fat diets) using data from the CORDIOPREV study. After 5 years of follow-up, patients with greater improvement in NRF9.3 had over 50% less risk of developing T2DM compared to the lowest tertile.  **Epigenetics: Histone Acetylation**  This study investigated the risk factors for acute ischemic stroke (AIS) in patients with type 2 diabetes mellitus (T2DM). The results showed that levels of fasting blood glucose, homocysteine, high-sens

In [46]:
def refine_overall_survey(overall_survey, client, model="meta/llama3-70b-instruct"):
  # Define prompt, used to re-sort the results of large models
    prompt = f"""
Refine the following research survey, ensuring that the text is coherent and flows smoothly. 
Maintain the given structure where each section starts with the section name followed by the section content. 
Here is the survey:\n\n
{overall_survey}"

The output format must be following:

abstract\n
{"The summary of abstract"}\n
introduction\n
{"The summary of introduction"}\n
method\n
{"The summary of method"}\n
Results\n
{"The summary of Results"}\n

Remember:
1. Output the final text without anthing other words
2. Try not to change the word count of the original survey
"""
    completion = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.5,
            top_p=1,
            max_tokens=4800,  # Allow some buffer for response tokens
            stream=True
        )
    
    refined_survey = ""
    for chunk in completion:
        if chunk.choices[0].delta.content is not None:
            refined_survey += chunk.choices[0].delta.content.replace('\n', ' ')

    return refined_survey

In [48]:
# Get the result without conclusion
refined_survey = refine_overall_survey(overall_survey, client)
print(refined_survey)

Here is the refined research survey:   abstract This research survey examines the relationship between various factors and the risk of developing type 2 diabetes mellitus (T2DM), including the association between improved diet quality and T2DM risk in patients with coronary heart disease, risk factors for acute ischemic stroke in patients with T2DM, the link between sugar-sweetened beverages intake and metabolic syndrome, and the emerging evidence on youth-onset type 2 diabetes.  introduction Type 2 Diabetes Mellitus (T2DM) is a chronic disease characterized by abnormal carbohydrate metabolism, with diets high in saturated fats contributing to insulin resistance, while high-quality dietary patterns can prevent T2DM incidence. Non-communicable diseases pose a significant burden on global health systems and economies, particularly in low- and middle-income countries, with childhood obesity and type 2 diabetes being significant global health concerns.  method The study employed a randomiz

In [57]:
# Generate a conclusion. 
# Combine the topics of each article, the methods and results of each article, 
# and the existing survey to generate the content of the conclusion chapter.

import json

with open('paper_information.json', 'r', encoding='utf-8') as file:
    methods_results = json.load(file)

with open('paper_theme_map.json', 'r', encoding='utf-8') as file:
    paper_theme_map = json.load(file)

with open('section_summaries.json', 'r', encoding='utf-8') as file:
    section_summaries = json.load(file)

# Generate conclusion prompt for each paper
def generate_conclusion_prompt(paper_id, methods_results, paper_theme_map, section_summaries_intro):
    method_result = methods_results
    theme = paper_theme_map.get(paper_id, {})
    intro_summary = next((entry['summary'] for entry in section_summaries_intro if entry['paper_id'] == paper_id), '')

    prompt = f"""
    Main Theme: {theme.get('main_theme', 'N/A')}
    Sub Theme: {theme.get('sub_theme', 'N/A')}

    Introduction:
    {intro_summary}

    Methods & Results:
    {method_result}

    Please provide a conclusion summarizing the above information.
    The output words must be less than 100 words.

    """
    return prompt


# Clean the generated conclusion content
def clean_conclusion(conclusion):

    conclusion = conclusion.replace('In conclusion ', '')

    if ':' in conclusion:
        conclusion = conclusion.split(':', 1)[1].strip()

    return conclusion


# Prepare to generate conclusion content
section_summaries_intro = section_summaries.get('intro', [])

conclusions = []
for paper_id in methods_results.keys():
    methods_info = methods_results[paper_id]
    prompt = generate_conclusion_prompt(paper_id, methods_info, paper_theme_map, section_summaries_intro)

    completion = client.chat.completions.create(
            model="meta/llama3-70b-instruct",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.5,
            top_p=1,
            max_tokens=200,  # Allow some buffer for response tokens
            stream=True
        )
    
    conclusion = ""
    for chunk in completion:
        if chunk.choices[0].delta.content is not None:
            conclusion += chunk.choices[0].delta.content.replace('\n', ' ')

    conclusions.append(conclusion)

# Concatenate the results to generate the final conclusion section
final_conclusion = ''
for conclusion in conclusions:
    final_conclusion += clean_conclusion(conclusion)
print(final_conclusion)

Here is a conclusion summarizing the study:  In conclusion, this study demonstrates the importance of diet quality in preventing Type 2 Diabetes Mellitus (T2DM) in patients with coronary heart disease. The Nutrient-Rich Food Index 9.3 (NRF9.3) was found to be a reliable tool in measuring diet quality and predicting T2DM incidence. Patients who showed greater improvement in NRF9.3 had a significantly lower risk of developing T2DM. These findings highlight the need for dietary interventions that focus on nutrient-dense foods to prevent T2DM and support the use of NRF9.3 as a valuable tool in assessing diet quality.
Here is a conclusion summarizing the study:  In conclusion, this study investigated the risk factors associated with inflammation for acute ischemic stroke (AIS) in patients with type 2 diabetes mellitus (T2DM). The results showed that high-sensitivity C-reactive protein (hs-CRP) and lipoprotein-associated phospholipase A2 (Lp-PLA2) are independent risk factors for AIS in T2DM

In [63]:
# Clean the generated conclusion content
def clean_conclusion(conclusion):

    conclusion = conclusion.replace('In conclusion ', '')

    if ':' in conclusion:
        conclusion = conclusion.split(':', 1)[1].strip()

    return conclusion


final_conclusion = ''
for conclusion in conclusions:
    final_conclusion += clean_conclusion(conclusion)
print(final_conclusion)

this study demonstrates the importance of diet quality in preventing Type 2 Diabetes Mellitus (T2DM) in patients with coronary heart disease. The Nutrient-Rich Food Index 9.3 (NRF9.3) was found to be a reliable tool in measuring diet quality and predicting T2DM incidence. Patients who showed greater improvement in NRF9.3 had a significantly lower risk of developing T2DM. These findings highlight the need for dietary interventions that focus on nutrient-dense foods to prevent T2DM and support the use of NRF9.3 as a valuable tool in assessing diet quality.this study investigated the risk factors associated with inflammation for acute ischemic stroke (AIS) in patients with type 2 diabetes mellitus (T2DM). The results showed that high-sensitivity C-reactive protein (hs-CRP) and lipoprotein-associated phospholipase A2 (Lp-PLA2) are independent risk factors for AIS in T2DM patients, with good predictive performance. These findings provide a basis for the prevention and treatment of stroke in

In [65]:
output_survey = refined_survey + '\n' + 'conclusion' + '\n' + final_conclusion
print(output_survey)


Here is the refined research survey:   abstract This research survey examines the relationship between various factors and the risk of developing type 2 diabetes mellitus (T2DM), including the association between improved diet quality and T2DM risk in patients with coronary heart disease, risk factors for acute ischemic stroke in patients with T2DM, the link between sugar-sweetened beverages intake and metabolic syndrome, and the emerging evidence on youth-onset type 2 diabetes.  introduction Type 2 Diabetes Mellitus (T2DM) is a chronic disease characterized by abnormal carbohydrate metabolism, with diets high in saturated fats contributing to insulin resistance, while high-quality dietary patterns can prevent T2DM incidence. Non-communicable diseases pose a significant burden on global health systems and economies, particularly in low- and middle-income countries, with childhood obesity and type 2 diabetes being significant global health concerns.  method The study employed a randomiz