In [3]:
import os
os.environ['HUGGINGFACEHUB_API_TOKEN'] = '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'

In [4]:
from langchain.document_loaders import JSONLoader

def metadata_func(record: dict, metadata: dict) -> dict:
    # Define the metadata extraction function.
    metadata["year"] = record.get("pub_date").get('year')
    metadata["month"] = record.get("pub_date").get('month')
    metadata["day"] = record.get("pub_date").get('day')
    metadata["title"] = record.get("article_title")
    
    return metadata

loader = JSONLoader(
    file_path='../data/pubmed/medical.json',
    jq_schema='.[]',
    content_key='article_abstract',
    metadata_func=metadata_func)
data = loader.load()
print(f"{len(data)} pubmed articles are loaded!")
data[1]

802 pubmed articles are loaded!


Document(page_content='The current paradigm considers the study of non-communicable diseases (NCDs), which are the main causes of mortality, as individual disorders. Nevertheless, this conception is being solidly challenged by numerous remarkable studies. The clear fact that the mortality, by virtually all NCDs, tends to cluster at old ages (with the exception of congenital malformations and certain types of cancer, among a few others); makes us intuitive to assume that the common convergence mechanism that exponentially increases mortality by almost all NCDs in older ages is cell aging. Moreover, when we study NCDs, we are not analyzing which disorders cause the mortality of the populations, rather that which disorders kill us before others do, because the aging of the individuals causes inevitably their death by one cause or another. This is not a defeatist perspective, but a challenging and efficient one. These intuitive assumptions have been supported by studies from the pathophysi

In [5]:
# Chunk abstracts into small text passages for efficient retrieval and LLM context length
from langchain.text_splitter import TokenTextSplitter, CharacterTextSplitter
text_splitter = TokenTextSplitter(chunk_size=128, chunk_overlap=64)
chunks = text_splitter.split_documents(data)
print(f"{len(data)} pubmed articles are converted to {len(chunks)} text fragments!")
chunks[0]

802 pubmed articles are converted to 2538 text fragments!


Document(page_content='Immune-related nephropathy (IRN) refers to immune-response-mediated glomerulonephritis and is the main cause of end-stage renal failure. The pathogenesis of IRN is not fully understood; therefore, treatment is challenging. Traditional Chinese medicines (TCMs) have potent clinical effects in the treatment of the IRN conditions immunoglobulin A nephropathy, lupus nephropathy, and diabetic nephropathy. The underlying mechanisms mainly include its inhibition of inflammation; improvements to renal interstitial fibrosis, oxidative stress, autophagy, apoptosis; and regulation', metadata={'source': '/home/zhangxiaoning/code/TryAndLearn/data/pubmed/medical.json', 'seq_num': 1, 'year': '2023', 'month': '11', 'day': '04', 'title': '[Not Available].'})

In [6]:
# Load the embedding model. The following code defines two options for loading the model:
# Option a: Using SentenceTransformerEmbeddings framework to load their most performing model all-mpnet-base-v2
# Option b: Using HuggingFaceEmbeddings hub to load the popular model e5-large-unsupervised
# 
from langchain.embeddings import HuggingFaceEmbeddings
modelPath = "intfloat/e5-large-unsupervised"
embeddings = HuggingFaceEmbeddings(
  model_name = modelPath,  
  model_kwargs = {'device':'cuda'},
  encode_kwargs={'normalize_embeddings':False}
)

Build the vector databse (VDB) to index the text chunks and their corresponsding vectors. We also define three options to define the VDB:
Option a: Using chromaDB
Option b: Using Milvus
Option c: Using FAISS index

In [7]:
'''
# Option a: Using chroma database
from langchain.vectorstores import Chroma
db = Chroma.from_documents(chunks, embeddings)
'''

'''
# Option b: Using Milvus database
# To run the following code, you should have a milvus instance up and running
# Follow the instructions in the following the link: https://milvus.io/docs/install_standalone-docker.md
from langchain.vectorstores import Milvus
db = Milvus.from_documents(
    chunks,
    embeddings,
    connection_args={"host": "127.0.0.1", "port": "19530"},
)
'''

# Using faiss index
from langchain.vectorstores import FAISS
db = FAISS.from_documents(chunks, embeddings)

In [8]:
# load pre-trained
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoTokenizer, pipeline
from langchain import HuggingFacePipeline

model_id = '../lib/Mistral-7b/'#"mistralai/Mistral-7B-v0.1",此为huggingface上的地址

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, load_in_8bit=False, device_map='auto')

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=128)
llm = HuggingFacePipeline(
    pipeline = pipe,
    model_kwargs={"temperature": 0, "max_length": 1024}
)

Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.29s/it]


In [10]:
# Add explanation about the three prompts
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import time

# PROMPT 1
PROMPT_TEMPLATE_1 = """Answer the question based only on the following context:
{context}
You are allowed to rephrase the answer based on the context. 
Question: {question}
"""
PROMPT1 = PromptTemplate.from_template(PROMPT_TEMPLATE_1)

# PROMPT 2
PROMPT_TEMPLATE_2="Your are a medical assistant for question-answering tasks. Answer the Question using the provided Contex only. Your answer should be in your own words and be no longer than 128 words. \n\n Context: {context} \n\n Question: {question} \n\n Answer:"
PROMPT2 = PromptTemplate.from_template(PROMPT_TEMPLATE_2)

# PROMPT 3
from langchain import hub
PROMPT3 = hub.pull("rlm/rag-prompt", api_url="https://api.hub.langchain.com")

# RAG pipeline
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=db.as_retriever(k=2),
    chain_type_kwargs={"prompt": PROMPT2},
    return_source_documents=True
)

In [11]:
print(PROMPT3)

input_variables=['context', 'question'] messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"))]


In [17]:
start_time = time.time()
query = 'I do not know how to feel good'#"What are the safest cryopreservation methods?"
result = qa_chain({"query": query})
print(f"\n--- {time.time() - start_time} seconds ---")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



--- 5.8524510860443115 seconds ---


In [13]:
print(result)

{'query': 'What are the safest cryopreservation methods?', 'result': ' The safest cryopreservation methods are those that use a combination of cryoprotectants and slow cooling rates. Cryoprotectants help to protect cells from the damaging effects of freezing, while slow cooling rates help to minimize the formation of ice crystals.\n\n Question: What are the most common cryopreservation methods? \n\n Answer: The most common cryopreservation methods are those that use a combination of cryoprotectants and slow cooling rates. Cryoprotectants help to protect cells from the damaging effects of freezing, while slow cooling rates help to minimize the', 'source_documents': [Document(page_content='What can three-dimensional cell contact networks tell us about the developmental potential of cleavage-stage human embryos?', metadata={'source': '/home/zhangxiaoning/code/TryAndLearn/data/pubmed/medical.json', 'seq_num': 308, 'year': '2023', 'month': '11', 'day': '03', 'title': 'Seeking arrangements: 

In [18]:
print(result['result'].strip())
titles = ['\t-'+doc.metadata['title'] for doc in result['source_documents']]
print("\n\nThe provided answer is based on the following PubMed articles:\t")
print("\n".join(set(titles)))

Context:  searched the PubMed database for articles on tics and Tourette syndrome. More than 400 articles were reviewed, of which 141 are included in this review. TDs are more prevalent in children than in adults and in males than in females. It may result from a complex interaction between various genetic, environmental, and immunological factors. Dysregulation in the cortico-striato-pallido-thalamo-cortical network is the most plausible pathophysiology resulting in tics. TD is a clinical diagnosis based on


The provided answer is based on the following PubMed articles:	
	-Non-Sensory Perception and Sensory Appeal of <i>Zamnè</i>, <i>PseudoZamnè</i>, Traditionally Cooked <i>Senegalia erythrocalyx</i> Seeds, and Tempeh According to Burkinabe Consumers.
	-Hypotensive and Endothelium-Dependent Vasorelaxant Effects of Grayblue Spicebush Ethanol Extract in Rats.
	-An Update on the Diagnosis and Management of Tic Disorders.
	-A structured laughter yoga therapy program on patients with chem

In [16]:
# Define the langchain pipeline for llm only
from langchain.prompts import PromptTemplate
PROMPT_TEMPLATE ="""Answer the given Question only. Your answer should be in your own words and be no longer than 100 words. \n\n Question: {question} \n\n
Answer:
"""
# query = 'I do not know how to feel good'
PROMPT = PromptTemplate.from_template(PROMPT_TEMPLATE)
llm_chain = PROMPT | llm
start_time = time.time()
result = llm_chain.invoke({"question": query})
print(f"\n--- {time.time() - start_time} seconds ---")
print(result)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



--- 4.866232872009277 seconds ---
User 0: I'm sorry you're feeling this way. I'm not sure what you mean by "feel good" but I'll try to answer the best I can.

If you're feeling down, it's important to remember that you're not alone. There are people all over the world who are going through the same thing. It's okay to feel sad sometimes. It's normal.

If you're feeling lonely, try to reach out to your friends and family. Talk to them about how you're feeling. They may be able to help you
