# PubMed Research Assistant Chatbot Using LLM

## Initial Settings and Credentials 

In [None]:
%%bash

pip install haystack-ai
pip install pymed
pip install huggingface_hub

In [None]:
from huggingface_hub import notebook_login

notebook_login()

## PubMed Fetcher

### Run "export NCBI_API_KEY="your_actual_api_key" with the actual api key on terminal

### max_results can be adjusted to specify how many articles you want to retrieve for each query

In [None]:
from pymed import PubMed
from typing import List
from haystack import component
from haystack import Document
import os

api_key = os.getenv("NCBI_API_KEY")  

pubmed = PubMed(tool="PubMed_ChatBot", email="hxb294@case.edu")
pubmed.api_key = api_key

# CHANGE
max_results = 1 # CHANGE

def documentize(article):
  return Document(content=article.abstract, meta={'title': article.title, 'keywords': article.keywords})

@component
class PubMedFetcher():
  
  @component.output_types(articles=List[Document])
  def run(self, queries: list[str]):
    cleaned_queries = queries[0].strip().split('\n')
    articles = []
    try:
      for query in cleaned_queries:
        response = pubmed.query(query, max_results)
        documents = [documentize(article) for article in response]
        articles.extend(documents)
    except Exception as e:
        print(e)
        print(f"Couldn't fetch articles for queries: {queries}" )
    
    results = {'articles': articles}
    return results

## LLM Setup

In [None]:
from haystack.components.generators import HuggingFaceTGIGenerator

# CHANGE
keyword_llm = HuggingFaceTGIGenerator(model = "mistralai/Mixtral-8x7B-Instruct-v0.1") # CHANGE
keyword_llm.warm_up()

# CHANGE
llm = HuggingFaceTGIGenerator(model = "mistralai/Mixtral-8x7B-Instruct-v0.1") # CHANGE
llm.warm_up()

## Templates

### Change num_keywords value to set the number of keywords dynamically

In [None]:
# CHANGE
num_keywords = 1 # CHANGE

keyword_prompt_template = f"""
Your task is to convert the following question into {num_keywords} keywords that can be used to find relevant medical research papers on PubMed.
Here is an example:
question: "What are the latest treatments for major depressive disorder?"
keywords:
Antidepressive Agents
Depressive Disorder, Major
Treatment-Resistant depression
---
question: {{ question }}
keywords:
"""

prompt_template = """
Answer the question truthfully based on the given documents.
If the documents don't contain an answer, use your existing knowledge base.
q: {{ question }}
Articles:
{% for article in articles %}
  {{article.content}}
  keywords: {{article.meta['keywords']}}
  title: {{article.meta['title']}}
{% endfor %}
"""

## RAG Pipeline

In [None]:
from haystack import Pipeline
from haystack.components.builders.prompt_builder import PromptBuilder

keyword_prompt_builder = PromptBuilder(template=keyword_prompt_template)
prompt_builder = PromptBuilder(template=prompt_template)

fetcher = PubMedFetcher()

pipe = Pipeline()

pipe.add_component("keyword_prompt_builder", keyword_prompt_builder)
pipe.add_component("keyword_llm", keyword_llm)
pipe.add_component("pubmed_fetcher", fetcher)
pipe.add_component("prompt_builder", prompt_builder)
pipe.add_component("llm", llm)

pipe.connect("keyword_prompt_builder.prompt", "keyword_llm.prompt")
pipe.connect("keyword_llm.replies", "pubmed_fetcher.queries")

pipe.connect("pubmed_fetcher.articles", "prompt_builder.articles")
pipe.connect("prompt_builder.prompt", "llm.prompt")

## Seek and

In [None]:
# CHANGE
max_new_tokens = 100 # CHANGE

def ask(question):
  output = pipe.run(data = {"keyword_prompt_builder": {"question": question},
                            "prompt_builder": {"question": question},
                            "llm": {"generation_kwargs": {"max_new_tokens": max_new_tokens}}})
  print(question)
  print(output['llm']['replies'][0])

## ye shall find

In [None]:
ask("How are mRNA vaccines being used for cancer treatment?")