In [None]:
!pip install langchain biopython pubmed_parser pinecone-client openai tiktoken langchain_pinecone gradio

In [None]:
# Import necessary libraries for the notebook
from langchain_pinecone import PineconeVectorStore
from langchain import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA
from langchain.document_loaders import DirectoryLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
import os, time
from Bio import Entrez
import pubmed_parser as pp
from pinecone import Pinecone, ServerlessSpec

In [None]:
# Initiate the Pinecone client instance
clientPine = Pinecone(api_key=os.environ['PINECONE_API_KEY'])

###### Data Fetching and Processing
* Fetch and process PubMed data based on a provided query. This code extracts publication data using the PubMed ID (PMID), then processes multiple PMIDs to gather specified fields.
*  Utilizes the Biopython Entrez API to search PubMed and fetch details for each PMID. Results are processed to extract relevant keys (pmid, title, abstract) and saved locally as data.json.
*  This function sets the groundwork for analyzing or utilizing PubMed data, critical for subsequent data manipulation and embedding.

In [None]:
def fetch_pubmed_data(pmid, keys):
    print(pmid)
    data = pp.parse_xml_web(pmid=pmid)  # Ensure this function can be awaited or adapt as necessary
    return {key: data.get(key.lower(), None) for key in keys}

def process_pmids(pmids, keys):
    results = [fetch_pubmed_data(pmid, keys) for pmid in pmids]
    return results

def parse_data(query):
    Entrez.email = os.environ['DEFAULT_EMAIL']
    with Entrez.esearch(db='pubmed', term=query, retmax=50) as handle:
        pmid_list = Entrez.read(handle).get('IdList')
        handle.close()

    keys = ['pmid', 'title', 'abstract']
    results = process_pmids(pmid_list, keys)
    for result in results:
        print(result, type(results))
    with open('data.json', 'w') as f:
        f.writelines([str(result) + '\n' for result in results])
    return results

### Data Parsing and Extraction
Executes the data fetching function with a specific query and extracts titles, abstracts and PMIDs from the results. This data is then used to generate embeddings for each abstract, which are stored in a Pinecone index.

In [None]:
data = parse_data(query='"meta analysis"[Publication Type]')

texts = [item['abstract'] for item in data]
pmids = [item['pmid'] for item in data]

### Embedding Text Data

Embeds the extracted text data (abstracts) using an OpenAI model. The resulting embeddings are then stored in a Pinecone index for efficient retrieval and similarity search.

In [None]:
response = OpenAIEmbeddings(
  model='text-embedding-ada-002'
)
embedded = response.embed_documents(texts)

### Index Creation and Management
Checks for the existence of a specific index and creates it if it doesn't exist. Uses the Pinecone client to manage indexes, setting up a new index with specified dimensions and metrics if necessary, and ensures the index is ready before proceeding. 

In [None]:
index_name = "rag-example"
if index_name not in clientPine.list_indexes():

    clientPine.create_index(
        name=index_name,
        dimension=len(embedded[0]),
        metric='cosine',
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ))

    while not clientPine.describe_index(index_name).status['ready']:
        time.sleep(1)

### Data Upsert into Index
Inserts or updates data in the vector index, associating PMIDs with their respective embedded vectors and metadata.

In [None]:
index = clientPine.Index(index_name)
upsert_data = [{
    'id':item['pmid'],
    "metadata":{
        "title": item['title'],
        "abstract": item['abstract']
        },
      "values": embedding,
    } for item, embedding in zip(data, embedded)
]
print("Sample upsert data:", next(iter(upsert_data)))
index.upsert(vectors=upsert_data, namespace='example')

### Initialize the ChatGPT chatbot
Here, we set up the language model (ChatOpenAI) with predefined limits and stopping conditions, which are crucial for controlling the model's output during interactions.

In [None]:
llm = ChatOpenAI(model_name='gpt-4', max_tokens=488,
                 model_kwargs={"stop": ["\nQ:", "\nA:"]})

Setup directory loaders for handling CSV files

In [None]:
loaders = {
    '.csv': DirectoryLoader(path="flattened_data", glob="**/*_all.csv")
}

### Langchain Pinecone vector storage instantiation 
This cell configures the methods for loading data and setting up a vector store with an embedding model. It's essential for enabling efficient data retrieval based on vector similarity.

In [None]:
vectorstore = PineconeVectorStore(index_name='rag-example',
                                  embedding=OpenAIEmbeddings(model='text-embedding-3-small'),
                                  text_key='abstract')

In [None]:
def get_prompt(instruction, examples, new_system_prompt):
    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
    prompt_template =  SYSTEM_PROMPT + instruction  + "\n" + examples
    return prompt_template

B_SYS, E_SYS = "<<SYS>>\n", "\n<</SYS>>\n\n"
sys_prompt = """\
You are a helpful, respectful and honest assistant designed to assist with. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""


instruction = """CONTEXT:/n/n {context}/n
"""

examples = """
Q: {question}
A: """
template = get_prompt(instruction, examples, sys_prompt)
print(template)

In [None]:
QA_CHAIN_PROMPT = PromptTemplate(
    input_variables=["context", "question"],
    template=template,
)

In [None]:
text_field = "abstract"
vectorstore = PineconeVectorStore(index_name='rag-example',
    embedding=OpenAIEmbeddings(model='text-embedding-3-small'),
    text_key='abstract'
)

In [None]:
query = "Who can help me with AI questions? "
vectorstore.similarity_search(
    query,
    k=3
)

In [None]:
llm = ChatOpenAI(
    openai_api_key=os.environ['OPENAI_API_KEY'],
    model_name='gpt-3.5-turbo',
    temperature=0.0
)

qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)

In [None]:
llm = OpenAI(temperature=0)
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectorstore.as_retriever(),
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
)

In [None]:
query = "Who can help me with AI questions? "
qa_chain.run(query)

In [None]:
def chat_response(msg, history):
    """
    Function to handle chat responses.
    Args:
        msg (str): The message to respond to.
        history (str): The chat history.
    Returns:
        str: The chat response.
    """
    return qa_chain({"query": msg})["result"]




In [None]:
# Setup a Gradio interface for the application
demo = gr.ChatInterface(chat_response)


demo.launch()