In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain_community.document_transformers import EmbeddingsRedundantFilter
from langchain.retrievers.document_compressors import EmbeddingsFilter
from langchain.text_splitter import CharacterTextSplitter
from langchain.retrievers import ContextualCompressionRetriever
from langchain_community.vectorstores import OpenSearchVectorSearch
from langchain_core.vectorstores import VectorStoreRetriever, VectorStore
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_core.documents import Document
from typing import List
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.messages import AIMessage, HumanMessage
from typing import Dict
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableBranch
from langchain_community.chat_models import ChatOpenAI
import sys
import time
from copy import deepcopy
from tqdm.auto import tqdm
import os
import json
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

In [None]:
host = "localhost"
port = 9200
auth = ("admin", "admin")

In [None]:
# model_id = "mistralai/Mistral-7B-Instruct-v0.2"
model_id = "gpt-3.5-turbo"

hf_token = os.environ.get("HF_TOKEN")
openai_api_key = os.environ.get("OPENAI_API_KEY")
openai_api_base = os.environ.get("OPENAI_API_BASE")

if "gpt" in model_id:
    hf = ChatOpenAI(
        model_name="gpt-3.5-turbo",
        openai_api_key=openai_api_key,
        openai_api_base=openai_api_base,
        temperature=0.7,
        max_tokens=200
    )
else:
    tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_name_or_path=model_id,
        token=hf_token
    )
    model = AutoModelForCausalLM.from_pretrained(
        pretrained_model_name_or_path=model_id,
        token=hf_token,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
    ) # device_map="cuda"
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.bfloat16,
        batch_size=1,
        num_workers=1,
        pad_token_id=2,
        max_length=1000
        # max_new_tokens=800
    )
    hf = HuggingFacePipeline(pipeline=pipe, pipeline_kwargs={'do_sample':True,
        # "max_new_tokens":200,
        "temperature":0.7,
        "top_k":50,
        'top_p':0.95,
        'num_return_sequences':1})

In [None]:
def format_docs(docs: List[Document]):
    return "\n\n".join(doc.page_content for doc in docs)

model_name = "BAAI/bge-small-en-v1.5"
model_kwargs = {"device": "cpu"} # cuda:0
encode_kwargs = {"normalize_embeddings": True}
embedder = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

vectorstore: VectorStore = OpenSearchVectorSearch(
    "https://localhost:9200",
    embedding_function=embedder,
    index_name="pubmed_abstracts",
    vector_dim=384,
    http_compress = True,
    http_auth = auth,
    use_ssl = True,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False,
    )

# retriever: VectorStoreRetriever = vectorstore.as_retriever(search_type='similarity_score_threshold', search_kwargs={'k': 5, 'score_threshold': 0.5})
retriever: VectorStoreRetriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"min_score": 0.4, 'k': 5}
)
# embeddings = HuggingFaceBgeEmbeddings()
splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=0, separator=". ")
redundant_filter = EmbeddingsRedundantFilter(embeddings=embedder)
relevant_filter = EmbeddingsFilter(embeddings=embedder, similarity_threshold=0.76)
# create a transformer for the document compressor that extracts the page_content from the documents

pipeline_compressor = DocumentCompressorPipeline(
    transformers=[splitter, redundant_filter, relevant_filter]
)

compression_retriever = ContextualCompressionRetriever(
    base_compressor=pipeline_compressor, base_retriever=retriever
)

In [None]:
SYSTEM_TEMPLATE = """
CONTEXT: {context}
Answer the following query with the above given CONTEXT. Answer honestly and correctly using accurate scientific terminology, if you don't know the answer, you must say so. Do not create questions for yourself.
"""

question_answering_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            SYSTEM_TEMPLATE,
        ),
        MessagesPlaceholder(variable_name="messages"),
    ]
)

document_chain = create_stuff_documents_chain(hf, question_answering_prompt)

def parse_retriever_input(params: Dict):
    return params["messages"][-1].content


def debug_print(this):
    print(type(this))
    for t in this:
        print(t, type(t))

query_transform_prompt = ChatPromptTemplate.from_messages(
    [
        MessagesPlaceholder(variable_name="messages"),
        (
            "user",
            "Given the above conversation, generate a search query to look up in order to get information relevant to the conversation. Only respond with the query, nothing else.",
        ),
    ]
)

query_transforming_retriever_chain = RunnableBranch(
    (
        lambda x: len(x.get("messages", [])) == 1,
        # If only one message, then we just pass that message's content to retriever
        (lambda x: x["messages"][-1].content) | compression_retriever,
    ),
    # If messages, then we pass inputs to LLM chain to transform the query, then pass to retriever
    query_transform_prompt | hf | StrOutputParser() | compression_retriever,
).with_config(run_name="chat_retriever_chain")

conversational_retrieval_chain = RunnablePassthrough.assign(
    context=query_transforming_retriever_chain,
).assign(
    answer=document_chain,
)

In [None]:
def prepare_context(context):
    clean_context = []
    for doc in context:
        new_doc = {}
        doc = dict(deepcopy(doc))
        new_doc["page_content"] = doc["page_content"]
        new_doc["metadata"] = doc["metadata"]
        del new_doc["metadata"]["text_chunk_id"]
        del new_doc["metadata"]["text"]
        del new_doc["metadata"]["vector_field"]
        new_doc["similarity_score"] = doc["state"]["query_similarity_score"]
        clean_context.append(new_doc)
    return clean_context


In [None]:
with open("eval_questions.json", "r") as f:
    questions = json.load(f)
answers = []
for q in tqdm(questions):
    for i in range(10):  # avoid model evading answer
        a = conversational_retrieval_chain.invoke(
            {
                "messages": [
                    HumanMessage(content=q["question"])
                ],
            }
        )
        if not ("I'm sorry" in a["answer"]):
            break
    answers.append({
        "id": q["id"],
        "type": q["type"],
        "question": q["question"],
        "context": prepare_context(a["context"]),
        "answer": a["answer"],
    })
with open(f"eval_answers_{model_id}.json", "w") as f:
    json.dump(answers, f, indent="    ")

In [None]:
def get_single_answer(questions, id):
    q = questions[id]
    for i in range(10):  # avoid model evading answer
        a = conversational_retrieval_chain.invoke(
            {
                "messages": [
                    HumanMessage(content=q["question"])
                ],
            }
        )
        if not ("I'm sorry" in a["answer"]):
            break
    print(a)
    print(a["context"])
    return {
        "id": q["id"],
        "type": q["type"],
        "question": q["question"],
        "context": prepare_context(a["context"]),
        "answer": a["answer"],
    }

In [None]:
for id in tqdm(range(60)):
    with open(f"eval_answers_{model_id}.json", "r") as f:
        eval_answers = json.load(f)
    eval_answers[id] = get_single_answer(questions, id)
    with open(f"eval_answers_{model_id}.json", "w") as f:
        json.dump(eval_answers, f, indent="    ")

In [None]:
id = 58
with open(f"eval_answers_{model_id}.json", "r") as f:
    eval_answers = json.load(f)
eval_answers[id] = get_single_answer(questions, id)
with open(f"eval_answers_{model_id}.json", "w") as f:
    json.dump(eval_answers, f, indent="    ")