In [None]:
from ragatouille import RAGPretrainedModel


indexes = []
GROUP_NUM = 1 # 1, 2, or 3

for i in range(GROUP_NUM+1):
    path_to_index = ".ragatouille/colbert/indexes/GROUP{}_cluster{}".format(GROUP_NUM, i)
    RAG = RAGPretrainedModel.from_index(path_to_index)
    indexes.append(RAG.as_langchain_retriever(k=3))

In [None]:
from langchain_core.retrievers import BaseRetriever, RetrieverLike, RetrieverOutputLike
from langchain_core.language_models import BaseLLM
from langchain_core.embeddings import Embeddings
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from typing import List



class CustomRetriever(BaseRetriever):


    vectorstore : List[RetrieverLike]

    def flatten_extend(self, matrix):
        flat_list = []
        for row in matrix:
            flat_list.extend(row)
        return flat_list

    def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun) -> List[Document]:

        all_docs = []
        for i in self.vectorstore:
            all_docs.append(i.get_relevant_documents(query, k=3))

        all_docs = self.flatten_extend(all_docs)
        return all_docs

In [None]:
customRetriever = CustomRetriever(vectorstore=indexes)

In [None]:
!wget https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q6_K.gguf

In [None]:
from langchain_community.llms import LlamaCpp


n_gpu_layers = -1  # Metal set to 1 is enough.
n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.

# Make sure the model path is correct for your system!
llm = LlamaCpp(
    model_path="llama-2-7b-chat.Q6_K.gguf",
    n_gpu_layers=n_gpu_layers,
    n_batch=n_batch,
    n_ctx=2048,
    f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
    verbose=True,
)


In [None]:
from langchain import hub
from langchain_core.runnables import RunnablePassthrough, RunnablePick
from langchain_core.prompts.chat import HumanMessagePromptTemplate, PromptTemplate

rag_prompt = hub.pull("rlm/rag-prompt")
# rag_prompt.messages
prompt = HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use one sentence maximum and keep the answer CONCISE. Keep the answer CONCISE.\nQuestion: {question} \nContext: {context} \nAnswer:"))
rag_prompt.messages = [prompt]

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate

# Chain
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


retriever = customRetriever
qa_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
    | StrOutputParser()
)



In [None]:
f = open("FINALQUESTIONS.txt", "r")
questions = f.readlines()
f.close()

questions = [i.strip() for i in questions]

In [None]:
from tqdm import tqdm


answer_file = "2cluster_nolimiformer.txt"


f = open(answer_file, "w")
f.close()

answers = []
for i in tqdm(range(len(questions))):
  response = qa_chain.invoke(questions[i])

  f = open(answer_file, "a")
  f.write(response.replace("\n", "") + "\n")
  f.close()
  answers.append(response)