In [None]:
!pip install -q langchain langchain_cohere langchain_google_genai chromadb langchainhub langchain_community huggingface_hub langchain_openai lancedb openai tiktoken rank_bm25 pypdf

In [None]:
import pandas as pd
from langchain import hub
from langchain_community.document_loaders import DataFrameLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_cohere import CohereEmbeddings
import getpass
import os
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain.vectorstores import LanceDB
import lancedb
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.schema import Document
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import PyPDFLoader

In [None]:
os.environ["GOOGLE_API_KEY"] = "-"
os.environ["COHERE_API_KEY"] = "-"
os.environ["OPENAI_API_KEY"] = "-"

In [None]:
df = pd.read_csv("/content/context2.csv")
loader = DataFrameLoader(df, page_content_column="data")
docs = loader.load()

In [None]:
embedding = OpenAIEmbeddings()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

In [None]:
bm25_retriever = BM25Retriever.from_documents(all_splits)
bm25_retriever.k = 3  # Retrieve top 3 results

In [None]:
all_text = " ".join([doc.page_content for doc in all_splits])
db = lancedb.connect("/tmp/lancedb")
table = db.create_table(
    "pandas_docs",
    data=[
        {
            "vector": embedding.embed_query("RUWireless Secure"),
            "text": "RUWireless Secure",
            "id": "1",
        },
        {
            "vector": embedding.embed_query("ScarletMail"),
            "text": "ScarletMail",
            "id": "2",
        }
    ],
    mode="overwrite",
)
docsearch = LanceDB.from_texts(all_text, embedding, connection=db)
retriever_lancedb = docsearch.as_retriever(search_kwargs={"k": 3})

In [None]:
vectorstore = Chroma.from_documents(documents=all_splits, embedding=CohereEmbeddings(model="embed-english-light-v3.0"))

In [None]:
retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, retriever_lancedb], weights=[0.2, 0.8]
)

In [None]:
llm = ChatGoogleGenerativeAI(model="gemini-pro")

In [None]:
from langchain import PromptTemplate
template = """
  You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question.
  The answer could come from the retrieved context or could be answered by following a hyperlink. Use the description of the hyperlink
  to infer if the hyperlink could provide a possible answer.
  If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

  CONTEXT:
  {context}

  QUESTION:
  {query}

  ANSWER:
  """

prompt = PromptTemplate(input_variables=["query", "context"], template=template)

In [None]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [None]:
rag_chain_from_docs = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "query": RunnablePassthrough()}
).assign(answer=rag_chain_from_docs)

In [None]:
with open("/content/questions.txt", "r") as pFile:
    pLines = [
        # strip() - Removes leading/trailing whitespace.
        line.strip()
            # readlines() - Reads all the lines of a file an returns them as a list.
            for line in pFile.readlines()]
for line in pLines:
  print(line)
  print(rag_chain_with_source.invoke(line))