In [None]:
#Multi Representation Indexing
"""
 Input: Full Doc, Table, or Image
You start with some content — a document, table, or image.

This could be, for example, a PDF report, an Excel table, or an image of a chart.

2. GPT-4V Processing
GPT-4V (Vision model) processes this raw content.

It generates a summary — a textual abstraction of the content's key ideas.

This summary is easier to embed and index than the raw content.

3. Multi-representation Indexing Block
This has two parts:

a. Vectorstore
The summary is passed through an embedding model (converts it into a numerical vector).

These vectors are stored in the Vectorstore.

When a question comes in, it is also embedded and compared to the stored vectors to find the most similar content.

b. Docstore
This stores the original full content (document, table, or image).

When a match is found in the vectorstore, we retrieve the actual content from the docstore — not just the summary.

❓ Question Flow:
A user asks a question.

The system embeds the question and searches the Vectorstore for similar summaries.

Once it finds the best match, it uses that to retrieve the corresponding original full document/table/image from the Docstore.

The result returned is the full, relevant content, not just a snippet.

📦 Output:
➡️ Relevant full doc, table, or image based on the user's query.

✅ Key Advantages:
Fast similarity search using vector embeddings.

Rich and precise retrieval since it still keeps the full source content.

Uses multi-modal understanding (via GPT-4V) to work even with images or tables, not just text.
"""


In [None]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
#for example we are taking two contents from the blog and load to convert them into docs
loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
docs = loader.load()
loader = WebBaseLoader("https://lilianweng.github.io/posts/2024-02-05-human-data-quality/")
docs.extend(loader.load())

#we are creating a cchain that genertes summaries for the list of docs that we giev
import uuid
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
    | ChatOpenAI(model="gpt-3.5-turbo",max_retries=0)
    | StrOutputParser()
)
summaries = chain.batch(docs, {"max_concurrency": 5})

In [None]:
from langchain.storage import InMemoryByteStore
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.retrievers.multi_vector import MultiVectorRetriever
# The vectorstore to use to index the child chunks initialisg a vectorspace to store the summary embeddings
vectorstore = Chroma(collection_name="summaries",
                     embedding_function=OpenAIEmbeddings())
# The storage layer for the parent documents we store the actual emebedding sof the parent document in the memoryBYstore
store = InMemoryByteStore()
id_key = "doc_id"
# The retriever inbuilt which will find the summary and get the releveant original doc 
retriever = MultiVectorRetriever(
    vectorstore=vectorstore,
    byte_store=store,
    id_key=id_key,
)
doc_ids = [str(uuid.uuid4()) for _ in docs]#create a list of unique id's 
summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})#here from the normal docs wth page content we are creating anew object metadata about the content with content and id as attributes
    for i, s in enumerate(summaries)
]
# Add the created spaces to the onejct created previously
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))


In [None]:
#Working
query="Memory In agents"
retrieved_docs = retriever.get_relevant_documents(query,n_results=1)
retrieved_docs[0].page_content[0:500]

In [None]:
#Now we will discuss about the next Method RAPTOR


In [None]:
"""
In ColBERT, documents and queries are tokenized and embedded using a transformer like BERT. Each query token is compared with all document tokens,
 and for each query token, the document token with the highest similarity is selected. These similarities are then aggregated to rank documents,
enabling fine-grained and efficient retrieval.
So the output will be not the entire docs but the part of docs corresponding to the max_similarity tokens 
"""
from ragatouille import RAGPretrainedModel
RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
import requests #this is like a router to fetch or post things to an api here from the wiki pedia we are gonna load anf give extract as text
def get_wikipedia_page(title: str):
    """
    Retrieve the full text content of a Wikipedia page.

    :param title: str - Title of the Wikipedia page.
    :return: str - Full text content of the page as raw string.
    """
    # Wikipedia API endpoint
    URL = "https://en.wikipedia.org/w/api.php"

    # Parameters for the API request
    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "extracts",
        "explaintext": True,
    }

    # Custom User-Agent header to comply with Wikipedia's best practices
    headers = {"User-Agent": "RAGatouille_tutorial/0.0.1 (ben@clavie.eu)"}

    response = requests.get(URL, params=params, headers=headers)
    data = response.json()

    # Extracting page content
    page = next(iter(data["query"]["pages"].values()))
    return page["extract"] if "extract" in page else None

full_document = get_wikipedia_page("Hayao_Miyazaki")

