In [None]:
import os
from dotenv import load_dotenv
from langchain_google_genai import GoogleGenerativeAI

load_dotenv()

api_key = os.environ["LANGCHAIN_API"] 

gemini_api = os.environ["GEMINI_API"]

llm = GoogleGenerativeAI(
    model="gemini-2.5-flash-lite"

)

In [18]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model = "models/gemini-embedding-001")

In [4]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/",
                       header_template= headers)
docs = loader.load()

loader = WebBaseLoader("https://lilianweng.github.io/posts/2024-02-05-human-data-quality/",
                       header_template= headers)
docs.extend(loader.load())

In [7]:
import uuid 

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

chain = (
    {"doc": lambda x : x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document: \n \n {doc}")
    | llm
    | StrOutputParser()

)
summaries = chain.batch(docs, {"max_concurrency" : 5})

In [20]:
from langchain_classic.storage import InMemoryByteStore
from langchain_community.vectorstores import Chroma
from langchain_classic.retrievers import MultiVectorRetriever

vectorstore = Chroma(collection_name = "summaries",
                     embedding_function = embeddings)

store = InMemoryByteStore()
id_key = "doc_id"

retriever = MultiVectorRetriever(
    vectorstore = vectorstore,
    byte_store= store,
    id_key = id_key

)

In [14]:
from langchain_core.documents import Document
doc_ids = [str(uuid.uuid4()) for _ in docs]

summary_docs = [
    Document(page_content = s, metadata = {id_key : doc_ids[i]})
    for i , s in enumerate(summaries)
]

In [None]:
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, docs)))

In [None]:
query = "Memory in agents"
sub_docs = vectorstore.similarity_search(query,k=1)
sub_docs[0]

In [None]:
retrieved_docs = retriever.get_relevant_documents(query,n_results=1)
retrieved_docs[0].page_content[0:500]

In [22]:
#part - 13 raptor - see youtube video on Langchain official channel

# part - 14

In [None]:
from ragatouille import RAGPretrainedModel
RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")

In [None]:
import requests

def get_wikipedia_page(title: str):
    """
    Retrieve the full text content of a Wikipedia page.

    :param title: str - Title of the Wikipedia page.
    :return: str - Full text content of the page as raw string.
    """
    # Wikipedia API endpoint
    URL = "https://en.wikipedia.org/w/api.php"

    # Parameters for the API request
    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "extracts",
        "explaintext": True,
    }

    # Custom User-Agent header to comply with Wikipedia's best practices
    headers = {"User-Agent": "RAGatouille_tutorial/0.0.1 (ben@clavie.eu)"}

    response = requests.get(URL, params=params, headers=headers)
    data = response.json()

    # Extracting page content
    page = next(iter(data["query"]["pages"].values()))
    return page["extract"] if "extract" in page else None

full_document = get_wikipedia_page("Hayao_Miyazaki")

In [None]:
RAG.index(
    collection=[full_document],
    index_name="Miyazaki-123",
    max_document_length=180,
    split_documents=True,
)

In [None]:
results = RAG.search(query="What animation studio did Miyazaki found?", k=3)
results

In [None]:
retriever = RAG.as_langchain_retriever(k=3)
retriever.invoke("What animation studio did Miyazaki found?")