# Setup

Create a `.env` file containing:
```
OPENAI_API_KEY="<your key here>"
```

Install langchain and dewy-client as shown below:

In [None]:
%pip install dewy-langchain langchain langchain-openai

# Example LangChain without RAG
This example shows a simple LangChain application which attempts to answer questions without retrieval.

In [None]:
from langchain_openai import ChatOpenAI
# MODEL="gpt-4-0125-preview"
MODEL="gpt-3.5-turbo"
llm = ChatOpenAI(temperature=0.9, model_name=MODEL)

llm.invoke("What is RAG useful for?")

# Example LangChain with RAG (using Dewy)
This example shows what the previous chain looks like using Dewy to retrieve relevant chunks.

## Retrieving documents in a chain

In [None]:
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

from dewy_langchain import DewyRetriever

retriever = DewyRetriever.for_collection("main", base_url="http://localhost:8000")
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            You're a helpful AI assistant. Given a user question and some retrieved content, answer the user question.
            If none of the articles answer the question, just say you don't know.

            Here is the retrieved content:
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)

def format_chunks(chunks):
    return "\n\n".join([d.page_content for d in chunks])

chain = (
    { "context": retriever | format_chunks, "question": RunnablePassthrough() }
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
chain.invoke("What is RAG useful for?")

## Langchain with Citations
Based on https://python.langchain.com/docs/use_cases/question_answering/citations#cite-documents.

In [None]:
from langchain_core.pydantic_v1 import BaseModel, Field
from operator import itemgetter
from langchain_core.runnables import (
    RunnableLambda,
)

class cited_answer(BaseModel):
    """Answer the user question based only on the given sources, and cite the sources used."""

    answer: str = Field(
        ...,
        description="The answer to the user question, which is based only on the given sources.",
    )
    citations: List[int] = Field(
        ...,
        description="The integer IDs of the SPECIFIC sources which justify the answer.",
    )

def format_docs_with_id(docs: List[Document]) -> str:
    formatted = [
        f"Source ID: {doc.metadata['chunk_id']}\nArticle Snippet: {doc.page_content}"
        for doc in docs
    ]
    return "\n\n" + "\n\n".join(formatted)

format = itemgetter("docs") | RunnableLambda(format_docs_with_id)

# Setup a "cited_answer" tool.
from langchain.output_parsers.openai_tools import JsonOutputKeyToolsParser
output_parser = JsonOutputKeyToolsParser(key_name="cited_answer", return_single=True)

llm_with_tool = llm.bind_tools(
    [cited_answer],
    tool_choice="cited_answer",
)
answer = prompt | llm_with_tool | output_parser

citation_chain = (
    RunnableParallel(docs = retriever, question=RunnablePassthrough())
    .assign(context=format)
    .assign(cited_answer=answer)
    # Can't include `docs` because they're not JSON serializable.
    .pick(["cited_answer"])
)

In [None]:
citation_chain.invoke("What is RAG useful for?")

## Bonus: Adding documents to the collection

In [None]:
from dewy_client import Client
from dewy_client.api.kb import add_document
from dewy_client.models import AddDocumentRequest
client = Client(base_url="http://localhost:8000")
add_document.sync(client=client, body=AddDocumentRequest(
    url = "https://arxiv.org/pdf/2305.14283.pdf",
    collection_id=collection_id,
))