# Retrieval Augmented Generation

Table of Contents
1. Idea
2. Naive Implementation
3. Graph Implementation

In [None]:
import pandas as pd
from language_models.proxy_client import BTPProxyClient
from language_models.agents.react import ReActAgent
from language_models.agents.chain import AgentChain
from language_models.tools.tool import Tool
from language_models.models.llm import OpenAILanguageModel
from language_models.models.embedding import SentenceTransformerEmbeddingModel
from language_models.retrievers import BasicRetriever, ContextualCompressionRetriever
from language_models.retrievers.utils import split_documents
from language_models.vector_stores import FAISSVectorStore, DistanceMetric
from language_models.settings import settings
from langchain_core.documents import Document
from pydantic import BaseModel, Field, field_validator
from numpy import dot
from numpy.linalg import norm
from pathlib import Path
from langchain_core.documents import Document
from utils import load_docs_from_json, save_docs_to_json
from pprint import pprint

In [None]:
proxy_client = BTPProxyClient(
    client_id=settings.CLIENT_ID,
    client_secret=settings.CLIENT_SECRET,
    auth_url=settings.AUTH_URL,
    api_base=settings.API_BASE,
)

In [None]:
path = Path("./data/jobs")
filenames = [file.name for file in path.iterdir() if file.is_file()]

documents = []
for filename in filenames:
    file_path = path / filename
    with open(file_path, "r", encoding="utf-8", errors="replace") as file:
        content = file.read()
        documents.append(Document(page_content=content, metadata={"source": file_path}))

In [None]:
system_prompt = """Take the following job and extract data about the job.

Respond with the following extracted data:
- job_title: The job title."""

llm = OpenAILanguageModel(
    proxy_client=proxy_client,
    model='gpt-4',
    max_tokens=128,
    float=0.0,
)

class Job(BaseModel):
    job_title: str = Field(description="The job title.")

job_data_agent = ReActAgent.create(
    llm=llm,
    system_prompt=system_prompt,
    task_prompt="Job description:\n{job}",
    task_prompt_variables=["job"],
    tools=None,
    output_format=Job,
    iterations=5,
)

In [None]:
def extract_job_titles(documents: list[Document]) -> pd.DataFrame:
    for document in documents:
        response = job_data_agent.invoke({"job": document.page_content})
        job_title = response.final_answer["job_title"]
        document.metadata["job_title"] = job_title
    return documents

In [None]:
try:
    documents = load_docs_from_json('./data/jobs.json')
except:
    documents = extract_job_titles(documents[:30])
    save_docs_to_json(documents, './data/jobs.json')

In [None]:
documents = split_documents(documents, separators=["\n\n", "\n", " ", ""], chunk_size=1000, chunk_overlap=100)

## Idea

In [None]:
llm = OpenAILanguageModel(
    proxy_client=proxy_client,
    model="gpt-4",
    max_tokens=256,
    temperature=0.0,
)

In [None]:
system_prompt = "You are an expert in job postings. Respond with the most accurate information about the job."

class Output(BaseModel):
    content: str = Field(description="The final answer.")

agent = ReActAgent.create(
    llm=llm,
    system_prompt=system_prompt,
    task_prompt="{question}",
    task_prompt_variables=["question"],
    output_format=Output,
    iterations=5,
)

In [None]:
response = agent.invoke({"question": "What is the salary range of an airport engineer."})

In [None]:
pprint(response.final_answer["content"])

In [None]:
question = """What is the salary range of an airport engineer.

Use this context to answer the question:
AIRPORT ENGINEER
Class Code:       7256
Open Date:  07-06-18
(Exam Open to All, including Current City Employees)

ANNUAL SALARY

$105,005 to $153,509 and $111,854 to $163,532."""

response = agent.invoke({"question": question})

In [None]:
pprint(response.final_answer["content"])

In [None]:
embedding_model = SentenceTransformerEmbeddingModel(model="all-MiniLM-L6-v2")

In [None]:
query1 = "What is the salary range of an airport engineer."
embedding1 = embedding_model.embed_query(query1)

In [None]:
print(embedding1)

In [None]:
query2 = """AIRPORT ENGINEER
Class Code:       7256
Open Date:  07-06-18
(Exam Open to All, including Current City Employees)

ANNUAL SALARY

$105,005 to $153,509 and $111,854 to $163,532."""

embedding2 = embedding_model.embed_query(query2)

In [None]:
cosine_similarity = dot(embedding1, embedding2) / (norm(embedding1) * norm(embedding2))
print(f"Cosine similarity: {cosine_similarity}")

## Naive Implementation

In [None]:
try:
    vector_store = FAISSVectorStore.load_local("./data", "job_embeddings")
except:
    vector_store = FAISSVectorStore.from_documents(
        documents=documents,
        embedding_model=embedding_model,
        distance_metric=DistanceMetric.COSINE_SIMILARITY,
    )
    vector_store.save_local("./data", "job_embeddings")

In [None]:
tool_name = "Search"
tool_description = "Use this tool to search job postings."

class Search(BaseModel):
    user_text: str = Field(description="The user question/prompt/text.")
    fetch_k: int = Field(5, description="The number of documents to return.")

In [None]:
basic_retriever = BasicRetriever(
    vector_store=vector_store,
    score_threshold=0.0
)

basic_retriever_tool = Tool(
    func=basic_retriever.get_relevant_documents,
    name=tool_name,
    description=tool_description,
    args_schema=Search,
)

In [None]:
system_prompt = """You are an expert in job postings. Respond with the most accurate information about the job.

Use the search tool to answer the user's question."""

class Output(BaseModel):
    content: str = Field(description="The final answer.")

basic_retriever_agent = ReActAgent.create(
    llm=llm,
    system_prompt=system_prompt,
    task_prompt="{question}",
    task_prompt_variables=["question"],
    output_format=Output,
    tools=[basic_retriever_tool],
    iterations=5,
)

In [None]:
response = basic_retriever_agent.invoke({"question": "Give me the job description of an airport engineer."})

In [None]:
pprint(response.final_answer["content"])

In [None]:
contextual_compression_retriever = ContextualCompressionRetriever(
    llm=llm,
    vector_store=vector_store,
    score_threshold=0.0
)

contextual_compression_retriever_tool = Tool(
    func=contextual_compression_retriever.get_relevant_documents,
    name=tool_name,
    description=tool_description,
    args_schema=Search,
)

In [None]:
system_prompt = """You are an expert in job postings. Respond with the most accurate information about the job.

Use the search tool to answer the user's question."""

class Output(BaseModel):
    content: str = Field(description="The final answer.")

contextual_compression_retriever_agent = ReActAgent.create(
    llm=llm,
    system_prompt=system_prompt,
    task_prompt="{question}",
    task_prompt_variables=["question"],
    output_format=Output,
    tools=[contextual_compression_retriever_tool],
    iterations=5,
)

In [None]:
response = basic_retriever_agent.invoke({"question": "Give me the job description of an airport engineer."})

In [None]:
pprint(response.final_answer["content"])

## Graph Implementation

In [None]:
def create_dataset(documents: list[Document]) -> pd.DataFrame:
    data = []
    for document in documents:
        embedding = embedding_model.embed_query(document.page_content)
        data.append({
            "job_title": document.metadata.get("job_title") or "",
            "text": document.page_content,
            "embedding": embedding,
            "source": document.metadata.get("source") or "",
        })
    return pd.DataFrame(data)

In [None]:
df = create_dataset(documents)
data = {"jobs": df}

In [None]:
def get_jobs() -> list[str]:
    return data["jobs"].job_title.unique().tolist()

get_jobs_tool = Tool(
    func=get_jobs,
    name="Get Jobs",
    description="Use this tool to get all available jobs.",
    args_schema=None,
)

In [None]:
system_prompt = """Respond with the job title.

- job_title: The job title. Make sure the Job Title is in all caps."""

class Job(BaseModel):
    job_title: str = Field(description="The job title.")

job_agent = ReActAgent.create(
    llm=llm,
    system_prompt=system_prompt,
    task_prompt="{question}",
    task_prompt_variables=["question"],
    output_format=Job,
    tools=[get_jobs_tool],
    iterations=10,
)

In [None]:
class Search(BaseModel):
    user_text: str = Field(description="The user question/prompt/text.")
    fetch_k: int = Field(5, description="The number of documents to return.")
    job_title: str = Field(description="The job title to filter for. Must be all caps.")

    @field_validator('job_title')
    def check_all_caps(cls, value):
        if not value.isupper():
            raise ValueError('must be all uppercase')
        return value

def search(user_text: str, fetch_k: int, job_title: str) -> str:

    def calculate_cosine_similarity(user_text_embedding, embedding):
        cosine_similarity = dot(user_text_embedding, embedding) / (norm(user_text_embedding) * norm(embedding))
        return cosine_similarity

    user_text_embedding = embedding_model.embed_query(user_text)
    df = data["jobs"]
    df = df.loc[df["job_title"] == job_title].copy()
    df["cosine_similarity"] = df.embedding.apply(lambda embedding: calculate_cosine_similarity(user_text_embedding, embedding))
    df = df.sort_values(by="cosine_similarity", ascending=False)
    df = df.iloc[:fetch_k]
    documents = "\n\n".join(df.text.tolist())
    return f"Context:\n\n{documents}"


search_tool = Tool(
    func=search,
    name=tool_name,
    description=tool_description,
    args_schema=Search,
)

In [None]:
system_prompt = """You are an expert in job postings. Respond with the most accurate information about the job.

Use the search tool to answer the user's question."""

class Output(BaseModel):
    content: str = Field(description="The final answer.")

agent = ReActAgent.create(
    llm=llm,
    system_prompt=system_prompt,
    task_prompt="{job_title}",
    task_prompt_variables=["job_title"],
    output_format=Output,
    tools=[search_tool],
    iterations=10,
)

In [None]:
chain = AgentChain(
    chain=[job_agent, agent],
    chain_variables=["question"],
)

In [None]:
response = chain.invoke({"question": "Give me the job description of an airport engineer."})

In [None]:
pprint(response.final_answer["content"])