<a href="https://colab.research.google.com/github/Alberto-Codes/langchain-experiments/blob/main/financial_rag_langsmith.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import getpass
import os

In [None]:
# Set your Cohere API key
os.environ["COHERE_API_KEY"] = getpass.getpass()

In [None]:
# Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = getpass.getpass()

In [None]:
os.environ["LANGCHAIN_API_KEY"] = getpass.getpass()

In [None]:
!pip install -U -q langchain cohere tiktoken unstructured==0.12.5 openai pandas langchain-community chromadb langchain-openai

# Download SEC filing

In [None]:
from langchain_community.document_loaders import UnstructuredURLLoader

url = "https://www.sec.gov/Archives/edgar/data/1559720/000155972024000006/abnb-20231231.htm"
loader = UnstructuredURLLoader(urls=[url], headers={'User-Agent': 'virat virat@virat.com'})
documents = loader.load()

# Index SEC filing

In [None]:
### INDEX

from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import TokenTextSplitter
from langchain_community.document_loaders import UnstructuredURLLoader

# Load
url = "https://www.sec.gov/Archives/edgar/data/1559720/000155972024000006/abnb-20231231.htm"
loader = UnstructuredURLLoader(urls=[url], headers={'User-Agent': 'virat virat@virat.com'})
docs = loader.load()

# Split
text_splitter = TokenTextSplitter(chunk_size=256, chunk_overlap=20)
splits = text_splitter.split_documents(docs)

# Embed
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

# Index
retriever = vectorstore.as_retriever()

# Build RAG chain

In [None]:
prompt = """
You are an expert language model designed to
answer questions about financial documents like
SEC filings.

Given financial documents, your primary role is to extract key information
and providing accurate answers to questions
related to these filings.

In your response, optimize for conciseness, accuracy, and correctness.
"""

In [None]:
from typing import List

import cohere

co = cohere.Client(os.environ["COHERE_API_KEY"])


def rerank_documents(query: str, documents: list, top_k) -> List[str]:
  response = co.rerank(
      query=query,
      documents=documents,
      top_n=top_k,
      model="rerank-english-v3.0",
      return_documents=True
  )
  results = response.results
  return [{"text": docs.document.text} for docs in results]

def answer_question(query: str, documents: list, prompt: str) -> str:
  message = f"{prompt}. Please answer the question: ```{query}```."
  response = co.chat(
      model="command-r-plus",
      temperature=0,
      message=message,
      documents=documents,
  )
  return response.text

In [None]:
### RAG

import openai
from langsmith import traceable
from langsmith.wrappers import wrap_openai

k = 3

class RagBot:
    @traceable
    def get_answer(self, question: str):
      print(f"Question: {question}")
      print()

      top_k_docs = vectorstore.similarity_search(question, k)

      # Extract the text content from documents
      documents = [{"text": doc.page_content} for doc in top_k_docs]

      # Rerank the documents
      documents = rerank_documents(question, documents, k)

      # Ask the LLM
      answer = answer_question(question, documents, prompt)

      # Evaluators will expect "answer" and "contexts"
      return {
          "answer": answer,
          "contexts": [str(doc) for doc in documents],
      }

rag_bot = RagBot()

In [None]:
response = rag_bot.get_answer("What is Airbnb's revenue in 2023?")
response["answer"][:150]

# Load Q&A Dataset

In [None]:
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'

In [None]:
import requests
import pandas as pd

# URL of the JSON file
url = 'https://raw.githubusercontent.com/virattt/datasets/main/abnb-2023-10k.json'

# Fetch the JSON content from the URL
response = requests.get(url)
data = response.json()

In [None]:
inputs = []
outputs = []

for row in data:
  question = row['question']
  answer = row['answer']
  inputs.append(question)
  outputs.append(answer)

qa_pairs = [{"question": q, "answer": a} for q, a in zip(inputs, outputs)]

In [None]:
from langsmith import Client

# Create dataset
client = Client()
dataset_name = "financial-rag-test-1.3"
dataset = client.create_dataset(
    dataset_name=dataset_name,
    description="QA pairs about LCEL.",
)
client.create_examples(
    inputs=[{"question": q} for q in inputs],
    outputs=[{"answer": a} for a in outputs],
    dataset_id=dataset.id,
)

# Evaluate

In [None]:
# RAG chain
def predict_rag_answer(example: dict):
    """Use this for answer evaluation"""
    response = rag_bot.get_answer(example["question"])
    return {"answer": response["answer"]}

def predict_rag_answer_with_context(example: dict):
    """Use this for evaluation of retrieved documents and hallucinations"""
    response = rag_bot.get_answer(example["question"])
    return {"answer": response["answer"], "contexts": response["contexts"]}

In [None]:
from langsmith.evaluation import LangChainStringEvaluator, evaluate

# Evaluator
qa_evalulator = [
    LangChainStringEvaluator(
        "qa",
        prepare_data=lambda run, example: {
            "prediction": run.outputs["answer"],
            "reference": example.outputs["answer"],
            "input": example.inputs["question"],
        },
      ),
]
experiment_results = evaluate(
    predict_rag_answer,
    data=dataset_name,
    evaluators=qa_evalulator,
    experiment_prefix="financial-rag-qa",
    metadata={"variant": "LCEL context, gpt-3.5-turbo"},
)