# Learning Objectives

Build LLM applications for retrieval-augmented generation tasks.


# Setup

In [None]:
!pip install -q openai==1.23.2 \
                tiktoken==0.6.0 \
                pypdf==4.0.1 \
                langchain==0.1.1 \
                langchain-community==0.0.13 \
                chromadb==0.4.22 \
                sentence-transformers==2.3.1

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.0/284.0 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m802.4/802.4 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m509.0/509.0 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.8/132.8 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━

In [None]:
!pip install azure-ai-textanalytics
!pip install tiktoken



In [None]:
import json
import tiktoken
import pandas as pd
from openai import AzureOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings
)
from langchain_community.vectorstores import Chroma
from google.colab import userdata

In [None]:
azure_openai_api_key = userdata.get('secretName') # azure_api_key

In [None]:
client = AzureOpenAI(
    api_key=azure_openai_api_key,
    api_version="2023-05-15",
    azure_endpoint="https://demo.openai.azure.com/"
)

In [None]:
model_name = "gpt35oaiD" #"gpt-35-turbo"

Note: This portion needs a T4 GPU.

## Raw Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd '/content/drive/MyDrive/'

/content/drive/MyDrive/teaching/courses/B2B/Deloitte


In [None]:
!ls 'tesla-annual-reports'

tsla-10k_20191231-gen_0.pdf  tsla-10ka_20211231-gen.pdf  tsla-20231231-gen.pdf
tsla-10k_20201231-gen.pdf    tsla-20221231-gen.pdf


In [None]:
#!unzip tesla-annual-reports.zip

## Chunk Data

In [None]:
pdf_folder_location = "tesla-annual-reports"

In [None]:
pdf_loader = PyPDFDirectoryLoader(pdf_folder_location)

In [None]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    encoding_name='cl100k_base',
    chunk_size=512,
    chunk_overlap=16
)

In [None]:
tesla_10k_chunks_ada = pdf_loader.load_and_split(text_splitter)

In [None]:
len(tesla_10k_chunks_ada)

3342

## Create Database

In [None]:
tesla_10k_collection = 'tesla-10k-2019-to-2023'

In [None]:
embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-large')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [None]:
vectorstore = Chroma.from_documents(
    tesla_10k_chunks_ada,
    embedding_model,
    collection_name=tesla_10k_collection,
    persist_directory='./tesla_db'
)

NameError: name 'tesla_10k_chunks_ada' is not defined

In [None]:
vectorstore.persist()

In [None]:
vectorstore_persisted = Chroma(
    collection_name=tesla_10k_collection,
    persist_directory='./tesla_db',
    embedding_function=embedding_model
)

In [None]:
retriever = vectorstore_persisted.as_retriever(
    search_type='similarity',
    search_kwargs={'k': 5}
)

Zip and download current DB state

In [None]:
!zip -r tesla_db.zip /content/drive/MyDrive/tesla_db

  adding: content/drive/MyDrive/teaching/courses/B2B/Deloitte/tesla_db/ (stored 0%)
  adding: content/drive/MyDrive/teaching/courses/B2B/Deloitte/tesla_db/chroma.sqlite3 (deflated 43%)
  adding: content/drive/MyDrive/teaching/courses/B2B/Deloitte/tesla_db/05867e28-7986-4e2f-8896-332f2d07ec3b/ (stored 0%)
  adding: content/drive/MyDrive/teaching/courses/B2B/Deloitte/tesla_db/05867e28-7986-4e2f-8896-332f2d07ec3b/header.bin (deflated 56%)
  adding: content/drive/MyDrive/teaching/courses/B2B/Deloitte/tesla_db/05867e28-7986-4e2f-8896-332f2d07ec3b/data_level0.bin (deflated 9%)
  adding: content/drive/MyDrive/teaching/courses/B2B/Deloitte/tesla_db/05867e28-7986-4e2f-8896-332f2d07ec3b/length.bin (deflated 94%)
  adding: content/drive/MyDrive/teaching/courses/B2B/Deloitte/tesla_db/05867e28-7986-4e2f-8896-332f2d07ec3b/link_lists.bin (deflated 84%)
  adding: content/drive/MyDrive/teaching/courses/B2B/Deloitte/tesla_db/05867e28-7986-4e2f-8896-332f2d07ec3b/index_metadata.pickle (deflated 76%)


In [None]:
#!zip -r tesla_db.zip /content/tesla_db


zip error: Nothing to do! (try: zip -r tesla_db.zip . -i /content/tesla_db)


# Load Vector DB

In [None]:
embedding_model = SentenceTransformerEmbeddings(model_name='thenlper/gte-large')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.9k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/191 [00:00<?, ?B/s]

In [None]:
!gdown 1hWbAWhJr5xsl0sAvvEq9Wpo8ItCdZpdq

Downloading...
From (original): https://drive.google.com/uc?id=1hWbAWhJr5xsl0sAvvEq9Wpo8ItCdZpdq
From (redirected): https://drive.google.com/uc?id=1hWbAWhJr5xsl0sAvvEq9Wpo8ItCdZpdq&confirm=t&uuid=24604d38-b753-49b0-9de3-3f354fb0b83e
To: /content/drive/MyDrive/teaching/courses/B2B/Deloitte/tesla_db.zip
100% 36.5M/36.5M [00:00<00:00, 92.0MB/s]


In [None]:
#!unzip tesla_db.zip

Archive:  tesla_db.zip
replace content/tesla_db/8ac3fef6-b222-4e92-a75c-b38882ef1f45/header.bin? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
tesla_10k_collection = 'tesla-10k-2019-to-2023'

In [None]:
vectorstore_persisted = Chroma(
    collection_name=tesla_10k_collection,
    persist_directory='./tesla_db',
    embedding_function=embedding_model
)

In [None]:
vectorstore_persisted = Chroma(
    collection_name=tesla_10k_collection,
    persist_directory='./content/tesla_db',
    embedding_function=embedding_model
)

In [None]:
retriever = vectorstore_persisted.as_retriever(
    search_type='similarity',
    search_kwargs={'k': 5}
)

# RAG Q&A

## Prompt Design

In [None]:
qna_system_message = """
You are an assistant to a financial services firm who answers user queries on annual reports.
User input will have the context required by you to answer user questions.
This context will begin with the token: ###Context.
The context contains references to specific portions of a document relevant to the user query.

User questions will begin with the token: ###Question.

Please answer only using the context provided in the input. Do not mention anything about the context in your final answer.

If the answer is not found in the context, respond "I don't know".
"""

In [None]:
qna_user_message_template = """
###Context
Here are some documents that are relevant to the question mentioned below.
{context}

###Question
{question}
"""

## Retrieving relevant documents

In [None]:
user_input = "What was the annual revenue of the company in 2021?"

In [None]:
retriever = vectorstore_persisted.as_retriever(
    search_type='similarity',
    search_kwargs={'k': 5}
)

In [None]:
relevant_document_chunks = retriever.get_relevant_documents(user_input)

In [None]:
len(relevant_document_chunks)

5

In [None]:
for document in relevant_document_chunks:
    print(document.page_content.replace("\t", " "))
    break

systems.
In 2020, we recognized total revenues of $31.54 billion, representing an increase of $6.96 billion compared to the prior year. We continue to ramp
production, build new manufacturing capacity and expand our operations to enable increased deliveries and deployments of our products and further revenue
growth.
In 2020, our net income attributable to common stockholders was $721 million, representing a favorable change of $1.58 billion compared to the prior
year. In 2020, our operating margin was 6.3%, representing a favorable change of 6.6% compared to the prior year. We continue to focus on operational
efficiencies, while we have seen an acceleration of non-cash stock-based compensation expense due to a rapid increase in our market capitalization and updates
to our business outlook.
We ended 2020 with $19.38 billion in cash and cash equivalents, representing an increase of $13.12 billion from the end of 2019. Our cash flows from
operating activities during 2020 was $5.94 billion

## Composing the response

In [None]:
user_input = "What was the total revenue of the company in 2021?"

In [None]:
relevant_document_chunks = retriever.get_relevant_documents(user_input)
context_list = [d.page_content for d in relevant_document_chunks]
context_for_query = ". ".join(context_list)

prompt = [
    {'role':'system', 'content': qna_system_message},
    {'role': 'user', 'content': qna_user_message_template.format(
         context=context_for_query,
         question=user_input
        )
    }
]

try:
    response = client.chat.completions.create(
        model=model_name,
        messages=prompt,
        temperature=0
    )

    prediction = response.choices[0].message.content.strip()
except Exception as e:
    prediction = f'Sorry, I encountered the following error: \n {e}'

print(prediction)

The total revenue of the company in 2021 was $53.82 billion.


In [None]:
model_name

'gpt35oaiD'

What was the annual revenue of the company in 2021?

What was the company's debt level in 2021?

Identity 5 key risks identified in the 2021 report?

Summarize the Management Discussion and Analysis section of the 2021 report?

key investments in 2022 report?

In [None]:
# @title Enter your query
qna_system_message = """
You are an assistant to a financial services firm who answers user queries on annual reports.
Users will ask questions delimited by triple backticks, that is, ```.
User input will have the context required by you to answer user questions.
This context will begin with the token: ###Context.
The context contains references to specific portions of a document relevant to the user query.

User questions will begin with the token: ###Question.

Please answer only using the context provided in the input.

If the answer is not found in the context, respond "I don't know".

Do not mention details of the context in your answer.
"""

qna_user_message_template = """
###Context
Here are some documents that are relevant to the question mentioned below.
{context}
```
###Question
{question}
```
"""

user_input = "key investments in 2022 report?" # @param {type:"string"}

relevant_document_chunks = retriever.get_relevant_documents(user_input)
context_list = [d.page_content for d in relevant_document_chunks]
context_for_query = ". ".join(context_list)

prompt = [
    {'role':'system', 'content': qna_system_message},
    {'role': 'user', 'content': qna_user_message_template.format(
         context=context_for_query,
         question=user_input
        )
    }
]

try:
    response = client.chat.completions.create(
        model=model_name,
        messages=prompt,
        temperature=0
    )

    prediction = response.choices[0].message.content.strip()
except Exception as e:
    prediction = f'Sorry, I encountered the following error: \n {e}'

print(prediction)

The 2022 report mentions that the company invested an aggregate of $1.50 billion in bitcoin in the first quarter of 2021. However, there is no mention of any other key investments in the 2022 report.


# Evaluation

RAG systems are usually judged on [two important parameters](https://docs.ragas.io/en/stable/concepts/metrics/index.html#ragas-metrics):
- Groundedness/Faithfulness: How factually accurate the answer is given the context?
- Relevance: How relevant is the context retrieved given the query?

These two metrics check the quality two components of the RAG system - retrieval and generation. We will use the LLM-as-a-judge method to check the quality of the RAG system on these two parameters.

Let us illustrate this evaluation based on the answers generated to the question from the previous section.

In [None]:
rater_model = 'gpt35oaiD' #"gpt-35-turbo"

In [None]:
groundedness_rater_system_message = """
You are tasked with rating AI generated answers to questions posed by users.
You will be presented a question, context used by the AI system to generate the answer and an AI generated answer to the question.
In the input, the question will begin with ###Question, the context will begin with ###Context while the AI generated answer will begin with ###Answer.

Evaluation criteria:
The task is to judge the extent to which the metric is followed by the answer.
1 - The metric is not followed at all
2 - The metric is followed only to a limited extent
3 - The metric is followed to a good extent
4 - The metric is followed mostly
5 - The metric is followed completely

Metric:
The answer should be derived only from the information presented in the context

Instructions:
1. First write down the steps that are needed to evaluate the answer as per the metric.
2. Give a step-by-step explanation if the answer adheres to the metric considering the question and context as the input.
3. Next, evaluate the extent to which the metric is followed.
4. Use the previous information to rate the answer using the evaluaton criteria and assign a score.
"""

In [None]:
relevance_rater_system_message = """
You are tasked with rating relevance of context retrieved with an intent to answer questions posed by users.
You will be presented a question, and context retrieved to answer the question.
In the input, the question will begin with ###Question, and the context will begin with ###Context.

Evaluation criteria:
The task is to judge the extent to which the metric is followed by the answer.
1 - The metric is not followed at all
2 - The metric is followed only to a limited extent
3 - The metric is followed to a good extent
4 - The metric is followed mostly
5 - The metric is followed completely

Metric:
All the information in the context should be relevant to the question.

Instructions:
1. First write down the steps that are needed to evaluate the context as per the metric.
2. Give a step-by-step explanation if the context adheres to the metric considering the question as the input.
3. Next, evaluate the extent to which the metric is followed.
4. Use the previous information to rate the context using the evaluaton criteria and assign a score.
"""

In [None]:
groundedness_user_message_template = """
###Question
{question}

###Context
{context}

###Answer
{answer}
"""

In [None]:
relevance_user_message_template = """
###Question
{question}

###Context
{context}
"""

In [None]:
user_input = "What was the annual revenue of the company in 2021?"

In [None]:
relevant_document_chunks = retriever.get_relevant_documents(user_input)
context_list = [d.page_content for d in relevant_document_chunks]
context_for_query = ". ".join(context_list)

In [None]:
prompt = [
    {'role':'system', 'content': qna_system_message},
    {'role': 'user', 'content': qna_user_message_template.format(
         context=context_for_query,
         question=user_input
        )
    }
]

response = client.chat.completions.create(
    model=rater_model,
    messages=prompt,
    temperature=0
)

answer = response.choices[0].message.content.strip()

In [None]:
print(answer)

The annual revenue of the company in 2021 was $53.82 billion.


In [None]:
groundedness_prompt = [
    {'role':'system', 'content': groundedness_rater_system_message},
    {'role': 'user', 'content': groundedness_user_message_template.format(
        question=user_input,
        context=context_for_query,
        answer=answer
        )
    }
]

In [None]:
response = client.chat.completions.create(
    model=rater_model,
    messages=groundedness_prompt,
    temperature=0
)

print(response.choices[0].message.content)

To evaluate the answer as per the metric, we need to check if the answer is derived only from the information presented in the context. 

Step-by-step explanation:
- The context provides information about the company's revenues and net income for the years 2020 and 2021.
- The context mentions that in 2021, the company recognized total revenues of $53.82 billion, representing an increase of $22.28 billion, or 70.64% compared to the prior year.
- The answer provided is consistent with the information presented in the context.
- Therefore, the answer adheres to the metric.

The extent to which the metric is followed:
The answer follows the metric completely as it is derived only from the information presented in the context.

Evaluation: 
Based on the evaluation criteria, the answer follows the metric completely, and hence, it deserves a score of 5.


In [None]:
relevance_prompt = [
    {'role':'system', 'content': relevance_rater_system_message},
    {'role': 'user', 'content': relevance_user_message_template.format(
        question=user_input,
        context=context_for_query
        )
    }
]

In [None]:
response = client.chat.completions.create(
    model=rater_model,
    messages=relevance_prompt,
    temperature=0
)

print(response.choices[0].message.content)

To evaluate the context as per the metric, we need to check if the context provides information about the annual revenue of the company in 2021. We need to ensure that all the information in the context is relevant to the question.

The context mentions the total revenues of the company in 2020, which was $31.54 billion, and the increase in revenue compared to the prior year. It also mentions the net income attributable to common stockholders, operating margin, cash and cash equivalents, and capital expenditures during 2020. The context then talks about the impact of the COVID-19 pandemic and the change in net income attributable to noncontrolling interests and redeemable noncontrolling interests in subsidiaries in 2021 compared to 2020. It then provides the objectives of the company in 2021, which includes the total revenues of $53.82 billion, net income attributable to common stockholders of $5.52 billion, and annual vehicle delivery and production records. The context also mentions 