In [None]:
!apt install subversion
!mkdir docs
!svn checkout https://github.com/ksm26/LangChain-Chat-with-Your-Data/trunk/docs/cs229_lectures docs
!mkdir data
!wget https://raw.githubusercontent.com/run-llama/llama_index/main/examples/paul_graham_essay/data/paul_graham_essay.txt -P ./data/
!wget https://raw.githubusercontent.com/langchain-ai/langchain/master/docs/docs/modules/state_of_the_union.txt -P ./data

In [None]:
!pip install cohere umap-learn altair datasets weaviate-client Annoy
!pip install sentence_transformers openai langchain chromadb pypdf rank_bm25
!pip install -U duckduckgo-search

In [None]:
import os
import re
import json
import requests
import numpy as np
import pandas as pd
import openai

os.environ['OPENAI_API_KEY'] = ''
openai.api_key = os.environ['OPENAI_API_KEY']

# from dotenv import load_dotenv, find_dotenv
# _ = load_dotenv(find_dotenv())

# 1) RAG System

[RAG Tuning Strategy]( https://towardsdatascience.com/a-guide-on-12-tuning-strategies-for-production-ready-rag-applications-7ca646833439)

In [None]:
import chromadb
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain.storage import InMemoryByteStore
from langchain.load import dumps, loads
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

## 1.1 Data Loading and Vector Store

In [None]:
### embedding loader
embedding_model = HuggingFaceEmbeddings(
    model_name='BAAI/bge-base-en-v1.5',
    model_kwargs={"device": "cuda"},
    encode_kwargs={"device": "cuda", "batch_size": 100}
)
# embedding_model.embed_documents(batch["text"])
# result = {"text": batch["text"], "source": batch["source"], "embeddings": embeddings}

In [None]:
loaders = [
    # Duplicate documents on purpose - messy data
    PyPDFLoader("docs/MachineLearning-Lecture01.pdf"),
    PyPDFLoader("docs/MachineLearning-Lecture01.pdf"),
    PyPDFLoader("docs/MachineLearning-Lecture02.pdf"),
    PyPDFLoader("docs/MachineLearning-Lecture03.pdf")
]

pdf_docs = []
for loader in loaders:
    pdf_docs.extend(loader.load())

# split by \n\n if still longer than chunk_size, then it split based on the next "\n"
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    separators=["\n\n", "\n", "(?<=\. )", " ", ""]
)
splits = text_splitter.split_documents(pdf_docs)
print('Total Number of Splitted Doc: ', len(splits))
print(f"\nSplit 10: {splits[10].page_content}\n\nSplit 11: {splits[11].page_content}\n")

Total Number of Splitted Doc:  287

Split 10: Similarly, every time you write a check, I ac tually don't know the number for this, but a 
significant fraction of checks that you write are processed by a learning algorithm that's 
learned to read the digits, so the dolla r amount that you wrote down on your check. So 
every time you write a check, there's anot her learning algorithm that you're probably 
using without even being aware of it.  
If you use a credit card, or I know at least one phone compan y was doing this, and lots of 
companies like eBay as well that do electr onic transactions, there's a good chance that 
there's a learning algorithm in the backgr ound trying to figure out if, say, your credit 
card's been stolen or if someone's engaging in a fraudulent transaction.  
If you use a website like Amazon or Netflix that will often recommend books for you to 
buy or movies for you to rent or whatever , these are other examples of learning

Split 11: buy or movies for you to

In [None]:
persist_directory = './chroma/'
vectordb = Chroma.from_documents(
    collection_name="pdf_documents",
    documents=splits,
    embedding=embedding_model,
    persist_directory=persist_directory
)
print('vectordb', vectordb._collection.count())

In [None]:
# to set filtering
retriever = vectordb.as_retriever(
    search_kwargs={
        "k": 4,
        "where_document": {'$contains': 'KEYWORD'},

        # "where_document": {
        #     "$or": [{"$contains": "search_string_1"}, {"$contains": "search_string_1"}]
        # },

        # "filter": {
        #     '$or': [
        #         {'source': {'$eq': './SampleDoc/Bikes.pdf'}},
        #         {'source': {'$eq': './SampleDoc/IceCreams.pdf'}}
        #     ]
        # }
        "filter":{"category": "commercial"}
    }
)

## 1.2 Retrieval Method

[Retrieval Method Overview](https://pub.towardsai.net/advanced-rag-techniques-an-illustrated-overview-04d193d8fec6)

### 1.2.1 Baseline

In [None]:
from langchain.utilities import DuckDuckGoSearchAPIWrapper
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

In [None]:
template = """\
Give the answer to the user query delimited by triple backticks ```{question}```\
using the information given in context delimited by triple backticks ```{context}```.\
If there is no relevant information in the provided context, try to answer yourself,
but tell user that you did not have any relevant context to base your answer on.
Be concise and output the answer of size less than 100 tokens.
"""

prompt = ChatPromptTemplate.from_template(template)
model = ChatOpenAI(max_retries=0, model="gpt-3.5-turbo", temperature=0)
search = DuckDuckGoSearchAPIWrapper()
def retriever(query):
    return search.run(query)

# RunnablePassthrough for unchanged input
# use duckduckgo to search for answer and fit into prompt template
baseline_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

simple_query = "What is Meta's revenue on 2023 Q3"
print(baseline_chain.invoke(simple_query))
print('\n')

Meta's revenue for 2023 Q3 was $34.15 billion.




### 1.2.2 Hybrid Search

Most commonly refers to the combination of traditional **keyword-based** search and modern vector search.

For example, consider the search query “How to merge two Pandas DataFrames with .concat()?”. The keyword search would help find relevant results for the method .concat(). However, since the word “merge” has synonyms such as “combine”, “join”, and “concatenate”, it would be helpful if we could leverage the context awareness of semantic search.

The only trick here is to properly combine the retrieved results with different similarity scores — this problem is usually solved with the help of the Reciprocal Rank Fusion algorithm (see 1.2.5), reranking the retrieved results for the final output.

**hybrid_score** = (1 - alpha) * sparse_score + alpha * dense_score

![](https://miro.medium.com/v2/resize:fit:1100/format:webp/0*0pQbhBEez7U-2knd.png)


**Best Match 25**

The most commonly used algorithm for sparse embeddings (keyword search) is BM25.
[Understanding TFIDF and BM25](https://kmwllc.com/index.php/2020/03/20/understanding-tf-idf-and-bm-25/)

1. TF/(TF + k) trick is the backbone of BM25. to control the contribution of TF to the score in a tunable way. Assuming the IDF of two terms is the same, it's always better to have one instance of each term than to have two instances of one of them.

2. Reward matches in short documents, while penalizing matches in long documents. Achieve this by multiplying k by the ratio dl/adl. Here, dl is the document's length, and adl is the average document length across the corpus. Adjust k up if the document is longer than average, thus reducing TF/(TF + k). A new parameter b is added into the mix (between 0 and 1).

3. BM25 handles document frequency with log(1 + (N - DF + .5)/(DF + .5)).

In [None]:
# bm25 retriever
bm25_retriever = BM25Retriever.from_documents(splits)
bm25_retriever.k = 2

# alternative to chroma: FAISS
# faiss_vectorstore = FAISS.from_texts(doc_list, embedding)
# faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={"k": 2})

# ensemble model of both bm25 and chroma mmr search
ensemble_retriever = EnsembleRetriever(
    retrievers=[
        bm25_retriever,
        vectordb.as_retriever(search_type="mmr")
    ],
    weights=[0.5, 0.5]
)

In [None]:
question = "What is central limit theorem"
docs = ensemble_retriever.get_relevant_documents(question)
for i in [doc.page_content for doc in docs]:
    print(i, '\n')

density of our epsilon I will be  this bell-shaped curve with one standard deviation being 
a, sort of, sigma. Okay? This is form for that  bell-shaped curve. So, let’s see. I can erase 
that. Can I erase the board? So this implies that  the probability distri bution of a price of a 
house given in si and the parameters theta, th at this is going to be  Gaussian with that 
density. Okay? In other words, saying goes as that the price of a hous e given the features 
of the house and my parameters theta, this  is going to be a random variable that’s 
distributed Gaussian with mean theta tran spose XI and variance sigma squared. Right? 
Because we imagine that the way the housing pr ices are generated is that the price of a 
house is equal to theta transpose XI and th en plus some random Gaussian noise with 
variance sigma squared. So the price of a house is going to have mean theta transpose XI, 
again, and sigma squared, right? Does this make sense? Raise your hand if this makes 
sense. 

### 1.2.3 Multi-Vector Retriever

Create multiple vectors per document include:

* Smaller chunks: split a document into smaller chunks, and embed those (this is ParentDocumentRetriever).
* Summary: create a summary for each document, embed that along with (or instead of) the document.
* Hypothetical questions: create hypothetical questions that each document would be appropriate to answer, embed those along with (or instead of) the document.

#### **Parent Document Retrieval**

Fetch smaller chunks during retrieval first, then if more than n chunks in top k retrieved chunks are linked to the same parent node (larger chunk). This allows for embeddings to capture the semantic meaning as closely as possible, but for as much context as possible to be passed downstream.

![](https://miro.medium.com/v2/resize:fit:1100/format:webp/0*x4rMd50GP99OSDuo.png)

In [None]:
import uuid
from langchain.retrievers.multi_vector import MultiVectorRetriever

In [None]:
### short and long chunk
loaders = [
    TextLoader("./data/paul_graham_essay.txt"),
    TextLoader("./data/state_of_the_union.txt"),
]

# longer chunk
text_docs = []
for loader in loaders:
    text_docs.extend(loader.load())
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000)
splits = text_splitter.split_documents(text_docs)

# shorter chunk
id_key = "doc_id"
sub_docs = []
doc_ids = [str(uuid.uuid4()) for _ in splits]
child_text_splitter = RecursiveCharacterTextSplitter(chunk_size=400)

for i, doc in enumerate(splits):
    _id = doc_ids[i]
    _sub_docs = child_text_splitter.split_documents([doc])
    for _doc in _sub_docs:
        _doc.metadata[id_key] = _id
    sub_docs.extend(_sub_docs)

# add into chroma
vectordb.add_documents(sub_docs)

In [None]:
store = InMemoryByteStore()
retriever = MultiVectorRetriever(
    vectorstore=vectordb,
    byte_store=store,
    id_key=id_key,
)
retriever.docstore.mset(list(zip(doc_ids, text_docs)))

In [None]:
### search embedding of small chunk and return the id of big chunk
sub_docs = vectordb.similarity_search("justice breyer")
print(len(sub_docs[0].page_content))

retrieved_docs = retriever.get_relevant_documents("justice breyer")
print(len(retrieved_docs[0].page_content))

390
9874


#### **Summary**

Summary may be able to distill more accurately what a chunk is about, leading to better retrieval. Here we show how to create summaries, and then embed those.

In [None]:
chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template("Summarize the following document:\n\n{doc}")
    | ChatOpenAI(max_retries=0, model="gpt-3.5-turbo")
    | StrOutputParser()
)
summaries = chain.batch(text_docs, {"max_concurrency": 2})
print(summaries[0])

The author discusses their early interests in writing and programming before college. They recall working on short stories and programming on an IBM 1401 computer in 9th grade. They describe their fascination with microcomputers and their experience with programming on a TRS-80. The author initially planned to study philosophy in college but switched to AI after finding philosophy courses boring. They detail their interest in AI, their self-teaching of Lisp, and their undergraduate thesis on reverse-engineering SHRDLU. They discuss their realization during graduate school that AI was a hoax and their decision to focus on Lisp. The author also reflects on the uneasy alliance between theory and systems in computer science and their desire to build things that would last. They mention a visit to the Carnegie Institute where they realized the permanence of paintings and the possibility of making a living as an artist.


In [None]:
# search through doc'summary and embedding, and return full docs

vector_db1 = Chroma(collection_name="summaries", embedding_function=embedding_model)
doc_ids = [str(uuid.uuid4()) for _ in text_docs]

store = InMemoryByteStore()
id_key = "doc_id"
retriever = MultiVectorRetriever(
    vectorstore=vector_db1,
    byte_store=store,
    id_key=id_key,
)

summary_docs = [
    Document(page_content=s, metadata={id_key: doc_ids[i]})
    for i, s in enumerate(summaries)
]
retriever.vectorstore.add_documents(summary_docs)
retriever.docstore.mset(list(zip(doc_ids, text_docs)))

# can also add the original chunks or smaller chunk to the vector_db1
# for i, doc in enumerate(text_docs):
#     doc.metadata[id_key] = doc_ids[i]
# retriever.vectorstore.add_documents(text_docs)

# can skip this in production
sub_docs = vector_db1.similarity_search("justice breyer")
print(sub_docs[0])

# the retrieval will search through the vector embeddings, match the id and point to full
retrieved_docs = retriever.get_relevant_documents("justice breyer")
print(len(retrieved_docs[0].page_content))

page_content='The document discusses various actions and plans of the administration. It mentions the appointment of a chief prosecutor for pandemic fraud, the reduction of the deficit, the crackdown on companies overcharging American businesses and consumers, the improvement of nursing home quality, and the implementation of various measures to support workers and families. It also addresses the progress made in the fight against COVID-19, including the new mask guidelines and the availability of vaccines, treatments, tests, and masks. The document emphasizes the need to prepare for new variants, reopen schools and businesses, and continue vaccinating the world. Additionally, it touches on issues related to crime prevention, law enforcement reform, gun violence, voting rights, and the retirement of Justice Stephen Breyer from the Supreme Court.' metadata={'doc_id': '03080b02-9337-4840-8e92-5d9c403e6052'}
9874


#### **Hypothetical Queries**

Use LLM to generate a list of hypothetical questions that could be asked of a particular document. These questions can then be embedded.

In [None]:
# llm chain to generate question
functions = [
    {
        "name": "hypothetical_questions",
        "parameters": {
            "type": "object",
            "properties": {
                "questions": {
                    "type": "array",
                    "items": {"type": "string"},
                },
            },
            "required": ["questions"],
        },
    }
]

chain = (
    {"doc": lambda x: x.page_content}
    | ChatPromptTemplate.from_template(
        "Generate a list of exactly 3 hypothetical questions that the below document could be used to answer:\n\n{doc}"
    )
    | ChatOpenAI(max_retries=0, model="gpt-3.5-turbo").bind(
        functions=functions, function_call={"name": "hypothetical_questions"}
    )
    | JsonKeyOutputFunctionsParser(key_name="questions")
)

print(f"Examples: {chain.invoke(text_docs[0])}")
hypothetical_questions = chain.batch(text_docs, {"max_concurrency": 2})
print(hypothetical_questions[0])

Examples: ['What were the main things the author worked on before college?', 'What were the programming languages and machines the author used in their early programming experiences?', 'Why did the author decide to focus on Lisp and write a book about Lisp hacking?']
["How did the author's experience with programming on the IBM 1401 shape their perspective on computers?", 'What motivated the author to switch from studying philosophy to AI?', "How did the author's perception of AI change during their first year of grad school?"]


In [None]:
### adding embedding and question list into chroma
vector_db2 = Chroma(
    collection_name="hypo-questions",
    embedding_function=embedding_model
)
doc_ids = [str(uuid.uuid4()) for _ in text_docs]
question_docs = []
for i, question_list in enumerate(hypothetical_questions):
    question_docs.extend(
        [Document(page_content=s, metadata={id_key: doc_ids[i]}) for s in question_list]
    )

vector_db2.add_documents(question_docs)

In [None]:
retriever.docstore.mset(list(zip(doc_ids, text_docs)))
sub_docs = vector_db2.similarity_search("What's your stand on sexual minority")
print([d.page_content for d in sub_docs])

retrieved_docs = retriever.get_relevant_documents("What's your stand on sexual minority")
print(retrieved_docs)

['How does the President plan to protect the rights of women and the LGBTQ+ community?', 'What advantages do independent-minded individuals have in fields affected by rapid change?', 'What led the author to consider quitting Y Combinator?', "What is the President's plan to beat the opioid epidemic and address mental health issues?"]
[Document(page_content='One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. \n\nA former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Demo

### 1.2.3 Auto Merging Retrieval

 Takes a list of retrievers as input and merges the results of their get_relevant_documents() methods into a single list. The merged results will be a list of documents that are relevant to the query and that have been ranked by the different retrievers.

In [None]:
from langchain.retrievers.merger_retriever import MergerRetriever
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.document_transformers import LongContextReorder
from langchain.document_transformers import (
    EmbeddingsClusteringFilter,
    EmbeddingsRedundantFilter,
)

In [None]:
# create 2 different embeddings and store into 2 different model

# db1 with shorter chunk
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
splits = text_splitter.split_documents(text_docs)
all_mini = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
db_1 = Chroma(
    embedding_function=all_mini,
    persist_directory='./chroma1',
)
db_1.add_documents(splits)
print('DB1 count: ', db_1._collection.count())

retriever_1 = db_1.as_retriever(
    search_type="similarity", search_kwargs={"k": 2, "include_metadata": True}
)

# db2 with longer chunk
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000)
splits = text_splitter.split_documents(text_docs)
multi_qa_mini = HuggingFaceEmbeddings(model_name="multi-qa-MiniLM-L6-dot-v1")
db_2 = Chroma(
    embedding_function=multi_qa_mini,
    persist_directory='./chroma2',
)
db_2.add_documents(splits)
print('DB2 count: ', db_2._collection.count())

retriever_2 = db_2.as_retriever(
    search_type="mmr", search_kwargs={"k": 2, "include_metadata": True}
)

DB1 count:  154
DB2 count:  12


Reorder the documents:
Less relevant document will be at the middle of the list and more relevant elements at beginning / end.

In [None]:
# combine retriever
lotr = MergerRetriever(retrievers=[retriever_1, retriever_2])

# remove redundant results from both retrievers using yet another embedding
# additional document transformer to reorder documents after removing redundancy
filter_embed = HuggingFaceEmbeddings(model_name="BAAI/bge-base-en-v1.5")
filter = EmbeddingsRedundantFilter(embeddings=filter_embed, similarity_threshold=0.9)
reordering = LongContextReorder()
pipeline = DocumentCompressorPipeline(transformers=[filter, reordering])
compression_retriever = ContextualCompressionRetriever(
    base_compressor=pipeline, base_retriever=lotr
)

res = compression_retriever.get_relevant_documents("How to deal with bad economy?")
print(f"Total number of Retrieved Documents: {len(res)}")
print(f'\n### Result ### \n: {res[0].page_content}')



Total number of Retrieved Documents: 4

### Result ### 
: But in my administration, the watchdogs have been welcomed back. 

We’re going after the criminals who stole billions in relief money meant for small businesses and millions of Americans.  

And tonight, I’m announcing that the Justice Department will name a chief prosecutor for pandemic fraud. 

By the end of this year, the deficit will be down to less than half what it was before I took office.  

The only president ever to cut the deficit by more than one trillion dollars in a single year. 

Lowering your costs also means demanding more competition. 

I’m a capitalist, but capitalism without competition isn’t capitalism. 

It’s exploitation—and it drives up prices. 

When corporations don’t have to compete, their profits go up, your prices go up, and small businesses and family farmers and ranchers go under. 

We see it happening with ocean carriers moving goods in and out of America. 

During the pandemic, these foreign-owne

In [None]:
# reranking using cohere
# from langchain.retrievers.document_compressors import CohereRerank
# llm = OpenAI(temperature=0)
# compressor = CohereRerank()
# compression_retriever = ContextualCompressionRetriever(
#     base_compressor=compressor, base_retriever=retriever
# )
# compressed_docs = compression_retriever.get_relevant_documents(
#     "What did the president say about Ketanji Jackson Brown"
# )

### 1.2.4 Query Rewriting

![](https://pbs.twimg.com/media/F8-iFxfaEAA77Nc?format=jpg&name=900x900)

In [None]:
# unable to answer due to bad query
distracted_query = "man that sam bankman fried trial was crazy! What is Meta's revenue on 2023 Q3"
print(f'Context: {retriever(distracted_query)}')
print(baseline_chain.invoke(distracted_query))

Context: Indicted FTX founder Sam Bankman-Fried leaves the United States Courthouse in New York City, U.S., July 26, 2023. Amr Alfiky/Reuters New York CNN — Sam Bankman-Fried was found guilty... Blake Montgomery The trial of the FTX founder has ended after a month with a guilty verdict. What did we learn? Thu 2 Nov 2023 20.29 EDT Former crypto mogul Sam Bankman-Fried was... Sam Bankman-Fried in a July 2023 picture. Sam Bankman-Fried was found guilty Thursday following a monthlong trial that was almost as wild as the rapid rise and fall of cryptocurrency exchange FTX ... Nov. 2, 2023 Sam Bankman-Fried, the tousle-haired mogul who founded the FTX cryptocurrency exchange, was convicted on Thursday of seven charges of fraud and conspiracy after a monthlong... Indicted FTX founder Sam Bankman-Fried leaves the United States Courthouse in New York City, U.S., July 26, 2023. New York (CNN) — Sam Bankman-Fried, once known as a cryptocurrency whiz kid, was ...
I'm sorry, but I cannot provide an 

In [None]:
template = """Provide a better search query for \
web search engine to answer the given question, end \
the queries with ’**’. Question: \
{x} Answer:"""
rewrite_prompt = ChatPromptTemplate.from_template(template)
print(rewrite_prompt)

input_variables=['x'] messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['x'], template='Provide a better search query for web search engine to answer the given question, end the queries with ’**’. Question: {x} Answer:'))]
content="Meta's revenue 2023 Q3**"


In [None]:
def _parse(text):
    '''Parser to remove the `**`'''
    return text.strip("**")
model = ChatOpenAI(max_retries=0, model="gpt-3.5-turbo", temperature=0)

rewriter = (
    rewrite_prompt | model | StrOutputParser() | _parse
)

print(rewriter.invoke({"x": distracted_query}))

Meta's revenue 2023 Q3


In [None]:
template = """Answer the users question based only on the following context:
<context>
{context}
</context>
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)
model = ChatOpenAI(max_retries=0, model="gpt-3.5-turbo", temperature=0)
search = DuckDuckGoSearchAPIWrapper()
def retriever(query):
    return search.run(query)

rewrite_chain = (
    {
        "context": {"x": RunnablePassthrough()} | rewriter | retriever,
        "question": RunnablePassthrough(),
    }
    | prompt
    | model
    | StrOutputParser()
)
rewrite_chain.invoke(distracted_query)

'Based on the given context, the revenue for Meta in the third quarter of 2023 is reported to be up 23% compared to the previous year. However, the specific revenue amount is not mentioned.'

### 1.2.5 Multi Query Fusion

1. Query Generation: The system starts by generating multiple queries from a user's initial query using OpenAI's GPT model.

2. Vector Search: Conducts vector-based searches on each of the generated queries to retrieve relevant documents from a predefined set.

3. Reciprocal Rank Fusion: Applies the Reciprocal Rank Fusion algorithm to re-rank the documents based on their relevance across multiple queries.

4. Output Generation: To properly combine the retrieved results with different similarity scores — Reciprocal Rank Fusion algorithm, reranking the retrieved results for the final output.

In [None]:
# chain to create multiple query from original
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant that generates multiple search queries based on a single input query."),
    ("user", "Generate multiple search queries related to: {original_query}"),
    ("user", "OUTPUT (4 queries):")
])

original_query = "What's your stand on sexual minority"
generate_queries = (
    prompt | ChatOpenAI(temperature=0) | StrOutputParser() | (lambda x: x.split("\n"))
)

print(generate_queries.invoke({"original_query": original_query}))

['1. What are the different perspectives on sexual minority rights?', '2. How do different cultures and societies view sexual minority individuals?', '3. What are the current laws and policies regarding sexual minority rights?', '4. What are the common misconceptions and stereotypes about sexual minority individuals?']


In [None]:
# create vector store and retriever
vector_db3 = Chroma(
    collection_name="rag-fusion",
    embedding_function=embedding_model
)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
splits = text_splitter.split_documents(text_docs)
vector_db3.add_documents(splits)
retriever = vector_db3.as_retriever()

In [None]:
def reciprocal_rank_fusion(results: list[list], k=60):
    fused_scores = {}

    # 4x4 results from vector store
    for docs in results:
        # Assumes the docs are returned in sorted order of relevance
        for rank, doc in enumerate(docs):
            doc_str = dumps(doc)
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            previous_score = fused_scores[doc_str]
            # k will decrease the effect of the rank on output
            fused_scores[doc_str] += 1 / (rank + k)

    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]
    return reranked_results

In [None]:
chain = generate_queries | retriever.map() | reciprocal_rank_fusion
chain.invoke({"original_query": original_query})

[(Document(page_content='That’s why immigration reform is supported by everyone from labor unions to religious leaders to the U.S. Chamber of Commerce. \n\nLet’s get it done once and for all. \n\nAdvancing liberty and justice also requires protecting the rights of women. \n\nThe constitutional right affirmed in Roe v. Wade—standing precedent for half a century—is under attack as never before. \n\nIf we want to go forward—not backward—we must protect access to health care. Preserve a woman’s right to choose. And let’s continue to advance maternal health care in America. \n\nAnd for our LGBTQ+ Americans, let’s finally get the bipartisan Equality Act to my desk. The onslaught of state laws targeting transgender Americans and their families is wrong. \n\nAs I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential.', metadata={'source': './data/state_of_the_union.txt'}),
  0.0

### 1.2.6 StepBack Prompting

"Step-Back" prompting can improve performance on complex questions by first asking a "step back" question. This can be combined with regular question-answering applications by then doing retrieval on both the original and step-back question.



In [None]:
from langchain.prompts import ChatPromptTemplate, FewShotChatMessagePromptTemplate
from langchain_core.runnables import RunnableLambda

In [None]:
# create few shot examples
examples = [
    {
        "input": "Could the members of The Police perform lawful arrests?",
        "output": "what can the members of The Police do?",
    },
    {
        "input": "Jan Sindel’s was born in what country?",
        "output": "what is Jan Sindel’s personal history?",
    },
]

example_prompt = ChatPromptTemplate.from_messages(
    [
        ("human", "{input}"),
        ("ai", "{output}"),
    ]
)

few_shot_prompt = FewShotChatMessagePromptTemplate(
    example_prompt=example_prompt,
    examples=examples,
)

In [None]:
# prompt chain to generate step back question
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """You are an expert at world knowledge. \
            Your task is to step back and paraphrase a question to a \
            more generic step-back question, which is easier to answer. \
            Here are a few examples:
            """,
        ),
        few_shot_prompt,
        ("user", "{question}"),
    ]
)

model = ChatOpenAI(max_retries=0, model="gpt-4", temperature=0)
question_gen = prompt | model | StrOutputParser()
question = "was chatgpt around while trump was president?"
print(question_gen.invoke({"question": question}))

When was ChatGPT developed?


In [None]:
search = DuckDuckGoSearchAPIWrapper(max_results=4)
def retriever(query):
    return search.run(query)

print(f"Content Without Step Back: {retriever(question)}\n")
stepback_question = question_gen.invoke({"question": question})
print(f"Content With Step Back: {retriever(stepback_question)}")

Content Without Step Back: 10 min read The release of OpenAI's ChatGPT in late 2022 made a splash in the tech world and beyond. A December 2022 Harvard Business Review article termed it a "tipping point for AI," calling it... True About this rating On Jan. 30, 2023, Twitter user @echo_chamberz tweeted that the OpenAI tool ChatGPT (Chat Generative Pre-Trained Transformer) declined a request to write an AI-generated... On Wednesday, a Twitter user posted screenshots of him asking OpenAI's chatbot, ChatGPT, to write a positive poem about former President Donald Trump, to which the chatbot declined, citing... After ChatGPT wrote a poem praising President Biden, but refused to write one praising former president Donald Trump, the creative director for Sen. Ted Cruz (R-Tex.), Leigh Wolf, lashed out.

Content With Step Back: ChatGPT is reshaping the world's interactions with technology and influencing a wide range of industries. Learn more about its inception, key milestones, and wide-ranging

In [None]:
### prompt chain to generate answer based on the content

response_prompt_template = """\
You are an expert of world knowledge. \
I am going to ask you a question. Your response should be comprehensive \
and not contradicted with the following context if they are relevant. \
Otherwise, ignore them if they are not relevant.

{normal_context}
{step_back_context}

Original Question: {question}
Answer:"""

response_prompt = ChatPromptTemplate.from_template(response_prompt_template)

chain = (
    {
        "normal_context": RunnableLambda(lambda x: x["question"]) | retriever,
        "step_back_context": question_gen | retriever,
        "question": lambda x: x["question"],
    }
    | response_prompt | model | StrOutputParser()
)
chain.invoke({"question": question})

"No, ChatGPT was not around while Trump was president. According to the context, OpenAI introduced ChatGPT at the end of 2022. Donald Trump's presidency ended in January 2021, so ChatGPT was released after his term."

### 1.2.7 Sentence Window Retrieval

## 1.3 Reranker

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer

### 1.3.1 HuggingFace

In [None]:
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-large')
model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-large')
model.eval()

pairs = [['what is panda?', 'hi'], ['what is panda?', 'The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China.']]
with torch.no_grad():
    inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors='pt', max_length=512)
    scores = model(**inputs, return_dict=True).logits.view(-1, ).float()
    print(scores)

### 1.3.2 LangChain Reorder

*Lost in the middle: The problem with long contexts.*
No matter the architecture of your model, there is a substantial performance degradation when you include 10+ retrieved documents. In brief: When models must access relevant information in the middle of long contexts, they tend to ignore the provided documents.

 Less relevant document will be at the middle of the list and more relevant elements at beginning / end.

In [None]:
from langchain.document_transformers import LongContextReorder

In [None]:
reordering = LongContextReorder()
docs = retriever.get_relevant_documents(query)
reordered_docs = reordering.transform_documents(docs)

### 1.3.3 SentenceTransformer

In [None]:
from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

pairs = [[query, doc] for doc in retrieved_documents]
scores = cross_encoder.predict(pairs)
print("Scores:")
for score in scores:
    print(score)

print("New Ordering:")
for i in np.argsort(scores)[::-1]:
    print(i+1)

## 1.4 Fine Tuning Embedding Model

[Guide to Train Sentence Transformer](https://huggingface.co/blog/how-to-train-sentence-transformers)

Most dataset configurations will take one of four forms (below you will see examples of each case):

1. The example is a pair of sentences and a label indicating how similar they are. The label can be either an integer or a float. E.g. [SNLI Dataset](https://huggingface.co/datasets/snli). The loss function depends on the format of the label. If its an integer use ContrastiveLoss or SoftmaxLoss; if its a float you can use CosineSimilarityLoss. The loss function optimizes such that
    * sentences with the closest labels are near in the vector space
    * sentences with the farthest labels are as far as possible.

2. The example is a pair of positive (similar) sentences without a label. E.g. [Sentence Compression](https://huggingface.co/datasets/embedding-data/sentence-compression?row=0). Having your data in this format can be great since you can use the MultipleNegativesRankingLoss, one of the most used loss functions for Sentence Transformers models.

3. The example is a sentence with an integer label. Each sentence has an integer label indicating the class to which it belongs. E.g. [Trec](https://huggingface.co/datasets/trec). This data format is easily converted by loss functions into three sentences (triplets) where the first is an "anchor", the second a "positive" of the same class as the anchor, and the third a "negative" of a different class.You can use BatchHardTripletLoss, which requires the data to be labeled with integers (e.g., labels 1, 2, 3) assuming that samples with the same label are similar. Therefore, anchors and positives must have the same label, while negatives must have a different one.

4.  The example is a triplet (anchor, positive, negative) without classes or labels for the sentences. E.g. [QQP_triplets](https://huggingface.co/datasets/embedding-data/QQP_triplets). If you don't have a label for each sentence in the triplets, you should use TripletLoss. This loss minimizes the distance between the anchor and the positive sentences while maximizing the distance between the anchor and the negative sentences.

### 1.4.1 Sentence Transformer

In [None]:
!pip install sentence-transformers datasets

In [None]:
from sentence_transformers import InputExample, SentenceTransformer, models
from sentence_transformers import losses
from torch.utils.data import DataLoader
from datasets import load_dataset

In [None]:
# using pure transformer architecture
word_embedding_model = models.Transformer('distilroberta-base')
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

# the most efficient way by using sentence transformer architecture
model_id = "sentence-transformers/all-MiniLM-L6-v2"
model = SentenceTransformer(model_id)

In [None]:
dataset_id = "embedding-data/QQP_triplets"
# dataset_id = "embedding-data/sentence-compression"

dataset = load_dataset(dataset_id)
print(f"- The {dataset_id} dataset has {dataset['train'].num_rows} examples.")
print(f"- Each example is a {type(dataset['train'][0])} with a {type(dataset['train'][0]['set'])} as value.")
print(f"- Examples look like this: {dataset['train'][0]}")

- The embedding-data/QQP_triplets dataset has 101762 examples.
- Each example is a <class 'dict'> with a <class 'dict'> as value.
- Examples look like this: {'set': {'query': 'Why in India do we not have one on one political debate as in USA?', 'pos': ['Why cant we have a public debate between politicians in India like the one in US?'], 'neg': ['Can people on Quora stop India Pakistan debate? We are sick and tired seeing this everyday in bulk?', 'Why do politicians, instead of having a decent debate on issues going in and around the world, end up fighting always?', 'Can educated politicians make a difference in India?', 'What are some unusual aspects about politics and government in India?', 'What is debate?', 'Why does civic public communication and discourse seem so hollow in modern India?', 'What is a Parliamentary debate?', "Why do we always have two candidates at the U.S. presidential debate. yet the ballot has about 7 candidates? Isn't that a misrepresentation of democracy?", 'Wh

In [None]:
train_examples = []
train_data = dataset['train']['set']
n_examples = dataset['train'].num_rows // 2

for i in range(n_examples):
    example = train_data[i]
    train_examples.append(
        InputExample(texts=[example['query'], example['pos'][0], example['neg'][0]])
    )

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

In [None]:
train_loss = losses.TripletLoss(model=model)
num_epochs = 10
# 10% of train data for warm up
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1)
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=num_epochs,
    warmup_steps=warmup_steps
)

In [None]:
### another model with different dataset ###
modelB = SentenceTransformer('embedding-data/distilroberta-base-sentence-transformer')
datasetB = load_dataset("embedding-data/sentence-compression")
print(f"Examples look like this: {datasetB['train']['set'][0]}")
train_examplesB = []
train_dataB = dataset['train']['set']
n_examples = dataset['train'].num_rows

# preparing dataset
for i in range(n_examples):
    example = train_dataB[i]
    train_examplesB.append(InputExample(texts=[example[0], example[1]]))
train_dataloaderB = DataLoader(train_examplesB, shuffle=True, batch_size=64)

# defining loss function
train_lossB = losses.MultipleNegativesRankingLoss(model=modelB)
num_epochsB = 10
warmup_stepsB = int(len(train_dataloaderB) * num_epochsB * 0.1)

modelB.fit(
    train_objectives=[(train_dataloaderB, train_lossB)],
    epochs=num_epochsB,
    warmup_steps=warmup_stepsB
)

### 1.4.2 Embedding Adaptor

In [None]:
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

In [None]:
### generate synthetic data ###
def generate_queries(model="gpt-3.5-turbo"):
    messages = [
        {
            "role": "system",
            "content": "You are a helpful expert financial research assistant. You help users analyze financial statements to better understand companies. "
            "Suggest 10 to 15 short questions that are important to ask when analyzing an annual report. "
            "Do not output any compound questions (questions with multiple sentences or conjunctions)."
            "Output each question on a separate line divided by a newline."
        },
    ]

    response = openai_client.chat.completions.create(
        model=model,
        messages=messages,
    )
    content = response.choices[0].message.content
    content = content.split("\n")
    return content

def evaluate_results(query, statement, model="gpt-3.5-turbo"):
    messages = [
    {
        "role": "system",
        "content": "You are a helpful expert financial research assistant. You help users analyze financial statements to better understand companies. "
        "For the given query, evaluate whether the following satement is relevant."
        "Output only 'yes' or 'no'."
    },
    {
        "role": "user",
        "content": f"Query: {query}, Statement: {statement}"
    }
    ]

    response = openai_client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=1
    )
    content = response.choices[0].message.content
    if content == "yes":
        return 1
    return -1

In [None]:
### create database with embedding function ###
embedding_function = SentenceTransformerEmbeddingFunction()
chroma_collection = load_chroma(
    filename='microsoft_annual_report_2022.pdf',
    collection_name='microsoft_annual_report_2022',
    embedding_function=embedding_function
)
print(chroma_collection.count())

### generate question and answer from gpt ###
generated_queries = generate_queries()
results = chroma_collection.query(
    query_texts=generated_queries, n_results=10, include=['documents', 'embeddings']
)

# get the embeddigns for both answers and queries
retrieved_documents = results['documents']
retrieved_embeddings = results['embeddings']
query_embeddings = embedding_function(generated_queries)

In [None]:
### create dataset ###
adapter_query_embeddings = []
adapter_doc_embeddings = []
adapter_labels = []

for q, query in enumerate(tqdm(generated_queries)):
    for d, document in enumerate(retrieved_documents[q]):
        adapter_query_embeddings.append(query_embeddings[q])
        adapter_doc_embeddings.append(retrieved_embeddings[q][d])
        adapter_labels.append(evaluate_results(query, document))

# (m, embedding_size)
adapter_query_embeddings = torch.Tensor(np.array(adapter_query_embeddings))
adapter_doc_embeddings = torch.Tensor(np.array(adapter_doc_embeddings))
# (m, 1)
adapter_labels = torch.Tensor(np.expand_dims(np.array(adapter_labels), 1))

# query, answer, label
dataset = torch.utils.data.TensorDataset(
    adapter_query_embeddings, adapter_doc_embeddings, adapter_labels
)

In [None]:
def model(query_embedding, document_embedding, adaptor_matrix):
    # query embedding multiply by additional adaptor
    updated_query_embedding = torch.matmul(adaptor_matrix, query_embedding)
    # similarity between updated query embedding vs original document
    return torch.cosine_similarity(updated_query_embedding, document_embedding, dim=0)

def mse_loss(query_embedding, document_embedding, adaptor_matrix, label):
    # (m, 1)
    return torch.nn.MSELoss()(model(query_embedding, document_embedding, adaptor_matrix), label)

# Initialize the adaptor matrix with random
mat_size = len(adapter_query_embeddings[0])
adapter_matrix = torch.randn(mat_size, mat_size, requires_grad=True)

# update adaptor matrix
min_loss = float('inf')
best_matrix = None
for epoch in tqdm(range(100)):
    for query_embedding, document_embedding, label in dataset:
        loss = mse_loss(query_embedding, document_embedding, adapter_matrix, label)

        if loss < min_loss:
            min_loss = loss
            best_matrix = adapter_matrix.clone().detach().numpy()

        loss.backward()
        with torch.no_grad():
            adapter_matrix -= 0.01 * adapter_matrix.grad
            adapter_matrix.grad.zero_()

### inference ###
scaled_vector = np.matmul(best_matrix, query_embedding).numpy()

## 1.5 Evaluation Metric

In [None]:
!pip install ragas

In [None]:
from datasets import Dataset
from langchain.document_loaders import TextLoader
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.prompts import HumanMessagePromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.evaluation import load_evaluator, EmbeddingDistance

In [None]:
# Load the data and split into smaller chunk
loader = TextLoader("./data/state_of_the_union.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_documents(documents)

# embed and load into vector store
embedding_model = HuggingFaceEmbeddings(
    model_name='BAAI/bge-base-en-v1.5',
    model_kwargs={"device": "cuda"},
    encode_kwargs={"device": "cuda", "batch_size": 100}
)

vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=embedding_model,
)
print('Vectordb Size:', vector_db._collection.count())
retriever = vector_db.as_retriever()

Vectordb Size: 90


In [None]:
# Define LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Define prompt template
template = """\
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Use two sentences maximum and keep the answer concise.

Question: {question}
Context: {context}
Answer:
"""

prompt = ChatPromptTemplate.from_template(template)

# Setup RAG pipeline
rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
# ground truth data
questions = ["What did the president say about Justice Breyer?",
             "What did the president say about Intel's CEO?",
             "What did the president say about gun violence?",
            ]
ground_truths = [["The president said that Justice Breyer has dedicated his life to serve the country and thanked him for his service."],
                ["The president said that Pat Gelsinger is ready to increase Intel's investment to $100 billion."],
                ["The president asked Congress to pass proven measures to reduce gun violence."]]
answers = []
contexts = []

# llm output
for query in questions:
  answers.append(rag_chain.invoke(query))
  contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# ground truth data only needed for context_recall and context_precision
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)
for k, v in dataset[0].items():
    print(f'{k} (len={len(v)}): {v}')

question (len=48): What did the president say about Justice Breyer?
answer (len=132): The president thanked Justice Breyer for his service and mentioned that he nominated Judge Ketanji Brown Jackson as his replacement.
contexts (len=4): ['Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.', 'And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', 'A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since

### 1.5.1 Retriever Components

In [None]:
from ragas import evaluate
from ragas.metrics import (
    context_recall,
    context_precision,
)

![](https://docs.ragas.io/en/latest/_static/imgs/component-wise-metrics.png)

In [None]:
result = evaluate(
    dataset=dataset,
    metrics=[context_precision, context_recall],
)

df = result.to_pandas()
df

evaluating with [context_precision]


100%|██████████| 1/1 [00:05<00:00,  5.19s/it]


evaluating with [context_recall]


100%|██████████| 1/1 [00:04<00:00,  4.68s/it]


Unnamed: 0,question,answer,contexts,ground_truths,context_precision,context_recall
0,What did the president say about Justice Breyer?,The president thanked Justice Breyer for his s...,"[Tonight, I’d like to honor someone who has de...",[The president said that Justice Breyer has de...,0.0,1.0
1,What did the president say about Intel's CEO?,I don't know what the president said about Int...,"[To all Americans, I will be honest with you, ...",[The president said that Pat Gelsinger is read...,0.0,0.0
2,What did the president say about gun violence?,The president called for Congress to pass meas...,[And I ask Congress to pass proven measures to...,[The president asked Congress to pass proven m...,1.0,1.0


#### **Context Recall**

Context Recall measures the extent to which the retrieved context aligns with the ground truth.
To estimate context recall from the ground truth answer, each sentence in the ground truth answer is analyzed to determine whether it can be attributed to the retrieved context or not. In an ideal scenario, all sentences in the ground truth answer should be attributable to the retrieved context.

In [None]:
### context recall from scratch ###
CONTEXT_RECALL_RA = HumanMessagePromptTemplate.from_template(
"""\
Given a context, and an answer, analyze each sentence in the answer and classify \
if the sentence can be attributed to the given context or not. \
Use only "Yes" (1) or "No" (0) as a binary classification. Output json with reason.

question: What can you tell me about albert Albert Einstein?
context: Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist,widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called "the world's most famous equation". He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius.
answer: Albert Einstein born in 14 March 1879 was  German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time. He received the 1921 Nobel Prize in Physics "for his services to theoretical physics. He published 4 papers in 1905.  Einstein moved to Switzerland in 1895
classification:
[
    {{  "statement_1":"Albert Einstein, born on 14 March 1879, was a German-born theoretical physicist, widely held to be one of the greatest and most influential scientists of all time.",
        "reason": "The date of birth of Einstein is mentioned clearly in the context.",
        "Attributed": "1"
    }},
    {{
        "statement_2":"He received the 1921 Nobel Prize in Physics 'for his services to theoretical physics.",
        "reason": "The exact sentence is present in the given context.",
        "Attributed": "1"
    }},
    {{
        "statement_3": "He published 4 papers in 1905.",
        "reason": "There is no mention about papers he wrote in the given context.",
        "Attributed": "0"
    }},
    {{
        "statement_4":"Einstein moved to Switzerland in 1895.",
        "reason": "There is no supporting evidence for this in the given context.",
        "Attributed": "0"
    }}
]

question: who won 2020 icc world cup?
context: Who won the 2022 ICC Men's T20 World Cup?
The 2022 ICC Men's T20 World Cup, held from October 16 to November 13, 2022, in Australia, was the eighth edition of the tournament. Originally scheduled for 2020, it was postponed due to the COVID-19 pandemic. England emerged victorious, defeating Pakistan by five wickets in the final to clinch their second ICC Men's T20 World Cup title.
answer: England
classification:
[
    {{
        "statement_1":"England won the 2022 ICC Men's T20 World Cup.",
        "reason": "From context it is clear that England defeated Pakistan to win the World Cup.",
         "Attributed": "1"
    }}
]

question:{question}
context:{context}
answer:{answer}
classification:
"""
)

In [None]:
question, ground_truths, contexts = dataset["question"], dataset["ground_truths"], dataset["contexts"]

prompts = []
response = []

for qstn, gt, ctx in zip(question, ground_truths, contexts):
    gt = "\n".join(gt) if isinstance(gt, list) else gt
    ctx = "\n".join(ctx) if isinstance(ctx, list) else ctx
    human_prompt = CONTEXT_RECALL_RA.format(
        question=qstn, context=ctx, answer=gt
    )
    prompts.append(ChatPromptTemplate.from_messages([human_prompt]))

print(len(prompts))

results = []
for prompt in prompts:
    results.append(llm(prompt.format_messages()))

responses = [json.loads(r.content) for r in results]

for ii in responses:
    for i in ii:
        print(i)
    print('\n')

{'statement_1': 'The president said that Justice Breyer has dedicated his life to serve the country.', 'reason': 'The statement is directly mentioned in the context.', 'Attributed': '1'}
{'statement_2': 'The president thanked Justice Breyer for his service.', 'reason': 'The statement is directly mentioned in the context.', 'Attributed': '1'}


{'statement_1': "The president said that Pat Gelsinger is ready to increase Intel's investment to $100 billion.", 'reason': "There is no mention of Pat Gelsinger or Intel's investment in the given context.", 'Attributed': '0'}


{'statement_1': 'The president asked Congress to pass proven measures to reduce gun violence.', 'reason': 'The exact sentence is present in the given context.', 'Attributed': '1'}




In [None]:
scores = []

for response in responses:
    res_lis = [
        int("1" == resp.get("Attributed", "0").strip())
        if resp.get("Attributed")
        else np.nan
        for resp in response
    ]

    print(res_lis)
    denominator = len(res_lis)
    numerator = sum(res_lis)
    scores.append(numerator / denominator)

print(scores)

[1, 1]
[0]
[1]
[1.0, 0.0, 1.0]


#### **Context Precision**

Context Precision (signal-to-noise ratio) is a metric that evaluates whether all of the ground-truth relevant items present in the retrieved contexts. It evaluates whether all of the ground-truth relevant items present in the contexts are ranked higher or not. Ideally all the relevant chunks must appear at the top ranks. This metric is computed using the question and the contexts.

In [None]:
### context precision from scratch ###
CONTEXT_PRECISION = HumanMessagePromptTemplate.from_template(
"""
Given question, answer and context verify if the context was useful in arriving \
at the given answer. Give verdict as "1" if useful and "0" if not.

question: What can you tell me about albert Albert Einstein?
context: Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist,widely held to be one of the greatest and most influential scientists of all time. Best known for developing the theory of relativity, he also made important contributions to quantum mechanics, and was thus a central figure in the revolutionary reshaping of the scientific understanding of nature that modern physics accomplished in the first decades of the twentieth century. His mass–energy equivalence formula E = mc2, which arises from relativity theory, has been called "the world's most famous equation". He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step in the development of quantum theory. His work is also known for its influence on the philosophy of science. In a 1999 poll of 130 leading physicists worldwide by the British journal Physics World, Einstein was ranked the greatest physicist of all time. His intellectual achievements and originality have made Einstein synonymous with genius.
answer: Albert Einstein born in 14 March 1879 was  German-born theoretical physicist, \
widely held to be one of the greatest and most influential scientists of all time. \
He received the 1921 Nobel Prize in Physics "for his services to theoretical physics. \
He published 4 papers in 1905.  Einstein moved to Switzerland in 1895
verification:
{{\
"reason": "The provided context was indeed useful in arriving at the given answer. \
The context includes key information about Albert Einstein's life and contributions, \
which are reflected in the answer.", \
"Verdict": "1"
}}

question: What is the tallest mountain in the world?
context: The Andes is the longest continental mountain range in the world, located in South America. It stretches across seven countries and features many of the highest peaks in the Western Hemisphere. The range is known for its diverse ecosystems, including the high-altitude Andean Plateau and the Amazon rainforest.
answer: Mount Everest.
verification:
{{\
"reason":"the provided context discusses the Andes mountain range, which, while impressive, \
does not include Mount Everest or directly relate to the question \
about the world's tallest mountain.", \
"verdict":"0"
}}

question:{question}
context:{context}
answer:{answer}
verification:
"""
)

# ground truths are better use, use generated answer for evaluation if not provided
questions, contexts, answers = dataset["question"], dataset["contexts"], dataset['ground_truths']
prompts = []
for qstn, ctx, answer in zip(questions, contexts, answers):
    human_prompts = [
        ChatPromptTemplate.from_messages(
            [CONTEXT_PRECISION.format(question=qstn, context=c, answer=answer)]
        ) for c in ctx
    ]
    prompts.extend(human_prompts)

print(len(prompts))
results = []
for prompt in prompts:
    results.append(llm(prompt.format_messages()))
responses = [json.loads(r.content) for r in results]

12


In [None]:
# group flatten results into vested list
context_lens = [len(ctx) for ctx in contexts]
context_lens.insert(0, 0)
context_lens = np.cumsum(context_lens)
print(context_lens)

grouped_responses = [
    responses[start:end]
    for start, end in zip(context_lens[:-1], context_lens[1:])
]
print(grouped_responses[0])

[ 0  4  8 12]
[{'reason': "The provided context directly quotes the president and provides information about Justice Breyer's background and retirement. This information is reflected in the answer.", 'verdict': '1'}, {'reason': 'The provided context mentions Justice Breyer and acknowledges his legacy of excellence, but it does not provide any specific information about what the president said about him.', 'verdict': '0'}, {'reason': 'The provided context does not mention anything about what the president said about Justice Breyer. Therefore, it is not useful in arriving at the given answer.', 'verdict': '0'}, {'reason': "the provided context does not mention anything about the president's statement regarding Justice Breyer, so it is not useful in arriving at the given answer.", 'verdict': '0'}]


In [None]:
scores = []

for response in grouped_responses:
    res_lis = [
        int("1" == resp.get("verdict", "0").strip())
        if resp.get("verdict")
        else np.nan
        for resp in response
    ]

    print(res_lis)
    denominator = sum(res_lis) + 1e-10
    numerator = sum(
        [
            (sum(res_lis[: i + 1]) / (i + 1)) * res_lis[i]
            for i in range(len(res_lis))
        ]
    )
    scores.append(numerator / denominator)

print(scores)

[1, 0, 0, 0]
[0, 0, 0, 0]
[1, 0, 0, 1]
[0.9999999999, 0.0, 0.7499999999625]


#### **NDCG**

[NDCG Explained](https://towardsdatascience.com/demystifying-ndcg-bee3be58cfe0)

![](https://miro.medium.com/v2/resize:fit:720/format:webp/1*ObqftD7LHdjS9kdaZVQH0Q.png)

In [None]:
import math
def calculate_ndcg(predictions, ground_truth, top_k=2):
    """
    Calculate NDCG (Normalized Discounted Cumulative Gain) for a recommendation system.

    Parameters:
    - predictions: A list of recommended items for each query.
    - ground_truth: A list of actual (ground truth) items for each query.
    - top_k: The number of top recommendations to consider.

    Returns:
    - The NDCG score, a float between 0 and 1.
    """
    if len(predictions) != len(ground_truth):
        raise ValueError("The length of predictions and ground_truth must be the same.")

    num_query = len(predictions)
    ndcg_sum = 0.0

    rel_list = []
    for i in range(num_query):
        recommended_items = predictions[i][:top_k]
        actual_items = ground_truth[i]

        # Calculate DCG (Discounted Cumulative Gain)
        dcg = 0.0
        for j in range(top_k):
            item = recommended_items[j]
            if item in actual_items:
                relevance = 1.0
            else:
                relevance = 0.0
            dcg += (2 ** relevance - 1) / math.log2(j + 2)
            rel_list.append(relevance)

        # Calculate ideal DCG for the ground truth items
        ideal_dcg = sum(
            (2 ** relevance - 1) / math.log2(j + 2)
                for j, relevance in enumerate(sorted(rel_list, reverse=True)[:top_k])
            )

        # Calculate NDCG for the query
        if ideal_dcg == 0:
            ndcg_query = 0.0  # Handle the case where ideal_dcg is zero
        else:
            ndcg_query = dcg / ideal_dcg

        ndcg_sum += ndcg_query

    return ndcg_sum / num_query

# test
print('Perfect Rank', calculate_ndcg([[1, 2, 3, 4, 5]], [[2, 1]], top_k=3))
print('Pretty Good Rank', calculate_ndcg([[1, 2, 3, 4, 5]], [[2, 3]], top_k=3))
print('Worst Rank', calculate_ndcg([[1, 2, 3, 4, 5]], [[5, 4]], top_k=3))

Perfect Rank 1.0
Bad Rank 0.6934264036172708
Worst Rank 0.0


#### **Hit Rate Ratio**

In [None]:
# hit rate ratio
def calculate_hit_rate_ratio(predictions, ground_truth, top_k=2):
    """
    Calculate the Hit Rate Ratio for a recommendation system.

    Parameters:
    - predictions: A list of recommended items for each query.
    - ground_truth: A list of actual (ground truth) items for each query.
    - top_k: The number of top recommendations to consider.

    Returns:
    - The Hit Rate Ratio, a float between 0 and 1.
    """

    num_query = len(predictions)
    hits = 0

    for i in range(num_query):
        recommended_items = predictions[i][:top_k]
        actual_items = ground_truth[i]
        for item in recommended_items:
            if item in actual_items:
                hits += 1
                break

    return hits / num_query

#### **Mean Reciprocal Rank (MRR)**

[MRR](https://en.wikipedia.org/wiki/Mean_reciprocal_rank)

In [None]:
def mean_reciprocal_rank(rs):
    """Score is reciprocal of the rank of the first relevant item
    First element is 'rank 1'.  Relevance is binary (nonzero is relevant).
    Example from http://en.wikipedia.org/wiki/Mean_reciprocal_rank
    Args:
        rs: Iterator of relevance scores (list or numpy) in rank order
            (first element is the first item)
    Returns:
        Mean reciprocal rank
    """
    rs = (np.asarray(r).nonzero()[0] for r in rs)
    return np.mean([1. / (r[0] + 1) if r.size else 0. for r in rs])

1.0

### 1.5.2 Generator component

In [None]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy
)

In [None]:
result = evaluate(
    dataset=dataset,
    metrics=[faithfulness, answer_relevancy],
)

df = result.to_pandas()
df

evaluating with [faithfulness]


100%|██████████| 1/1 [00:17<00:00, 17.09s/it]


evaluating with [answer_relevancy]


100%|██████████| 1/1 [00:09<00:00,  9.06s/it]


Unnamed: 0,question,answer,contexts,ground_truths,faithfulness,answer_relevancy
0,What did the president say about Justice Breyer?,The president thanked Justice Breyer for his s...,"[Tonight, I’d like to honor someone who has de...",[The president said that Justice Breyer has de...,1.0,0.894318
1,What did the president say about Intel's CEO?,The president did not mention Intel's CEO in t...,[But that’s just the beginning. \n\nIntel’s CE...,[The president said that Pat Gelsinger is read...,0.0,0.959788
2,What did the president say about gun violence?,The president called for Congress to pass meas...,[And I ask Congress to pass proven measures to...,[The president asked Congress to pass proven m...,1.0,0.909126


#### **Faithfulness**

This measures the factual consistency of the generated answer against the given context. It is calculated from answer and retrieved context.

The generated answer is regarded as faithful if all the claims that are made in the answer can be inferred from the given context. To calculate this a set of claims from the generated answer is first identified. Then each one of these claims are cross checked with given context to determine if it can be inferred from given context or not.

In [None]:
LONG_FORM_ANSWER_PROMPT = HumanMessagePromptTemplate.from_template(
"""\
Create one or more statements from each sentence in the given answer.

question: Who was  Albert Einstein and what is he best known for?
answer: He was a German-born theoretical physicist, widely acknowledged to be one of the greatest and most influential physicists of all time. He was best known for developing the theory of relativity, he also made important contributions to the development of the theory of quantum mechanics.
statements in json:
{{
    "statements": [
        "Albert Einstein was born in Germany.",
        "Albert Einstein was best known for his theory of relativity."
    ]
}}

question: Cadmium Chloride is slightly soluble in this chemical, it is also called what?
answer: alcohol
statements in json:
{{
    "statements": [
        "Cadmium Chloride is slightly soluble in alcohol."
    ]
}}

question: Were Hitler and Benito Mussolini of the same nationality?
answer: Sorry, I can't provide answer to that question.
statements in json:
{{
    "statements": []
}}

question:{question}
answer: {answer}
statements in json:"""
)

In [None]:
NLI_STATEMENTS_MESSAGE = HumanMessagePromptTemplate.from_template(
"""
Natural language inference. Use only "Yes" (1), "No" (0) and "Null" (-1) as verdict.

Context:
John is a student at XYZ University. He is pursuing a degree in Computer Science. \
He is enrolled in several courses this semester, including Data Structures, Algorithms, and Database Management. \
John is a diligent student and spends a significant amount of time studying and completing assignments. \
He often stays late in the library to work on his projects.
statement_1: John is majoring in Biology.
statement_2: John is taking a course on Artificial Intelligence.
statement_3: John is a dedicated student.
statement_4: John has a part-time job.
Answer:
[
    {{
        "statement_1": "John is majoring in Biology.",
        "reason": "John's major is explicitly mentioned as Computer Science. There is no information suggesting he is majoring in Biology.",
        "verdict": "0"
    }},
    {{
        "statement_2": "John is taking a course on Artificial Intelligence.",
        "reason": "The context mentions the courses John is currently enrolled in, and Artificial Intelligence is not mentioned. Therefore, it cannot be deduced that John is taking a course on AI.",
        "verdict": "0"
    }},
    {{
        "statement_3": "John is a dedicated student.",
        "reason": "The context states that he spends a significant amount of time studying and completing assignments. Additionally, it mentions that he often stays late in the library to work on his projects, which implies dedication.",
        "verdict": "1"
    }},
    {{
        "statement_4": "John has a part-time job.",
        "reason": "There is no information given in the context about John having a part-time job.",
        "verdict": "0"
    }}
]

Context:
Photosynthesis is a process used by plants, algae, \
and certain bacteria to convert light energy into chemical energy.
statement_1: Albert Einstein was a genius.
Answer:
[
     {{
        "statement_1": "Albert Einstein was a genius.",
        "reason": "The context and statement are unrelated"
        "verdict": "0"
    }}
]

Context:
Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time.
statement_1: Nil
Answer:
[
     {{
        "statement_1": "Nil",
        "reason": "The statement is invalid",
        "verdict": "-1"
    }}
]


context:
{context}
statements:
{statements}
Answer:
"""
)

In [None]:
question, answer, contexts = dataset["question"], dataset["answer"], dataset["contexts"]
prompts = []

### generate claims and cross verify with answer
for q, a in zip(question, answer):
    human_prompt = LONG_FORM_ANSWER_PROMPT.format(question=q, answer=a)
    prompts.append(ChatPromptTemplate.from_messages([human_prompt]))

print(len(prompts), '\n')

results = []
for prompt in prompts:
    results.append(llm(prompt.format_messages()))
responses = [json.loads(r.content) for r in results]

for msg in responses:
    print(msg)
print('\n')

3 

{'statements': ['The president thanked Justice Breyer for his service.', 'The president mentioned that he nominated Judge Ketanji Brown Jackson as his replacement.']}
{'statements': []}
{'statements': ['The president called for Congress to pass measures to reduce gun violence.', 'The president called for universal background checks.', 'The president called for a ban on assault weapons and high-capacity magazines.']}




In [None]:
### generate verdicts for each context against claims
prompts = []
for context, output in zip(contexts, responses):
    statements = output.get("statements", [])
    statements = statements if statements != [] else ["Nil"]
    statements_str: str = "\n".join(
        [f"statement_{i+1}: {st}" for i, st in enumerate(statements)]
    )
    contexts_str: str = "\n".join(context)
    human_prompt = NLI_STATEMENTS_MESSAGE.format(
        context=contexts_str, statements=statements_str
    )
    prompts.append(ChatPromptTemplate.from_messages([human_prompt]))

print(len(prompts), '\n')

results = []
for prompt in prompts:
    results.append(llm(prompt.format_messages()))
responses = [json.loads(r.content) for r in results]

for msg in responses:
    print(msg)
print('\n')

3 

[{'statement_1': 'The president thanked Justice Breyer for his service.', 'reason': 'The context explicitly states that the president wants to honor Justice Breyer for his service.', 'verdict': '1'}, {'statement_2': 'The president mentioned that he nominated Judge Ketanji Brown Jackson as his replacement.', 'reason': 'The context mentions that the president nominated Judge Ketanji Brown Jackson as a replacement for Justice Breyer.', 'verdict': '1'}]
[{'statement_1': 'Nil', 'reason': 'The statement is invalid', 'verdict': '-1'}]
[{'statement_1': 'The president called for Congress to pass measures to reduce gun violence.', 'reason': 'The context explicitly mentions that the president asked Congress to pass proven measures to reduce gun violence.', 'verdict': '1'}, {'statement_2': 'The president called for universal background checks.', 'reason': 'The context mentions that the president asked Congress to pass universal background checks.', 'verdict': '1'}, {'statement_3': 'The preside

In [None]:
# compute score
verdict_score_map = {"1": 1, "0": 0, "-1": np.nan}
scores = []

for output in responses:
    output = output if isinstance(output, list) else []
    faithful_statements = sum(
        verdict_score_map.get(dict.get("verdict", "").lower(), np.nan)
        for dict in output
    )

    print(faithful_statements)
    num_statements = len(output)
    if num_statements:
        score = faithful_statements / num_statements
    else:
        score = np.nan
    scores.append(score)

print(scores)

2
nan
3
[1.0, nan, 1.0]


#### **Answer Relevance**

The evaluation metric, Answer Relevancy, focuses on assessing how pertinent the generated answer is to the given prompt. A lower score is assigned to answers that are incomplete or contain redundant information. An answer is deemed relevant when it directly and appropriately addresses the original question. Importantly, our assessment of answer relevance does not consider factuality but instead penalizes cases where the answer lacks completeness or contains redundant details.

The LLM is prompted to generate an appropriate question for the generated answer multiple times, and the mean cosine similarity between these generated questions and the original question is measured.

In [None]:
QUESTION_GEN = HumanMessagePromptTemplate.from_template(
"""\
Generate a question for the given answer and Identify if answer is noncommittal

Answer:
Albert Einstein was born in Germany.
Context:
Albert Einstein was a German-born theoretical physicist who is widely held to be one of the greatest and most influential scientists of all time
Output:
{{"question":"Where was Albert Einstein born?","noncommittal":false}}

Answer:
It can change its skin color based on the temperature of its environment.
Context:
A recent scientific study has discovered a new species of frog in the Amazon rainforest that has the unique ability to change its skin color based on the temperature of its environment.
Output:
{{"question":"What unique ability does the newly discovered species of frog have?","noncommittal":false}}

Answer:
Everest
Context:
The tallest mountain on Earth, measured from sea level, is a renowned peak located in the Himalayas.
Output:
{{"question":"What is the tallest mountain on Earth?","noncommittal":false}}

Answer:
I don't know about the  groundbreaking feature of the smartphone invented in 2023 as am unware of information beyong 2022.
Context:
In 2023, a groundbreaking invention was announced: a smartphone with a battery life of one month, revolutionizing the way people use mobile technology.
Output:
{{"question":"What was the groundbreaking feature of the smartphone invented in 2023?", "noncommittal":true}}

Answer:
{answer}
Context:
{context}
Output:\
"""
)

In [None]:
questions, answers, contexts = dataset["question"], dataset["answer"], dataset["contexts"]
prompts = []

for ans, ctx in zip(answers, contexts):
    human_prompt = QUESTION_GEN.format(answer=ans, context="\n".join(ctx))
    prompts.append(ChatPromptTemplate.from_messages([human_prompt]))
print(len(prompts), '\n')

results = []
for prompt in prompts:
    results.append(llm(prompt.format_messages()))
responses = [json.loads(r.content) for r in results]

for msg in responses:
    print(msg)
print('\n')

3 

{'question': 'Who did the president nominate as a replacement for Justice Breyer?', 'noncommittal': False}
{'question': "What did the president say about Intel's CEO?", 'noncommittal': True}
{'question': 'What measures did the president call for Congress to pass to reduce gun violence?', 'noncommittal': False}




In [None]:
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
evaluator = load_evaluator(
    "pairwise_embedding_distance",
    distance_metric=EmbeddingDistance.COSINE,
    embeddings=embedding_model
)
scores = []

for question, result, ans in zip(questions, responses, answer):
    # if isinstance(result, list):
    #     gen_questions = [item.get("question", "") for item in result]
    #     committal = np.any([item.get("noncommittal", False) for item in result])
    gen_questions = result.get("question", "")
    committal = result.get("noncommittal", False)
    cosine_sim = evaluator.evaluate_string_pairs(
        prediction=question,
        prediction_b=gen_questions
    )
    print(f'Q: {question}. GQ: {gen_questions}. Answer: {ans}')
    scores.append(cosine_sim['score'] * int(not committal))

print(scores)

Q: What did the president say about Justice Breyer?. GQ: Who did the president nominate as a replacement for Justice Breyer?. Answer: The president thanked Justice Breyer for his service and mentioned that he nominated Judge Ketanji Brown Jackson as his replacement.
Q: What did the president say about Intel's CEO?. GQ: What did the president say about Intel's CEO?. Answer: I don't know what the president said about Intel's CEO.
Q: What did the president say about gun violence?. GQ: What measures did the president call for Congress to pass to reduce gun violence?. Answer: The president called for Congress to pass measures to reduce gun violence, including universal background checks and a ban on assault weapons and high-capacity magazines.
[0.19011186444049444, 0.0, 0.24031257506966974]


#### **RAG triad — retrieved context relevance**

# 2) Cohere & Weaviate

In [None]:
import weaviate
from annoy import AnnoyIndex
import cohere
co = cohere.Client("")

## 2.1 Keyword Search

Track by words in common

In [None]:
# connect to a demo database in weaviate
client = weaviate.Client(
    url='https://cohere-demo.weaviate.network/',
    auth_client_secret=weaviate.auth.AuthApiKey(""),
    additional_headers={"X-Cohere-Api-Key": ""}
)
client.is_ready()

True

In [None]:
def keyword_search(
    query,
    results_lang='en',
    properties = ["title", "url", "text"],
    num_results=3):

    where_filter = {
        "path": ["lang"],
        "operator": "Equal",
        "valueString": results_lang
    }

    response = (
        client.query.get("Articles", properties)
        .with_bm25(query=query)
        .with_where(where_filter)
        .with_limit(num_results)
        .do()
        )

    result = response['data']['Get']['Articles']
    return result

def print_result(result):
    """ Print results with colorful formatting """
    for i,item in enumerate(result):
        print(f'item {i}')
        for key in item.keys():
            print(f"{key}:{item.get(key)}")
            print()
        print()

# examples
query = "What is the most viewed televised event?"
properties = ["text", "title", "url", "views", "lang"]
keyword_search_results = keyword_search(query, properties=properties)
print_result(keyword_search_results)

item 0
lang:en

text:The most active Gamergate supporters or "Gamergaters" said that Gamergate was a movement for ethics in games journalism, for protecting the "gamer" identity, and for opposing "political correctness" in video games and that any harassment of women was done by others not affiliated with Gamergate. They argued that the close relationships between journalists and developers demonstrated a conspiracy among reviewers to focus on progressive social issues. Some supporters pointed to what they considered disproportionate praise for games such as "Depression Quest" and "Gone Home", which feature unconventional gameplay and stories with social implications, while they viewed traditional AAA games as downplayed. False claims of the "ethics in game journalism" had started as early as 2012, when Geoff Keighley was accused of such unethical behavior when he was presenting information about "Halo 4" among advertisements for Mountain Dew and Doritos, an event called "Doritosgate" 

## 2.2 Dense Retrieval

Semantic Search based on distance of embedding

### 2.2.1 Querying

In [None]:
def dense_retrieval(
    query,
    results_lang='en',
    properties = ["text", "title", "url", "views", "lang", "_additional {distance}"],
    num_results=5):

    nearText = {"concepts": [query]}

    # To filter by language
    where_filter = {
        "path": ["lang"],
        "operator": "Equal",
        "valueString": results_lang
    }

    response = (
        client.query
        .get("Articles", properties)
        .with_near_text(nearText)
        .with_where(where_filter)
        .with_limit(num_results)
        .do()
    )

    result = response['data']['Get']['Articles']
    return result

# example
query = "Tallest person in history"
dense_retrieval_results = dense_retrieval(query)
print_result(dense_retrieval_results)

# same query but in differnet language
query = "أطول رجل في التاريخ"
dense_retrieval_results = dense_retrieval(query)
print_result(dense_retrieval_results)

item 0
_additional:{'distance': -148.99521}

lang:en

text:Robert Pershing Wadlow (February 22, 1918 July 15, 1940), also known as the Alton Giant and the Giant of Illinois, was a man who was the tallest person in recorded history for whom there is irrefutable evidence. He was born and raised in Alton, Illinois, a small city near St. Louis, Missouri.

title:Robert Wadlow

url:https://en.wikipedia.org/wiki?curid=359117

views:3000


item 1
_additional:{'distance': -148.10501}

lang:en

text:Bol came from a family of extraordinarily tall men and women. He said: "My mother was , my father , and my sister is . And my great-grandfather was even taller—." His ethnic group, the Dinka, and the Nilotic people of which they are a part, are among the tallest populations in the world. Bol's hometown, Turalei, is the origin of other exceptionally tall people, including basketball player Ring Ayuel. "I was born in a village, where you cannot measure yourself," Bol reflected. "I learned I was 7 foot 

### 2.2.2 Building Vector Database

In [None]:
text = """
Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan.
It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine.
Set in a dystopian future where humanity is struggling to survive, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for mankind.

Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007.
Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar.
Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm.
Principal photography began in late 2013 and took place in Alberta, Iceland, and Los Angeles.
Interstellar uses extensive practical and miniature effects and the company Double Negative created additional digital effects.

Interstellar premiered on October 26, 2014, in Los Angeles.
In the United States, it was first released on film stock, expanding to venues using digital projectors.
The film had a worldwide gross over $677 million (and $773 million with subsequent re-releases), making it the tenth-highest grossing film of 2014.
It received acclaim for its performances, direction, screenplay, musical score, visual effects, ambition, themes, and emotional weight.
It has also received praise from many astronomers for its scientific accuracy and portrayal of theoretical astrophysics. Since its premiere, Interstellar gained a cult following,[5] and now is regarded by many sci-fi experts as one of the best science-fiction films of all time.
Interstellar was nominated for five awards at the 87th Academy Awards, winning Best Visual Effects, and received numerous other accolades
"""

# chunking

# Split into a list of sentences
texts = text.split('.')
# Clean up to remove empty spaces and new lines
texts = np.array([t.strip(' \n') for t in texts])
# Split into a list of paragraphs
texts = text.split('\n\n')
# Clean up to remove empty spaces and new lines
texts = np.array([t.strip(' \n') for t in texts])
# Split into a list of sentences
texts = text.split('.')
# Clean up to remove empty spaces and new lines
texts = np.array([t.strip(' \n') for t in texts])
title = 'Interstellar (film)'
texts = np.array([f"{title} {t}" for t in texts])
texts

array(['Interstellar (film) Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan',
       'Interstellar (film) It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine',
       'Interstellar (film) Set in a dystopian future where humanity is struggling to survive, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for mankind',
       'Interstellar (film) Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007',
       'Interstellar (film) Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar',
       'Interstellar (film) Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format

In [None]:
# building index table
pd.set_option('display.max_colwidth', None)

response = co.embed(texts=texts.tolist()).embeddings
embeds = np.array(response)
print(embeds.shape)
search_index = AnnoyIndex(embeds.shape[1], 'angular')

# Add all the vectors to the search index
for i in range(len(embeds)):
    search_index.add_item(i, embeds[i])

search_index.build(10) # 10 trees
search_index.save('test.ann')

(15, 4096)


True

In [None]:
def search(query):
    # Get the query's embedding
    query_embed = co.embed(texts=[query]).embeddings
    similar_item_ids = search_index.get_nns_by_vector(
        query_embed[0], 3, include_distances=True)
    # Format the results
    results = pd.DataFrame(
        data={
            'texts': texts[similar_item_ids[0]],
            'distance': similar_item_ids[1]
            }
        )

    print(texts[similar_item_ids[0]])
    return results

query = "How much did the film make?"
search(query)

['Interstellar (film) The film had a worldwide gross over $677 million (and $773 million with subsequent re-releases), making it the tenth-highest grossing film of 2014'
 'Interstellar (film) Interstellar premiered on October 26, 2014, in Los Angeles'
 'Interstellar (film) In the United States, it was first released on film stock, expanding to venues using digital projectors']


Unnamed: 0,texts,distance
0,"Interstellar (film) The film had a worldwide gross over $677 million (and $773 million with subsequent re-releases), making it the tenth-highest grossing film of 2014",1.019055
1,"Interstellar (film) Interstellar premiered on October 26, 2014, in Los Angeles",1.144951
2,"Interstellar (film) In the United States, it was first released on film stock, expanding to venues using digital projectors",1.167268


## 2.3 Rerank
Dense Retrieval is not perfect as the most similar chunk might not be the answer

E.g. **What is the capital of France** has high similarity with **What is the capital of Canada**

Rerank is important to find the relevance score to the question

In [None]:
query = "Who is the tallest person in history?"
results = dense_retrieval(query)
for i, result in enumerate(results):
    print(f"i:{i}")
    print(result.get('title'))
    print(result.get('text'))
    print()

i:0
Robert Wadlow
Robert Pershing Wadlow (February 22, 1918 July 15, 1940), also known as the Alton Giant and the Giant of Illinois, was a man who was the tallest person in recorded history for whom there is irrefutable evidence. He was born and raised in Alton, Illinois, a small city near St. Louis, Missouri.

i:1
Manute Bol
Bol came from a family of extraordinarily tall men and women. He said: "My mother was , my father , and my sister is . And my great-grandfather was even taller—." His ethnic group, the Dinka, and the Nilotic people of which they are a part, are among the tallest populations in the world. Bol's hometown, Turalei, is the origin of other exceptionally tall people, including basketball player Ring Ayuel. "I was born in a village, where you cannot measure yourself," Bol reflected. "I learned I was 7 foot 7 in 1979, when I was grown. I was about 18 or 19."

i:2
Sultan Kösen
Sultan Kösen (born 10 December 1982) is a Turkish farmer who holds the Guinness World Record for 

In [None]:
def rerank_responses(query, responses, num_responses=10):
    reranked_responses = co.rerank(
        model='rerank-english-v2.0',
        query=query,
        documents=responses,
        top_n=num_responses,
        )
    return reranked_responses

texts = [result.get('text') for result in results]
reranked_text = rerank_responses(query, texts)

for i, rerank_result in enumerate(reranked_text):
    print(f"i:{i}")
    print(f"{rerank_result}")
    print()

i:0
RerankResult<document['text']: Robert Pershing Wadlow (February 22, 1918 July 15, 1940), also known as the Alton Giant and the Giant of Illinois, was a man who was the tallest person in recorded history for whom there is irrefutable evidence. He was born and raised in Alton, Illinois, a small city near St. Louis, Missouri., index: 0, relevance_score: 0.9734939>

i:1
RerankResult<document['text']: Sultan Kösen (born 10 December 1982) is a Turkish farmer who holds the Guinness World Record for tallest living male at . Of Kurdish ethnicity, he is the seventh tallest man in history., index: 2, relevance_score: 0.8664718>

i:2
RerankResult<document['text']: The Dutch are the tallest people in the world, by nationality, with an average height of for adult males and for adult females in 2009. The average height of young males in the Netherlands increased from 5 feet, 4 inches to approximately 6 feet between the 1850s until the early 2000s. People in the south are on average about shorter 

## 2.4 Text Generation

Feed searched chunk into LLM

In [None]:
text = """
The rapid rise of AI has led to a rapid rise in AI jobs, and many people are building exciting careers in this field. A career is a decades-long journey, and the path is not always straightforward. Over many years, I’ve been privileged to see thousands of students as well as engineers in companies large and small navigate careers in AI. In this and the next few letters, I’d like to share a few thoughts that might be useful in charting your own course.

Three key steps of career growth are learning (to gain technical and other skills), working on projects (to deepen skills, build a portfolio, and create impact) and searching for a job. These steps stack on top of each other:

Initially, you focus on gaining foundational technical skills.
After having gained foundational skills, you lean into project work. During this period, you’ll probably keep learning.
Later, you might occasionally carry out a job search. Throughout this process, you’ll probably continue to learn and work on meaningful projects.
These phases apply in a wide range of professions, but AI involves unique elements. For example:

AI is nascent, and many technologies are still evolving. While the foundations of machine learning and deep learning are maturing — and coursework is an efficient way to master them — beyond these foundations, keeping up-to-date with changing technology is more important in AI than fields that are more mature.
Project work often means working with stakeholders who lack expertise in AI. This can make it challenging to find a suitable project, estimate the project’s timeline and return on investment, and set expectations. In addition, the highly iterative nature of AI projects leads to special challenges in project management: How can you come up with a plan for building a system when you don’t know in advance how long it will take to achieve the target accuracy? Even after the system has hit the target, further iteration may be necessary to address post-deployment drift.
While searching for a job in AI can be similar to searching for a job in other sectors, there are some differences. Many companies are still trying to figure out which AI skills they need and how to hire people who have them. Things you’ve worked on may be significantly different than anything your interviewer has seen, and you’re more likely to have to educate potential employers about some elements of your work.
Throughout these steps, a supportive community is a big help. Having a group of friends and allies who can help you — and whom you strive to help — makes the path easier. This is true whether you’re taking your first steps or you’ve been on the journey for years.

I’m excited to work with all of you to grow the global AI community, and that includes helping everyone in our community develop their careers. I’ll dive more deeply into these topics in the next few weeks.

Last week, I wrote about key steps for building a career in AI: learning technical skills, doing project work, and searching for a job, all of which is supported by being part of a community. In this letter, I’d like to dive more deeply into the first step.

More papers have been published on AI than any person can read in a lifetime. So, in your efforts to learn, it’s critical to prioritize topic selection. I believe the most important topics for a technical career in machine learning are:

Foundational machine learning skills. For example, it’s important to understand models such as linear regression, logistic regression, neural networks, decision trees, clustering, and anomaly detection. Beyond specific models, it’s even more important to understand the core concepts behind how and why machine learning works, such as bias/variance, cost functions, regularization, optimization algorithms, and error analysis.
Deep learning. This has become such a large fraction of machine learning that it’s hard to excel in the field without some understanding of it! It’s valuable to know the basics of neural networks, practical skills for making them work (such as hyperparameter tuning), convolutional networks, sequence models, and transformers.
Math relevant to machine learning. Key areas include linear algebra (vectors, matrices, and various manipulations of them) as well as probability and statistics (including discrete and continuous probability, standard probability distributions, basic rules such as independence and Bayes rule, and hypothesis testing). In addition, exploratory data analysis (EDA) — using visualizations and other methods to systematically explore a dataset — is an underrated skill. I’ve found EDA particularly useful in data-centric AI development, where analyzing errors and gaining insights can really help drive progress! Finally, a basic intuitive understanding of calculus will also help. In a previous letter, I described how the math needed to do machine learning well has been changing. For instance, although some tasks require calculus, improved automatic differentiation software makes it possible to invent and implement new neural network architectures without doing any calculus. This was almost impossible a decade ago.
Software development. While you can get a job and make huge contributions with only machine learning modeling skills, your job opportunities will increase if you can also write good software to implement complex AI systems. These skills include programming fundamentals, data structures (especially those that relate to machine learning, such as data frames), algorithms (including those related to databases and data manipulation), software design, familiarity with Python, and familiarity with key libraries such as TensorFlow or PyTorch, and scikit-learn.
This is a lot to learn! Even after you master everything in this list, I hope you’ll keep learning and continue to deepen your technical knowledge. I’ve known many machine learning engineers who benefitted from deeper skills in an application area such as natural language processing or computer vision, or in a technology area such as probabilistic graphical models or building scalable software systems.

How do you gain these skills? There’s a lot of good content on the internet, and in theory reading dozens of web pages could work. But when the goal is deep understanding, reading disjointed web pages is inefficient because they tend to repeat each other, use inconsistent terminology (which slows you down), vary in quality, and leave gaps. That’s why a good course — in which a body of material has been organized into a coherent and logical form — is often the most time-efficient way to master a meaningful body of knowledge. When you’ve absorbed the knowledge available in courses, you can switch over to research papers and other resources.

Finally, keep in mind that no one can cram everything they need to know over a weekend or even a month. Everyone I know who’s great at machine learning is a lifelong learner. In fact, given how quickly our field is changing, there’s little choice but to keep learning if you want to keep up. How can you maintain a steady pace of learning for years? I’ve written about the value of habits. If you cultivate the habit of learning a little bit every week, you can make significant progress with what feels like less effort.

In the last two letters, I wrote about developing a career in AI and shared tips for gaining technical skills. This time, I’d like to discuss an important step in building a career: project work.

It goes without saying that we should only work on projects that are responsible and ethical, and that benefit people. But those limits leave a large variety to choose from. I wrote previously about how to identify and scope AI projects. This and next week’s letter have a different emphasis: picking and executing projects with an eye toward career development.

A fruitful career will include many projects, hopefully growing in scope, complexity, and impact over time. Thus, it is fine to start small. Use early projects to learn and gradually step up to bigger projects as your skills grow.

When you’re starting out, don’t expect others to hand great ideas or resources to you on a platter. Many people start by working on small projects in their spare time. With initial successes — even small ones — under your belt, your growing skills increase your ability to come up with better ideas, and it becomes easier to persuade others to help you step up to bigger projects.

What if you don’t have any project ideas? Here are a few ways to generate them:

Join existing projects. If you find someone else with an idea, ask to join their project.
Keep reading and talking to people. I come up with new ideas whenever I spend a lot of time reading, taking courses, or talking with domain experts. I’m confident that you will, too.
Focus on an application area. Many researchers are trying to advance basic AI technology — say, by inventing the next generation of transformers or further scaling up language models — so, while this is an exciting direction, it is hard. But the variety of applications to which machine learning has not yet been applied is vast! I’m fortunate to have been able to apply neural networks to everything from autonomous helicopter flight to online advertising, partly because I jumped in when relatively few people were working on those applications. If your company or school cares about a particular application, explore the possibilities for machine learning. That can give you a first look at a potentially creative application — one where you can do unique work — that no one else has done yet.
Develop a side hustle. Even if you have a full-time job, a fun project that may or may not develop into something bigger can stir the creative juices and strengthen bonds with collaborators. When I was a full-time professor, working on online education wasn’t part of my “job” (which was doing research and teaching classes). It was a fun hobby that I often worked on out of passion for education. My early experiences recording videos at home helped me later in working on online education in a more substantive way. Silicon Valley abounds with stories of startups that started as side projects. So long as it doesn’t create a conflict with your employer, these projects can be a stepping stone to something significant.
Given a few project ideas, which one should you jump into? Here’s a quick checklist of factors to consider:

Will the project help you grow technically? Ideally, it should be challenging enough to stretch your skills but not so hard that you have little chance of success. This will put you on a path toward mastering ever-greater technical complexity.
Do you have good teammates to work with? If not, are there people you can discuss things with? We learn a lot from the people around us, and good collaborators will have a huge impact on your growth.
Can it be a stepping stone? If the project is successful, will its technical complexity and/or business impact make it a meaningful stepping stone to larger projects? (If the project is bigger than those you’ve worked on before, there’s a good chance it could be such a stepping stone.)
Finally, avoid analysis paralysis. It doesn’t make sense to spend a month deciding whether to work on a project that would take a week to complete. You'll work on multiple projects over the course of your career, so you’ll have ample opportunity to refine your thinking on what’s worthwhile. Given the huge number of possible AI projects, rather than the conventional “ready, aim, fire” approach, you can accelerate your progress with “ready, fire, aim.”
"""

# Clean up to remove empty spaces and new lines
texts = text.split('\n\n')
texts = np.array([t.strip(' \n') for t in texts if t])

question = "Are side projects important when you are starting to learn about AI?"

In [None]:
# build search idnex
response = co.embed(texts=texts.tolist(),).embeddings
embeds = np.array(response)
search_index = AnnoyIndex(embeds.shape[1], 'angular')
for i in range(len(embeds)):
    search_index.add_item(i, embeds[i])
search_index.build(10) # 10 trees
search_index.save('test.ann')

True

In [None]:
def search_article(query):
    query_embed = co.embed(texts=[query]).embeddings
    similar_item_ids = search_index.get_nns_by_vector(
        query_embed[0], 10, include_distances=True)
    search_results = texts[similar_item_ids[0]]
    return search_results

results = search_article(
    "Are side projects a good idea when trying to build a career in AI?"
)
print(results[0])

Join existing projects. If you find someone else with an idea, ask to join their project.
Keep reading and talking to people. I come up with new ideas whenever I spend a lot of time reading, taking courses, or talking with domain experts. I’m confident that you will, too.
Focus on an application area. Many researchers are trying to advance basic AI technology — say, by inventing the next generation of transformers or further scaling up language models — so, while this is an exciting direction, it is hard. But the variety of applications to which machine learning has not yet been applied is vast! I’m fortunate to have been able to apply neural networks to everything from autonomous helicopter flight to online advertising, partly because I jumped in when relatively few people were working on those applications. If your company or school cares about a particular application, explore the possibilities for machine learning. That can give you a first look at a potentially creative applicatio

In [None]:
def ask_article(question, num_generations=1):

    results = search_article(question)
    context = results[0]

    prompt = f"""
    Excerpt from the article titled "How to Build a Career in AI"
    by Andrew Ng:
    {context}
    Question: {question}

    Extract the answer of the question from the text provided.
    If the text doesn't contain the answer,
    reply that the answer is not available."""

    prediction = co.generate(
        prompt=prompt,
        max_tokens=70,
        model="command-nightly",
        temperature=0.5,
        num_generations=num_generations
    )
    return prediction.generations

results = ask_article(
    "Are side projects a good idea when trying to build a career in AI?",
    num_generations=3
)

for gen in results:
    print(gen)
    print('--')

 The answer is not available.
--
 The answer is not available.
--
 The answer is yes, side projects are a good idea when trying to build a career in AI.
--


# 3) LlamaIndex