In [1]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

True

In [12]:
import os 
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_PROJECT'] = 'advanced-rag'
os.environ['LANGCHAIN_API_KEY'] = os.getenv("LANGCHAIN_API_KEY")
os.environ['GROQ_API_KEY'] = os.getenv("GROQQ_API_KEY")
os.environ["USER_AGENT"] = "my-rag-app/0.1"


In [23]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceBgeEmbeddings



In [14]:
loader = WebBaseLoader(
    web_paths= ("https://lilianweng.github.io/posts/2024-02-05-human-data-quality/",),

    bs_kwargs = dict(
        parse_only = bs4.SoupStrainer(
            class_ = ("post-content", "post-title", "post-header")
        )
    ),
)

docs = loader.load()

In [15]:
print(docs[0].page_content[:500])



      Thinking about High-Quality Human Data
    
Date: February 5, 2024  |  Estimated Reading Time: 20 min  |  Author: Lilian Weng


[Special thank you to Ian Kivlichan for many useful pointers (E.g. the 100+ year old Nature paper “Vox populi”) and nice feedback. 🙏 ]
High-quality data is the fuel for modern data deep learning model training. Most of the task-specific labeled data comes from human annotation, such as classification task or RLHF labeling (which can be constructed as classificat


In [16]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200,
)
splits = text_splitter.split_documents(docs)


In [24]:
model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf_embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

vectorstore = FAISS.from_documents(
    documents = splits,
    embedding = hf_embeddings,
)

retriever = vectorstore.as_retriever()


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [25]:
prompt = hub.pull("rlm/rag-prompt")
llm = ChatGroq(model = "llama3-8b-8192", temperature = 0)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context" : retriever | format_docs,"question":
    RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

print(rag_chain.invoke("What is the difference between data quality and human data quality?"))

Based on the provided context, data quality refers to the quality of the data collected through machine learning techniques, whereas human data quality refers to the quality of data collected through human annotation, which involves attention to details and careful execution.


INDEXING - IN RAG

In [35]:
question = "What food i like the most in mumbai?"
document = "My favourite food in mumbai is vada pav"

In [26]:
import bs4
from langchain_community.document_loaders import WebBaseLoader
loader = WebBaseLoader(
    web_paths = ("https://lilianweng.github.io/posts/2024-02-05-human-data-quality/",),
    bs_kwargs = dict(
        parse_only = bs4.SoupStrainer(
            class_ = ("post-content", "post-title", "post-header")
        )
    ),
)

blog_docs = loader.load()

In [28]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size = 300,
    chunk_overlap = 50
)

splits = text_splitter.split_documents(blog_docs)

In [38]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}  
hf_embeddings = HuggingFaceBgeEmbeddings(
    model_name = model_name,
    model_kwargs = model_kwargs,
    encode_kwargs = encode_kwargs
)

query_result = hf_embeddings.embed_query(question)
document_result = hf_embeddings.embed_documents([document])
len(query_result)

384

In [41]:
import numpy as np

def cosine_similarity(vec1, vec2):
    dot_product = np.dot(vec1, vec2)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)
    return dot_product / (norm_vec1 * norm_vec2)

similarity = cosine_similarity(query_result, document_result[0])
print("Cosine similarity:", similarity)

Cosine similarity: 0.8929203021307338


VECTORSTORES

In [42]:
from langchain_community.vectorstores import FAISS
vectorstore = FAISS.from_documents(
    documents = splits,
    embedding = hf_embeddings
)

retriever = vectorstore.as_retriever()

In [43]:
retriever

VectorStoreRetriever(tags=['FAISS', 'HuggingFaceBgeEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x0000025E06416850>, search_kwargs={})

RETRIEVAL

In [44]:
docs = retriever.get_relevant_documents("How do descriptive and prescriptive paradigms impact annotation quality and disagreement in NLP tasks?")
print("Retrieved documents:")
for doc in docs:
    print(doc.page_content[:200])  

  docs = retriever.get_relevant_documents("How do descriptive and prescriptive paradigms impact annotation quality and disagreement in NLP tasks?")


Retrieved documents:
Often there is more than one correct interpretation for some samples. We need diverse perspectives via e.g. having multiple people to review annotation quality.
Disagreement is not always bad. We shou
The descriptive paradigm allows us to understand a number of important effects as well as to account for different perspectives. For example, annotator identity (e.g. African American, LGBTQ) is found
Pros
- Can help to identify which entries are more subjective;- Embrace diversity
- More aligned with standard NLP setup. - Easier to do QC by measuring disagreement or doing label aggregation.


Cons
Correlations between non-expert and expert annotations vary a lot across topics. (Image source: Wang et al. 2023)

Zhang et al. (2023) proposed a taxonomy of rater disagreement to analyze the root cau


In [45]:
len(docs)

4

In [46]:
docs

[Document(id='d89a5588-a0ae-4753-9381-71bd15f4c4d1', metadata={'source': 'https://lilianweng.github.io/posts/2024-02-05-human-data-quality/'}, page_content='Often there is more than one correct interpretation for some samples. We need diverse perspectives via e.g. having multiple people to review annotation quality.\nDisagreement is not always bad. We should reduce disagreements caused by errors or poorly designed process but other disagreements can give us rich information.\n\nIf it is caused by a task not well defined, we should enhance the instruction. However, a more detailed guideline does not resolve innate diversity among opinions.\n\n\nExperts may not always be better than lay people, but they would have a big gap in terms of considering what’s important.\nGround truth annotations can change in time, especially those related to timely events or news.\n\nLater, Rottger et al. (2021) formulated the difference into two contrasting paradigms for data annotation for subjective NLP t