In [1]:
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

True

In [2]:
import os
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_PROJECT'] = 'advanced-rag'
os.environ['LANGCHAIN_API_KEY'] = os.getenv("LANGCHAIN_API_KEY")
os.environ['GROQ_API_KEY'] = os.getenv("GROQQ_API_KEY")
os.environ["USER_AGENT"] = "my-rag-app/0.1"

In [3]:
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

In [10]:
loader = WebBaseLoader(
    web_paths= ("https://lilianweng.github.io/posts/2024-02-05-human-data-quality/",),

    bs_kwargs = dict(
        parse_only = bs4.SoupStrainer(
            class_ = ("post-content", "post-title", "post-header")
        )
    ),
)

blog_docs = loader.load()

In [11]:
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size = 300,
    chunk_overlap = 50,
)
splits = text_splitter.split_documents(blog_docs)

In [12]:
model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf_embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs,
)
vectorstore = FAISS.from_documents(
    documents = splits,
    embedding = hf_embeddings
)
retriever = vectorstore.as_retriever()

  hf_embeddings = HuggingFaceBgeEmbeddings(


  from .autonotebook import tqdm as notebook_tqdm


In [17]:
from langchain.prompts import ChatPromptTemplate

template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""

prompt_perspectives = ChatPromptTemplate.from_template(template)

generate_queries = (
    prompt_perspectives
    | ChatGroq(model = "llama3-70b-8192",temperature = 0)
    | StrOutputParser()
    | (lambda x: x.split("\n"))
)

In [18]:
generate_queries.invoke("How do descriptive and prescriptive paradigms impact annotation quality and disagreement in NLP tasks?")

['Here are five alternative versions of the user question to retrieve relevant documents from a vector database:',
 '',
 'What are the effects of descriptive and prescriptive approaches on annotation consistency and quality in natural language processing tasks?',
 '',
 'How do different annotation paradigms, such as descriptive and prescriptive, influence the level of agreement among annotators in NLP tasks?',
 '',
 'What is the relationship between annotation paradigm and annotation quality, and how do descriptive and prescriptive approaches compare in NLP tasks?',
 '',
 'In what ways do descriptive and prescriptive annotation methods impact the reliability and validity of annotated data in natural language processing applications?',
 '',
 'Can the choice of annotation paradigm, whether descriptive or prescriptive, affect the degree of inter-annotator agreement and overall quality of annotated data in NLP tasks?',
 '',
 'These alternative questions offer different perspectives on the 

In [19]:
from langchain.load import dumps, loads

def get_unique_union(documents: list[list]):
    """ UNIQUE UNION OF RETRIEVAL DOCS """

    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]

    unique_docs = list(set(flattened_docs))
    return [loads(doc) for doc in unique_docs]





In [20]:
question = "How do descriptive and prescriptive paradigms impact annotation quality and disagreement in NLP tasks?"

retrieval_chain = generate_queries | retriever.map() | get_unique_union 
docs = retrieval_chain.invoke({"question" : question})
len(docs)

  return [loads(doc) for doc in unique_docs]


8

In [21]:
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough

template = """Answer the following question based on this context:

{context}

Question: {question}"""

prompt = ChatPromptTemplate.from_template(template)

llm = ChatGroq(model = "llama3-70b-8192", temperature = 0)

final_rag_chain = (
    {"context": retrieval_chain,
     "question": itemgetter("question")}
     | prompt
     | llm
     | StrOutputParser()
)

final_rag_chain.invoke({"question": question})

'According to the text, the descriptive and prescriptive paradigms have different approaches to annotation quality and disagreement in NLP tasks.\n\nThe descriptive paradigm:\n\n* Encourages annotator subjectivity, trying to model many beliefs\n* Embraces diversity and can help identify which entries are more subjective\n* Is more aligned with standard NLP setup and easier to do QC by measuring disagreement or doing label aggregation\n* Pros: can help to identify which entries are more subjective, embrace diversity, more aligned with standard NLP setup, and easier to do QC\n* Cons: metrics like rater disagreement cannot be used to measure data quality or annotator performance, cannot be used for training models that are optimized for outputting one preset behavior, expensive and challenging to create high-quality annotation guidelines, and cannot capture an interpretable diversity of beliefs or consistently encode one specific belief\n\nThe prescriptive paradigm:\n\n* Discourages annot

RAG FUSION

In [11]:
from langchain.prompts import ChatPromptTemplate

template = """You are a helpful assistant that generates multiple search queries based on a single input query. \n
Generate multiple search queries related to: {question} \n
Output (4 queries):"""

prompt_rag_fusion = ChatPromptTemplate.from_template(template)

In [12]:
from langchain_core.output_parsers import StrOutputParser
from langchain_groq import ChatGroq

generate_queries = (
    prompt_rag_fusion
    | ChatGroq(model = "llama3-70b-8192", temperature=0)
    | StrOutputParser()
    | (lambda x: x.split("\n"))
)

In [13]:
from langchain.load import dumps, loads

def reciprocal_rank_fusion(results: list[list], k=60):
    fused_scores = {}

    for doc in results:
        for rank,doc in enumerate(docs):
            doc_str = dumps(doc)
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            previous_score = fused_scores[doc_str]
            fused_scores[doc_str] += 1/(rank + k)

    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key = lambda x:x[1], reverse=True)
    ]
    return reranked_results

retrieval_chain_rag_fusion = generate_queries | retriever.map() | reciprocal_rank_fusion
docs = retrieval_chain_rag_fusion.invoke({"question": question})
len(docs)

NameError: name 'retriever' is not defined

In [29]:
from langchain_core.runnables import RunnablePassthrough

template = """Answer the following question based on this context:

{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    {"context" : retrieval_chain_rag_fusion,
     "question" : itemgetter("question")
    }
    | prompt
    |llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question" : question})

'According to the text, the descriptive and prescriptive paradigms have different approaches to annotation quality and disagreement in NLP tasks.\n\nThe descriptive paradigm:\n\n* Encourages annotator subjectivity, trying to model many beliefs\n* Embraces diversity and can help identify which entries are more subjective\n* Is more aligned with standard NLP setup and easier to do quality control by measuring disagreement or doing label aggregation\n* Pros: can help identify subjective entries, embraces diversity, and is easier to do QC\n* Cons: metrics like rater disagreement cannot be used to measure data quality or annotator performance, and cannot be used for training models that are optimized for outputting one preset behavior\n\nThe prescriptive paradigm:\n\n* Discourages annotator subjectivity, trying to consistently apply one belief\n* Is more aligned with standard NLP setup and easier to do quality control by measuring disagreement or doing label aggregation\n* Pros: more aligne

DECOMPOSITION

In [2]:
from langchain.prompts import ChatPromptTemplate

template = """You are a helpful assistant that generates multiple sub-questions related to an input question. \n
The goal is to break down the input into a set of sub-problems / sub-questions that can be answers in isolation. \n
Generate multiple search queries related to: {question} \n
Output (3 queries):"""

prompt_decomposition = ChatPromptTemplate.from_template(template)

In [None]:
from langchain_groq import ChatGroq
from langchain_core.output_parsers import StrOutputParser

llm = ChatGroq(model = "llama3-70b-8192", temperature = 0)

generate_queries_decomposition = (
    prompt_decomposition
    | llm
    | StrOutputParser()
    | (lambda x: x.split("\n"))
)

question = "How do descriptive and prescriptive paradigms impact annotation quality and disagreement in NLP tasks?"

questions = generate_queries_decomposition.invoke({"question": question})

In [8]:
questions

['Here are three sub-questions related to the input question:',
 '',
 '**Query 1:** ',
 'What are the fundamental differences between descriptive and prescriptive annotation paradigms in NLP, and how do these differences influence annotator behavior and decision-making?',
 '',
 '**Query 2:** ',
 'How do descriptive and prescriptive annotation approaches affect the consistency and reliability of annotations, particularly in tasks that involve subjective judgments, such as sentiment analysis or entity recognition?',
 '',
 '**Query 3:** ',
 'What are the implications of descriptive and prescriptive paradigms on inter-annotator agreement and disagreement in NLP tasks, and how can these effects be mitigated through annotation guidelines, training, or other strategies?',
 '',
 'These sub-questions can help to break down the original question into more manageable and specific topics, allowing for a more focused exploration of the impact of descriptive and prescriptive paradigms on annotation 

In [9]:
template = """Here is the question you need to answer:

\n --- \n {question} \n --- \n

Here is any available background question + answer pairs:

\n --- \n {q_a_pairs} \n --- \n

Here is additional context relevant to the question: 

\n --- \n {context} \n --- \n

Use the above context and any background question + answer pairs to answer the question: \n {question}
"""

decomposition_prompt = ChatPromptTemplate.from_template(template)

In [17]:
from operator import itemgetter
from langchain_core.output_parsers import StrOutputParser

def format_qa_pair(question,answer):

    formatted_string = ""
    formatted_string += f"Question: {question}\n Anaswer: {answer}\n\n"
    return formatted_string.strip()

llm = ChatGroq(model = "llama3-70b-8192", temperature = 0)

q_a_pairs = ""

for q in questions:

    rag_chain = (
        {"context": itemgetter("question") | retriever,
         "question": itemgetter("question"),
         "q_a_pairs": itemgetter("q_a_pairs")
        }
        | decomposition_prompt
        | llm
        | StrOutputParser()
    )

    answer = rag_chain.invoke({"question": q, "q_a_pairs":q_a_pairs})
    q_a_pair = format_qa_pair(q,answer)
    q_a_pairs = q_a_pairs + "\n---\n" + q_a_pair

In [18]:
answer

"I'm happy to help! Based on the provided context and background information, I understand that the original question is related to the impact of descriptive and prescriptive paradigms on annotation quality and disagreement in NLP tasks.\n\nThe sub-questions mentioned are not explicitly provided, but based on the context, I can infer that they might be related to the following topics:\n\n1. What are the fundamental differences between descriptive and prescriptive annotation paradigms in NLP, and how do these differences influence annotator behavior and decision-making?\n2. How do descriptive and prescriptive annotation approaches affect the consistency and reliability of annotations, particularly in tasks that involve subjective judgments, such as sentiment analysis or entity recognition?\n3. What are the implications of descriptive and prescriptive paradigms on inter-annotator agreement and disagreement in NLP tasks, and how can these effects be mitigated through annotation guidelines

In [19]:
print("Running for:", q)
print("Context:", rag_chain.invoke({"question": q, "q_a_pairs": q_a_pairs}))


Running for: These sub-questions can help to break down the original question into more manageable and specific topics, allowing for a more focused exploration of the impact of descriptive and prescriptive paradigms on annotation quality and disagreement in NLP tasks.
Context: I'm happy to help! Based on the provided context and background information, I understand that the original question is related to the impact of descriptive and prescriptive paradigms on annotation quality and disagreement in NLP tasks.

The sub-questions mentioned are not explicitly provided, but based on the context, I can infer that they might be related to the following topics:

1. What are the fundamental differences between descriptive and prescriptive annotation paradigms in NLP, and how do these differences influence annotator behavior and decision-making?
2. How do descriptive and prescriptive annotation approaches affect the consistency and reliability of annotations, particularly in tasks that involve 

ANSWER INDIVIDUALLY

In [20]:
from langchain import hub
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough,RunnableLambda
from langchain_core.output_parsers import StrOutputParser
from langchain_groq import ChatGroq

In [21]:
prompt_rag = hub.pull("rlm/rag-prompt")

def retrieve_and_rag(question,prompt_rag,sub_question_generator_chain):

    sub_questions = sub_question_generator_chain.invoke({"question":question})

    rag_results = []

    for sub_question in sub_questions:

        retrieved_docs = retriever.get_relevant_documents
        (sub_question)

        answer = (
            prompt_rag
            | llm
            | StrOutputParser()
        ).invoke

        ({"context": retrieved_docs,
          "question": sub_question
        })
        rag_results.append(answer)
    return rag_results,sub_questions

answers, questions = retrieve_and_rag(question,prompt_rag,generate_queries_decomposition)

In [22]:
def format_qa_pairs(questions,answers):

    formatted_string = ""
    for i, (question,answer) in enumerate(zip(questions,answers),start = 1):
        formatted_string += f"Question {i}: {question}\n Answer {i}: {answer}\n\n"
    return formatted_string.strip()

context = format_qa_pairs(questions,answers)

template =  """Here is a set of Q+A pairs:

{context}

Use these to synthesize an answer to the question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"context":context,"question":question})

'Based on the provided Q&A pairs, it appears that the descriptive and prescriptive paradigms have significant implications for annotation quality and disagreement in NLP tasks.\n\nThe descriptive paradigm, which focuses on describing the data as it is, tends to lead to more subjective and variable annotations. This is because annotators are not provided with clear guidelines or expectations, resulting in a wider range of possible interpretations and annotations. On the other hand, the prescriptive paradigm, which provides clear guidelines and expectations, tends to lead to more consistent and objective annotations.\n\nThe impact of these paradigms on annotation quality is significant. Descriptive paradigms can lead to lower-quality annotations due to the variability and subjectivity of the annotations. In contrast, prescriptive paradigms can lead to higher-quality annotations due to the consistency and objectivity of the annotations.\n\nIn terms of disagreement, the descriptive paradig

STEP - BACK IN RAG

In [16]:
from langchain_core.prompts import ChatPromptTemplate,FewShotChatMessagePromptTemplate



examples = [
    {
        "input": "Could the members of The Police perform lawful arrests?",
        "output": "what can the members of The Police do?",
    },
    {
        "input": "Jan Sindel’s was born in what country?",
        "output": "what is Jan Sindel’s personal history?",
    },
]

example_prompt = ChatPromptTemplate.from_messages(
    [
        ("human", "{input}"),
        ("ai", "{output}"),
    ]
)

few_shot_prompt = FewShotChatMessagePromptTemplate(
    example_prompt=example_prompt,
    examples = examples,
)

prompt = ChatPromptTemplate.from_messages(
    [
        (
          "system",
            """You are an expert at world knowledge. Your task is to step back and paraphrase a question to a more generic step-back question, which is easier to answer. Here are a few examples:""",  
        ),
        few_shot_prompt,
        ("user", "{question}"),
    ]
)

In [17]:
generate_queries_step_back = prompt | ChatGroq(model = "llama3-70b-8192", temperature = 0) | StrOutputParser()

question = "How do descriptive and prescriptive paradigms impact annotation quality and disagreement in NLP tasks?"

generate_queries_step_back.invoke({"question": question})


'how do different approaches to language analysis affect the results of NLP tasks?'

In [19]:
from langchain_core.runnables import RunnableLambda


response_prompt_template = """You are an expert of world knowledge. I am going to ask you a question. Your response should be comprehensive and not contradicted with the following context if they are relevant. Otherwise, ignore them if they are not relevant.

# {normal_context}
# {step_back_context}

# Original Question: {question}
# Answer:"""

response_prompt = ChatPromptTemplate.from_template(response_prompt_template)

chain = (
    {
        "normal_context": RunnableLambda(lambda x: x["question"]) | retriever,
        "step_back_context": generate_queries_step_back | retriever,
        "question": RunnableLambda(lambda x: x["question"]),
    }
    | response_prompt
    | ChatGroq(model = "llama3-70b-8192", temperature = 0)
    | StrOutputParser()
)

chain.invoke({"question": question})

'The descriptive and prescriptive paradigms are two contrasting approaches to data annotation for subjective NLP tasks. The choice of paradigm significantly impacts annotation quality and disagreement in NLP tasks.\n\n**Descriptive Paradigm:**\nIn the descriptive paradigm, annotators are encouraged to express their subjective opinions, and the goal is to model many beliefs. This approach:\n\n1. **Embraces diversity**: By allowing annotators to express their individual perspectives, the descriptive paradigm captures a diverse range of opinions.\n2. **Identifies subjective entries**: This approach helps identify which entries are more subjective, which is essential in tasks where opinions vary widely.\n3. **Aligns with standard NLP setup**: The descriptive paradigm is more aligned with standard NLP setup, making it easier to measure disagreement and perform label aggregation.\n4. **Easier quality control**: Measuring disagreement and performing label aggregation are more straightforward 

HYDE

In [20]:
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

template = """Please write a scientific paper passage to answer the question
Question: {question}
Passage:"""

prompt_hyde = ChatPromptTemplate.from_template(template)

generate_docs_for_retrieval = (
    prompt_hyde
    | ChatGroq(model = "llama3-70b-8192", temperature = 0)
    | StrOutputParser()
)

question = "How do descriptive and prescriptive paradigms impact annotation quality and disagreement in NLP tasks?"

generate_docs_for_retrieval.invoke({"question": question}) 


'Here is a scientific paper passage that answers the question:\n\n**Title:** The Impact of Descriptive and Prescriptive Paradigms on Annotation Quality and Disagreement in NLP Tasks\n\n**Abstract:** Annotation is a crucial step in Natural Language Processing (NLP) tasks, as high-quality annotations are essential for training accurate models. However, annotation quality can be influenced by the paradigm adopted by annotators, which can be either descriptive or prescriptive. In this study, we investigate the impact of these two paradigms on annotation quality and disagreement in NLP tasks. Our results show that descriptive annotations, which focus on describing the natural language data, tend to exhibit higher quality and lower disagreement rates compared to prescriptive annotations, which aim to enforce specific guidelines or rules. We analyze the underlying causes of these differences and discuss their implications for NLP research and practice.\n\n**Introduction:** Annotation is a tim

In [21]:
retrieval_chain = generate_docs_for_retrieval | retriever
retrieved_docs = retrieval_chain.invoke({"question": question})
retrieved_docs

[Document(id='af6986a9-0fa3-4a9c-b148-31eb7cc1f0f3', metadata={'source': 'https://lilianweng.github.io/posts/2024-02-05-human-data-quality/'}, page_content='Often there is more than one correct interpretation for some samples. We need diverse perspectives via e.g. having multiple people to review annotation quality.\nDisagreement is not always bad. We should reduce disagreements caused by errors or poorly designed process but other disagreements can give us rich information.\n\nIf it is caused by a task not well defined, we should enhance the instruction. However, a more detailed guideline does not resolve innate diversity among opinions.\n\n\nExperts may not always be better than lay people, but they would have a big gap in terms of considering what’s important.\nGround truth annotations can change in time, especially those related to timely events or news.\n\nLater, Rottger et al. (2021) formulated the difference into two contrasting paradigms for data annotation for subjective NLP t

In [23]:
llm = ChatGroq(model = "llama3-70b-8192", temperature = 0)

template =  """Answer the following question based on this context:

{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

final_hyde_rag_chain = (
    prompt
    | llm
    | StrOutputParser()
)

final_hyde_rag_chain.invoke({"context": retrieved_docs, "question": question})

'According to the provided context, the descriptive and prescriptive paradigms have different impacts on annotation quality and disagreement in NLP tasks.\n\nThe **Descriptive Paradigm**:\n\n* Encourages annotator subjectivity, trying to model many beliefs.\n* Helps to identify which entries are more subjective.\n* Embraces diversity.\n* Is more aligned with standard NLP setup.\n* Makes it easier to do quality control (QC) by measuring disagreement or doing label aggregation.\n\nThe **Prescriptive Paradigm**:\n\n* Discourages annotator subjectivity, trying to consistently apply one belief.\n* Is more aligned with standard NLP setup.\n* Makes it easier to do QC by measuring disagreement or doing label aggregation.\n\nIn terms of disagreement, the descriptive paradigm acknowledges that disagreement is not always bad and can provide rich information, whereas the prescriptive paradigm aims to reduce disagreements caused by errors or poorly designed processes.\n\nAdditionally, the context h