In [1]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
import json

In [3]:
from langchain.vectorstores import FAISS
from langchain_groq import ChatGroq
from langchain_community.embeddings import OllamaEmbeddings

In [4]:
path = "../data/Understanding_Climate_Change.pdf"

In [5]:
from PyPDF2 import PdfReader

def read_pdf_to_string(path):
    """
    Read a PDF document from the specified path and return its content as a string.

    Args:
        path (str): The file path to the PDF document.

    Returns:
        str: The concatenated text content of all pages in the PDF document.
    
    The function uses PyPDF2 to open the PDF document, iterate over each page,
    extract the text content from each page, and append it to a single string.
    """
    # Create a PDF reader object
    reader = PdfReader(path)
    content = ""
    
    # Iterate through all pages
    for page in reader.pages:
        # Extract text from the page and append to content
        content += page.extract_text() + "\n"
    
    return content.strip()

In [6]:
def encode_from_string(content, chunk_size=1000, chunk_overlap=200):
    """
    Encodes a string into a vector store using OpenAI embeddings.

    Args:
        content (str): The text content to be encoded.
        chunk_size (int): The size of each chunk of text.
        chunk_overlap (int): The overlap between chunks.

    Returns:
        FAISS: A vector store containing the encoded content.

    Raises:
        ValueError: If the input content is not valid.
        RuntimeError: If there is an error during the encoding process.
    """

    if not isinstance(content, str) or not content.strip():
        raise ValueError("Content must be a non-empty string.")

    if not isinstance(chunk_size, int) or chunk_size <= 0:
        raise ValueError("chunk_size must be a positive integer.")

    if not isinstance(chunk_overlap, int) or chunk_overlap < 0:
        raise ValueError("chunk_overlap must be a non-negative integer.")

    try:
        # Split the content into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap,
            length_function=len,
            is_separator_regex=False,
        )
        chunks = text_splitter.create_documents([content])

        # Assign metadata to each chunk
        for chunk in chunks:
            chunk.metadata['relevance_score'] = 1.0

        # Generate embeddings and create the vector store
        embeddings = OllamaEmbeddings(model='nomic-embed-text', show_progress=True)
        vectorstore = FAISS.from_documents(chunks, embeddings)

    except Exception as e:
        raise RuntimeError(f"An error occurred during the encoding process: {str(e)}")

    return vectorstore

In [7]:
content = read_pdf_to_string(path)
vectorstore = encode_from_string(content)
retriever = vectorstore.as_retriever()

  embeddings = OllamaEmbeddings(model='nomic-embed-text', show_progress=True)
OllamaEmbeddings: 100%|██████████| 93/93 [01:37<00:00,  1.05s/it]


In [None]:
llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0, max_tokens=4000,api_key='')

In [9]:
qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever)

In [10]:
def get_user_feedback(query, response, relevance, quality, comments=""):
    return {
        "query": query,
        "response": response,
        "relevance": int(relevance),
        "quality": int(quality),
        "comments": comments
    }

In [11]:
def store_feedback(feedback):
    with open("../data/feedback_data.json", "a") as f:
        json.dump(feedback, f)
        f.write("\n")

In [12]:
def load_feedback_data():
    feedback_data = []
    try:
        with open("../data/feedback_data.json", "r") as f:
            for line in f:
                feedback_data.append(json.loads(line.strip()))
    except FileNotFoundError:
        print("No feedback data file found. Starting with empty feedback.")
    return feedback_data

In [13]:
from langchain.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import Any, Dict, List


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
class Response(BaseModel):
    answer: str = Field(..., title="The answer to the question. The options can be only 'Yes' or 'No'")

def adjust_relevance_scores(query: str, docs: List[Any], feedback_data: List[Dict[str, Any]]) -> List[Any]:
    # Create a prompt template for relevance checking
    relevance_prompt = PromptTemplate(
        input_variables=["query", "feedback_query", "doc_content", "feedback_response"],
        template="""
        Determine if the following feedback response is relevant to the current query and document content.
        You are also provided with the Feedback original query that was used to generate the feedback response.
        Current query: {query}
        Feedback query: {feedback_query}
        Document content: {doc_content}
        Feedback response: {feedback_response}
        
        Is this feedback relevant? Respond with only 'Yes' or 'No'.
        """
    )
    llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0, max_tokens=4000,api_key='')

    # Create an LLMChain for relevance checking
    relevance_chain = relevance_prompt | llm.with_structured_output(Response)

    for doc in docs:
        relevant_feedback = []
        
        for feedback in feedback_data:
            # Use LLM to check relevance
            input_data = {
                "query": query,
                "feedback_query": feedback['query'],
                "doc_content": doc.page_content[:1000],
                "feedback_response": feedback['response']
            }
            result = relevance_chain.invoke(input_data).answer
            
            if result == 'yes':
                relevant_feedback.append(feedback)

        print(f"Relevant feedback for document '{doc.title}': {relevant_feedback}")
        
        # Adjust the relevance score based on feedback
        if relevant_feedback:
            avg_relevance = sum(f['relevance'] for f in relevant_feedback) / len(relevant_feedback)
            doc.metadata['relevance_score'] *= (avg_relevance / 3)  # Assuming a 1-5 scale, 3 is neutral
    
    # Re-rank documents based on adjusted scores
    return sorted(docs, key=lambda x: x.metadata['relevance_score'], reverse=True)

In [15]:
def fine_tune_index(feedback_data: List[Dict[str, Any]], texts: List[str]) -> Any:
    # Filter high-quality responses
    good_responses = [f for f in feedback_data if f['relevance'] >= 4 and f['quality'] >= 4]
    
    # Extract queries and responses, and create new documents
    additional_texts = []
    for f in good_responses:
        combined_text = f['query'] + " " + f['response']
        additional_texts.append(combined_text)

    # make the list a string
    additional_texts = " ".join(additional_texts)
    
    # Create a new index with original and high-quality texts
    all_texts = texts + additional_texts
    new_vectorstore = encode_from_string(all_texts)
    
    return new_vectorstore

In [16]:
query = "What is the greenhouse effect?"

# Get response from RAG system
response = qa_chain(query)["result"]

  response = qa_chain(query)["result"]
OllamaEmbeddings: 100%|██████████| 1/1 [00:03<00:00,  3.17s/it]


In [17]:
response

'The greenhouse effect is a natural process where greenhouse gases, such as carbon dioxide (CO2), methane (CH4), and nitrous oxide (N2O), trap heat from the sun, keeping the planet warm enough to support life. This effect is essential for life on Earth. However, human activities have intensified this natural process, leading to a warmer climate.'

In [18]:
relevance = 5
quality = 5

# Collect feedback
feedback = get_user_feedback(query, response, relevance, quality)

In [19]:
feedback

{'query': 'What is the greenhouse effect?',
 'response': 'The greenhouse effect is a natural process where greenhouse gases, such as carbon dioxide (CO2), methane (CH4), and nitrous oxide (N2O), trap heat from the sun, keeping the planet warm enough to support life. This effect is essential for life on Earth. However, human activities have intensified this natural process, leading to a warmer climate.',
 'relevance': 5,
 'quality': 5,
 'comments': ''}

In [20]:
store_feedback(feedback)

In [21]:
docs = retriever.get_relevant_documents(query)

  docs = retriever.get_relevant_documents(query)
OllamaEmbeddings: 100%|██████████| 1/1 [00:01<00:00,  1.23s/it]


In [22]:
docs

[Document(metadata={'relevance_score': 1.0}, page_content='The use of synthetic fertilizers in agriculture releases nitrous oxide, a potent greenhouse gas. \nPractices such as precision farming and organic fertilizers can mitigate these emissions. The \ndevelopment of eco -friendly fertilizers and farming techniques is  essential for reducing the \nagricultural sector\'s carbon footprint.  \nChapter 3: Effects of Climate Change  \nThe effects of climate change are already being felt around the world and are projected to \nintensify in the coming decades. These effects include:  \nRising Temperatures  \nGlobal temperatures have risen by about 1.2 degrees Celsius (2.2 degrees Fahrenheit) since \nthe late 19th century. This warming is not uniform, with some regions experiencing more \nsignificant increases than others.  \nHeatwaves  \nHeatwaves are becoming more frequent and severe, posing risks to human health, agriculture, \nand infrastructure. Cities are particularly vulnerable due to 

In [24]:
adjusted_docs = adjust_relevance_scores(query, docs, load_feedback_data())

In [25]:
adjusted_docs

[Document(metadata={'relevance_score': 1.0}, page_content='The use of synthetic fertilizers in agriculture releases nitrous oxide, a potent greenhouse gas. \nPractices such as precision farming and organic fertilizers can mitigate these emissions. The \ndevelopment of eco -friendly fertilizers and farming techniques is  essential for reducing the \nagricultural sector\'s carbon footprint.  \nChapter 3: Effects of Climate Change  \nThe effects of climate change are already being felt around the world and are projected to \nintensify in the coming decades. These effects include:  \nRising Temperatures  \nGlobal temperatures have risen by about 1.2 degrees Celsius (2.2 degrees Fahrenheit) since \nthe late 19th century. This warming is not uniform, with some regions experiencing more \nsignificant increases than others.  \nHeatwaves  \nHeatwaves are becoming more frequent and severe, posing risks to human health, agriculture, \nand infrastructure. Cities are particularly vulnerable due to 

In [26]:
new_vectorstore = fine_tune_index(load_feedback_data(), content)
new_vectorstore

OllamaEmbeddings: 100%|██████████| 94/94 [01:36<00:00,  1.03s/it]


<langchain_community.vectorstores.faiss.FAISS at 0x119ffc550>

In [27]:
retriever = new_vectorstore.as_retriever()

In [None]:
llm = ChatGroq(model="llama-3.3-70b-versatile", temperature=0, max_tokens=4000,api_key='')

In [29]:
qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever)

In [30]:
query = "What are reasons of greenhouse effects?"

# Get response from RAG system
response = qa_chain(query)["result"]

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  3.97it/s]


In [31]:
response

'The reasons for the intensified greenhouse effect are primarily driven by human activities, particularly the emission of greenhouse gases. Some of the main reasons include:\n\n1. Burning fossil fuels (such as coal, oil, and natural gas) for energy, which releases large amounts of carbon dioxide (CO2).\n2. The use of synthetic fertilizers in agriculture, which releases nitrous oxide (N2O), a potent greenhouse gas.\n3. Industrial activities and transportation, which also release greenhouse gases.\n\nThese human activities have intensified the natural greenhouse effect, leading to a warmer climate.'