In [1]:
import os
import pandas as pd
from langchain_groq import ChatGroq
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import create_retrieval_chain
from langchain_community.vectorstores import FAISS
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.schema import Document
from dotenv import load_dotenv
import json

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Load environment variables
load_dotenv()

groq_api_key = os.getenv('GROQ_API_KEY')
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")

# Initialize LLM and prompt
llm = ChatGroq(
    groq_api_key=groq_api_key,
    model_name="Llama3-8b-8192"
)

# The Accuray Evaluation Is Ran For The 50 Question of The Dataset to check Accuracy Of The Retrival

In [None]:
prompt = ChatPromptTemplate.from_template(
    """
    You are a CSV assistant. Your task is to retrieve the correct translation for a given question based on the context.
    Respond in JSON format with the following structure:
    {{
        "translation": "<retrieved_translation>"
    }}
    If you cannot find the translation, respond with "I cannot find the translation."
    <context>
    {context}
    <context>
    Question: {input}
    """
)

# Function to process the CSV and embed question and translation columns
def process_csv(csv_file):
    data = pd.read_csv(csv_file)

    # Validate that required columns exist
    if "question" not in data.columns or "translation" not in data.columns:
        raise ValueError("CSV must contain 'question' and 'translation' columns.")

    # Create Documents using question and translation
    documents = [
        Document(
            page_content=f"Question: {row['question']}\nTranslation: {row['translation']}",
            metadata={"row_index": i}
        )
        for i, row in data.iterrows()
    ]

    # Split into smaller chunks
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    final_documents = text_splitter.split_documents(documents)

    # Create vector embeddings
    vectors = FAISS.from_documents(final_documents, embeddings)

    return data, vectors

# Function to evaluate RAG using the translation column
def evaluate_rag(data, vectors):
    correct_answers = 0
    sn=0
    total_questions = 50

    # Create chains
    document_chain = create_stuff_documents_chain(llm, prompt)
    retriever = vectors.as_retriever()
    retrieval_chain = create_retrieval_chain(retriever, document_chain)

    # Loop through the CSV questions
    for idx, row in data.iterrows():
        if sn>50:
            break
        sn+=1
        question = row['question']
        expected_translation = row['translation']

        # Get response from the RAG pipeline
        response = retrieval_chain.invoke({'input': question})

        # Parse JSON response to extract the retrieved translation
        try:
            response_data = json.loads(response['answer'])
            retrieved_translation = response_data.get("translation", "")

            print(f"Question {idx + 1}: {question}")
            print(f"Expected Translation: {expected_translation}")
            print(f"Retrieved Translation: {retrieved_translation}")
            print("--------------------------------------------")

            # Compare expected and retrieved translation
            if expected_translation.strip().lower() == retrieved_translation.strip().lower():
                correct_answers += 1

        except (json.JSONDecodeError, KeyError):
            print(f"Failed to parse response or missing 'translation' key: {response['answer']}")
            print("--------------------------------------------")

    # Calculate accuracy
    accuracy = (correct_answers / total_questions) * 100
    return accuracy, correct_answers, total_questions

if __name__ == "__main__":
    # csv_file = input("Enter the path to your CSV file: ")
    csv_file = 'bhagwatgita_queeng.csv'

    # Ensure the CSV file exists
    if not os.path.exists(csv_file):
        print("File not found. Please provide a valid file path.")
    else:
        # Process CSV and create embeddings
        try:
            data, vectors = process_csv(csv_file)

            # Evaluate RAG performance
            accuracy, correct_answers, total_questions = evaluate_rag(data, vectors)

            print("\nEvaluation Complete!")
            print(f"Accuracy: {accuracy:.2f}%")
            print(f"Correct Answers: {correct_answers}")
            print(f"Total Questions: {total_questions}")
        except Exception as e:
            print(f"Error: {e}")


Question 1: How does the Gita start?
Expected Translation: Dhritarashtra said, "What did my people and the sons of Pandu do when they had assembled together, eager for battle, on the holy plain of Kurukshetra, O Sanjaya?"
Retrieved Translation: Dhritarashtra said, "What did my people and the sons of Pandu do when they had assembled together, eager for battle, on the holy plain of Kurukshetra, O Sanjaya?"
--------------------------------------------
Question 2: In the Mahabharata war, whom did Duryodhana first talk to?
Expected Translation: Sanjaya said: Having seen the army of the Pandavas drawn up in battle array, King Duryodhana approached his teacher, Drona, and spoke these words.
Retrieved Translation: Sanjaya said: Having seen the army of the Pandavas drawn up in battle array, King Duryodhana approached his teacher, Drona, and spoke these words.
--------------------------------------------
Question 3: What did Duryodhana say to his teacher, Drona?
Expected Translation: Behold, O T