### Retrieval Function

In [None]:
!pip install openai
!pip install scikit-learn
!pip install groq

In [None]:
import subprocess
import urllib.parse
import json

def execute_curl_request(query, k=3):
    """
    description: This function retrieves relevant information from a database consisting of financial information"""
    # URL encode the query
    encoded_query = urllib.parse.quote(query)
    url = f"http://localhost:8000/v1/retrieve?query={encoded_query}&k={k}"
    
    # Construct the curl command
    command = [
        "curl", "-X", "GET", url, "-H", "accept: */*"
    ]
    
    # Execute the command
    try:
        result = subprocess.run(command, capture_output=True, text=True, check=True)
        response = result.stdout
        
        # Parse the JSON response
        data = json.loads(response)
        
        # Extract and print only the "text" field
        for item in data:
            ptext=item.get("text")
            # print("Text:\n", item.get("text"))
            i=1
    except subprocess.CalledProcessError as e:
        print("Error:", e.stderr)
    except json.JSONDecodeError:
        print("Failed to parse JSON response.")
    s="Text:\n"+item.get("text")+"done"
    return s

def execute_curl_request_legal(query, k=3):
    """
    description: This function retrieves relevant information from a database consisting of financial information"""
    # URL encode the query
    encoded_query = urllib.parse.quote(query)
    url = f"http://localhost:8001/v1/retrieve?query={encoded_query}&k={k}"
    
    # Construct the curl command
    command = [
        "curl", "-X", "GET", url, "-H", "accept: */*"
    ]
    
    # Execute the command
    try:
        result = subprocess.run(command, capture_output=True, text=True, check=True)
        response = result.stdout
        
        # Parse the JSON response
        data = json.loads(response)
        
        # Extract and print only the "text" field
        for item in data:
            ptext=item.get("text")
            # print("Text:\n", item.get("text"))
            i=1
    except subprocess.CalledProcessError as e:
        print("Error:", e.stderr)
    except json.JSONDecodeError:
        print("Failed to parse JSON response.")
    s="Text:\n"+item.get("text")+"done"
    return s



In [None]:
### SET KEYS ###
import os
os.environ['OPENAI_API_KEY']='Your OpenAI API Key'
os.environ['GROQ_API_KEY'] = "Your GROQ API Key"

### INFERENCE FUNCTION

In [None]:
import json
from openai import OpenAI

def process_questions_with_context_and_gpt(
    input_json_path, 
    output_json_path, 
    model="gpt-4o-mini",
    k=3
):
    """
    Processes a JSON file containing questions, retrieves contexts, 
    and generates answers using GPT. Outputs the final JSON with question, context, 
    and answer pairs.

    Args:
        input_json_path (str): Path to the input JSON file with questions and ground truth.
        output_json_path (str): Path to save the output JSON with questions, contexts, and answers.
        model (str): The GPT model to use for generating responses.
        k (int): Number of contexts to retrieve per question.
    """
    # Initialize the OpenAI client
    client = OpenAI()

    # Load the input JSON data
    with open(input_json_path, 'r', encoding='utf-8') as file:
        questions_data = json.load(file)

    # Initialize the output list
    output_data = []

    # Iterate through each question in the input JSON
    for question_entry in questions_data:
        question = question_entry.get("question", "").strip()  # Extract and sanitize the question
        if not question:
            continue  # Skip if the question is empty

        try:
            # Retrieve contexts using the `execute_curl_request` function
            contexts_raw = execute_curl_request(query=question, k=k)
            context_list = contexts_raw.split("done")[:-1]  # Process into a list

            # Prepare the prompt for GPT
            context_text = "\n".join(context_list) if context_list else "No context provided."
            prompt = f"I am giving you a query and the necessary context to answer it.\n\nQuery: {question}\n\nContext:\n{context_text} \n be concise, if the answer is not in the context provided print that"

            # Get the response from GPT-4o-mini in streaming mode
            response_text = ""
            stream = client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                stream=True,
            )
            for chunk in stream:
                delta_content = chunk.choices[0].delta.content
                if delta_content:
                    response_text += delta_content

            # Append the processed data to the output list
            output_data.append({
                "question": question,
                "contexts": context_list,
                "answer": response_text.strip()
            })

        except Exception as e:
            print(f"Error processing question: {question}\n{e}")

    # Save the output data to a JSON file
    with open(output_json_path, 'w', encoding='utf-8') as file:
        json.dump(output_data, file, ensure_ascii=False, indent=4)

    print(f"Processed {len(output_data)} questions. Output saved to {output_json_path}.")



### EVALUATION FUNCTIONS

In [None]:
import os
import re
import json
import string
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Set up the Groq API

Groq_API_Key = os.getenv('GROQ_API_KEY', None)

if not Groq_API_Key:
    raise ValueError("GROQ_API_KEY is not set.")

from groq import Groq
llm = Groq()

# Cosine similarity function
def calculate_cosine_similarity(text1, text2):
    if not text1 or not text2:  # Handle empty input
        return 0.0
    vectorizer = CountVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)[0, 1]

# Text normalization function
def normalize_answer(s):
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

# F1 Score calculation function
def f1_score(prediction, ground_truth):
    normalized_prediction = normalize_answer(prediction)
    normalized_ground_truth = normalize_answer(ground_truth)

    ZERO_METRIC = (0, 0, 0, 0.0)  # Include a default value for cosine similarity

    # Handle binary answers separately
    if normalized_prediction in ['yes', 'no', 'noanswer'] or normalized_ground_truth in ['yes', 'no', 'noanswer']:
        if normalized_prediction != normalized_ground_truth:
            return ZERO_METRIC

    prediction_tokens = normalized_prediction.split()
    ground_truth_tokens = normalized_ground_truth.split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())

    if num_same == 0:
        return ZERO_METRIC

    precision = num_same / len(prediction_tokens)
    recall = num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    cosine_sim = calculate_cosine_similarity(prediction, ground_truth)

    return f1, precision, recall, cosine_sim


# Load JSON data
def load_json(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return json.load(file)


In [None]:
##### MULTIHOP FINANCE INFERENCE  #####

input_path = r"datasets\multihop_finance.json"  # Input JSON file containing questions
output_path = r"notebooks\baseline_on_multihop_finance.json"  # Output JSON file

# Process the questions, retrieve contexts, and generate answers
process_questions_with_context_and_gpt(input_path, output_path)


In [None]:
#### MULTIHOP FINANCE EVAL ####

# Paths to the input files
json_file_path = output_path
ground_truth_path = input_path

# Load data
question_chunks_data = load_json(json_file_path)
ground_truth_data = load_json(ground_truth_path)

# Map questions to ground truth answers
ground_truth_map = {entry['question']: entry['answer'] for entry in ground_truth_data}

# Initialize metrics
total_f1, total_precision, total_recall, total_cos, count = 0, 0, 0, 0, 0

# Evaluate F1, precision, recall, cosine similarity
for question_entry in question_chunks_data:
    question = question_entry.get('question', "")
    ground_truth = ground_truth_map.get(question, "")
    chunk1_text = question_entry.get('answer', "")

    if chunk1_text:  # Process only if answer exists
        f1, precision, recall, cos = f1_score(chunk1_text, ground_truth)
        total_f1 += f1
        total_precision += precision
        total_recall += recall
        total_cos += cos
        count += 1

# Average metrics
avg_f1 = total_f1 / count if count else 0
avg_precision = total_precision / count if count else 0
avg_recall = total_recall / count if count else 0
avg_cosine_sim = total_cos / count if count else 0

# Print results
print(f"Average F1 Score: {avg_f1:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average Cosine Similarity: {avg_cosine_sim:.4f}")

# Evaluate with LLM
LLM_Score = 0
for question_entry in question_chunks_data:
    query = question_entry.get('question', "")
    reply = question_entry.get('answer', "")
    answer = ground_truth_map.get(query, "")

    if query and reply and answer:
        chat_completion = llm.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": f"""Evaluate the semantic similarity between the following two answers with respect to some question. Output only a single floating-point number between 0 and 1, where 0 indicates no similarity and 1 indicates identical meaning. Respond with only the number:
Question: {query}
Text A: {answer}
Text B: {reply}"""
                }
            ],
            model="llama-3.1-70b-versatile",
            stream=False,
        )

        try:
            score = float(chat_completion.choices[0].message.content.strip())
            LLM_Score += score
        except ValueError:
            print("Invalid LLM response for query:", query)

# Calculate and print average LLM similarity score
average_llm_score = LLM_Score / count if count else 0
print(f"Average LLM Similarity Score: {average_llm_score:.4f}")


In [None]:
##### NVIDIA  #####

input_path = r"datasets\singlehop_finance.json"  # Input JSON file containing questions
output_path = r"notebooks\baseline_on_singlehop_finance.json"  # Output JSON file

# Process the questions, retrieve contexts, and generate answers
process_questions_with_context_and_gpt(input_path, output_path)

In [None]:
#### NVIDIA EVAL ####

# Paths to the input files
json_file_path = output_path
ground_truth_path = input_path

# Load data
question_chunks_data = load_json(json_file_path)
ground_truth_data = load_json(ground_truth_path)

# Map questions to ground truth answers
ground_truth_map = {entry['question']: entry['answer'] for entry in ground_truth_data}

# Initialize metrics
total_f1, total_precision, total_recall, total_cos, count = 0, 0, 0, 0, 0

# Evaluate F1, precision, recall, cosine similarity
for question_entry in question_chunks_data:
    question = question_entry.get('question', "")
    ground_truth = ground_truth_map.get(question, "")
    chunk1_text = question_entry.get('answer', "")

    if chunk1_text:  # Process only if answer exists
        f1, precision, recall, cos = f1_score(chunk1_text, ground_truth)
        total_f1 += f1
        total_precision += precision
        total_recall += recall
        total_cos += cos
        count += 1

# Average metrics
avg_f1 = total_f1 / count if count else 0
avg_precision = total_precision / count if count else 0
avg_recall = total_recall / count if count else 0
avg_cosine_sim = total_cos / count if count else 0

# Print results
print(f"Average F1 Score: {avg_f1:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average Cosine Similarity: {avg_cosine_sim:.4f}")

# Evaluate with LLM
LLM_Score = 0
for question_entry in question_chunks_data:
    query = question_entry.get('question', "")
    reply = question_entry.get('answer', "")
    answer = ground_truth_map.get(query, "")

    if query and reply and answer:
        chat_completion = llm.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": f"""Evaluate the semantic similarity between the following two answers with respect to some question. Output only a single floating-point number between 0 and 1, where 0 indicates no similarity and 1 indicates identical meaning. Respond with only the number:
Question: {query}
Text A: {answer}
Text B: {reply}"""
                }
            ],
            model="llama-3.1-70b-versatile",
            stream=False,
        )

        try:
            score = float(chat_completion.choices[0].message.content.strip())
            LLM_Score += score
        except ValueError:
            print("Invalid LLM response for query:", query)

# Calculate and print average LLM similarity score
average_llm_score = LLM_Score / count if count else 0
print(f"Average LLM Similarity Score: {average_llm_score:.4f}")


### LEGAL INFERENCE AND EVAL

In [None]:
import json
from openai import OpenAI

def process_questions_with_context_and_gpt_legal(
    input_json_path, 
    output_json_path, 
    model="gpt-4o-mini",
    k=3
):
    """
    Processes a JSON file containing questions, retrieves contexts, 
    and generates answers using GPT. Outputs the final JSON with question, context, 
    and answer pairs.

    Args:
        input_json_path (str): Path to the input JSON file with questions and ground truth.
        output_json_path (str): Path to save the output JSON with questions, contexts, and answers.
        model (str): The GPT model to use for generating responses.
        k (int): Number of contexts to retrieve per question.
    """
    # Initialize the OpenAI client
    client = OpenAI()

    # Load the input JSON data
    with open(input_json_path, 'r', encoding='utf-8') as file:
        questions_data = json.load(file)

    # Initialize the output list
    output_data = []

    # Iterate through each question in the input JSON
    for question_entry in questions_data:
        question = question_entry.get("question", "").strip()  # Extract and sanitize the question
        if not question:
            continue  # Skip if the question is empty

        try:
            # Retrieve contexts using the `execute_curl_request` function
            contexts_raw = execute_curl_request_legal(query=question, k=k)
            context_list = contexts_raw.split("done")[:-1]  # Process into a list

            # Prepare the prompt for GPT
            context_text = "\n".join(context_list) if context_list else "No context provided."
            prompt = f"I am giving you a query and the necessary context to answer it.\n\nQuery: {question}\n\nContext:\n{context_text} \n be concise, if the answer is not in the context provided print that"

            # Get the response from GPT-4o-mini in streaming mode
            response_text = ""
            stream = client.chat.completions.create(
                model=model,
                messages=[{"role": "user", "content": prompt}],
                stream=True,
            )
            for chunk in stream:
                delta_content = chunk.choices[0].delta.content
                if delta_content:
                    response_text += delta_content

            # Append the processed data to the output list
            output_data.append({
                "question": question,
                "contexts": context_list,
                "answer": response_text.strip()
            })

        except Exception as e:
            print(f"Error processing question: {question}\n{e}")

    # Save the output data to a JSON file
    with open(output_json_path, 'w', encoding='utf-8') as file:
        json.dump(output_data, file, ensure_ascii=False, indent=4)

    print(f"Processed {len(output_data)} questions. Output saved to {output_json_path}.")



In [None]:
##### CUAD INFERENCE  #####

input_path = r"datasets\CUAD_LBRAG.json"  # Input JSON file containing questions
output_path = r"notebooks\baseline_on_CUAD_LBRAG.json"  # Output JSON file

# Process the questions, retrieve contexts, and generate answers
process_questions_with_context_and_gpt_legal(input_path, output_path)


In [None]:
#### CUAD EVAL ####

# Paths to the input files
json_file_path = output_path
ground_truth_path = input_path

# Load data
question_chunks_data = load_json(json_file_path)
ground_truth_data = load_json(ground_truth_path)

# Map questions to ground truth answers
ground_truth_map = {entry['question']: entry['answer'] for entry in ground_truth_data}

# Initialize metrics
total_f1, total_precision, total_recall, total_cos, count = 0, 0, 0, 0, 0

# Evaluate F1, precision, recall, cosine similarity
for question_entry in question_chunks_data:
    question = question_entry.get('question', "")
    ground_truth = ground_truth_map.get(question, "")
    chunk1_text = question_entry.get('answer', "")

    if chunk1_text:  # Process only if answer exists
        f1, precision, recall, cos = f1_score(chunk1_text, ground_truth)
        total_f1 += f1
        total_precision += precision
        total_recall += recall
        total_cos += cos
        count += 1

# Average metrics
avg_f1 = total_f1 / count if count else 0
avg_precision = total_precision / count if count else 0
avg_recall = total_recall / count if count else 0
avg_cosine_sim = total_cos / count if count else 0

# Print results
print(f"Average F1 Score: {avg_f1:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average Cosine Similarity: {avg_cosine_sim:.4f}")

# Evaluate with LLM
LLM_Score = 0
for question_entry in question_chunks_data:
    query = question_entry.get('question', "")
    reply = question_entry.get('answer', "")
    answer = ground_truth_map.get(query, "")

    if query and reply and answer:
        chat_completion = llm.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": f"""Evaluate the semantic similarity between the following two answers with respect to some question. Output only a single floating-point number between 0 and 1, where 0 indicates no similarity and 1 indicates identical meaning. Respond with only the number:
Question: {query}
Text A: {answer}
Text B: {reply}"""
                }
            ],
            model="llama-3.1-70b-versatile",
            stream=False,
        )

        try:
            score = float(chat_completion.choices[0].message.content.strip())
            LLM_Score += score
        except ValueError:
            print("Invalid LLM response for query:", query)

# Calculate and print average LLM similarity score
average_llm_score = LLM_Score / count if count else 0
print(f"Average LLM Similarity Score: {average_llm_score:.4f}")
