In [None]:
!pip install -r requirements.txt
!pip install -U spacy
!python -m spacy download en_core_web_sm

In [None]:
import subprocess
import urllib
from dragin import AttnWeightRAG, model
import nltk
import os
import re
import json
import string
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from groq import Groq

In [None]:
os.environ['GROQ_API_KEY'] = "YOUR-GROQ-API-KEY"

In [None]:
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
def execute_curl_request(query, k=2):
    # URL encode the query
    encoded_query = urllib.parse.quote(query)
    url = f"http://localhost:8000/v1/retrieve?query={encoded_query}&k={k}"
    
    # Construct the curl command
    command = [
        "curl", "-X", "GET", url, "-H", "accept: */*"
    ]
    
    # Execute the command
    try:
        result = subprocess.run(command, capture_output=True, text=True, check=True)
        # print("Response:\n", result.stdout)
        return result.stdout
    except subprocess.CalledProcessError as e:
        print("Error:", e.stderr)
        return e.stderr

In [None]:
def remove_non_ascii(text):
    return ''.join([char for char in text if ord(char) < 128])

def retrieve(self, query, topk=1, max_query_length=64):
    self.counter.retrieve += 1
    docs = execute_curl_request(query=query, k=topk)
    docs = [remove_non_ascii(doc['text']) for doc in eval(docs.replace("null", "None"))]
    return docs
model.retrieve = retrieve.__get__(model, AttnWeightRAG)
# model.retrieve("What is NVIDIA", topk=5)

def generate_dragin_output(question):
    output, contexts = model.inference(question, {}, f"Question: {question}\nAnswer:")
    return output, contexts

In [None]:
# Cosine similarity function
def calculate_cosine_similarity(text1, text2):
    if not text1 or not text2:  # Handle empty input
        return 0.0
    vectorizer = CountVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)[0, 1]

# Text normalization function
def normalize_answer(s):
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

# F1 Score calculation function
def f1_score(prediction, ground_truth):
    normalized_prediction = normalize_answer(prediction)
    normalized_ground_truth = normalize_answer(ground_truth)

    ZERO_METRIC = (0, 0, 0, 0.0)  # Include a default value for cosine similarity

    # Handle binary answers separately
    if normalized_prediction in ['yes', 'no', 'noanswer'] or normalized_ground_truth in ['yes', 'no', 'noanswer']:
        if normalized_prediction != normalized_ground_truth:
            return ZERO_METRIC

    prediction_tokens = normalized_prediction.split()
    ground_truth_tokens = normalized_ground_truth.split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())

    if num_same == 0:
        return ZERO_METRIC

    precision = num_same / len(prediction_tokens)
    recall = num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    cosine_sim = calculate_cosine_similarity(prediction, ground_truth)

    return f1, precision, recall, cosine_sim


# Load JSON data
def load_json(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        return json.load(file)

In [None]:
import json
import tqdm

ground_truth_path = '<your dataset path>'

with open("config.json", 'r') as f:
    config = json.load(f)
gtp = ground_truth_path.replace('/', '_').replace('-', '_')
mnp = config["model_name_or_path"].replace('/', '_').replace('-', '_')
answers_save_path = f'results/{mnp}_{gtp}_answers.json'
all_contexts_save_path = f'results/{mnp}_{gtp}_contexts.json'
with open(ground_truth_path, 'r') as f:
    data = json.load(f)
answers = []
all_contexts = []
for d in tqdm.tqdm(data):
    response, contexts = generate_dragin_output(d['question'])
    answers.append({
        'question': d['question'],
        'answer': response
    })
    all_contexts.append({
        'question': d['question'],
        'contexts': str(contexts)
    })
with open(answers_save_path, 'w') as f:
    json.dump(answers, f)
with open(all_contexts_save_path, 'w') as f:
    json.dump(all_contexts, f)
print("Done!")

In [None]:
json_file_path = answers_save_path
llm=Groq

# Load data
question_chunks_data = load_json(json_file_path)
ground_truth_data = load_json(ground_truth_path)

# Map questions to ground truth answers
ground_truth_map = {entry['question']: entry['answer'] for entry in ground_truth_data}

# Initialize metrics
total_f1, total_precision, total_recall, total_cos, count = 0, 0, 0, 0, 0

# Evaluate F1, precision, recall, cosine similarity
for question_entry in question_chunks_data:
    question = question_entry.get('question', "")
    ground_truth = ground_truth_map.get(question, "")
    chunk1_text = question_entry.get('answer', "")

    if chunk1_text:
        f1, precision, recall, cos = f1_score(chunk1_text, ground_truth)
        total_f1 += f1
        total_precision += precision
        total_recall += recall
        total_cos += cos
        count += 1

avg_f1 = total_f1 / count if count else 0
avg_precision = total_precision / count if count else 0
avg_recall = total_recall / count if count else 0
avg_cosine_sim = total_cos / count if count else 0

# Print results
print(f"Average F1 Score: {avg_f1:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average Cosine Similarity: {avg_cosine_sim:.4f}")

# Evaluate with LLM
LLM_Score = 0
for question_entry in question_chunks_data:
    query = question_entry.get('question', "")
    reply = question_entry.get('answer', "")
    answer = ground_truth_map.get(query, "")

    if query and reply and answer:
        chat_completion = llm.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": f"""Evaluate the semantic similarity between the following two answers with respect to some question. Output only a single floating-point number between 0 and 1, where 0 indicates no similarity and 1 indicates identical meaning. Respond with only the number:
Question: {query}
Text A: {answer}
Text B: {reply}"""
                }
            ],
            model="llama-3.1-70b-versatile",
            stream=False,
        )

        try:
            score = float(chat_completion.choices[0].message.content.strip())
            LLM_Score += score
        except ValueError:
            print("Invalid LLM response for query:", query)

# Calculate and print average LLM similarity score
average_llm_score = LLM_Score / count if count else 0
print(f"Average LLM Similarity Score: {average_llm_score:.4f}")