# Answer Generation

In [None]:
import weave
from tqdm import tqdm
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import FactualCorrectness, SemanticSimilarity, StringPresence
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas.llms import LangchainLLMWrapper 
from ragas.embeddings import LangchainEmbeddingsWrapper
import os
import json

# Initialize Weave
weave.init('benchmark')

# Initialize LLM and Embeddings
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini", api_key=os.environ['OPENAI_API_KEY']))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(api_key=os.environ['OPENAI_API_KEY']))

def load_test_data(benchmark_name):
    """Load the test data from Weave."""
    return weave.ref(benchmark_name).get()

async def evaluate_sample(chat_response, expected, response):
    """Evaluate a single sample."""
    answer_sample = SingleTurnSample(response=chat_response, reference=expected)
    exact_sample = SingleTurnSample(response=chat_response, reference=response)

    if not chat_response:
        return 0, 0, 0, 0

    try:
        factual = await FactualCorrectness().single_turn_ascore(answer_sample)
        similar = await SemanticSimilarity().single_turn_ascore(answer_sample)
        present = await StringPresence().single_turn_ascore(exact_sample)

        factual = float(factual)
        similar = float(similar)
        present = float(present)

        general_score = (0.5 * factual + 0.2 * similar + 0.3 * present)
    except Exception as e:
        print(f"Error evaluating sample: {e}")
        return 0, 0, 0, 0

    return general_score, factual, similar, present

async def evaluate():
    total_results = []
    benchmark_name = 'gsm8k'
    df = load_test_data(benchmark_name)
    dataset_name = f'{benchmark_name}-llama_cpp-Llama_3-custom-raspberry_5'

    for j in range(3, 123, 1000):
        bot_answer = load_test_data(f'{dataset_name}-batch{j-1}-10')
        start_index = j * 10 - 10
        end_index = start_index + 10
        results = []

        for i in tqdm(range(start_index, min(end_index, len(df.rows)))):
            prompt = df.rows[i]['prompt']
            expected, response = df.rows[i]['expecting'].split('####')

            chat_response = bot_answer.rows[i % 10]['response'][0]
            general_score, factual, similar, present = await evaluate_sample(chat_response, expected, response)

            results.append({
                "prompt": {
                    "general": general_score,
                    "factual_correctness": factual,
                    "semantic_similarity": similar,
                    "string_presence": present
                }
            })

        result = {
            "all_results": results,
            "average": {
                "general_results": sum(x["prompt"]["general"] for x in results) / len(results),
                "factual_correctness": sum(x["prompt"]["factual_correctness"] for x in results) / len(results),
                "semantic_similarity": sum(x["prompt"]["semantic_similarity"] for x in results) / len(results),
                "string_presence": sum(x["prompt"]["string_presence"] for x in results) / len(results)
            }
        }

        total_results.append(result)
        with open(f'results_{j}.json', 'w') as f:
            json.dump(result, f, indent=4)

    with open('results.json', 'w') as f:
        json.dump(total_results, f, indent=4)

import asyncio
asyncio.run(evaluate())


# Retrieval Phase

In [None]:
import weave
from tqdm import tqdm
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import LLMContextPrecisionWithoutReference, LLMContextRecall
from langchain_openai import ChatOpenAI
from ragas.llms import LangchainLLMWrapper 
from langchain_openai import OpenAIEmbeddings
from ragas.embeddings import LangchainEmbeddingsWrapper
import os
import json

# Initialize Weave
weave.init('benchmark')

# Initialize LLM and Embeddings
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini", api_key=os.environ['OPEN_AI_API_KEY']))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(api_key=os.environ['OPEN_AI_API_KEY']))

def load_test_data(benchmark_name):
    """Load the test data from Weave."""
    return weave.ref(benchmark_name).get()

async def evaluate_sample(chat_response, prompt, expected, context):
    """Evaluate a single sample for context precision and recall."""
    context_precision_sample = SingleTurnSample(
        response=chat_response,
        retrieved_contexts=context,
        user_input=prompt
    )
    
    context_recall_sample = SingleTurnSample(
        response=chat_response,
        reference=expected,
        retrieved_contexts=context,
        user_input=prompt
    )
    
    scorer = LLMContextPrecisionWithoutReference()
    scorer.llm = evaluator_llm
    precision = float(await scorer.single_turn_ascore(context_precision_sample))
    
    scorer = LLMContextRecall()
    scorer.llm = evaluator_llm
    recall = float(await scorer.single_turn_ascore(context_recall_sample))
    
    general_score = (0.5 * precision + 0.5 * recall)
    
    return general_score, precision, recall

async def evaluate():
    benchmark_name = 'gsm8k'
    df = load_test_data(benchmark_name)
    dataset_name = "gsm8k-llama_cpp-llama_3_8b_Q4_K_M-rag_2_sierra-raspberry_5-batch"
    
    for j in range(1, 4):
        bot_answer = load_test_data(f'{dataset_name}{j}-10')
        start_index = 0
        end_index = 10
        results = []
        
        for i in tqdm(range(start_index, min(end_index, len(df.rows)))):
            prompt = df.rows[i]['prompt']
            context = list(bot_answer.rows[i]['context_q'])
            expected, response = df.rows[i]['expecting'].split('####')    
            chat_response = bot_answer.rows[i]['response']
            
            general_score, precision, recall = await evaluate_sample(chat_response, prompt, expected, context)
            
            results.append({
                "prompt": {
                    "general": general_score,
                    "context_precision": precision,
                    "context_recall": recall,
                }
            })
        
        result = {
            "all_results": results,
            "average": {
                "general_retrieval": sum([x["prompt"]["general"] for x in results]) / len(results),
                "context_precision": sum([x["prompt"]["context_precision"] for x in results]) / len(results),
                "context_recall": sum([x["prompt"]["context_recall"] for x in results]) / len(results),
            }
        }

        with open(f'retrieval_{j}.json', 'w') as f:
            json.dump(result, f, indent=4)

import asyncio
asyncio.run(evaluate())

# Context Usage

In [None]:
import weave
from tqdm import tqdm
from ragas.dataset_schema import SingleTurnSample
from ragas.metrics import Faithfulness
from langchain_openai import ChatOpenAI
from ragas.llms import LangchainLLMWrapper 
from langchain_openai import OpenAIEmbeddings
from ragas.embeddings import LangchainEmbeddingsWrapper
import os
import json

# Initialize Weave
weave.init('benchmark')

# Initialize LLM and Embeddings
evaluator_llm = LangchainLLMWrapper(ChatOpenAI(model="gpt-4o-mini", api_key=os.environ['OPEN_AI_API_KEY']))
evaluator_embeddings = LangchainEmbeddingsWrapper(OpenAIEmbeddings(api_key=os.environ['OPEN_AI_API_KEY']))

def load_test_data(benchmark_name):
    """Load the test data from Weave."""
    return weave.ref(benchmark_name).get()

async def evaluate_sample(chat_response, prompt, context):
    """Evaluate a single sample for faithfulness."""
    faithful_sample = SingleTurnSample(
        response=chat_response,
        user_input=prompt,
        retrieved_contexts=context
    )
    
    if chat_response:
        scorer = Faithfulness()
        scorer.llm = evaluator_llm
        score = float(await scorer.single_turn_ascore(faithful_sample))
    else:
        score = 0.0
    
    return score

async def evaluate():
    benchmark_name = 'gsm8k'
    df = load_test_data(benchmark_name)
    dataset_name = "gsm8k-llama_cpp-llama_3_8b_Q4_K_M-rag_2_sierra-raspberry_5-batch"
    
    for j in range(1, 4):
        bot_answer = load_test_data(f'{dataset_name}{j}-10')
        start_index = 0
        end_index = 10
        results = []
        
        for i in tqdm(range(start_index, min(end_index, len(df.rows)))):
            prompt = df.rows[i]['prompt']
            context = list(bot_answer.rows[i]['context_q'])
            expected, response = df.rows[i]['expecting'].split('####')
            chat_response = bot_answer.rows[i]['response']
            
            score = await evaluate_sample(chat_response, prompt, context)
            
            results.append({
                "prompt": {
                    "general": score,
                    "faithfulness": score
                }
            })
        
        result = {
            "all_results": results,
            "average": {
                "general_usage": sum(x["prompt"]["general"] for x in results) / len(results),
                "faithfulness": sum(x["prompt"]["faithfulness"] for x in results) / len(results),
            }
        }

        with open(f'usage_{j}.json', 'w') as f:
            json.dump(result, f, indent=4)

import asyncio
asyncio.run(evaluate())