In [None]:
from openai import AsyncOpenAI
import weave
from configparser import ConfigParser
import json
from pathlib import Path
import random

In [None]:
config_parser = ConfigParser()
config_parser.read("config.cfg")
LLAMA_KEY = config_parser.get("DEFAULT", "LLAMA_KEY")

In [None]:
with Path("data/centerEmbed/ce1.json").open(encoding="UTF-8") as source:
     objects = json.load(source)


examples = [
    {
        "id": i,
        "context": ex["Context"],
        "question": ex["Q"],
        "target": ex["A"],
        "level": ex["level"],
    }
for i, ex in enumerate(objects)]


In [None]:
class Llama(weave.Model):

    model_name : str
    prompt_template : str

    @property
    def api_key(self):
        return LLAMA_KEY

    @property
    def api(self):
        return AsyncOpenAI(
            api_key=self.api_key, 
            base_url="https://api.llama-api.com"
        )
    

    def format(self, context : str, question : str, params : dict, **kwargs) -> dict:

        prompt = self.prompt_template.format(context=context, question=question)
        return {
            "messages": [
                {"role": "user", "content": prompt},
            ],
            **params,
            **kwargs
        }

    @weave.op()
    async def predict(
        self,
        context : str,
        question : str,
        params : dict = {},
        **kwargs
    ):
        with weave.attributes(kwargs):

            payload = self.format(context, question, params)

            response = await self.api.chat.completions.create(
                model=self.model_name, 
                **payload
            )
            if response is None:
                raise ValueError("No response from model")

            result = response.choices[0].message.content
            return result

In [None]:
PROMPT_TEMPLATE = """You will be given an example consisting of a context and a question to answer. The answer should always be of this form "The N V the N", where N stands for a single word that is a noun, and V stands for a single word that is a verb. 
Here are two samples:

        "Context": "The student the man noticed seemed happy",
        "Question": "Who saw who?",
        "Answer": "The man saw the student.",


        "Context": "The teacher the student saw hit is dead",
        "Question": "Who saw who?",
        "Answer": "The student saw the teacher.",

Context: {context}
Question: {question}

Now answer the question:
"""

In [None]:
model = Llama(
    name="llama-7b-chat",
    description="Weave model for Llama",
    model_name="llama-7b-chat",
    prompt_template=PROMPT_TEMPLATE
)

In [None]:
import weave
import editdistance

from utils.loggers import logger


def postprocess(string):
    return string.strip().lower().strip("answer:").strip()

@weave.op()
def evaluator(target: str, model_output: str) -> dict:
    """ Evaluate the model output against the target. We
    consider the model output correct if it exactly matches
    the target or if the edit distance between the two
    strings is less than 2."""

    model_output = postprocess(model_output)
    edit_distance = editdistance.eval(target, model_output)
    
    exact_match = target == model_output
    fuzzy_match = edit_distance < 2
    correct =  exact_match | fuzzy_match

    logger.info(f"{target=} | {model_output=} | {correct=}")
    return {
        'correct': correct,
        'edit_distance': edit_distance,
        'exact_match': exact_match,
    }

In [None]:
################
# SAMPLE EXAMPLES
################

sample: list = random.sample(examples, 10)

################
# RUN EVALUATION
################

weave.init(f"llama-7b-chat-CE1-n{len(sample)}-test3")



evaluation = weave.Evaluation(
    dataset=sample,
    scorers=[evaluator],
    trials=1,

)

output = await evaluation.evaluate(model)