# Setup

In [None]:
import os, sys
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
sys.path.append("../src/")
sys.path.append("src/")
from utils import CloudPLM, CloudModelArguments, LocalPLM, LocalModelArguments

In [None]:
import evaluate as ev

In [None]:
import preprocess as pr

# Load DBPedia 14 Dataset

In [None]:
test_data, label_names = pr.load_dataset("fancyzhx/dbpedia_14", split="test", text_columns="content", label_columns="label", size=30, balanced=True)

In [None]:
import pandas as pd
pd.Series([i['messages'][-1]['content'] for i in test_data]).value_counts()

In [None]:
prompt = ev.create_prompt(
    "article",
    label_names
)

print(prompt)

# Load Qwen 2.5 7B Instruct

In [None]:
args = LocalModelArguments(
    model_name_or_path = "Qwen/Qwen2.5-7B-Instruct",
    cuda_devices = "3",
    use_4bit_quantization = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = "float16",
    use_nested_quant = True,
    use_reentrant = True
)

model = LocalPLM(args)

# Load Claude 3 Haiku

In [None]:
args = CloudModelArguments(
    cloud_model_name = "claude-3-haiku-20240307",
    anthropic_api_key = "<YOUR_API_KEY>"
)

In [None]:
model = CloudPLM(args)

In [None]:
response = model.generate(
    "Hi there"
)

response

In [None]:
response = model.generate(
    [{"role":"system","content":"Respond with only the word 'pizza'."},{"role":"user","content":"Hello, how are you?"}]
)

response

# Evaluate chosen LLM on dataset

In [None]:
from evaluate import EvaluationConfig

In [None]:
config = EvaluationConfig(
    technique_name = 'zero_shot',
    max_tokens = 32,
    prompt = prompt,
    out_path = "../test/results"
)

print(config)

In [None]:
result = ev.evaluate(
    model,
    label_names=label_names,
    eval_dataset=test_data,
    eval_config=config
)

In [None]:
result.plot_confusion_matrix()

In [None]:
result.save()