In [1]:
from datasets import load_dataset
# load the QA pairs
semeval_dev_qa = load_dataset("cardiffnlp/databench", name="semeval", split="dev")

  from .autonotebook import tqdm as notebook_tqdm
Downloading data: 100%|██████████| 49/49 [00:00<00:00, 91.44files/s]
Downloading data: 100%|██████████| 16/16 [00:00<00:00, 165.23files/s]
Generating train split: 100%|██████████| 988/988 [00:00<00:00, 19268.20 examples/s]
Generating dev split: 100%|██████████| 320/320 [00:00<00:00, 21375.31 examples/s]


In [4]:
from llama_cpp import Llama

llm = Llama.from_pretrained(
	repo_id="TheBloke/stable-code-3b-GGUF",
	filename="stable-code-3b.Q2_K.gguf",
)


ModuleNotFoundError: No module named 'llama_cpp'

In [None]:
import pandas as pd
import subprocess
import shlex
import zipfile
import os

from openai import OpenAI
from databench_eval import Runner, Evaluator, utils


# this makes use of https://huggingface.co/TheBloke/stable-code-3b-GGUF
# and https://github.com/ggerganov/llama.cpp
def call_gguf_model(prompts):
    results = []
    for p in prompts:
        escaped = p.replace('"', '\\"')
        cmd = f'llama-cli -m ./models/stable-code-3b.Q4_K_M.gguf -p "{escaped}" -c 1024 -n 128'
        args = shlex.split(cmd)
        try:
            result = subprocess.run(args, capture_output=True, text=True, check=True)
            results.append(result.stdout)
        except Exception as e:
            results.append(f"__CODE_GEN_ERROR__: {e.stderr}")

    return results


def chat_completions(prompts, model="gpt-4o-mini", max_tokens=150):
  """
  This function takes a list of prompts and returns a list of corresponding generations using OpenAI's Chat Completions API.

  Args:
      prompts (list): A list of user prompts for the chat conversation (strings).
      api_key (str, optional): Your OpenAI API key. Defaults to "YOUR_API_KEY".
      model (str, optional): The OpenAI model to use. Defaults to "gpt-4o-mini".
      max_tokens (int, optional): The maximum number of tokens for the generated response. Defaults to 150.

  Returns:
      list: A list of generated responses (strings), corresponding to the input prompts.

  Raises:
      ValueError: If any prompt exceeds the maximum character limit.
  """

  # Error handling for prompt length
  max_char_limit = max_tokens * 4  # Assuming 4 characters per token (average)
  for prompt in prompts:
    if len(prompt) > max_char_limit:
      raise ValueError(f"Prompt '{prompt}' exceeds the maximum character limit of {max_char_limit}")

  api_key = os.getenv("OPENAI_API_KEY", api_key)
  client = OpenAI(api_key)
  generations = []

  for prompt in prompts:
    response = client.chat.completions.create(
      model=model,
      messages=[{"role": "user", "content": prompt}],
      max_tokens=max_tokens
    )
    generations.append(response.choices[0].message.content.strip())  # Remove leading/trailing whitespace

  return generations

# # Example usage
# prompts = ["What is the capital of France?", "Write a haiku about nature."
# generations = chat_completions(prompts, api_key)

# print(generations)


def example_generator(row: dict) -> str:
    """IMPORTANT:
    **Only the question and dataset keys will be available during the actual competition**.
    You can, however, try to predict the answer type or columns used
    with another modeling task if that helps, then use them here.
    """
    dataset = row["dataset"]
    question = row["question"]
    df = utils.load_table(dataset)
    return f"""
# TODO: complete the following function in one line. It should give the answer to: How many rows are there in this dataframe? 
def example(df: pd.DataFrame) -> int:
    df.columns=["A"]
    return df.shape[0]

# TODO: complete the following function in one line. It should give the answer to: {question}
def answer(df: pd.DataFrame) -> {row["type"]}:
    df.columns = {list(df.columns)}
    return"""


def example_postprocess(response: str, dataset: str, loader):
    try:
        df = loader(dataset)
        lead = """
def answer(df):
    return """
        exec(
            "global ans\n"
            + lead
            + response.split("return")[2]
            .split("\n")[0]
            .strip()
            .replace("[end of text]", "")
            + f"\nans = answer(df)"
        )
        # no true result is > 1 line atm, needs 1 line for txt format
        return ans.split("\n")[0] if "\n" in str(ans) else ans
    except Exception as e:
        return f"__CODE_ERROR__: {e}"


qa = utils.load_qa(name="semeval", split="dev")
evaluator = Evaluator(qa=qa)

runner = Runner(
    # model_call=call_gguf_model,
    model_call=chat_completions,
    prompt_generator=example_generator,
    postprocess=lambda response, dataset: example_postprocess(
        response, dataset, utils.load_table
    ),
    qa=qa,
    batch_size=10,
)

runner_lite = Runner(
    # model_call=call_gguf_model,
    model_call=chat_completions,
    prompt_generator=example_generator,
    postprocess=lambda response, dataset: example_postprocess(
        response, dataset, utils.load_sample
    ),
    qa=qa,
    batch_size=10,
)

responses = runner.run(save="predictions.txt")
responses_lite = runner_lite.run(save="predictions_lite.txt")
print(f"DataBench accuracy is {evaluator.eval(responses)}")  # ~0.15
print(
    f"DataBench_lite accuracy is {evaluator.eval(responses_lite, lite=True)}"
)  # ~0.07


with zipfile.ZipFile("Archive.zip", "w") as zipf:
    zipf.write("predictions.txt")
    zipf.write("predictions_lite.txt")

print("Created Archive.zip containing predictions.txt and predictions_lite.txt")


  0%|          | 0/32 [00:03<?, ?it/s]


AttributeError: 'FileNotFoundError' object has no attribute 'stderr'