# TACO Experiment: Cherry-picked context to one specific task

The goal here is to evaluate how the model will behave when passing the Solutions from an analog problem that were manually analyzed and selected  
In this specific scenario we want to test it with a very specific group of tasks from the TACO benchmark

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import polars as pl
import torch
import numpy as np
import json
import re
## LLM
from dmcr.models import Llama3_1_Instruct
from dmcr.taco_evaluator import compute, compute_1_pass_by_test
from datasets import load_from_disk
import datetime

seed = 42
# NumPy
np.random.seed(seed)

# PyTorch
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


if torch.cuda.is_available():
    print(f"Number of GPUs available: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Number of GPUs available: 1
GPU 0: NVIDIA L40S


## Load Datasets

In [15]:
PATH  = "../../../data/TACO/processed"
train = pl.read_ipc(f"{PATH}/train.feather")
train_solutions = pl.read_ipc(f"{PATH}/train_solutions.feather")
train_dict = load_from_disk("../../../data/TACO/train.hf")

In [16]:
def run_inference(prompt: str, path: str, num_returns = 10, max_length=2048):
    outputs = []
    llm = Llama3_1_Instruct()
    for i in range(num_returns//10):
        print(f"Lopp {i}, {datetime.datetime.now()}")
        config = {
                    "temperature": 0.7,
                    "max_length": max_length,
                    "top_p": 0.95,
                    "num_return_sequences": 10
        }
        

        output = llm.run(prompt=prompt, input="", config_params=config)

        for res in output:
            outputs.append(res)
        
    json.dump(outputs, open(path, "w"))

In [17]:
def parse_generation(generations: list, id: int, path: str):
    
    gens = []
    for i in range(len(generations)):

        code_blocks = re.findall(r'```python(.*?)```', generations[i]["generated_text"], re.DOTALL)
        extracted_code = "\n".join([block.strip() for block in code_blocks])
        gens.append(extracted_code)
    
    results = [{
        "task_id": int(id),
        "output": gens
    }]

    json.dump(results, open(path, "w"))
        

## Problem Selection

The category choosen it was "Geometry" in EASY difficulty  
The criteria behind the choice is because there isn't a lot of examples of geometry, which facilitates to find samples specific from that scope.  
The EASY difficulty is for validation purposes

In [18]:
# train.filter(pl.col("tags") == "Geometry").filter(pl.col("difficulty") == "EASY").count()

In [19]:
# selected_problem = train.filter(pl.col("tags") == "Geometry").filter(pl.col("difficulty") == "EASY").sample(1)
# print(selected_problem)
## ID = 14186 

selected_problem = train.filter(pl.col("tags") == "Geometry").filter(pl.col("difficulty") == "EASY").filter(pl.col("id") == 10237)
# print(selected_problem)

In [20]:
# train_solutions.filter(pl.col("id") == 10237).to_struct()

In [21]:
# print(selected_problem.select("input").to_dict()["input"][0])

## Run Baseline - No Context

In [22]:
# prompt_input = selected_problem.select("input").to_struct().to_pandas().iloc[0]["input"]
# prompt = f"Please write a Python program \nQUESTION: \n{prompt_input} \n ANSWER: \n."
# run_inference(prompt_input, "no_context.json", num_returns=200)

In [23]:
parse_generation(json.load(open("no_context.json")), 10237 , "no_context_parsed.json")

In [24]:
compute("no_context_parsed.json", [train_dict[10237]], [1, 10, 100])

Traceback (most recent call last):
  File "/tmp/tmpmkya6ewr", line 19, in <module>
    (x1 + x2) / 2, (y1 + y2) / 2
     ^^
NameError: name 'x1' is not defined
Traceback (most recent call last):
  File "/tmp/tmpmkya6ewr", line 19, in <module>
    (x1 + x2) / 2, (y1 + y2) / 2
     ^^
NameError: name 'x1' is not defined
Traceback (most recent call last):
  File "/tmp/tmp7mhpzgoc", line 41, in <module>
    code()
  File "/tmp/tmp7mhpzgoc", line 20, in code
    (x1 + x2) / 2, (y1 + y2) / 2
     ^^
NameError: name 'x1' is not defined
Traceback (most recent call last):
  File "/tmp/tmp7mhpzgoc", line 41, in <module>
    code()
  File "/tmp/tmp7mhpzgoc", line 20, in code
    (x1 + x2) / 2, (y1 + y2) / 2
     ^^
NameError: name 'x1' is not defined
Traceback (most recent call last):
  File "/tmp/tmp9eyc4hft", line 33, in <module>
    print(solution.findCornerPoints(2, [[1,0],[1,2]]))  # Output: [(0, 0), (0, 2), (2, 0), (2, 2)]
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/t

In [25]:
json.load(open("taco_metrics.json"))

{'pass@1': 0.0,
 'pass@10': 0.0,
 'pass@100': 0.0,
 'detail': {'pass@1': {'10237': 0.0},
  'pass@10': {'10237': 0.0},
  'pass@100': {'10237': 0.0}}}

In [26]:
compute_1_pass_by_test("no_context_parsed.json", [train_dict[10237]])

Traceback (most recent call last):
  File "/tmp/tmpnah91e21", line 19, in <module>
    (x1 + x2) / 2, (y1 + y2) / 2
     ^^
NameError: name 'x1' is not defined
Traceback (most recent call last):
  File "/tmp/tmpnah91e21", line 19, in <module>
    (x1 + x2) / 2, (y1 + y2) / 2
     ^^
NameError: name 'x1' is not defined
Traceback (most recent call last):
  File "/tmp/tmps9tq4tol", line 41, in <module>
    code()
  File "/tmp/tmps9tq4tol", line 20, in code
    (x1 + x2) / 2, (y1 + y2) / 2
     ^^
NameError: name 'x1' is not defined
Traceback (most recent call last):
  File "/tmp/tmps9tq4tol", line 41, in <module>
    code()
  File "/tmp/tmps9tq4tol", line 20, in code
    (x1 + x2) / 2, (y1 + y2) / 2
     ^^
NameError: name 'x1' is not defined
Traceback (most recent call last):
  File "/tmp/tmptv9m6xo5", line 33, in <module>
    print(solution.findCornerPoints(2, [[1,0],[1,2]]))  # Output: [(0, 0), (0, 2), (2, 0), (2, 2)]
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/t

tensor([[False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [ True, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [ True, False],
        [False, False],
        [False, False],
        [False, False],
        [False, 

{10237: [10, 3]}

## Select Context

Here we will try to get 4 solutions that are related to the problem above


In [27]:
# df = train.filter(pl.col("tags") == "Geometry").filter(pl.col("difficulty") == "EASY").filter(pl.col("input").str.contains("class Solution")).select(["id", "input"])
# print(df.count())
# df.write_csv("pool.csv")

In [28]:
selected_ids = [21825, 10745, 1643, 4661]
# train.filter(pl.col("id") == 1643)

In [29]:
all_inputs  = train.filter(pl.col("id").is_in(selected_ids)).select("input").unique().to_dict()["input"]
all_solutions = train_solutions.filter(pl.col("id").is_in(selected_ids)).group_by(pl.col("id")).head(1).select("solution").unique().to_dict()["solution"]
question_input = selected_problem.select("input").to_dict()["input"][0]

# all_inputs

## Full Prompt Run

In [30]:
context_prompt = "You will have to answer a programming quesiton in geometry, we will pass before some examples of questions and solutions\n"
for i in range(4):
    context_prompt += f"EXAMPLE QUESTION {i}:\n {all_inputs[i]}\n EXAMPLE SOLUTION {i}:\n {all_solutions[i]}\n"

full_prompt = f"Please write a Python program {context_prompt} \nQUESTION: \n{question_input} \n ANSWER: \n."

In [31]:
# run_inference(
#     prompt=full_prompt,
#     path = "full_prompt.json",
#     num_returns=200,
#     max_length=4096
# )

In [32]:
parse_generation(json.load(open("full_prompt.json")), 10237 , "full_prompt_parsed.json")

In [33]:
compute("full_prompt_parsed.json", [train_dict[10237]], [1, 10, 100])

Traceback (most recent call last):
  File "/tmp/tmpcks9dut3", line 32, in <module>
    print(solution.findCornerPoints(2, [[1,0],[1,2]]))  # Output: [[0, 0], [0, 2], [2, 0], [2, 2]]
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/tmpcks9dut3", line 30, in findCornerPoints
    return [int(i) for i in [A, B, C, D]]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/tmpcks9dut3", line 30, in <listcomp>
    return [int(i) for i in [A, B, C, D]]
            ^^^^^^
TypeError: int() argument must be a string, a bytes-like object or a real number, not 'list'
Traceback (most recent call last):
  File "/tmp/tmpcks9dut3", line 32, in <module>
    print(solution.findCornerPoints(2, [[1,0],[1,2]]))  # Output: [[0, 0], [0, 2], [2, 0], [2, 2]]
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/tmpcks9dut3", line 30, in findCornerPoints
    return [int(i) for i in [A, B, C, D]]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/tmpcks9dut3", line 30, in <list

In [34]:
compute_1_pass_by_test("full_prompt_parsed.json", [train_dict[10237]])

Traceback (most recent call last):
  File "/tmp/tmpmyka4uq3", line 32, in <module>
    print(solution.findCornerPoints(2, [[1,0],[1,2]]))  # Output: [[0, 0], [0, 2], [2, 0], [2, 2]]
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/tmpmyka4uq3", line 30, in findCornerPoints
    return [int(i) for i in [A, B, C, D]]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/tmpmyka4uq3", line 30, in <listcomp>
    return [int(i) for i in [A, B, C, D]]
            ^^^^^^
TypeError: int() argument must be a string, a bytes-like object or a real number, not 'list'
Traceback (most recent call last):
  File "/tmp/tmpmyka4uq3", line 32, in <module>
    print(solution.findCornerPoints(2, [[1,0],[1,2]]))  # Output: [[0, 0], [0, 2], [2, 0], [2, 2]]
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/tmpmyka4uq3", line 30, in findCornerPoints
    return [int(i) for i in [A, B, C, D]]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/tmpmyka4uq3", line 30, in <list

tensor([[False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [False, False],
        [ True, False],
        [False, False],
        [False, False],
        [False, 

{10237: [2, 1]}

In [35]:
json.load(open("full_prompt_metrics.json"))

{'pass@1': 0.0,
 'pass@10': 0.0,
 'detail': {'pass@1': {'10237': 0.0}, 'pass@10': {'10237': 0.0}}}

## Only Solutions Run

In [36]:
context_prompt = "You will have to answer a programming quesiton in geometry, we will pass before some examples of solutions for similar problems\n"
for i in range(4):
    context_prompt += f" EXAMPLE SOLUTION {i}:\n {all_solutions[i]}\n"

solutions_prompt = f"Please write a Python program {context_prompt} \nQUESTION: \n{question_input} \n ANSWER: \n."

In [37]:
# run_inference(
#     prompt=solutions_prompt,
#     path = "solutions_prompt.json",
#     num_returns=20,
#     max_length=4096
# )

In [38]:
parse_generation(json.load(open("solutions_prompt.json")), 10237 , "solutions_parsed.json")

In [39]:
compute("solutions_parsed.json", [train_dict[10237]], [1, 10])

  File "/tmp/tmpzjuc1mv2", line 36
    return sorted([round(A[0]), round(A[1])], [round(B[0]), round(B[1])], [round(C[0]), round(C[1])], [round(D[0]), round(D[1])])
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
SyntaxError: 'return' outside function
  File "/tmp/tmpzjuc1mv2", line 36
    return sorted([round(A[0]), round(A[1])], [round(B[0]), round(B[1])], [round(C[0]), round(C[1])], [round(D[0]), round(D[1])])
    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
SyntaxError: 'return' outside function
Traceback (most recent call last):
  File "/tmp/tmphvt__yhn", line 40, in <module>
    code()
  File "/tmp/tmphvt__yhn", line 37, in code
    return sorted([round(A[0]), round(A[1])], [round(B[0]), round(B[1])], [round(C[0]), round(C[1])], [round(D[0]), round(D[1])])
                         ^
NameError: name 'A' is not defined
Tr

In [40]:
compute_1_pass_by_test("solutions_parsed.json", [train_dict[10237]])

Process Process-2408:
Traceback (most recent call last):
  File "/home/caio.rhoden/miniconda3/envs/datamodels/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/home/caio.rhoden/miniconda3/envs/datamodels/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/work/caio.rhoden/datamodels-context-reduction/src/taco_evaluator/compute_1_pass_by_test.py", line 17, in _temp_run
    result.append(run_test(sample, test=generation, debug=debug))
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/work/caio.rhoden/datamodels-context-reduction/src/taco_evaluator/metrics/testing_util.py", line 87, in run_test
    detail_results = execute_std_code(exec_code, inputs_list, outputs_list, timeout=TIMEOUT, early_stop=False, debug=debug)
                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/work/caio.rhod

KeyboardInterrupt: 

In [None]:
json.load(open("solutions_prompt_metrics.json"))

{'pass@1': 0.0,
 'pass@10': 0.0,
 'detail': {'pass@1': {'10237': 0.0}, 'pass@10': {'10237': 0.0}}}