# TACO Experiment: Cherry-picked context 2

The goal here is to evaluate how the model will behave when passing the Solutions from an analog problem that were manually analyzed and selected  
In this specific scenario we want to test it with a very specific group of tasks from the TACO benchmark  
The difference here is try to evaluate the idea in a different task, with more tests and other metrics to explore

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

import polars as pl
import torch
import numpy as np
import json
import re
## LLM
from src.llms import Llama3_1_Instruct
from src.taco_evaluator import compute
from datasets import load_from_disk
import datetime
from typing import List, Dict, Any, Tuple, Mapping
seed = 42
# NumPy
np.random.seed(seed)

# PyTorch
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


if torch.cuda.is_available():
    print(f"Number of GPUs available: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


Number of GPUs available: 1
GPU 0: NVIDIA RTX A5000


## Load Datasets

In [3]:
PATH  = "../../../data/TACO/processed"
train = pl.read_ipc(f"{PATH}/train.feather")
train_solutions = pl.read_ipc(f"{PATH}/train_solutions.feather")
train_tests = pl.read_ipc(f"{PATH}/train_evaluation_tests.feather")
train_dict = load_from_disk("../../../data/TACO/train.hf")

In [4]:
def run_inference(prompt: str, path: str, num_returns = 200, max_length=2048):
    outputs = []
    llm = Llama3_1_Instruct()
    for i in range(num_returns//10):
        print(f"Lopp {i}, {datetime.datetime.now()}")
        config = {
                    "temperature": 0.7,
                    "max_length": max_length,
                    "top_p": 0.95,
                    "num_return_sequences": 10
        }
        

        output = llm.run(prompt=prompt, input="", config_params=config)

        for res in output:
            outputs.append(res)
        
    json.dump(outputs, open(path, "w"))

In [5]:
def parse_generation(generations: list, id: int, path: str):
    
    gens = []
    for i in range(len(generations)):

        code_blocks = re.findall(r'```python(.*?)```', generations[i]["generated_text"], re.DOTALL)
        extracted_code = "\n".join([block.strip() for block in code_blocks])
        gens.append(extracted_code)
    
    results = [{
        "task_id": int(id),
        "output": gens
    }]

    json.dump(results, open(path, "w"))
        

In [14]:
def filter_test(tests: pl.DataFrame, task_id: int) -> list[str]:

    temp = tests.filter(pl.col("id") == task_id).select(["input", "output"]).to_dict()
    splited_tests = []
    for t in zip(temp["input"], temp["output"]):
        temp_dict = {
            "inputs": [t[0]],
            "outputs": [t[1]]
        }

        splited_tests.append(json.dumps(temp_dict))
    
    return splited_tests
    



def compute_pass_1_by_test(generations_file: str, taco_dataset, tests: pl.DataFrame) -> dict[str, list[float]]:

    
    generations = json.load(open(generations_file))
    metric_results = {}

    for idx in range(len(generations)):

        

        id =  generations[idx]["task_id"]
        metric_results[f"task_{id}"] = []

        splited_tests = filter_test(tests,id)
        for idx_text in range(len(splited_tests)):
            print(f"Test {idx_text}")
            print(datetime.datetime.now())
            temp_taco = taco_dataset[id].copy()
            temp_taco["input_output"] = splited_tests[idx_text]
            result = compute("no_context_parsed.json", [temp_taco], [1], return_dict=True)
            metric_results[f"task_{id}"].append(result["pass@1"])
        
        return metric_results

        



## Problem Selection

The category choosen it was "Probability" in EASY difficulty  
The criteria behind the choice is because there isn't a lot of examples of geometry, which facilitates to find samples specific from that scope.  
The EASY difficulty is for validation purposes

In [7]:
_tests = train_tests.group_by("id").agg(pl.count("test_id").alias("num_tests"))
(
    train
    .join(_tests, on="id", how="left")
    .filter(pl.col("num_tests") > 5)
    .filter(pl.col("tags") == "Probability")
    .group_by(pl.col("difficulty"))
    .agg(pl.count("id"))
)

difficulty,id
str,u32
"""HARD""",69
"""UNKNOWN_DIFFICULTY""",2
"""EASY""",6
"""MEDIUM""",5
"""VERY_HARD""",90
"""MEDIUM_HARD""",23


In [8]:
# (
#     train
#     .join(_tests, on="id", how="left")
#     .filter(pl.col("num_tests") > 5)
#     .filter(pl.col("tags") == "Probability")
#     .filter(pl.col("difficulty") == "MEDIUM")
#     .sample(1)
# )
## ID = 2545

selected_problem = train.filter(pl.col("id") == 2545)
print(selected_problem.select(["input"]).unique().to_dict()["input"][0])

It's the rainy season again, and the city experiences frequent showers throughout the day.

The weather report says that there is a P probability of rainfalls today. Raj has to step out for a meeting at the office, and would like to know the probability that it rains during the time he is on the way.

Input:

The first line of input contains the number of test cases, T. Each of the following T lines contain two numbers, P and time. P denotes the probability that it will rain today and time is the time (in minutes), it will take for Raj to reach his office.

Output:

Output should have T lines each containing answer to corresponding test case. Please round the answer to 4 decimal places.

Constraints:

1 ≤ T ≤ 100
0 ≤ P ≤ 0.5
10 ≤ time ≤ 720
time is a perfect divisor of 1440.

SAMPLE INPUT
2
0 10
.5 720

SAMPLE OUTPUT
0.0000
0.2929



In [9]:
train_tests.filter(pl.col("id") == 2545).select("input").count()

input
u32
10


## Run Baseline - No Context

In [10]:
prompt_input = selected_problem.select("input").to_struct().to_pandas().iloc[0]["input"]
prompt = f"Please write a Python program \nQUESTION: \n{prompt_input} \n ANSWER: \n."
# run_inference(prompt_input, "no_context.json")

In [11]:
# parse_generation(json.load(open("no_context.json")), 2545 , "no_context_parsed.json")

In [12]:
train_dict[2545]

{'question': "It's the rainy season again, and the city experiences frequent showers throughout the day.\n\nThe weather report says that there is a P probability of rainfalls today. Raj has to step out for a meeting at the office, and would like to know the probability that it rains during the time he is on the way.\n\nInput:\n\nThe first line of input contains the number of test cases, T. Each of the following T lines contain two numbers, P and time. P denotes the probability that it will rain today and time is the time (in minutes), it will take for Raj to reach his office.\n\nOutput:\n\nOutput should have T lines each containing answer to corresponding test case. Please round the answer to 4 decimal places.\n\nConstraints:\n\n1 ≤ T ≤ 100\n0 ≤ P ≤ 0.5\n10 ≤ time ≤ 720\ntime is a perfect divisor of 1440.\n\nSAMPLE INPUT\n2\n0 10\n.5 720\n\nSAMPLE OUTPUT\n0.0000\n0.2929",
 'solutions': '["test_case = int(input())\\n\\nwhile test_case:\\n\\tin_1 = input()\\n\\tin_1 = in_1.split()\\n\\t\

In [None]:
compute_pass_1_by_test("no_context_parsed.json", train_dict, train_tests)

Test 0
2025-02-28 16:01:31.899467


In [None]:
a = {
    "inputs": ["0"],
    "outputs": ["1"]
}

str(a, )

"{'inputs': ['0'], 'outputs': ['1']}"

In [17]:
compute("no_context_parsed.json", [train_dict[2545]], [1, 10, 100])

In [14]:
json.load(open("taco_metrics.json"))

{'pass@1': 0.0,
 'pass@10': 0.0,
 'pass@100': 0.0,
 'detail': {'pass@1': {'2545': 0.0},
  'pass@10': {'2545': 0.0},
  'pass@100': {'2545': 0.0}}}

## Select Context

Here we will try to get 4 solutions that are related to the problem above


In [8]:
df = train.filter(pl.col("tags") == "Geometry").filter(pl.col("difficulty") == "EASY").filter(pl.col("input").str.contains("class Solution")).select(["id", "input"])
print(df.count())
df.write_csv("pool.csv")

shape: (1, 2)
┌─────┬───────┐
│ id  ┆ input │
│ --- ┆ ---   │
│ u32 ┆ u32   │
╞═════╪═══════╡
│ 14  ┆ 14    │
└─────┴───────┘


In [9]:
selected_ids = [21825, 10745, 1643, 4661]
train.filter(pl.col("id") == 1643)

id,difficulty,tags,input
u32,str,str,str
1643,"""EASY""","""Geometry""","""Given two rectangles, find if …"
1643,"""EASY""","""Mathematics""","""Given two rectangles, find if …"


In [10]:
all_inputs  = train.filter(pl.col("id").is_in(selected_ids)).select("input").unique().to_dict()["input"]
all_solutions = train_solutions.filter(pl.col("id").is_in(selected_ids)).group_by(pl.col("id")).head(1).select("solution").unique().to_dict()["solution"]
question_input = selected_problem.select("input").to_dict()["input"][0]

all_inputs

input
str
"""Given the coordinates of the e…"
"""Given a circular sheet of radi…"
"""Given two rectangles, find if …"
"""An axis-aligned rectangle is r…"


In [11]:
all_solutions

solution
str
"""class Solution: 	def isRectan…"
"""class Solution: 	def rectangl…"
"""class Solution: 	def doInters…"
"""class Solution: 	def doOverla…"


## Full Prompt Run

In [12]:
context_prompt = "You will have to answer a programming quesiton in geometry, we will pass before some examples of questions and solutions\n"
for i in range(4):
    context_prompt += f"EXAMPLE QUESTION {i}:\n {all_inputs[i]}\n EXAMPLE SOLUTION {i}:\n {all_solutions[i]}\n"

full_prompt = f"Please write a Python program {context_prompt} \nQUESTION: \n{question_input} \n ANSWER: \n."

In [None]:
# run_inference(
#     prompt=full_prompt,
#     path = "full_prompt.json",
#     num_returns=20,
#     max_length=4096
# )

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.49s/it]


Lopp 0, 2025-02-27 08:19:55.087679




Lopp 1, 2025-02-27 08:43:58.668251


In [8]:
parse_generation(json.load(open("full_prompt.json")), 10237 , "full_prompt_parsed.json")

In [9]:
compute("full_prompt_parsed.json", [train_dict[10237]], [1, 10, 100])

In [10]:
json.load(open("full_prompt_metrics.json"))

{'pass@1': 0.0,
 'pass@10': 0.0,
 'detail': {'pass@1': {'10237': 0.0}, 'pass@10': {'10237': 0.0}}}

## Only Solutions Run

In [None]:
context_prompt = "You will have to answer a programming quesiton in geometry, we will pass before some examples of solutions for similar problems\n"
for i in range(4):
    context_prompt += f" EXAMPLE SOLUTION {i}:\n {all_solutions[i]}\n"

solutions_prompt = f"Please write a Python program {context_prompt} \nQUESTION: \n{question_input} \n ANSWER: \n."

In [None]:
# run_inference(
#     prompt=solutions_prompt,
#     path = "solutions_prompt.json",
#     num_returns=20,
#     max_length=4096
# )

In [11]:
parse_generation(json.load(open("solutions_prompt.json")), 10237 , "solutions_parsed.json")

In [12]:
compute("solutions_parsed.json", [train_dict[10237]], [1, 10])

In [13]:
json.load(open("solutions_prompt_metrics.json"))

{'pass@1': 0.0,
 'pass@10': 0.0,
 'detail': {'pass@1': {'10237': 0.0}, 'pass@10': {'10237': 0.0}}}