In [1]:
import numpy as np
import pandas as pd

import os
import sys
import json
from pathlib import Path
import gc
import inspect
import random
import time

import matplotlib.pyplot as plt
from matplotlib import colors
import numpy as np
from pathlib import Path

from copy import deepcopy
from copy import copy

from tqdm.notebook import tqdm

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from trl import SFTTrainer, SFTConfig
from datasets import Dataset
import torch

In [3]:
data_path = Path('../data')
train_path = data_path / 'arc-agi_training_challenges.json'
train_sols_path = data_path / 'arc-agi_training_solutions.json'
eval_path = data_path / 'arc-agi_evaluation_challenges.json'
eval_sols_path = data_path / 'arc-agi_evaluation_solutions.json'

In [4]:
sys.path.insert(0, '..')
sys.path.insert(0, '../DSL')

from visualization.visualization_utils import *
import solvers
from solver_class import Solver
from dsl import *
from constants import *
from fitness_scoring import *

In [5]:
with open(train_path, 'r') as f:
    train_tasks = json.load(f)
with open(train_sols_path, 'r') as f:
    train_sols = json.load(f)
with open(eval_path, 'r') as f:
    eval_tasks = json.load(f)
with open(eval_sols_path, 'r') as f:
    eval_sols = json.load(f)

train_task_labels = sorted(train_tasks.keys())
eval_task_labels = sorted(eval_tasks.keys())

In [6]:
ast = lambda g: tuple(tuple(r) for r in g) # Converts grid to tuple format for DSL

# Convert all train and eval examples to tuples for DSL
for train_label in train_task_labels:
    num_train = len(train_tasks[train_label]['train']) 
    num_test = len(train_tasks[train_label]['test'])
    for i in range(num_train):
        train_tasks[train_label]['train'][i]['input'] = ast(train_tasks[train_label]['train'][i]['input'])
        train_tasks[train_label]['train'][i]['output'] = ast(train_tasks[train_label]['train'][i]['output'])
    for i in range(num_test):
        train_tasks[train_label]['test'][i]['input'] = ast(train_tasks[train_label]['test'][i]['input'])
        train_sols[train_label][i] = ast(train_sols[train_label][i])
for eval_label in eval_task_labels:
    num_train = len(eval_tasks[eval_label]['train']) 
    num_test = len(eval_tasks[eval_label]['test'])
    for i in range(num_train):
        eval_tasks[eval_label]['train'][i]['input'] = ast(eval_tasks[eval_label]['train'][i]['input'])
        eval_tasks[eval_label]['train'][i]['output'] = ast(eval_tasks[eval_label]['train'][i]['output'])
    for i in range(num_test):
        eval_tasks[eval_label]['test'][i]['input'] = ast(eval_tasks[eval_label]['test'][i]['input'])
        eval_sols[eval_label][i] = ast(eval_sols[eval_label][i])

In [7]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using {DEVICE}")

Using cuda


# Load pretrained model

In [8]:
codegen_tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono",
                                                 padding_side='left', # For padding batches of input in decoder-only context
                                                 clean_up_tokenization_spaces = True,
                                                 ) 
codegen = AutoModelForCausalLM.from_pretrained("../CodeGen fine-tuning/outputs/v7/",
                                               #load_in_8bit = True, # Don't bother with this, actually slows runtime significantly in practice.
                                              )
codegen = codegen.to(DEVICE) # Comment this line out if quantizing via load_in_8bit

In [9]:
EOS_TOKEN = codegen_tokenizer.eos_token
BOS_TOKEN = codegen_tokenizer.bos_token
codegen_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
PAD_TOKEN = '[PAD]'
#codegen_tokenizer.pad_token = codegen_tokenizer.eos_token

# Helper functions

In [10]:
def grid_to_string(grid):
    """
    Simply converts grid to string and removes whitespace.
    A 30-by-30 grid will use ~1800 tokens.
    """
    return str(grid).replace(" ", "")

def grid_to_string_compact(grid):
    """
    Rows as strings of characters separated by linebreaks.
    Uses approximately 4-times fewer tokens than grid_to_string.
    """
    return "\n".join(["".join([str(entry) for entry in row]) for row in grid])

In [11]:
def create_generation_prompt(in_grid, out_grid, name = 'solve'):
    result = f'''def {name}(I):
    """
    INPUT:
    {"\n    ".join(grid_to_string_compact(in_grid).split("\n"))}
    OUTPUT:
    {"\n    ".join(grid_to_string_compact(out_grid).split("\n"))}
    """
    '''
    return result

In [12]:
def random_label() -> str:
    """
    Random task label consisting of 8 hexidecimal digits lowercase.
    """
    digits = list("0123456789abcdef")
    return "".join([random.choice(digits) for _ in range(8)])

In [13]:
def random_grid(rows = None, cols = None, palette = None):
    if not rows:
        rows = random.randint(1,30)
    if not cols:
        cols = random.randint(1,30)
    if not palette:
        palette = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    return tuple(tuple(random.choice(palette) for _ in range(cols)) for _ in range(rows))

In [14]:
def memory_safe_generate(model,
                         tokenizer,
                         inputs,
                         generate_args = {
                           'max_new_tokens': 512, 
                           'do_sample': True,
                           'temperature': 2.0,
                           'top_k': 50,
                         },
                         batchsize = 32,
                        ):
    current_idx = 0
    results = []
    while current_idx < len(inputs):
        outputs = None
        while outputs == None:
            tokens = tokenizer(inputs[current_idx : current_idx + batchsize],
                               padding = True,
                               return_tensors = 'pt',
                              ).to(DEVICE)
            try:
                outputs = model.generate(**tokens,
                                 pad_token_id = tokenizer.pad_token_id,
                                **generate_args,
                                )
            except torch.cuda.OutOfMemoryError:
                outputs = None
                batchsize = batchsize // 2
                if batchsize == 0:
                    raise MemoryError('batchsize was reduced to 1, but still ran out of memory!')
        
        results.extend(tokenizer.batch_decode(outputs))
        current_idx += batchsize

    batchsize_log.append(batchsize)
            
    return results

In [15]:
def generate_prompts(in_out_pairs, num_prompts):
    results = []
    while len(results) < num_prompts:
        results.extend([create_generation_prompt(pair['input'], pair['output']) for pair in in_out_pairs])
    return results[:num_prompts]

In [16]:
def generate_candidate_solvers(model,
                               tokenizer,
                               in_out_pairs,
                               generate_args = {
                                   'max_new_tokens': 512, 
                                   'do_sample': True,
                                   'temperature': 2.0,
                                   'top_k': 50,
                               },
                               num_candidates = 512):
    prompts = generate_prompts(in_out_pairs, num_candidates)
    candidates = memory_safe_generate(model, tokenizer, prompts, generate_args = generate_args)
    return candidates
        

In [17]:
def clean_candidate(candidate: str) -> str:
    candidate = candidate.replace(EOS_TOKEN, "")
    candidate = candidate.replace(BOS_TOKEN, "")
    candidate = candidate.replace(PAD_TOKEN, "")
    if "return O" not in candidate:
        return None
    return candidate.split("return O")[0] + "return O"

# Generate solvers

In [18]:
%env TOKENIZERS_PARALLELISM true

env: TOKENIZERS_PARALLELISM=true


In [19]:
full_scoring_results = {}
batchsize_log = []

for i, label in enumerate(tqdm(eval_task_labels)):
        
    pairs = eval_tasks[label]['train']
    results_this_round = []
    
    generate_args = {
        'max_new_tokens': 512,
        'do_sample': True,
        'temperature': 2.0,
        'top_k': 50,
        #'num_beams': 2, # Using multiple beams creates too much memory pressure
    }

    candidates = generate_candidate_solvers(codegen, codegen_tokenizer, pairs, generate_args = generate_args, num_candidates = 200)

    # Clean up candidates
    cleaned_candidates = []
    for c in candidates:
        cleaned = clean_candidate(c)
        if not cleaned:
            results_this_round.append((1.1, c.replace(EOS_TOKEN, "").replace(BOS_TOKEN, "").replace(PAD_TOKEN, "")))
        else:
            cleaned_candidates.append(cleaned)

    # Generate programs
    programs = []
    for c in cleaned_candidates:
        try:
            programs.append(Solver(c))
        except:
            results_this_round.append((1.1, c))

    # Compute scores
    scoring_results = score_solvers_vs_tasks(programs, pairs, scoring_functions, solver_timeout = 1.0)

    # Record results
    results_this_round += [(score, solver.function_text) for score, solver in scoring_results]
    full_scoring_results[label] = sorted(results_this_round, key = lambda x: x[0])

    # Save every 10 tasks
    if i%10 == 0:
        with open('v7.json', 'w') as f:
            f.seek(0)
            json.dump(full_scoring_results, f, indent = 4)

with open('v7.json', 'w') as f:
    f.seek(0)
    json.dump(full_scoring_results, f, indent = 4)

  0%|          | 0/400 [00:00<?, ?it/s]

In [20]:
pd.Series(batchsize_log).value_counts()

32    390
16     10
Name: count, dtype: int64

# Experimentation code

In [18]:
experiment_labels = ['d56f2372', '1e97544e', '3490cc26', 'bf699163', '2037f2c7']
MAX_BATCH_SIZE = 32

In [17]:
def conduct_and_log_experiment(labels, 
                               model,
                               tokenizer,
                               solvers_per_task = 64,
                               generate_args = {
                                    'max_new_tokens': 512,
                                    'do_sample': True,
                                    'top_k': 50,
                                    'temperature': 1.0,
                               },
                               log_file = 'log.json'
                              ):
    """
    Generate a bunch of solvers for each task according to parameters
    given, then score them, then record the scores to log_file.

    The log is a dict with labels as keys, and each entry is a list of
    pairs (score, solver_text) sorted by score. A score of 1.1 indicates
    that solver_text failed to compile to a program. A score of 1.0
    almost surely indicates that solver_text compiled but failed to 
    run on one of the pairs. Any lower scores are computed as usual. The
    solver texts should not include docstrings.
    """
    results = {label: [] for label in labels}
    
    for label in tqdm(labels):
        pairs = eval_tasks[label]['train']
        results_this_round = []
        
        # Generate candidate solvers
        candidates = generate_candidate_solvers(model, tokenizer, pairs, generate_args = generate_args, min_num_solvers = solvers_per_task)
        candidates = candidates[:solvers_per_task]

        # Clean up candidates
        cleaned_candidates = []
        for c in candidates:
            cleaned = clean_candidate(c)
            if not cleaned:
                results_this_round.append((1.1, c.replace(EOS_TOKEN, "").replace(BOS_TOKEN, "").replace(PAD_TOKEN, "")))
            else:
                cleaned_candidates.append(cleaned)

        # Parse the candidates into programs
        solvers = []
        for c in cleaned_candidates:
            try:
                solvers.append(Solver(c))
            except:
                results_this_round.append((1.1, c))
    
        # Score the solvers
        scoring_results = score_solvers_vs_tasks(solvers, pairs, scoring_functions, solver_timeout = 1.0)
        scoring_results = [(score, solver.function_text) for score, solver in scoring_results]
        results_this_round += scoring_results
        
        # Add results
        results[label] = sorted(results_this_round, key = lambda x: x[0])
    
    # Log the results
    with open(log_file, 'w') as f:
        json.dump(results, f)

    return results

### Experiment 5, 10/27/2024: First shot at genetic algorithm

In [58]:
generations = {} # Records results of each generation

label = experiment_labels[4]
pairs = eval_tasks[label]['train']
solvers_per_generation = 64
generate_args = {
    'max_new_tokens': 512,
    'do_sample': True,
    'top_k': 50,
    'temperature': 2.0,
}

generation_results = []
generation = []

def create_new_generation(old_generation):
    # First, mutate the existing quartiles
    Q1

    # Then, fill in the remaining space with new functions
    pass

def cull_generation(generation):
    pass

new_solvers_needed = solvers_per_generation - len(generation)
candidates = generate_candidate_solvers(codegen, 
                                        codegen_tokenizer,
                                        pairs,
                                        generate_args = generate_args, 
                                        min_num_solvers = new_solvers_needed * 1.4, # Generate more to account for dups
                                       )
candidates = list(set(candidates))
candidates = candidates[:new_solvers_needed]

# Clean up candidates
cleaned_candidates = []
for c in candidates:
    cleaned = clean_candidate(c)
    if not cleaned:
        generation_results.append((1.1, c.replace(EOS_TOKEN, "").replace(BOS_TOKEN, "").replace(PAD_TOKEN, "")))
    else:
        cleaned_candidates.append(cleaned)

# Parse the candidates into programs
for c in cleaned_candidates:
    try:
        generation.append(Solver(c))
    except:
        generation_results.append((1.1, c))

96
78
64


In [59]:
generation_scored = score_solvers_vs_tasks(generation, pairs, scoring_functions, solver_timeout = 1.0)

generation_results += [(score, solver.function_text) for score, solver in generation_scored]
generations[f'generation 0'] = sorted(generation_results, key = lambda x: x[0])

# Log the results
log_file = f"fitness data/genetic alg experiments/{label}.json"
with open(log_file, 'w') as f:
    f.seek(0)
    json.dump(generations, f)

In [48]:
print(S.function_text)

def solve_955a34d9(I):
    x1 = partition(I)
    x2 = objects(I, T, F, F)
    x3 = argmax(x2, size)
    x4 = color(x3)
    x5 = remove(ZERO, x1)
    x6 = other(x5, x4)
    O = canvas(x6, UNITY)
    return O


In [55]:
print(S.without_last_k_lines(4, include_docstring = False))

def solve_955a34d9(I):
    x1 = partition(I)
    x2 = objects(I, T, F, F)
    x3 = argmax(x2, size)
    x4 = color(x3)


In [51]:
S.num_lines()

7

### Experiment 4b, 10/26/2024: Totally random grids with higher temps

Using parameters in experiment 4 gave ~330 unique solvers for each trial, which is lots of repetition out of 512 solvers. Here we turn up temp to 3.0 to look for more varied results.

In [22]:
num_trials = 10

solvers_per_task = 512
generate_args = {
    'max_new_tokens': 512,
    'do_sample': True,
    'top_k': 50,
    'temperature': 3.0,
}

for trial in range(num_trials):
    results = {label: [] for label in experiment_labels}
    results_starter = []

    # Generate random pairs to seed our solver generator
    random_pairs = [{'input': random_grid(10, 10), 'output': random_grid(10, 10)} for _ in range(4)]
    
    # Generate candidate solvers
    candidates = generate_candidate_solvers(codegen, 
                                            codegen_tokenizer,
                                            random_pairs,
                                            generate_args = generate_args, 
                                            min_num_solvers = solvers_per_task,
                                           )
    candidates = candidates[:solvers_per_task]

    # Clean up candidates
    cleaned_candidates = []
    for c in candidates:
        cleaned = clean_candidate(c)
        if not cleaned:
            results_starter.append((1.1, c.replace(EOS_TOKEN, "").replace(BOS_TOKEN, "").replace(PAD_TOKEN, "")))
        else:
            cleaned_candidates.append(cleaned)

    # Parse the candidates into programs
    solvers = []
    for c in cleaned_candidates:
        try:
            solvers.append(Solver(c))
        except:
            results_starter.append((1.1, c))

    # Now score our random solvers against each task
    for label in tqdm(experiment_labels):
        pairs = eval_tasks[label]['train']
        results_this_round = deepcopy(results_starter)

        # Score the solvers
        scoring_results = score_solvers_vs_tasks(solvers, pairs, scoring_functions, solver_timeout = 1.0)
        scoring_results = [(score, solver.function_text) for score, solver in scoring_results]
        results_this_round += scoring_results
        
        # Add results
        results[label] = sorted(results_this_round, key = lambda x: x[0])
    
    # Log the results
    log_file = f"fitness data/random grid experiments/trial{trial}_temp3.json"
    with open(log_file, 'w') as f:
        json.dump(results, f)

  0%|          | 0/5 [00:00<?, ?it/s]

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7ed3372d6b10>>
Traceback (most recent call last):
  File "/home/amzi/installs/miniforge3/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

  File "/home/amzi/installs/miniforge3/lib/python3.12/site-packages/timeout_decorator/timeout_decorator.py", line 69, in handler
    _raise_exception(timeout_exception, exception_message)
  File "/home/amzi/installs/miniforge3/lib/python3.12/site-packages/timeout_decorator/timeout_decorator.py", line 45, in _raise_exception
    raise exception()
timeout_decorator.timeout_decorator.TimeoutError: 'Timed Out'


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

### Experiment 4, 10/26/2024: What happens with programs generated via totally random grids?

We use four pairs of 10x10 uniform random grids to generate 512 solvers, then run those solvers vs the tasks, for 10 trials.

In [21]:
num_trials = 10

solvers_per_task = 512
generate_args = {
    'max_new_tokens': 512,
    'do_sample': True,
    'top_k': 50,
    'temperature': 2.0,
}

for trial in range(num_trials):
    results = {label: [] for label in experiment_labels}
    results_starter = []

    # Generate random pairs to seed our solver generator
    random_pairs = [{'input': random_grid(10, 10), 'output': random_grid(10, 10)} for _ in range(4)]
    
    # Generate candidate solvers
    candidates = generate_candidate_solvers(codegen, 
                                            codegen_tokenizer,
                                            random_pairs,
                                            generate_args = generate_args, 
                                            min_num_solvers = solvers_per_task,
                                           )
    candidates = candidates[:solvers_per_task]

    # Clean up candidates
    cleaned_candidates = []
    for c in candidates:
        cleaned = clean_candidate(c)
        if not cleaned:
            results_starter.append((1.1, c.replace(EOS_TOKEN, "").replace(BOS_TOKEN, "").replace(PAD_TOKEN, "")))
        else:
            cleaned_candidates.append(cleaned)

    # Parse the candidates into programs
    solvers = []
    for c in cleaned_candidates:
        try:
            solvers.append(Solver(c))
        except:
            results_starter.append((1.1, c))

    # Now score our random solvers against each task
    for label in tqdm(experiment_labels):
        pairs = eval_tasks[label]['train']
        results_this_round = deepcopy(results_starter)

        # Score the solvers
        scoring_results = score_solvers_vs_tasks(solvers, pairs, scoring_functions, solver_timeout = 1.0)
        scoring_results = [(score, solver.function_text) for score, solver in scoring_results]
        results_this_round += scoring_results
        
        # Add results
        results[label] = sorted(results_this_round, key = lambda x: x[0])
    
    # Log the results
    log_file = f"fitness data/random grid experiments/trial{trial}.json"
    with open(log_file, 'w') as f:
        json.dump(results, f)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x7ed3372d6b10>>
Traceback (most recent call last):
  File "/home/amzi/installs/miniforge3/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

  File "/home/amzi/installs/miniforge3/lib/python3.12/site-packages/timeout_decorator/timeout_decorator.py", line 69, in handler
    _raise_exception(timeout_exception, exception_message)
  File "/home/amzi/installs/miniforge3/lib/python3.12/site-packages/timeout_decorator/timeout_decorator.py", line 45, in _raise_exception
    raise exception()
timeout_decorator.timeout_decorator.TimeoutError: 'Timed Out'


  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

### Experiment 3, 10/26/2024: How does top_k affect results?

In [None]:
top_ks = [2, 4, 8, 16, 32, 64, 128]
for top_k in top_ks:
    results = conduct_and_log_experiment(labels = experiment_labels,
                                         model = codegen,
                                         tokenizer = codegen_tokenizer,
                                         solvers_per_task = 512,
                                         generate_args = {
                                                'max_new_tokens': 512,
                                                'do_sample': True,
                                                'top_k': top_k,
                                                'temperature': 2.0,
                                         },
                                         log_file = f'fitness data/top_k experiments/{top_k}.json',
                                      )

### Experiment 2, 10/26/2024: How does number of solvers affect results?

In [None]:
solver_counts = [32, 64, 128, 256, 512, 1024, 2048]
for num_solvers in solver_counts:
    results = conduct_and_log_experiment(labels = experiment_labels,
                                         model = codegen,
                                         tokenizer = codegen_tokenizer,
                                         solvers_per_task = num_solvers,
                                         generate_args = {
                                                'max_new_tokens': 512,
                                                'do_sample': True,
                                                'top_k': 50,
                                                'temperature': 2.0,
                                         },
                                         log_file = f'fitness data/num_solvers experiments/{num_solvers}.json',
                                      )

### Experiment 1, 10/25/2024: How does temperature affect results?

In [None]:
for temp in [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]:
    results = conduct_and_log_experiment(labels = experiment_labels,
                                         model = codegen,
                                         tokenizer = codegen_tokenizer,
                                         solvers_per_task = 256,
                                         generate_args = {
                                                'max_new_tokens': 512,
                                                'do_sample': True,
                                                'top_k': 50,
                                                'temperature': temp,
                                         },
                                         log_file = f'fitness data/temperature experiments/temp{temp}.json',
                                      )

# Scratch work

In [18]:
len(full_scoring_results)

22

In [19]:
list(full_scoring_results.keys())

['00576224',
 '009d5c81',
 '00dbd492',
 '03560426',
 '05a7bcf2',
 '0607ce86',
 '0692e18c',
 '070dd51e',
 '08573cc6',
 '0934a4d8',
 '09c534e7',
 '0a1d4ef5',
 '0a2355a6',
 '0b17323b',
 '0bb8deee',
 '0becf7df',
 '0c786b71',
 '0c9aba6e',
 '0d87d2a6',
 '0e671a1a',
 '0f63c0b9',
 '103eff5b']

In [23]:
list((label, min([score for score, _ in R])) for label, R in full_scoring_results.items()) 

[('00576224', 1.0),
 ('009d5c81', 1.0),
 ('00dbd492', 1.0),
 ('03560426', 0.34803823090542035),
 ('05a7bcf2', 0.20541399619989428),
 ('0607ce86', 1.0),
 ('0692e18c', 1.0),
 ('070dd51e', 1.1),
 ('08573cc6', 1.0),
 ('0934a4d8', 1.1),
 ('09c534e7', 1.0),
 ('0a1d4ef5', 1.0),
 ('0a2355a6', 1.0),
 ('0b17323b', 1.0),
 ('0bb8deee', 1.0),
 ('0becf7df', 0.19372860470435668),
 ('0c786b71', 1.1),
 ('0c9aba6e', 1.1),
 ('0d87d2a6', 1.0),
 ('0e671a1a', 1.0),
 ('0f63c0b9', 1.0),
 ('103eff5b', 1.0)]

In [24]:
R = full_scoring_results['00576224']

In [25]:
R

[(1.0,
  'def solve(I):\n    x1 = width(I)\n    x2 = multiply(x1, x1)\n    x3 = tojvec(x2)\n    x4 = palette(I)\n    x5 = remove(x3, x4)\n    x6 = lbind(colorcount, I)\n    x7 = argmax(x5, x6)\n    x8 = remove(x7, x5)\n    x9 = height(I)\n    x10 = increment(x9)\n    x11 = frontiers(I)\n    x12 = sfilter(x11, hline)\n    x13 = size(x12)\n    x14 = increment(x13)\n    x15 = divide(x10, x14)\n    x16 = width(I)\n    x17 = increment(x16)\n    x18 = frontiers(I)\n    x19 = sfilter(x18, vline)\n    x20 = size(x19)\n    x21 = increment(x20)\n    x22 = divide(x17, x21)\n    x23 = rbind(multiply, x15)\n    x24 = rbind(divide, x15)\n    x25 = compose(x23, x24)\n    x26 = fork(equality, identity, x25)\n    x27 = rbind(multiply, x22)\n    x28 = rbind(divide, x22)\n    x29 = compose(x27, x28)\n    x30 = fork(equality, identity, x29)\n    x31 = lbind(fork, both)\n    x32 = rbind(compose, first)\n    x33 = lbind(compose, x26)\n    x34 = lbind(rbind, subtract)\n    x35 = compose(x34, uppermost)\n    

In [27]:
print('def solve(I):\n    """\n    INPUT:\n    86\n    64\n    OUTPUT:\n    868686\n    646464\n    686868\n    464646\n    868686\n    646464\n    """\n    x1 = hconcat(I, I)\n    x2 = vconcat(x1, x1)\n    x3 = asindices(x2)\n    x4 = mostcolor(I)\n    x5 = ofcolor(x2, x4)\n    x6 = difference(x3, x5)\n    x7 = mapply(ineighbors, x6)\n    O = underfill(x2, EIGHT, x7)\n    return O')

def solve(I):
    """
    INPUT:
    86
    64
    OUTPUT:
    868686
    646464
    686868
    464646
    868686
    646464
    """
    x1 = hconcat(I, I)
    x2 = vconcat(x1, x1)
    x3 = asindices(x2)
    x4 = mostcolor(I)
    x5 = ofcolor(x2, x4)
    x6 = difference(x3, x5)
    x7 = mapply(ineighbors, x6)
    O = underfill(x2, EIGHT, x7)
    return O


In [28]:
def solve(I):
    """
    INPUT:
    86
    64
    OUTPUT:
    868686
    646464
    686868
    464646
    868686
    646464
    """
    x1 = hconcat(I, I)
    x2 = vconcat(x1, x1)
    x3 = asindices(x2)
    x4 = mostcolor(I)
    x5 = ofcolor(x2, x4)
    x6 = difference(x3, x5)
    x7 = mapply(ineighbors, x6)
    O = underfill(x2, EIGHT, x7)
    return O

In [31]:
Solver(clean_candidate('''def solve(I):
    """
    INPUT:
    86
    64
    OUTPUT:
    868686
    646464
    686868
    464646
    868686
    646464
    """
    x1 = hconcat(I, I)
    x2 = vconcat(x1, x1)
    x3 = asindices(x2)
    x4 = mostcolor(I)
    x5 = ofcolor(x2, x4)
    x6 = difference(x3, x5)
    x7 = mapply(ineighbors, x6)
    O = underfill(x2, EIGHT, x7)
    return O'''))

def solve(I):
    """
    INPUT:
    86
    64
    OUTPUT:
    868686
    646464
    686868
    464646
    868686
    646464
    
    """
    x1 = hconcat(I, I)
    x2 = vconcat(x1, x1)
    x3 = asindices(x2)
    x4 = mostcolor(I)
    x5 = ofcolor(x2, x4)
    x6 = difference(x3, x5)
    x7 = mapply(ineighbors, x6)
    O = underfill(x2, EIGHT, x7)
    return O

In [None]:
# Avoiding OOM on 25094a63, task at index 60
l = eval_task_labels[60]
print(l)
in_out_pairs = eval_tasks[l]['train']
prompts = []
while len(prompts) < 200:
        prompts.extend([create_generation_prompt(pair['input'], pair['output']) for pair in in_out_pairs])
print(prompts[4])
print(len(codegen_tokenizer.encode(prompts[0])))
# Requires 900+ tokens to represent grids, which leads to OOM

In [68]:
l = '009d5c81'
l = eval_task_labels[4]

pairs = eval_tasks[l]['train']
#pairs = [{'input': random_grid(20, 20), 'output': random_grid(20, 20)} for i in range(3)]

generate_args = {
    'max_new_tokens': 512,
    'do_sample': True,
    'temperature': 0.5,
    'top_k': 20,
    #'num_beams': 2, # Using multiple beams creates too much memory pressure
}

starttime = time.time()
cands = generate_candidate_solvers(codegen, codegen_tokenizer, pairs, generate_args = generate_args, min_num_solvers = 50)
print(f"Computed candidates in {(time.time()-starttime):.2f} seconds")

Computed candidates in 32.84 seconds


In [69]:
cleaned_cands = [clean_candidate(cand) for cand in cands]

In [70]:
progs = []
for cand in cleaned_cands:
    try:
        progs.append(Solver(cand))
    except:
        pass

In [71]:
len(progs)

44

In [72]:
from fitness_scoring import *
       
scores = score_solvers_vs_tasks(progs, pairs, scoring_functions, solver_timeout = 1)

In [73]:
non_null_scores = [score for score in scores.values() if score != None]
sorted(non_null_scores)

[0.09617932940310235,
 0.10830729281693237,
 0.12581017833567684,
 0.12581017833567684,
 0.12581017833567684,
 0.1451198226642063,
 0.2276676681134889,
 0.24111676145747532,
 0.25464015450133604,
 0.26147469968565545,
 0.26147469968565545,
 0.26147469968565545,
 0.26147469968565545,
 0.26147469968565545,
 0.2667732786862824,
 0.29310876589370266,
 0.3650276645492598,
 0.36861214897799227,
 0.4521855666908194,
 0.5704644893874207,
 0.5704644893874207,
 0.5777334591509189,
 0.6694927403348179,
 0.6694927403348179,
 0.7600000000000001,
 0.7600000000000001,
 0.7942041298228831,
 0.9462068965517242,
 0.9597777777777777]

In [74]:
prog_dict = {}
for prog in progs:
    if prog:
        n = str(prog).split(" ", 1)[1].split("(")[0]
        prog_dict[n] = prog

In [75]:
for p_name, score in scores.items():
    if score is not None and score < 0.5:
        prog_dict[p_name].update_docstring("")
        print(score, "\n", prog_dict[p_name])
        print("")

0.2276676681134889 
 def solve_b3cdade0(I):
    """
    
    """
    x1 = leastcolor(I)
    x2 = ofcolor(I, x1)
    x3 = shift(x2, NEG_UNITY)
    x4 = recolor(THREE, x3)
    x5 = shift(x2, UNITY)
    x6 = recolor(SEVEN, x5)
    x7 = shift(x2, DOWN_LEFT)
    x8 = recolor(EIGHT, x7)
    x9 = shift(x2, UP_RIGHT)
    x10 = recolor(SIX, x9)
    x11 = mostcolor(I)
    x12 = fill(I, x11, x2)
    x13 = combine(x4, x6)
    x14 = combine(x8, x10)
    x15 = combine(x13, x14)
    O = paint(x12, x15)
    return O

0.1451198226642063 
 def solve_0388b7f2(I):
    """
    
    """
    x1 = objects(I, T, F, T)
    x2 = colorfilter(x1, THREE)
    x3 = colorfilter(x1, TWO)
    x4 = lbind(recolor, TWO)
    x5 = rbind(shoot, DOWN)
    x6 = chain(x4, x5, lrcorner)
    x7 = lbind(recolor, ONE)
    x8 = rbind(shoot, UP_RIGHT)
    x9 = chain(x7, x8, urcorner)
    x10 = mapply(x6, x2)
    x11 = mapply(x9, x3)
    x12 = combine(x10, x11)
    O = underpaint(I, x12)
    return O

0.26147469968565545 
 def solve_86

In [76]:
print(len(non_null_scores))

29


In [67]:
l

'05a7bcf2'