In [2]:
import numpy as np
import pandas as pd

import os
import sys
import json
from pathlib import Path
import gc
import inspect
import random
import time

import matplotlib.pyplot as plt
from matplotlib import colors
import numpy as np
from pathlib import Path

from copy import deepcopy
from copy import copy

from tqdm.notebook import tqdm

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import Dataset
import torch

In [4]:
sys.path.insert(0, '../DSL')
import solvers
from solver_class import Solver
import dsl
from dsl import *
from constants import *
from fitness_scoring import *

sys.path.insert(0, '..')
from visualization.visualization_utils import *
from misc_utils import *

In [5]:
data_path = Path('../data')
train_path = data_path / 'arc-agi_training_challenges.json'
train_sols_path = data_path / 'arc-agi_training_solutions.json'
eval_path = data_path / 'arc-agi_evaluation_challenges.json'
eval_sols_path = data_path / 'arc-agi_evaluation_solutions.json'

In [6]:
with open(train_path, 'r') as f:
    train_tasks = json.load(f)
with open(train_sols_path, 'r') as f:
    train_sols = json.load(f)
with open(eval_path, 'r') as f:
    eval_tasks = json.load(f)
with open(eval_sols_path, 'r') as f:
    eval_sols = json.load(f)

train_task_labels = sorted(train_tasks.keys())
eval_task_labels = sorted(eval_tasks.keys())

In [7]:
ast = lambda g: tuple(tuple(r) for r in g) # Converts grid to tuple format for DSL

# Convert all train and eval examples to tuples for DSL
for train_label in train_task_labels:
    num_train = len(train_tasks[train_label]['train']) 
    num_test = len(train_tasks[train_label]['test'])
    for i in range(num_train):
        train_tasks[train_label]['train'][i]['input'] = ast(train_tasks[train_label]['train'][i]['input'])
        train_tasks[train_label]['train'][i]['output'] = ast(train_tasks[train_label]['train'][i]['output'])
    for i in range(num_test):
        train_tasks[train_label]['test'][i]['input'] = ast(train_tasks[train_label]['test'][i]['input'])
        train_sols[train_label][i] = ast(train_sols[train_label][i])
for eval_label in eval_task_labels:
    num_train = len(eval_tasks[eval_label]['train']) 
    num_test = len(eval_tasks[eval_label]['test'])
    for i in range(num_train):
        eval_tasks[eval_label]['train'][i]['input'] = ast(eval_tasks[eval_label]['train'][i]['input'])
        eval_tasks[eval_label]['train'][i]['output'] = ast(eval_tasks[eval_label]['train'][i]['output'])
    for i in range(num_test):
        eval_tasks[eval_label]['test'][i]['input'] = ast(eval_tasks[eval_label]['test'][i]['input'])
        eval_sols[eval_label][i] = ast(eval_sols[eval_label][i])

In [8]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using {DEVICE}")

Using cuda


In [9]:
PRIMITIVES = ['add', 'adjacent', 'apply', 'argmax', 'argmin', 'asindices', 'asobject', 'astuple', 'backdrop', 'bordering', 'both', 'bottomhalf', 'box', 'branch', 'canvas', 'cellwise', 'center', 'centerofmass', 'chain', 'cmirror', 'color', 'colorcount', 'colorfilter', 'combine', 'compose', 'compress', 'connect', 'contained', 'corners', 'cover', 'crement', 'crop', 'decrement', 'dedupe', 'delta', 'difference', 'divide', 'dmirror', 'dneighbors', 'double', 'downscale', 'either', 'equality', 'even', 'extract', 'fgpartition', 'fill', 'first', 'flip', 'fork', 'frontiers', 'gravitate', 'greater', 'halve', 'hconcat', 'height', 'hfrontier', 'hline', 'hmatching', 'hmirror', 'hperiod', 'hsplit', 'hupscale', 'identity', 'inbox', 'increment', 'index', 'ineighbors', 'initset', 'insert', 'intersection', 'interval', 'invert', 'last', 'lbind', 'leastcolor', 'leastcommon', 'lefthalf', 'leftmost', 'llcorner', 'lowermost', 'lrcorner', 'manhattan', 'mapply', 'matcher', 'maximum', 'merge', 'mfilter', 'minimum', 'mostcolor', 'mostcommon', 'move', 'mpapply', 'multiply', 'neighbors', 'normalize', 'numcolors', 'objects', 'occurrences', 'ofcolor', 'order', 'other', 'outbox', 'paint', 'pair', 'palette', 'papply', 'partition', 'portrait', 'position', 'positive', 'power', 'prapply', 'product', 'rapply', 'rbind', 'recolor', 'remove', 'repeat', 'replace', 'righthalf', 'rightmost', 'rot180', 'rot270', 'rot90', 'sfilter', 'shape', 'shift', 'shoot', 'sign', 'size', 'sizefilter', 'square', 'subgrid', 'subtract', 'switch', 'toindices', 'toivec', 'tojvec', 'toobject', 'tophalf', 'totuple', 'trim', 'ulcorner', 'underfill', 'underpaint', 'uppermost', 'upscale', 'urcorner', 'valmax', 'valmin', 'vconcat', 'vfrontier', 'vline', 'vmatching', 'vmirror', 'vperiod', 'vsplit', 'vupscale', 'width']
COMMON_PRIMITIVES = ['compose', 'fork', 'lbind', 'rbind', 'fill', 'chain', 'objects', 'mapply', 'apply', 'ofcolor', 'astuple', 'paint', 'sfilter', 'matcher', 'branch', 'merge', 'combine', 'first', 'argmax', 'shift', 'canvas', 'mostcolor', 'palette', 'height', 'shape', 'width', 'remove', 'interval', 'difference', 'subgrid', 'colorfilter', 'size', 'replace', 'color', 'argmin', 'extract', 'leastcolor', 'decrement', 'increment', 'asobject', 'equality', 'asindices', 'add', 'hconcat', 'subtract', 'mfilter', 'initset', 'insert', 'normalize', 'vmirror']

# Load pretrained model

In [10]:
codegen_tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono",
                                                 padding_side='left', # For padding batches of input in decoder-only context
                                                 clean_up_tokenization_spaces = True,
                                                 ) 
codegen = AutoModelForCausalLM.from_pretrained("../CodeGen fine-tuning/outputs/v7/")
codegen = codegen.to(DEVICE)

In [11]:
EOS_TOKEN = codegen_tokenizer.eos_token
BOS_TOKEN = codegen_tokenizer.bos_token
PAD_TOKEN = '[PAD]'
codegen_tokenizer.add_special_tokens({'pad_token': PAD_TOKEN})

1

# Helper functions

In [12]:
def clean_candidates(candidates: list[str]) -> [int, list[str]]:
    failed = []
    cleaned = []
    for candidate in candidates:
        if "return O" not in candidate:
            failed.append(candidate)
        else:
            cleaned.append(candidate.split("return O")[0] + "return O")
    return failed, cleaned            

In [13]:
def compile_candidates(candidates: list[str]) -> [int, list[Solver]]:
    failed = []
    compiled = []
    for candidate in candidates:
        try:
            compiled.append(Solver(candidate))
        except:
            failed.append(candidate)
    return failed, compiled

In [14]:
def save_solutions(label, solvers) -> int:
    saved_count = 0
    solver_texts = set([solver.function_text.replace(solver.name, 'solve', 1) for solver in solvers])
    existing_solvers = ""
    try:
        with open(f"solvers generated/{label}.py", "r") as f:
            existing_solvers = f.read()
    except:
        pass # File didn't open, probably because it doesn't exist
    for solver_text in solver_texts:
        if solver_text not in existing_solvers:
            with open(f"solvers generated/{label}.py", "a") as f:
                f.write(solver_text + "\n\n\n")
                saved_count += 1
    if saved_count > 0:
        print("New solver found")
    return saved_count

# Generate solvers

In [15]:
def remove_docstring(solver_text):
    S = solver_text.split('    """\n', 2)
    if len(S) == 1:
        return S[0]
    else:
        return S[0] + S[-1]

In [16]:
%env TOKENIZERS_PARALLELISM true

env: TOKENIZERS_PARALLELISM=true


In [17]:
full_results = {}
NEW_SOLVERS_COUNT = 0
generate_args = {
    'max_new_tokens': 256,
    'num_beams': 64,
    'num_return_sequences': 64,
    #'do_sample': True,
    #'temperature': 2.0,
    #'low_memory': True # WAY TOO SLOW
}
beam_log = []

for i, label in enumerate(tqdm(eval_task_labels)):
    pairs = eval_tasks[label]['train']
    prompts = create_prompts_from_pairs(pairs, len(pairs))
    results = []

    tokens = [codegen_tokenizer(prompt, padding = True, return_tensors = 'pt').to(DEVICE) for prompt in prompts]

    candidates = []
    for t in tokens:
        outputs = []
        genargs = generate_args
        while len(outputs) == 0:
            try:
                outputs = codegen.generate(**t,
                         pad_token_id = codegen_tokenizer.pad_token_id,
                        **genargs,
                        )
            except torch.cuda.OutOfMemoryError:
                genargs['num_beams'] = genargs['num_beams']//2
                genargs['num_return_sequences'] = genargs['num_return_sequences']//2
        candidates.extend(codegen_tokenizer.batch_decode(outputs, skip_special_tokens = True))
        beam_log.append(genargs['num_beams'])

    failed, cleaned = clean_candidates(candidates)
    results.extend([(1.1, F) for F in failed])

    failed, compiled = compile_candidates(cleaned)
    results.extend([(1.1, F) for F in failed])

    scored = score_solvers_vs_tasks(compiled, pairs, scoring_functions, solver_timeout = 1.0)
    results.extend(scored)

    results = sorted(results, key = lambda x: x[0])
    
    full_results[label] = list(set([(score, remove_docstring(str(solver))) for score, solver in results]))

    # Save any solvers that fully succeeded
    idx = 0
    to_save = []
    while idx < len(results) and results[idx][0] == 0:
        to_save.append(results[idx][1])
        idx += 1
    if len(to_save) > 0:
        NEW_SOLVERS_COUNT += save_solutions(label, to_save)
    
    # Save every 10 tasks
    if (i+1)%10 == 0:
        with open('beam.json', 'w') as f:
            f.seek(0)
            json.dump(full_results, f, indent = 4)


with open('beam.json', 'w') as f:
    f.seek(0)
    json.dump(full_results, f, indent = 4)

  0%|          | 0/400 [00:00<?, ?it/s]

New solver found
New solver found
New solver found
New solver found


In [18]:
print(NEW_SOLVERS_COUNT)

8


In [19]:
pd.Series(beam_log).value_counts()

16    1348
64       9
32       6
Name: count, dtype: int64

# Scratch work

NOTES:
 * Using top_p = 0.9, temp = 1.5 instead of top_k narrows results down to things that actually compile and speeds up computation. Generally top_p seems better than top_k.

In [21]:
test_label = '48f8583b'
label = test_label
print(label)

48f8583b


In [22]:
pairs = eval_tasks[label]['train']

In [227]:
torch.cuda.empty_cache()

In [217]:
NEW_SOLVERS_COUNT = 0

In [218]:
prompts = create_prompts_from_pairs(pairs, len(pairs))

In [219]:
len(prompts)

6

In [220]:
tokens = [codegen_tokenizer(prompt, padding = True, return_tensors = 'pt').to(DEVICE) for prompt in prompts]

In [221]:
# 27 seconds with 32 beams and 768 new tokens, no OOM
# 15 seconds with 32 beams and 512 new tokens, no OOM
# 192 seconds with 32 beams and 512 new tokens and low_memory = True
# 
# Weirdly, temp 5.0 resulted in more compiled solvers and lower scores...

In [237]:
configs = [(32, 512), (64, 256), (16, 1024)]
full_results = {}

for NUM_BEAMS, MAX_NEW_TOKENS in tqdm(configs):
    generate_args = {
        'max_new_tokens': MAX_NEW_TOKENS,
        'num_beams': NUM_BEAMS,
        'num_return_sequences': NUM_BEAMS,
        'do_sample': True,
        'temperature': 2.0,
        #'low_memory': True # WAY TOO SLOW
    }
    results = []
    candidates = []
    for t in tokens:
        outputs = []
        try:
            outputs = codegen.generate(**t,
                     pad_token_id = codegen_tokenizer.pad_token_id,
                    **generate_args,
                    )
        except torch.cuda.OutOfMemoryError:
            print("OOM")
        candidates.extend(codegen_tokenizer.batch_decode(outputs, skip_special_tokens = True))

    failed, cleaned = clean_candidates(candidates)
    results.extend([(1.1, F) for F in failed])

    failed, compiled = compile_candidates(cleaned)
    results.extend([(1.1, F) for F in failed])

    scored = score_solvers_vs_tasks(compiled, pairs, scoring_functions, solver_timeout = 1.0)
    results.extend(scored)

    full_results[(NUM_BEAMS, MAX_NEW_TOKENS)] = sorted(results, key = lambda x: x[0])    


  0%|          | 0/3 [00:00<?, ?it/s]

In [238]:
for key, results in full_results.items():
    print(key)
    print(results[0][0], results[0][1])
    print("")

(32, 512)
0.276010101010101 def solve(I):
    """
    INPUT:
    717
    177
    717
    OUTPUT:
    000717000
    000177000
    000717000
    717000000
    177000000
    717000000
    000717000
    000177000
    000717000
    """
    x1 = ofcolor(I, ONE)
    x2 = shape(I)
    x3 = multiply(x2, x2)
    x4 = canvas(ZERO, x3)
    x5 = rbind(multiply, x2)
    x6 = apply(x5, x1)
    x7 = asobject(I)
    x8 = lbind(shift, x7)
    x9 = mapply(x8, x6)
    O = paint(x4, x9)
    return O

(64, 256)
0.0 def solve(I):
    """
    INPUT:
    717
    177
    717
    OUTPUT:
    000717000
    000177000
    000717000
    717000000
    177000000
    717000000
    000717000
    000177000
    000717000
    """
    x1 = leastcolor(I)
    x2 = ofcolor(I, x1)
    x3 = shape(I)
    x4 = multiply(x3, x3)
    x5 = canvas(ZERO, x4)
    x6 = rbind(multiply, x3)
    x7 = apply(x6, x2)
    x8 = asobject(I)
    x9 = lbind(shift, x8)
    x10 = mapply(x9, x7)
    O = paint(x5, x10)
    return O

(16, 1024)
0.3610256

<code>generate_args = {
        'max_new_tokens': MAX_NEW_TOKENS,
        'num_beams': NUM_BEAMS,
        'num_return_sequences': NUM_BEAMS,
        #'do_sample': True,
        #'temperature': 3.0,
        #'low_memory': True # WAY TOO SLOW
    }

In [236]:
for key, results in full_results.items():
    print(key)
    print(results[0][0], results[0][1])
    print("")

(32, 512)
0.276010101010101 def solve(I):
    """
    INPUT:
    717
    177
    717
    OUTPUT:
    000717000
    000177000
    000717000
    717000000
    177000000
    717000000
    000717000
    000177000
    000717000
    """
    x1 = ofcolor(I, ONE)
    x2 = shape(I)
    x3 = multiply(x2, x2)
    x4 = canvas(ZERO, x3)
    x5 = rbind(multiply, x2)
    x6 = apply(x5, x1)
    x7 = asobject(I)
    x8 = lbind(shift, x7)
    x9 = mapply(x8, x6)
    O = paint(x4, x9)
    return O

(64, 256)
0.0 def solve(I):
    """
    INPUT:
    717
    177
    717
    OUTPUT:
    000717000
    000177000
    000717000
    717000000
    177000000
    717000000
    000717000
    000177000
    000717000
    """
    x1 = leastcolor(I)
    x2 = ofcolor(I, x1)
    x3 = shape(I)
    x4 = multiply(x3, x3)
    x5 = canvas(ZERO, x4)
    x6 = rbind(multiply, x3)
    x7 = apply(x6, x2)
    x8 = asobject(I)
    x9 = lbind(shift, x8)
    x10 = mapply(x9, x7)
    O = paint(x5, x10)
    return O

(16, 1024)
0.3610256

In [222]:
start = time.time()
try:
    outputs = codegen.generate(**tokens[0],
                     pad_token_id = codegen_tokenizer.pad_token_id,
                    **generate_args,
                    )
except torch.cuda.OutOfMemoryError:
    print("OOM")
print(f"{time.time()-start:.2f} seconds")

10.32 seconds


In [223]:
candidates = codegen_tokenizer.batch_decode(outputs, skip_special_tokens = True)
print(len(candidates))

64


In [224]:
failed1, cleaned = clean_candidates(candidates)
print(len(cleaned))

35


In [225]:
failed2, compiled = compile_candidates(cleaned)
print(len(compiled))

35


In [226]:
scored = score_solvers_vs_tasks(compiled, pairs, scoring_functions, solver_timeout = 1.0)
scored = sorted(scored, key = lambda x: x[0])
print(sum([score < 1.0 for score, _ in scored]))
print([score for score, _ in scored])

16
[0.3239538982502369, 0.3391996891996892, 0.361025641025641, 0.361025641025641, 0.4003108003108003, 0.4186868686868687, 0.4585714285714286, 0.4593245640097597, 0.46058648192693497, 0.4626586983363246, 0.480519978106185, 0.480519978106185, 0.4814417895672325, 0.5378794364187622, 0.6142857142857142, 0.8571428571428571, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


In [147]:
print(scored[0][0], scored[0][1])

0.361025641025641 def solve(I):
    """
    INPUT:
    996
    388
    833
    OUTPUT:
    000000996
    000000388
    000000833
    000000000
    000000000
    000000000
    000000000
    000000000
    000000000
    """
    x1 = shape(I)
    x2 = multiply(x1, x1)
    x3 = canvas(ZERO, x2)
    x4 = mostcolor(I)
    x5 = ofcolor(I, x4)
    x6 = lbind(multiply, x1)
    x7 = apply(x6, x5)
    x8 = asobject(I)
    x9 = lbind(shift, x8)
    x10 = mapply(x9, x7)
    O = paint(x3, x10)
    return O
