In [1]:
import numpy as np
import pandas as pd

import os
import sys
import json
from pathlib import Path
import gc
import inspect
import random
import time

import matplotlib.pyplot as plt
from matplotlib import colors
import numpy as np
from pathlib import Path

from copy import deepcopy
from copy import copy

from tqdm.notebook import tqdm

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from trl import SFTTrainer, SFTConfig
from datasets import Dataset
import torch

In [3]:
data_path = Path('../data')
train_path = data_path / 'arc-agi_training_challenges.json'
train_sols_path = data_path / 'arc-agi_training_solutions.json'
eval_path = data_path / 'arc-agi_evaluation_challenges.json'
eval_sols_path = data_path / 'arc-agi_evaluation_solutions.json'

In [4]:
sys.path.insert(0, '..')
sys.path.insert(0, '../DSL')

from visualization.visualization_utils import *
import solvers
from solver_class import Solver
from dsl import *
from constants import *
from fitness_scoring import *

In [5]:
with open(train_path, 'r') as f:
    train_tasks = json.load(f)
with open(train_sols_path, 'r') as f:
    train_sols = json.load(f)
with open(eval_path, 'r') as f:
    eval_tasks = json.load(f)
with open(eval_sols_path, 'r') as f:
    eval_sols = json.load(f)

train_task_labels = sorted(train_tasks.keys())
eval_task_labels = sorted(eval_tasks.keys())

In [6]:
ast = lambda g: tuple(tuple(r) for r in g) # Converts grid to tuple format for DSL

# Convert all train and eval examples to tuples for DSL
for train_label in train_task_labels:
    num_train = len(train_tasks[train_label]['train']) 
    num_test = len(train_tasks[train_label]['test'])
    for i in range(num_train):
        train_tasks[train_label]['train'][i]['input'] = ast(train_tasks[train_label]['train'][i]['input'])
        train_tasks[train_label]['train'][i]['output'] = ast(train_tasks[train_label]['train'][i]['output'])
    for i in range(num_test):
        train_tasks[train_label]['test'][i]['input'] = ast(train_tasks[train_label]['test'][i]['input'])
        train_sols[train_label][i] = ast(train_sols[train_label][i])
for eval_label in eval_task_labels:
    num_train = len(eval_tasks[eval_label]['train']) 
    num_test = len(eval_tasks[eval_label]['test'])
    for i in range(num_train):
        eval_tasks[eval_label]['train'][i]['input'] = ast(eval_tasks[eval_label]['train'][i]['input'])
        eval_tasks[eval_label]['train'][i]['output'] = ast(eval_tasks[eval_label]['train'][i]['output'])
    for i in range(num_test):
        eval_tasks[eval_label]['test'][i]['input'] = ast(eval_tasks[eval_label]['test'][i]['input'])
        eval_sols[eval_label][i] = ast(eval_sols[eval_label][i])

In [7]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using {DEVICE}")

Using cuda


# Load pretrained model

In [56]:
codegen_tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-mono",
                                                 padding_side='left', # For padding batches of input in decoder-only context
                                                 clean_up_tokenization_spaces = True,
                                                 ) 
codegen = AutoModelForCausalLM.from_pretrained("../CodeGen fine-tuning/outputs/checkpoint-2000/")
codegen = codegen.to(DEVICE)

In [57]:
EOS_TOKEN = codegen_tokenizer.eos_token
BOS_TOKEN = codegen_tokenizer.bos_token
codegen_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
PAD_TOKEN = '[PAD]'
#codegen_tokenizer.pad_token = codegen_tokenizer.eos_token

# Helper functions

In [10]:
def grid_to_string(grid):
    """
    Simply converts grid to string and removes whitespace.
    A 30-by-30 grid will use ~1800 tokens.
    """
    return str(grid).replace(" ", "")

def grid_to_string_compact(grid):
    """
    Rows as strings of characters separated by linebreaks.
    Uses approximately 4-times fewer tokens than grid_to_string.
    """
    return "\n".join(["".join([str(entry) for entry in row]) for row in grid])

In [11]:
def create_generation_prompt(in_grid, out_grid, label = None):
    if label == None:
        label = random_label()
    result = f'''def solve_{label}(I):
    """
    Example input:
    {"\n    ".join(grid_to_string_compact(in_grid).split("\n"))}
    Example output:
    {"\n    ".join(grid_to_string_compact(out_grid).split("\n"))}
    """
    '''
    return result

In [12]:
def random_label() -> str:
    """
    Random task label consisting of 8 hexidecimal digits lowercase.
    """
    digits = list("0123456789abcdef")
    return "".join([random.choice(digits) for _ in range(8)])

In [13]:
def random_grid(rows = None, cols = None, palette = None):
    if not rows:
        rows = random.randint(1,30)
    if not cols:
        cols = random.randint(1,30)
    if not palette:
        palette = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    return tuple(tuple(random.choice(palette) for _ in range(cols)) for _ in range(rows))

# Generate solvers

In [14]:
MAX_BATCH_SIZE = 25

In [15]:
def generate_candidate_solvers(model, tokenizer, in_out_pairs, generate_args = {'max_new_tokens': 512}, min_num_solvers = 100):
    candidates = []
    prompts = []

    while len(prompts) < min_num_solvers:
        prompts.extend([create_generation_prompt(pair['input'], pair['output']) for pair in in_out_pairs])
    
    for i in range((len(prompts)-1) // MAX_BATCH_SIZE + 1):  
        inputs = tokenizer(prompts[i*MAX_BATCH_SIZE : (i+1)*MAX_BATCH_SIZE],
                           padding = True,
                           return_tensors = 'pt',
                          ).to(DEVICE)
        outputs = model.generate(**inputs,
                                 pad_token_id = tokenizer.eos_token_id,
                                **generate_args,
                                )

        candidates.extend(tokenizer.batch_decode(outputs))

    return candidates
        

In [16]:
def clean_candidate(candidate: str) -> str:
    candidate = candidate.replace(EOS_TOKEN, "")
    candidate = candidate.replace(BOS_TOKEN, "")
    candidate = candidate.replace(PAD_TOKEN, "")
    candidate = candidate.split("return ")
    if len(candidate) == 1:
        return None
    candidate = candidate[0] + "return O"
    return candidate

In [None]:
full_scoring_results = {}

starttime = time.time()

for i, label in enumerate(eval_task_labels):
    pairs = eval_tasks[label]['train']
    
    generate_args = {
        'max_new_tokens': 512,
        'do_sample': True,
        'temperature': 5.0,
        'top_k': 2,
        #'num_beams': 2, # Using multiple beams creates too much memory pressure
    }

    candidates = generate_candidate_solvers(codegen, codegen_tokenizer, pairs, generate_args = generate_args, min_num_solvers = 50)
    candidates = [clean_candidate(c) for c in candidates]
    programs = []
    for c in candidates:
        try:
            programs.append(Solver(c))
        except:
            pass

    # TODO: Right now candidate solvers generate random names. Fix this to avoid potential collisions in dictionary
    scoring_results = score_solvers_vs_tasks(programs, pairs, scoring_functions, solver_timeout = 1.0)
    scoring_results = {solver_name: score if score != None else 1.0 for solver_name, score in scoring_results.items()}

    print(f"Completed task {i}, {label}.")
    print(f"Valid programs: {len(programs)}")
    print(f"Valid scores: {sum([(score < 1.0) for score in scoring_results.values()])}")
    print(f"Best score: {min(scoring_results.values())}")
    print(f"Time elapsed: {(time.time()-starttime):.2f} seconds")
    print("")

    full_scoring_results[label] = sorted(scoring_results.items(), key = lambda x: x[1])

In [28]:
sorted_scoring_results = sorted(full_scoring_results.items(), key = lambda x: [x[1][i][1] for i in range(len(x[1]))])

In [31]:
with open('first_pass_solvers_vs_eval.json', 'w') as f:
    json.dump(sorted_scoring_results, f, indent = 4)

# Scratch work

In [68]:
l = '009d5c81'
l = eval_task_labels[4]

pairs = eval_tasks[l]['train']
#pairs = [{'input': random_grid(20, 20), 'output': random_grid(20, 20)} for i in range(3)]

generate_args = {
    'max_new_tokens': 512,
    'do_sample': True,
    'temperature': 0.5,
    'top_k': 20,
    #'num_beams': 2, # Using multiple beams creates too much memory pressure
}

starttime = time.time()
cands = generate_candidate_solvers(codegen, codegen_tokenizer, pairs, generate_args = generate_args, min_num_solvers = 50)
print(f"Computed candidates in {(time.time()-starttime):.2f} seconds")

Computed candidates in 32.84 seconds


In [69]:
cleaned_cands = [clean_candidate(cand) for cand in cands]

In [70]:
progs = []
for cand in cleaned_cands:
    try:
        progs.append(Solver(cand))
    except:
        pass

In [71]:
len(progs)

44

In [72]:
from fitness_scoring import *
       
scores = score_solvers_vs_tasks(progs, pairs, scoring_functions, solver_timeout = 1)

In [73]:
non_null_scores = [score for score in scores.values() if score != None]
sorted(non_null_scores)

[0.09617932940310235,
 0.10830729281693237,
 0.12581017833567684,
 0.12581017833567684,
 0.12581017833567684,
 0.1451198226642063,
 0.2276676681134889,
 0.24111676145747532,
 0.25464015450133604,
 0.26147469968565545,
 0.26147469968565545,
 0.26147469968565545,
 0.26147469968565545,
 0.26147469968565545,
 0.2667732786862824,
 0.29310876589370266,
 0.3650276645492598,
 0.36861214897799227,
 0.4521855666908194,
 0.5704644893874207,
 0.5704644893874207,
 0.5777334591509189,
 0.6694927403348179,
 0.6694927403348179,
 0.7600000000000001,
 0.7600000000000001,
 0.7942041298228831,
 0.9462068965517242,
 0.9597777777777777]

In [74]:
prog_dict = {}
for prog in progs:
    if prog:
        n = str(prog).split(" ", 1)[1].split("(")[0]
        prog_dict[n] = prog

In [75]:
for p_name, score in scores.items():
    if score is not None and score < 0.5:
        prog_dict[p_name].update_docstring("")
        print(score, "\n", prog_dict[p_name])
        print("")

0.2276676681134889 
 def solve_b3cdade0(I):
    """
    
    """
    x1 = leastcolor(I)
    x2 = ofcolor(I, x1)
    x3 = shift(x2, NEG_UNITY)
    x4 = recolor(THREE, x3)
    x5 = shift(x2, UNITY)
    x6 = recolor(SEVEN, x5)
    x7 = shift(x2, DOWN_LEFT)
    x8 = recolor(EIGHT, x7)
    x9 = shift(x2, UP_RIGHT)
    x10 = recolor(SIX, x9)
    x11 = mostcolor(I)
    x12 = fill(I, x11, x2)
    x13 = combine(x4, x6)
    x14 = combine(x8, x10)
    x15 = combine(x13, x14)
    O = paint(x12, x15)
    return O

0.1451198226642063 
 def solve_0388b7f2(I):
    """
    
    """
    x1 = objects(I, T, F, T)
    x2 = colorfilter(x1, THREE)
    x3 = colorfilter(x1, TWO)
    x4 = lbind(recolor, TWO)
    x5 = rbind(shoot, DOWN)
    x6 = chain(x4, x5, lrcorner)
    x7 = lbind(recolor, ONE)
    x8 = rbind(shoot, UP_RIGHT)
    x9 = chain(x7, x8, urcorner)
    x10 = mapply(x6, x2)
    x11 = mapply(x9, x3)
    x12 = combine(x10, x11)
    O = underpaint(I, x12)
    return O

0.26147469968565545 
 def solve_86

In [76]:
print(len(non_null_scores))

29


In [67]:
l

'05a7bcf2'