In [1]:
import os
import json
import torch
import pandas as pd
from PIL import Image 
from datasets import Dataset
from transformers import AutoProcessor 
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

In [2]:
train_file_names = []

for root, dirnames, filenames in os.walk("./ARC-AGI-master/data/training"):
    for filename in filenames:
        if filename.endswith('.json'):
            train_file_names.append(filename)
            

In [10]:
def grid2str(grid):
    return "\n".join(" ".join(str(i) for i in j) for j in grid)

In [11]:
bunch_of_series = []
i = 0

for name in train_file_names:
    with open(f"./ARC-AGI-master/data/training/{name}") as temp_file:
        some_file = json.load(temp_file)
        
        try:
            prob_ex_3_placement_input = grid2str(some_file["train"][2]["input"])
            prob_ex_3_placement_output = grid2str(some_file["train"][2]["output"])
        #print(i)
        except IndexError:
            prob_ex_3_placement_input = ""
            prob_ex_3_placement_output = ""
            
        try:
            prob_ex_4_placement_input = grid2str(some_file["train"][3]["input"])
            prob_ex_4_placement_output = grid2str(some_file["train"][3]["output"])
        except IndexError:
            prob_ex_4_placement_input = ""
            prob_ex_4_placement_output = ""
            
        try:
            problem_test_2_placement_input = grid2str(some_file["test"][1]["input"])
            problem_test_2__placement_output = grid2str(some_file["test"][1]["output"])
        except IndexError:
            problem_test_2_placement_input = ""
            problem_test_2__placement_output = ""
        
        temp_series = pd.Series({
            "problem_id": name,
            "problem_example_1_input":  grid2str(some_file["train"][0]["input"]),
            "problem_example_1_output": grid2str(some_file["train"][0]["output"]),
            "problem_example_2_input":  grid2str(some_file["train"][1]["input"]),
            "problem_example_2_output": grid2str(some_file["train"][1]["output"]),
            "problem_example_3_input":  prob_ex_3_placement_input,
            "problem_example_3_output": prob_ex_3_placement_output,
            "problem_example_4_input":  prob_ex_4_placement_input, 
            "problem_example_4_output": prob_ex_4_placement_output,
            "problem_test_1_input":     grid2str(some_file["test"][0]["input"]),
            "problem_test_1_output":    grid2str(some_file["test"][0]["output"]),
            "problem_test_2_input":     problem_test_2_placement_input,
            "problem_test_2_output":    problem_test_2__placement_output,
        })
        
        i += 1
        bunch_of_series.append(temp_series)
        
arc_train = pd.concat(bunch_of_series, axis=1).T
arc_no_id = arc_train.loc[:, arc_train.columns != "problem_id"]
arc_no_id_hf = Dataset.from_pandas(arc_no_id)

In [12]:
with open(f"./ARC-AGI-master/data/training/{train_file_names[0]}") as f:
    some_file = json.load(f)
    print(some_file)
    #print(grid2str(some_file["test"][0]["input"]))

{'train': [{'input': [[0, 0, 5], [0, 5, 0], [5, 0, 0]], 'output': [[3, 3, 3], [4, 4, 4], [2, 2, 2]]}, {'input': [[0, 0, 5], [0, 0, 5], [0, 0, 5]], 'output': [[3, 3, 3], [3, 3, 3], [3, 3, 3]]}, {'input': [[5, 0, 0], [0, 5, 0], [5, 0, 0]], 'output': [[2, 2, 2], [4, 4, 4], [2, 2, 2]]}, {'input': [[0, 5, 0], [0, 0, 5], [0, 5, 0]], 'output': [[4, 4, 4], [3, 3, 3], [4, 4, 4]]}], 'test': [{'input': [[0, 0, 5], [5, 0, 0], [0, 5, 0]], 'output': [[3, 3, 3], [2, 2, 2], [4, 4, 4]]}]}


In [13]:
train_file_names[0]

'a85d4709.json'

In [14]:
#tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct", 
# trust_remote_code=True)
#model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3.5-mini-instruct", 
# trust_remote_code=True)

# model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3.5-vision-instruct", trust_remote_code=True)
# tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-MoE-instruct",                                         trust_remote_code=True)

In [15]:
num = 0

prompt = (
    f"Here are three examples of a puzzle — \n"
    f'Puzzle 1 input: \n\n{arc_no_id_hf["problem_example_1_input"][num]} \n\n'
    f'Puzzle 1 output: \n\n{arc_no_id_hf["problem_example_1_output"][num]} '
    f'\n\n\n'
    f'Puzzle 2 input: \n\n{arc_no_id_hf["problem_example_2_input"][num]} \n\n'
    f'Puzzle 2 output: \n\n{arc_no_id_hf["problem_example_2_output"][num]} '
    f'\n\n\n'
    f'Puzzle 3 input: \n\n{arc_no_id_hf["problem_example_3_input"][num]} \n\n'
    f'Puzzle 3 output: \n\n{arc_no_id_hf["problem_example_3_output"][num]} '
    f'\n\n\n'
    
    f"Hint: These puzzles can be imagined as grids and the digits represent different "
    f"colors. 0: black, 1: blue, 2: red, 3: green, 4: yellow, 5: silver, 6: pink, 7: "
    f"orange, 8: cyan, 9: brown. Each puzzle-solution pair has some transformation, for "
    f"instance, adding  a particular digit/color-block in-between two other "
    f"digits/color-blocks or splitting/combining the size of the overall grid. The "
    f"solution may not have anything to with the digits themselves or some numerical "
    f"computations. \n\n"
    
    f"Based on the examples, could you solve the following test puzzle? \n\n"
    f'{arc_no_id_hf["problem_test_1_input"][num]} \n\n'
    f'Use chain of thought reasoning, try to identify the pattern, and provide only the '
    f'final answer to the test puzzle.'
)

print(prompt)
# prompt = f"Here's three examples of a puzzle \n"

# for i in range(4):
#    prompt += f'Puzzle {i + 1} input: \n\n {grid2str
#    (arc_no_id_hf["problem_example_{i}_input"][num])}'

Here are three examples of a puzzle — 
Puzzle 1 input: 

0 0 5
0 5 0
5 0 0 

Puzzle 1 output: 

3 3 3
4 4 4
2 2 2 


Puzzle 2 input: 

0 0 5
0 0 5
0 0 5 

Puzzle 2 output: 

3 3 3
3 3 3
3 3 3 


Puzzle 3 input: 

5 0 0
0 5 0
5 0 0 

Puzzle 3 output: 

2 2 2
4 4 4
2 2 2 


Hint: These puzzles can be imagined as grids and the digits represent different colors. 0: black, 1: blue, 2: red, 3: green, 4: yellow, 5: silver, 6: pink, 7: orange, 8: cyan, 9: brown. Each puzzle-solution pair has some transformation, for instance, adding  a particular digit/color-block in-between two other digits/color-blocks or splitting/combining the size of the overall grid. The solution may not have anything to with the digits themselves or some numerical computations. 

Based on the examples, could you solve the following test puzzle? 

0 0 5
5 0 0
0 5 0 

Use chain of thought reasoning, try to identify the pattern, and provide only the final answer to the test puzzle.


In [12]:
#if tokenizer.pad_token is None:
#    tokenizer.pad_token = tokenizer.eos_token
#
#def chat_with_model(prompt):
#    inputs = tokenizer.encode(prompt, return_tensors='pt').to("mps")
#    model.to("mps")
#    
#    output = model.generate(
#        inputs, 
#        max_new_tokens=300,
#        pad_token_id=tokenizer.pad_token_id
#    )
#    
#    response = tokenizer.decode(output[0], skip_special_tokens=True)
#    
#    return response #[len(prompt):].strip()

In [13]:
#response = chat_with_model(prompt)
#print(response)

In [16]:
print(arc_no_id_hf["problem_test_1_output"][num])

3 3 3
2 2 2
4 4 4


In [3]:
# from unittest.mock import patch
# from transformers.dynamic_module_utils import get_imports
# 
# device = "cpu"
# 
# def fixed_get_imports(filename: str | os.PathLike) -> list[str]:
#     """Work around for https://huggingface.co/microsoft/phi-1_5/discussions/72."""
#     if not str(filename).endswith("/modeling_phimoe.py"):
#         return get_imports(filename)
#     imports = get_imports(filename)
#     imports.remove("flash_attn")
#     return imports

model_name = "microsoft/Phi-3.5-vision-instruct"

# create model
#with patch("transformers.dynamic_module_utils.get_imports", fixed_get_imports):
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    #torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    #device_map='auto' if torch.cuda.is_available() else "mps",
    trust_remote_code=True,
    _attn_implementation="eager"
)

# for best performance, use num_crops=4 for multi-frame, num_crops=16 for single-frame.
processor = AutoProcessor.from_pretrained(model_name, 
  trust_remote_code=True, 
  num_crops=4
) 

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)

torch.random.manual_seed(0)

# model = AutoModelForCausalLM.from_pretrained(
#     model_name, 
#     device_map="mps", 
#     torch_dtype="auto", 
#     trust_remote_code=True,
#     attn_implementation="eager"
# )
# tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



<torch._C.Generator at 0x121aefdd0>

In [11]:
messages = [
    {"role": "system", "content": "You are a helpful AI assistant."},
    {"role": "user", "content": f"{prompt}"},
]

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    #device="mps" # if un-hashed, it drastically worsens model performance
)

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.9,
    "do_sample": True,
}

output = pipe(messages, **generation_args)
print(output[0]['generated_text'])

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
You are not running the flash-attention implementation, expect numerical differences.


 By observing the given puzzle and the examples, it appears that the transformation might involve identifying a pattern or color block and then either splitting or combining the overall grid based on that pattern.

In the given puzzle, we can see a repeating pattern of "8 1 8" and "8 8 1." We can divide the puzzle into two parts:

Top half:
8 1 8
8 8 1

Bottom half:
8 1 8
8 8 1

We can see that these patterns are mirror images of each other. Thus, we can create a new grid based on these patterns:

8 1
8 8

So, the final answer for the test puzzle is:

8 1
8 8


In [17]:
def create_vllm_prompt(image_paths):
    train_inputs = sorted([p for p in image_paths if 'train_input' in p])
    train_outputs = sorted([p for p in image_paths if 'train_output' in p])
    test_input = [p for p in image_paths if 'test_input' in p][0] 

    # image placeholders
    placeholders = '\n'.join(f'<|image_{i+1}|>' for i in range(len(image_paths)))

    prompt = f"""
{placeholders}
I will show you several pairs of input and output images representing a puzzle. 
Each pair demonstrates a pattern or rule. Your task is to analyze these pairs, 
understand the underlying pattern, and then apply it to a new input to predict its output.

Here are the training pairs:
1. Input: Image 1, Output: Image 2
2. Input: Image 3, Output: Image 4
3. Input: Image 5, Output: Image 6
4. Input: Image 7, Output: Image 8

Now, based on these examples, analyze the test input (Image 9) and describe what 
the output should look like. Explain your reasoning step by step, and answer the 
following question: what should the colors be for each of the cells in the test output 
with 3 x 3 grids?

Provide the answer in the form of a grid with digits in place of colors. For reference, 
0: black, 1: blue, 2: red, 3: green, 4: yellow, 5: silver, 6: pink, 7: orange, 8: cyan,
 9: brown.
"""

    return prompt

image_paths = [
    './images_for_vllm/train_input_0.png',
    './images_for_vllm/train_output_0.png',
    './images_for_vllm/train_input_1.png',
    './images_for_vllm/train_output_1.png',
    './images_for_vllm/train_input_2.png',
    './images_for_vllm/train_output_2.png',
    './images_for_vllm/train_input_3.png',
    './images_for_vllm/train_output_3.png',
    './images_for_vllm/test_input_0.png'
]

prompt = create_vllm_prompt(image_paths)
print(prompt)


<|image_1|>
<|image_2|>
<|image_3|>
<|image_4|>
<|image_5|>
<|image_6|>
<|image_7|>
<|image_8|>
<|image_9|>
I will show you several pairs of input and output images representing a puzzle. 
Each pair demonstrates a pattern or rule. Your task is to analyze these pairs, 
understand the underlying pattern, and then apply it to a new input to predict its output.

Here are the training pairs:
1. Input: Image 1, Output: Image 2
2. Input: Image 3, Output: Image 4
3. Input: Image 5, Output: Image 6
4. Input: Image 7, Output: Image 8

Now, based on these examples, analyze the test input (Image 9) and describe what 
the output should look like. Explain your reasoning step by step, and answer the 
following question: what should the colors be for each of the cells in the test output 
with 3 x 3 grids?

Provide the answer in the form of a grid with digits in place of colors. For reference, 
0: black, 1: blue, 2: red, 3: green, 4: yellow, 5: silver, 6: pink, 7: orange, 8: cyan,
 9: brown.


In [18]:
#image_paths = []
#for root, dirs, files in os.walk("./images_for_vllm/", topdown=False):
#   for name in files:
#      image_paths.append(os.path.join(root, name))

images = [Image.open(i) for i in image_paths]

messages = [
    {
        "role": "user", 
        "content": prompt
    },
]

prompt = processor.tokenizer.apply_chat_template(
    messages,
    tokenize=False, 
    add_generation_prompt=True
)

inputs = processor(prompt, images, return_tensors="pt")

generation_args = { 
    "max_new_tokens": 1000, 
    #"temperature": 0.0, 
    #"do_sample": False, 
}

generate_ids = model.generate(
    **inputs, 
    eos_token_id=processor.tokenizer.eos_token_id, 
    **generation_args
)

# remove input tokens 
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(
    generate_ids, 
    skip_special_tokens=True,
    clean_up_tokenization_spaces=False
)[0]

print(response)

To solve this puzzle, we need to identify the pattern or rule that transforms the input image into the output image. Let's analyze the given pairs:

1. Input: Image 1 (black and white checkerboard)
   Output: Image 2 (green and yellow gradient)

2. Input: Image 3 (black and white checkerboard)
   Output: Image 4 (green and yellow gradient)

3. Input: Image 5 (black and white checkerboard)
   Output: Image 6 (green and yellow gradient)

4. Input: Image 7 (black and white checkerboard)
   Output: Image 8 (green and yellow gradient)

From the above analysis, we can observe that the pattern involves replacing the black squares with a gradient of green and yellow. Now, let's apply this pattern to the test input (Image 9):

Test Input: Image 9 (black and white checkerboard)

Following the pattern, we replace the black squares with a gradient of green and yellow. The output should have a 3 x 3 grid with the following colors:

0: black
1: green
2: yellow
3: green
4: yellow
5: green
6: yellow
7