In [1]:
# Import Required Packages
import torch
import os
import json
import sys
import re
import random
import importlib.util
from typing import *
from tqdm import tqdm 
from typing import List

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap


from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


## Import Data sets

- I need to try different kind of labels: 
    - Multiclassification binaries [1,0,0,1]
    - To output the name 

In [9]:
with open("perceptions_training.json", "r") as f:
    dic_training = json.load(f)

with open("perceptions_testing.json", "r") as f:
    dic_testing = json.load(f)

In [10]:
def example_to_text(example):
    """
    Serialize the instruction, input, and output fields
    into a single text string that can be fed into the tokenizer.
    """
    instruction = example['instruction']
    # Convert the 'input' dictionary to a JSON-like string
    # so we can keep the structure visible for the model.
    # You could also manually flatten or format it differently.
    input_str = json.dumps(example['input'], indent=2)
    
    # The 'output' in your example is just a string ("symmetry"),
    # but if your output is also structured, you can similarly serialize it.
    output_str = str(example['output'])
    
    # A commonly-used format for instruction tuning:
    text = (
        f"### Instruction:\n{instruction}\n\n"  # or however you want to label it
        f"### Input:\n{input_str}\n\n"
        f"### Output:\n{output_str}\n"
    )
    return text

## Multi-label Classification

- The model's output layer should be a dense layer with 16 units (one for each label).
- Use a sigmoid activation function to get probabilities for each label.
- Use binary cross-entropy to calculate loss for each label independently:


In [76]:
label_list = [
    "Containment",
    "Depth",
    "Symmetry",
    "Categorical",
    "Spatial-Orientation",
    "Spatial-Ordinal",
    "Similarity",
    "Quantitative",
    "Replication",
    "Figure-Ground",
    "Continuity",
    "Size",
    "Closure",
    "Centroid",
    "Topological",
    "Motion",
]

In [77]:
def prepare_data_for_multilabel_classification(
    dic_training,
    label_list,
    inp_prefix="I: ",
    out_prefix="O: ",
    arr_sep="\n",
    exa_sep="\n---\n"
):
    llama_data = []

    # Dictionary with labels
    label_to_index = {label: i for i, label in enumerate(label_list)}

    for entry_id, content in dic_training.items():
        # Extract perceptions and encode as binary vector
        perceptions = content.get("perceptions", [])
        label_vector = [1 if label in perceptions else 0 for label in label_list]

        # Combine train and test examples
        examples = content.get("example", {}).get("train", []) + content.get("example", {}).get("test", [])

        # Format examples into a single input string
        formatted_examples = []
        for example in examples:
            input_data = f"{inp_prefix}{format_array(example['input'], arr_sep)}"
            output_data = f"{out_prefix}{format_array(example['output'], arr_sep)}"
            formatted_examples.append(f"{input_data}{exa_sep}{output_data}")

        # Combine all examples into a single text for the input
        combined_text = exa_sep.join(formatted_examples)

        # Add to dataset
        llama_data.append({
            "instruction": "Classify the relationship between the input and output sequences:",
            "input": combined_text,
            "output": label_vector,  # Binary vector for labels
        })

    return llama_data

def format_array(array, arr_sep="\n"):
    """
    Helper function to format a 2D array into a string with row-wise separation.
    """
    return arr_sep.join([" ".join(map(str, row)) for row in array])

llama_data = prepare_data_for_multilabel_classification(dic_training, label_list)
print(llama_data[0])


{'instruction': 'Classify the relationship between the input and output sequences:', 'input': 'I: 0 7 7\n7 7 7\n0 7 7\n---\nO: 0 0 0 0 7 7 0 7 7\n0 0 0 7 7 7 7 7 7\n0 0 0 0 7 7 0 7 7\n0 7 7 0 7 7 0 7 7\n7 7 7 7 7 7 7 7 7\n0 7 7 0 7 7 0 7 7\n0 0 0 0 7 7 0 7 7\n0 0 0 7 7 7 7 7 7\n0 0 0 0 7 7 0 7 7\n---\nI: 4 0 4\n0 0 0\n0 4 0\n---\nO: 4 0 4 0 0 0 4 0 4\n0 0 0 0 0 0 0 0 0\n0 4 0 0 0 0 0 4 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 4 0 4 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 4 0 0 0 0\n---\nI: 0 0 0\n0 0 2\n2 0 2\n---\nO: 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 2\n0 0 0 0 0 0 2 0 2\n0 0 0 0 0 0 0 0 0\n0 0 2 0 0 0 0 0 2\n2 0 2 0 0 0 2 0 2\n---\nI: 6 6 0\n6 0 0\n0 6 6\n---\nO: 6 6 0 6 6 0 0 0 0\n6 0 0 6 0 0 0 0 0\n0 6 6 0 6 6 0 0 0\n6 6 0 0 0 0 0 0 0\n6 0 0 0 0 0 0 0 0\n0 6 6 0 0 0 0 0 0\n0 0 0 6 6 0 6 6 0\n0 0 0 6 0 0 6 0 0\n0 0 0 0 6 6 0 6 6\n---\nI: 2 2 2\n0 0 0\n0 2 2\n---\nO: 2 2 2 2 2 2 2 2 2\n0 0 0 0 0 0 0 0 0\n0 2

### Fine tune Model with Llama

In [None]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer

class MultiLabelClassifier(nn.Module):
    def __init__(self, base_model_name, num_labels=16):
        super(MultiLabelClassifier, self).__init__()
        # Load the base Transformer model
        self.transformer = AutoModel.from_pretrained(base_model_name)
        
        # Freeze Transformer layers (optional)
        for param in self.transformer.parameters():
            param.requires_grad = False
        
        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(self.transformer.config.hidden_size, num_labels),  # Linear layer
            nn.Sigmoid()  # Sigmoid activation for multi-label classification
        )

    def forward(self, input_ids, attention_mask):
        # Pass input through the Transformer
        transformer_output = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        
        # Take the last hidden state (CLS token embedding)
        cls_embedding = transformer_output.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)
        
        # Pass through the classification head
        logits = self.classifier(cls_embedding)  # Shape: (batch_size, num_labels)
        return logits

base_model_name = "meta-llama/Llama-3.1-7B"  # Replace with your LLM
num_labels = 16  # Number of labels

In [None]:
model = MultiLabelClassifier(base_model_name, num_labels)
tokenizer = AutoTokenizer.from_pretrained(base_model_name)


# Example data
texts = ["This is a test input.", "Another example sequence."]
labels = [[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # Binary vector for first text
          [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]  # Binary vector for second text

# Tokenize input
tokenized = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Convert labels to tensors
labels = torch.tensor(labels, dtype=torch.float32)

# Binary Cross-Entropy Loss for multi-label classification
criterion = nn.BCELoss()

# Optimizer (fine-tune the classification head)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

model.train()
epochs = 3

for epoch in range(epochs):
    optimizer.zero_grad()

    # Forward pass
    logits = model(input_ids=tokenized["input_ids"], attention_mask=tokenized["attention_mask"])
    
    # Compute loss
    loss = criterion(logits, labels)
    
    # Backward pass
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}")


## Llama with known labels

In [68]:
def prepare_data_for_llama(
    dic_training,
    inp_prefix="I: ",
    out_prefix="O: ",
    arr_sep="\n",
    exa_sep="\n---\n",
    combine_token_start="[",
    combine_token_end="]"
):
    llama_data = []

    for entry_id, content in dic_training.items():
        # Extract perceptions (labels)
        perceptions = content.get("perceptions", [])
        if not perceptions:
            continue  # Skip entries without perceptions

        # Format the combined perceptions as a single token
        combined_perception = f"{combine_token_start}{', '.join(perceptions)}{combine_token_end}"

        # Combine train and test examples
        examples = content.get("example", {}).get("train", []) + content.get("example", {}).get("test", [])

        # Format the examples into a single text input
        formatted_examples = []
        for example in examples:
            input_data = f"{inp_prefix}{format_array(example['input'], arr_sep)}"
            output_data = f"{out_prefix}{format_array(example['output'], arr_sep)}"
            formatted_examples.append(f"{input_data}{exa_sep}{output_data}")

        # Combine all examples into a single text for the input
        combined_text = exa_sep.join(formatted_examples)

        # Create the final JSON structure for fine-tuning
        llama_data.append({
            "instruction": "Classify the relationship between the input and output sequences:",
            "input": combined_text,
            "output": f"{' | '.join(perceptions)} | {combined_perception}",
        })

    return llama_data

def format_array(array, arr_sep="\n"):
    """
    Helper function to format a 2D array into a string with row-wise separation.
    """
    return arr_sep.join([" ".join(map(str, row)) for row in array])


NameError: name 'prepare_data_for_multilabel_classification' is not defined

In [67]:
print(llama_data_training[0])

{'instruction': 'Classify the relationship between the input and output sequences:', 'input': 'I: 0 7 7\n7 7 7\n0 7 7\n---\nO: 0 0 0 0 7 7 0 7 7\n0 0 0 7 7 7 7 7 7\n0 0 0 0 7 7 0 7 7\n0 7 7 0 7 7 0 7 7\n7 7 7 7 7 7 7 7 7\n0 7 7 0 7 7 0 7 7\n0 0 0 0 7 7 0 7 7\n0 0 0 7 7 7 7 7 7\n0 0 0 0 7 7 0 7 7\n---\nI: 4 0 4\n0 0 0\n0 4 0\n---\nO: 4 0 4 0 0 0 4 0 4\n0 0 0 0 0 0 0 0 0\n0 4 0 0 0 0 0 4 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 4 0 4 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 4 0 0 0 0\n---\nI: 0 0 0\n0 0 2\n2 0 2\n---\nO: 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 2\n0 0 0 0 0 0 2 0 2\n0 0 0 0 0 0 0 0 0\n0 0 2 0 0 0 0 0 2\n2 0 2 0 0 0 2 0 2\n---\nI: 6 6 0\n6 0 0\n0 6 6\n---\nO: 6 6 0 6 6 0 0 0 0\n6 0 0 6 0 0 0 0 0\n0 6 6 0 6 6 0 0 0\n6 6 0 0 0 0 0 0 0\n6 0 0 0 0 0 0 0 0\n0 6 6 0 0 0 0 0 0\n0 0 0 6 6 0 6 6 0\n0 0 0 6 0 0 6 0 0\n0 0 0 0 6 6 0 6 6\n---\nI: 2 2 2\n0 0 0\n0 2 2\n---\nO: 2 2 2 2 2 2 2 2 2\n0 0 0 0 0 0 0 0 0\n0 2

In [66]:
llama_data_training = prepare_data_for_llama(dic_training)

for example in llama_data_training:
    print("Instruction:", example["instruction"])
    print("Input:", example["input"])
    print("Output:", example["output"])


Instruction: Classify the relationship between the input and output sequences:
Input: I: 0 7 7
7 7 7
0 7 7
---
O: 0 0 0 0 7 7 0 7 7
0 0 0 7 7 7 7 7 7
0 0 0 0 7 7 0 7 7
0 7 7 0 7 7 0 7 7
7 7 7 7 7 7 7 7 7
0 7 7 0 7 7 0 7 7
0 0 0 0 7 7 0 7 7
0 0 0 7 7 7 7 7 7
0 0 0 0 7 7 0 7 7
---
I: 4 0 4
0 0 0
0 4 0
---
O: 4 0 4 0 0 0 4 0 4
0 0 0 0 0 0 0 0 0
0 4 0 0 0 0 0 4 0
0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0
0 0 0 4 0 4 0 0 0
0 0 0 0 0 0 0 0 0
0 0 0 0 4 0 0 0 0
---
I: 0 0 0
0 0 2
2 0 2
---
O: 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 2
0 0 0 0 0 0 2 0 2
0 0 0 0 0 0 0 0 0
0 0 2 0 0 0 0 0 2
2 0 2 0 0 0 2 0 2
---
I: 6 6 0
6 0 0
0 6 6
---
O: 6 6 0 6 6 0 0 0 0
6 0 0 6 0 0 0 0 0
0 6 6 0 6 6 0 0 0
6 6 0 0 0 0 0 0 0
6 0 0 0 0 0 0 0 0
0 6 6 0 0 0 0 0 0
0 0 0 6 6 0 6 6 0
0 0 0 6 0 0 6 0 0
0 0 0 0 6 6 0 6 6
---
I: 2 2 2
0 0 0
0 2 2
---
O: 2 2 2 2 2 2 2 2 2
0 0 0 0 0 0 0 0 0
0 2 2 0 2 2 0 2 2
0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0
0 

In [None]:
from datasets import Dataset

# Convert the list to a Hugging Face Dataset
hf_dataset = Dataset.from_list(llama_data_training)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples["instruction"] + "\n" + examples["input"],
        text_target=examples["output"],
        truncation=True,
        max_length=512,
    )

tokenizer = AutoTokenizer.from_pretrained("path_to_llama_tokenizer")
tokenized_dataset = hf_dataset.map(tokenize_function, batched=True)

from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

# Load LLaMA model
model = AutoModelForCausalLM.from_pretrained("path_to_llama_model")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./llama_finetuned",
    evaluation_strategy="steps",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    save_steps=500,
    logging_dir="./logs",
    logging_steps=100,
)

# Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

# Start training
trainer.train()


-------------

In [15]:
def transform_data_for_llama(dic_training):
    llama_data = []
    for entry_id, content in dic_training.items():
        # Extract train and test examples from 'example'
        example = content.get("example", {})
        train_examples = [
            {"input": example_entry["input"], "output": example_entry["output"]}
            for example_entry in example.get("train", [])
        ]
        test_examples = [
            {"input": example_entry["input"], "output": example_entry.get("output", [])}
            for example_entry in example.get("test", [])
        ]

        # Build the data entry in the compact format
        data_entry = {
            "instruction": "Answer based on the example below:",
            "input": {"train": train_examples, "test": test_examples},
            "output": content.get("perceptions", []),  # Get perceptions if available
        }
        llama_data.append(data_entry)

    return llama_data

llama_data_training = transform_data_for_llama(dic_training)

llama_data_training

[{'instruction': 'Answer based on the example below:',
  'input': {'train': [{'input': [[0, 7, 7], [7, 7, 7], [0, 7, 7]],
     'output': [[0, 0, 0, 0, 7, 7, 0, 7, 7],
      [0, 0, 0, 7, 7, 7, 7, 7, 7],
      [0, 0, 0, 0, 7, 7, 0, 7, 7],
      [0, 7, 7, 0, 7, 7, 0, 7, 7],
      [7, 7, 7, 7, 7, 7, 7, 7, 7],
      [0, 7, 7, 0, 7, 7, 0, 7, 7],
      [0, 0, 0, 0, 7, 7, 0, 7, 7],
      [0, 0, 0, 7, 7, 7, 7, 7, 7],
      [0, 0, 0, 0, 7, 7, 0, 7, 7]]},
    {'input': [[4, 0, 4], [0, 0, 0], [0, 4, 0]],
     'output': [[4, 0, 4, 0, 0, 0, 4, 0, 4],
      [0, 0, 0, 0, 0, 0, 0, 0, 0],
      [0, 4, 0, 0, 0, 0, 0, 4, 0],
      [0, 0, 0, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 4, 0, 4, 0, 0, 0],
      [0, 0, 0, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 0, 4, 0, 0, 0, 0]]},
    {'input': [[0, 0, 0], [0, 0, 2], [2, 0, 2]],
     'output': [[0, 0, 0, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 0, 0, 0, 0, 0, 0],
      [0, 0, 0, 0, 0, 0, 0, 0, 0],
      [0, 0

In [4]:
def transform_data_for_llama(dic_training):
    llama_data = []
    for entry_id, content in dic_training.items():
        example_text = content["example"]  # e.g., can be a string or list
        if isinstance(example_text, list):
            example_text = "\n".join(example_text)  # or any other separator

        perceptions_text = content["perceptions"]  # e.g., can be a string or list
        if isinstance(perceptions_text, list):
            perceptions_text = "\n".join(perceptions_text)

        # Build the desired JSON entry
        data_entry = {
            "instruction": "Answer based on the example below:",
            "input": example_text,
            "output": perceptions_text
        }
        llama_data.append(data_entry)

    return llama_data

llama_data_training = transform_data_for_llama(dic_training)

llama_data_training

[{'instruction': 'Answer based on the example below:',
  'input': {'test': [{'input': [[7, 0, 7], [7, 0, 7], [7, 7, 0]],
     'output': [[7, 0, 7, 0, 0, 0, 7, 0, 7],
      [7, 0, 7, 0, 0, 0, 7, 0, 7],
      [7, 7, 0, 0, 0, 0, 7, 7, 0],
      [7, 0, 7, 0, 0, 0, 7, 0, 7],
      [7, 0, 7, 0, 0, 0, 7, 0, 7],
      [7, 7, 0, 0, 0, 0, 7, 7, 0],
      [7, 0, 7, 7, 0, 7, 0, 0, 0],
      [7, 0, 7, 7, 0, 7, 0, 0, 0],
      [7, 7, 0, 7, 7, 0, 0, 0, 0]]}],
   'train': [{'input': [[0, 7, 7], [7, 7, 7], [0, 7, 7]],
     'output': [[0, 0, 0, 0, 7, 7, 0, 7, 7],
      [0, 0, 0, 7, 7, 7, 7, 7, 7],
      [0, 0, 0, 0, 7, 7, 0, 7, 7],
      [0, 7, 7, 0, 7, 7, 0, 7, 7],
      [7, 7, 7, 7, 7, 7, 7, 7, 7],
      [0, 7, 7, 0, 7, 7, 0, 7, 7],
      [0, 0, 0, 0, 7, 7, 0, 7, 7],
      [0, 0, 0, 7, 7, 7, 7, 7, 7],
      [0, 0, 0, 0, 7, 7, 0, 7, 7]]},
    {'input': [[4, 0, 4], [0, 0, 0], [0, 4, 0]],
     'output': [[4, 0, 4, 0, 0, 0, 4, 0, 4],
      [0, 0, 0, 0, 0, 0, 0, 0, 0],
      [0, 4, 0, 0, 0, 0, 0, 4, 0],
  

# Fine-tuning:

## Lama 3.2 Transduction Model: LoRa

In [5]:
import torch
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

In [7]:
model = AutoModelForCausalLM.from_pretrained(
    "barc0/Llama-3.1-ARC-Potpourri-Transduction-8B"
)
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(
    "barc0/Llama-3.1-ARC-Potpourri-Transduction-8B"
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

lora_config = LoraConfig(
    r=16,              # smaller rank
    lora_dropout=0.1
)

model = get_peft_model(model, lora_config)
model.gradient_checkpointing_enable()   # optional but helps reduce memory

training_args = TrainingArguments(
    output_dir="./finetuned_model",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,      # also small for evaluation
    gradient_accumulation_steps=4,     # increase GA steps
    num_train_epochs=3,
    save_strategy="epoch",
    save_total_limit=2,
    seed=42,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    logging_dir="./logs",
    logging_steps=50,
    report_to="tensorboard",
    fp16=torch.cuda.is_available(), 
    push_to_hub=False,
    ddp_find_unused_parameters=False,
)

serialized_texts = [example_to_text(item) for item in llama_data_training]

# E.g. reduce max_length
tokenized_data = tokenizer(
    serialized_texts,
    padding="max_length",
    truncation=True,
    max_length=256,
    return_tensors="pt"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,  # For a real scenario, use a Dataset object
    tokenizer=tokenizer,
)

trainer.train()


Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.93it/s]


AttributeError: 'list' object has no attribute 'keys'

In [None]:
# Update TrainingArguments
training_args = TrainingArguments(
    output_dir="./finetuned_model",
    evaluation_strategy="steps",      # Evaluate every X steps
    eval_steps=500,                   # Define steps
    learning_rate=1e-5,
    per_device_train_batch_size=2,    # Increase if possible
    per_device_eval_batch_size=2,     # Increase if possible
    gradient_accumulation_steps=4,    # Tune based on memory
    num_train_epochs=5,               # Increase epochs for better training
    save_strategy="steps",
    save_steps=500,                   # Save every 500 steps
    save_total_limit=2,
    seed=42,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    logging_dir="./logs",
    logging_steps=50,
    report_to="tensorboard",
    fp16=torch.cuda.is_available(),
    push_to_hub=False,
    ddp_find_unused_parameters=False,
)

# Updated Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,  # Use Dataset object
    tokenizer=tokenizer,
)

trainer.train()