In [1]:
# Import Required Packages
import torch
import os
import json
import sys
import re
import random
import importlib.util
from typing import *
from tqdm import tqdm 
from typing import List

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap


from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig
from datasets import Dataset

  from .autonotebook import tqdm as notebook_tqdm


# Fine-tune Models

1) Import Data Set
2) Transform them into correct format

### 1) Import Data sets and other quantities

In [2]:
with open("perceptions_training.json", "r") as f:
    dic_training = json.load(f)

with open("perceptions_testing.json", "r") as f:
    dic_testing = json.load(f)

label_list = [
    "Containment",
    "Depth",
    "Symmetry",
    "Categorical",
    "Spatial-Orientation",
    "Spatial-Ordinal",
    "Similarity",
    "Quantitative",
    "Replication",
    "Figure-Ground",
    "Continuity",
    "Size",
    "Closure",
    "Centroid",
    "Topological",
    "Motion",
]

### 2) Transform them into input,output

In [3]:
def prepare_data_for_multilabel_classification(
    dic_training,
    label_list,
    inp_prefix="I: ",
    out_prefix="O: ",
    arr_sep="\n",
    exa_sep="\n---\n"
):
    llama_data = []

    # Create a mapping from label to index
    label_to_index = {label: i for i, label in enumerate(label_list)}

    for entry_id, content in dic_training.items():
        # Extract perceptions (labels) and encode them as a binary vector
        perceptions = content.get("perceptions", [])
        
        label_vector = labels_to_binary(label_list,perceptions)

        # Combine train and test examples
        examples = content.get("example", {}).get("train", []) + content.get("example", {}).get("test", [])

        # Format examples into a single input string
        formatted_examples = []
        for example in examples:
            input_data = f"{inp_prefix}{format_array(example['input'], arr_sep)}"
            output_data = f"{out_prefix}{format_array(example['output'], arr_sep)}"
            formatted_examples.append(f"{input_data}{exa_sep}{output_data}")

        # Combine all examples into one input text
        combined_text = exa_sep.join(formatted_examples)

        # Add the structured data for fine-tuning
        llama_data.append({
            "instruction": "Classify the relationship between the input and output sequences:",
            "input": combined_text,
            "output": label_vector,  # Multi-label as binary vector
        })

    return llama_data

def format_array(array, arr_sep="\n"):
    """
    Helper function to format a 2D array into a string with row-wise separation.
    """
    return arr_sep.join([" ".join(map(str, row)) for row in array])

def labels_to_binary(label_list,input_string):

    # Convert input_string to lowercase for case-insensitive matching
    input_set = set(label.lower() for label in input_string)

    # Create a binary list where 1 indicates the label is present and 0 otherwise
    binary_list = [1 if label.lower() in input_set else 0 for label in label_list]

    return(binary_list)


In [4]:
llama_data_list = prepare_data_for_multilabel_classification(dic_training, label_list)

# Restructure llama_data
llama_data_dict = {
    "instruction": [item["instruction"] for item in llama_data_list],
    "input": [item["input"] for item in llama_data_list],
    "output": [item["output"] for item in llama_data_list],
}

llama_data_dataset = Dataset.from_dict(llama_data_dict)
llama_data_dataset[0]

{'instruction': 'Classify the relationship between the input and output sequences:',
 'input': 'I: 0 7 7\n7 7 7\n0 7 7\n---\nO: 0 0 0 0 7 7 0 7 7\n0 0 0 7 7 7 7 7 7\n0 0 0 0 7 7 0 7 7\n0 7 7 0 7 7 0 7 7\n7 7 7 7 7 7 7 7 7\n0 7 7 0 7 7 0 7 7\n0 0 0 0 7 7 0 7 7\n0 0 0 7 7 7 7 7 7\n0 0 0 0 7 7 0 7 7\n---\nI: 4 0 4\n0 0 0\n0 4 0\n---\nO: 4 0 4 0 0 0 4 0 4\n0 0 0 0 0 0 0 0 0\n0 4 0 0 0 0 0 4 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 4 0 4 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 4 0 0 0 0\n---\nI: 0 0 0\n0 0 2\n2 0 2\n---\nO: 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 2\n0 0 0 0 0 0 2 0 2\n0 0 0 0 0 0 0 0 0\n0 0 2 0 0 0 0 0 2\n2 0 2 0 0 0 2 0 2\n---\nI: 6 6 0\n6 0 0\n0 6 6\n---\nO: 6 6 0 6 6 0 0 0 0\n6 0 0 6 0 0 0 0 0\n0 6 6 0 6 6 0 0 0\n6 6 0 0 0 0 0 0 0\n6 0 0 0 0 0 0 0 0\n0 6 6 0 0 0 0 0 0\n0 0 0 6 6 0 6 6 0\n0 0 0 6 0 0 6 0 0\n0 0 0 0 6 6 0 6 6\n---\nI: 2 2 2\n0 0 0\n0 2 2\n---\nO: 2 2 2 2 2 2 2 2 2\n0 0 0 0 0 0 0 0 0\n0 

In [30]:
llama_data_list_testing = prepare_data_for_multilabel_classification(dic_testing, label_list)

# Restructure llama_data
llama_data_dict_testing = {
    "instruction": [item["instruction"] for item in llama_data_list_testing],
    "input": [item["input"] for item in llama_data_list_testing],
    "output": [item["output"] for item in llama_data_list_testing],
}

llama_data_dataset_testing = Dataset.from_dict(llama_data_dict_testing)
llama_data_dataset_testing[10]

{'instruction': 'Classify the relationship between the input and output sequences:',
 'input': 'I: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 0\n0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 1 1 1 1 0 0 0 0\n0 0 0 0 0 1 1 1 0 0 0 0 1 1 1 1 1 0 0 0 0 0 1 1 1 1 0 0 0 0\n0 0 0 0 0 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0\n0 0 0 0 0 1 1 1 0 0 0 0 1 1 1 1 1 0 0 0 0 0 1 1 1 1 0 0 0 0\n0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 0 0 1 1 1\n0 0 0 0 0 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1\n0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 0 0 1 1 1\n0 0 0 0 0 1 1 1 0 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1

### 3) Finetune encoders

In [6]:
from tokenizers import Tokenizer, models, trainers, pre_tokenizers

# Initialize a tokenizer
tokenizer = Tokenizer(models.BPE())

# Customize pre-tokenizer
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)

# Train on your dataset
trainer = trainers.BpeTrainer(special_tokens=["<pad>", "<eos>", "<unk>", "I:", "O:", "---"])
files = ["path_to_your_data.txt"]
tokenizer.train(files, trainer)

# Save and load the tokenizer
tokenizer.save("custom_tokenizer.json")


Exception: No such file or directory (os error 2)

## Multi-label Classification

- The model's output layer should be a dense layer with 16 units (one for each label).
- Use a sigmoid activation function to get probabilities for each label.
- Use binary cross-entropy to calculate loss for each label independently:


### Fine tune Model with Llama

1) Fine Tune Tokenizer 
    - Save data as txt.file

In [26]:
import torch
from torch import nn
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from peft import get_peft_model, LoraConfig
from datasets import Dataset

# Load base model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    "barc0/Llama-3.1-ARC-Potpourri-Transduction-8B"
)
#model.config.use_cache = False
#model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(
    "barc0/Llama-3.1-ARC-Potpourri-Transduction-8B"
)
#tokenizer.pad_token = tokenizer.eos_token
#tokenizer.padding_side = "right"

# Add LoRA config (optional for parameter efficiency)
lora_config = LoraConfig(r=64, lora_dropout=0.1)
model = get_peft_model(model, lora_config)

class ClassificationHead(nn.Module):
    def __init__(self, base_model, num_labels):
        super().__init__()
        self.base_model = base_model
        self.classifier = nn.Linear(base_model.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None):
        # Make sure we request hidden states in the forward pass
        outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=True,
            output_hidden_states=True,
        )

        # Instead of outputs.last_hidden_state, we use outputs.hidden_states[-1]
        final_hidden_state = outputs.hidden_states[-1]   # shape: [batch_size, seq_len, hidden_size]
        # If you want the first token as a "CLS" representation:
        pooled_output = final_hidden_state[:, 0, :]
        
        # Or, if you'd rather pool over the entire sequence, you can do something like:
        # pooled_output = final_hidden_state.mean(dim=1)  # mean pooling

        logits = self.classifier(pooled_output)
        return logits


num_labels = 16  # Number of labels
model = ClassificationHead(model, num_labels)

# Enable gradient checkpointing for memory optimization
model.base_model.gradient_checkpointing_enable()

training_args = TrainingArguments(
    output_dir="./finetuned_model",
    evaluation_strategy="steps",  # Evaluate periodically
    eval_steps=500,              # Evaluate every 500 steps
    learning_rate=1e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=1,  # Reduced
    num_train_epochs=3,
    save_strategy="epoch",
    save_total_limit=2,
    seed=42,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    logging_dir="./logs",
    logging_steps=50,
    report_to="tensorboard",
    fp16=True,                    # Mixed-precision
    push_to_hub=False,
    ddp_find_unused_parameters=False,
)


Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.92it/s]


In [9]:
for item in llama_data_list_testing:
    input_tokens = tokenizer(item["input"], truncation=False)["input_ids"]
    print(f"Input Length: {len(input_tokens)} for {item['input'][:100]}...")


Input Length: 263 for I: 8 6
6 4
---
O: 8 6 8 6 8 6
6 4 6 4 6 4
6 8 6 8 6 8
4 6 4 6 4 6
8 6 8 6 8 6
6 4 6 4 6 4
---
I: 7 9...
Input Length: 4751 for I: 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 8 8 0 8 8 0 0
0 0 0 0 0 0 0 0 8 8 8 0 0 0
0 0 0 0 0 8 8...
Input Length: 3735 for I: 2 2 2 2 2 0 0
2 0 0 0 2 0 0
2 0 2 0 2 0 0
2 0 0 0 2 0 0
2 2 2 2 2 0 0
0 0 0 0 0 0 0
0 0 0 0 0 0 0...
Input Length: 1631 for I: 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0...
Input Length: 14431 for I: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0...
Input Length: 7951 for I: 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 3 0 0 0 0 0 0
0 0 6 6 6 6 6 6 0 0 6 6 6 6 3 6 0 0 0 0 0 0
0 0 8 8 3...
Input Length: 751 for I: 0 7 0
7 7 7
0 7 0
---
O: 0 0 0 7 0 7 0 0 0
0 0 0 0 0 0 0 0 0
0 0 0 7 0 7 0 0 0
7 0 7 7 0 7 7 0 7
...
Input Length: 4823 for I: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 0 0 

In [12]:
# Define the tokenizer function
def tokenize_function(example):
    tokenized = tokenizer(
        example["input"],
        padding="max_length",
        truncation=True,
        max_length=8192,
        return_tensors="pt"
    )
    
    return {
        "input_ids": tokenized["input_ids"].squeeze().tolist(),
        "attention_mask": tokenized["attention_mask"].squeeze().tolist(),
        "labels": example["output"]
    }

# Apply tokenization
tokenized_dataset = llama_data_dataset.map(tokenize_function, batched=True)
tokenized_dataset_testing = llama_data_dataset_testing.map(tokenize_function, batched=True)

print(tokenized_dataset[0])
print(tokenized_dataset_testing[0])

Map: 100%|██████████| 150/150 [00:00<00:00, 280.29 examples/s]
Map: 100%|██████████| 50/50 [00:00<00:00, 280.01 examples/s]

{'instruction': 'Classify the relationship between the input and output sequences:', 'input': 'I: 0 7 7\n7 7 7\n0 7 7\n---\nO: 0 0 0 0 7 7 0 7 7\n0 0 0 7 7 7 7 7 7\n0 0 0 0 7 7 0 7 7\n0 7 7 0 7 7 0 7 7\n7 7 7 7 7 7 7 7 7\n0 7 7 0 7 7 0 7 7\n0 0 0 0 7 7 0 7 7\n0 0 0 7 7 7 7 7 7\n0 0 0 0 7 7 0 7 7\n---\nI: 4 0 4\n0 0 0\n0 4 0\n---\nO: 4 0 4 0 0 0 4 0 4\n0 0 0 0 0 0 0 0 0\n0 4 0 0 0 0 0 4 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 4 0 4 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 4 0 0 0 0\n---\nI: 0 0 0\n0 0 2\n2 0 2\n---\nO: 0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 2\n0 0 0 0 0 0 2 0 2\n0 0 0 0 0 0 0 0 0\n0 0 2 0 0 0 0 0 2\n2 0 2 0 0 0 2 0 2\n---\nI: 6 6 0\n6 0 0\n0 6 6\n---\nO: 6 6 0 6 6 0 0 0 0\n6 0 0 6 0 0 0 0 0\n0 6 6 0 6 6 0 0 0\n6 6 0 0 0 0 0 0 0\n6 0 0 0 0 0 0 0 0\n0 6 6 0 0 0 0 0 0\n0 0 0 6 6 0 6 6 0\n0 0 0 6 0 0 6 0 0\n0 0 0 0 6 6 0 6 6\n---\nI: 2 2 2\n0 0 0\n0 2 2\n---\nO: 2 2 2 2 2 2 2 2 2\n0 0 0 0 0 0 0 0 0\n0 2




In [14]:
# Custom DataCollator for multi-label classification
class DataCollatorWithLabels:
    def __call__(self, features):
        # Convert input_ids and attention_mask to tensors
        input_ids = torch.stack([torch.tensor(f["input_ids"]) for f in features])
        attention_mask = torch.stack([torch.tensor(f["attention_mask"]) for f in features])
        labels = torch.tensor([f["labels"] for f in features], dtype=torch.float32)
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

data_collator = DataCollatorWithLabels()

# Custom loss function
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        logits = model(**inputs)
        loss_fn = nn.BCEWithLogitsLoss()  # Binary Cross Entropy Loss
        loss = loss_fn(logits, labels)
        return (loss, logits) if return_outputs else loss

# Define trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset_testing,  # <-- Provide your eval dataset here
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Start training
trainer.train()

[2024-12-29 16:40:24,482] [INFO] [real_accelerator.py:219:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/home/jdelinea/anaconda3/envs/BarcHandbook/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/jdelinea/anaconda3/envs/BarcHandbook/compiler_compat/ld: /usr/local/cuda-12.6/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/home/jdelinea/anaconda3/envs/BarcHandbook/compiler_compat/ld: /usr/local/cuda-12.6/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/home/jdelinea/anaconda3/envs/BarcHandbook/compiler_compat/ld: /usr/local/cuda-12.6/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/home/jdelinea/anaconda3/envs/BarcHandbook/compiler_compat/ld: /usr/local/cuda-12.6/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/home/jdelinea/anaconda3/envs/BarcHandbook/compiler_compat/ld: /usr/local/cuda-12.6/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned 

Epoch,Training Loss,Validation Loss
0,No log,0.378866
2,0.182200,0.325917


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/internal/generation_utils#transformers.Cache)


TrainOutput(global_step=111, training_loss=0.3088789684278471, metrics={'train_runtime': 771.702, 'train_samples_per_second': 0.583, 'train_steps_per_second': 0.144, 'total_flos': 0.0, 'train_loss': 0.3088789684278471, 'epoch': 2.96})

-----------

In [15]:
eval_results = trainer.evaluate()


In [17]:
predictions = trainer.predict(tokenized_dataset_testing)

PredictionOutput(predictions=array([], shape=(0, 16), dtype=float32), label_ids=array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 

In [27]:
class DebugTrainer(CustomTrainer):
    def prediction_step(self, model, inputs, prediction_loss_only=False, ignore_keys=None):
        outputs = model(**inputs)
        print(f"Outputs: {outputs}")
        return super().prediction_step(model, inputs, prediction_loss_only, ignore_keys)

debug_trainer = DebugTrainer(
    model=model,
    args=training_args,
    eval_dataset=tokenized_dataset_testing,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

debug_trainer.predict(tokenized_dataset_testing)


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.96 GiB. GPU 0 has a total capacity of 79.20 GiB of which 594.62 MiB is free. Including non-PyTorch memory, this process has 78.62 GiB memory in use. Of the allocated memory 77.89 GiB is allocated by PyTorch, and 79.18 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [21]:
predictions

PredictionOutput(predictions=array([], shape=(0, 16), dtype=float32), label_ids=array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 

In [19]:
# Logits from the model
logits = predictions.predictions
probs = torch.softmax(torch.tensor(logits), dim=-1).numpy()
predicted_classes = np.argmax(probs, axis=-1)

# Ground truth labels
true_labels = predictions.label_ids

# Compare predictions and true labels
for i, (pred, true) in enumerate(zip(predicted_classes, true_labels)):
    print(f"Example {i}: Predicted={pred}, True={true}")



In [23]:
print(len(tokenized_dataset))  # Should be > 0
print(len(tokenized_dataset_testing))   # Inspect the first example


150
50


In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(true_labels, predicted_classes)
print(f"Accuracy: {accuracy}")


In [None]:
print(len(llama_data_dataset_testing))



In [33]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer

class MultiLabelClassifier(nn.Module):
    def __init__(self, base_model_name, num_labels=16):
        super(MultiLabelClassifier, self).__init__()
        # Load the base Transformer model
        self.transformer = AutoModel.from_pretrained(base_model_name)
        
        # Freeze Transformer layers (optional)
        for param in self.transformer.parameters():
            param.requires_grad = False
        
        # Classification head
        self.classifier = nn.Sequential(
            nn.Linear(self.transformer.config.hidden_size, num_labels),  # Linear layeAr
            nn.Sigmoid()  # Sigmoid activation for multi-label classification
        )

    def forward(self, input_ids, attention_mask):
        # Pass input through the Transformer
        transformer_output = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        
        # Take the last hidden state (CLS token embedding)
        cls_embedding = transformer_output.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)
        
        # Pass through the classification head
        logits = self.classifier(cls_embedding)  # Shape: (batch_size, num_labels)
        return logits

base_model_name = "meta-llama/Llama-3.1-7B"  # Replace with your LLM
num_labels = 16  # Number of labels

In [None]:
model = MultiLabelClassifier(base_model_name, num_labels)
tokenizer = AutoTokenizer.from_pretrained(base_model_name)


# Example data
texts = ["This is a test input.", "Another example sequence."]
labels = [[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],  # Binary vector for first text
          [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]]  # Binary vector for second text

# Tokenize input
tokenized = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Convert labels to tensors
labels = torch.tensor(labels, dtype=torch.float32)

# Binary Cross-Entropy Loss for multi-label classification
criterion = nn.BCELoss()

# Optimizer (fine-tune the classification head)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

model.train()
epochs = 3

for epoch in range(epochs):
    optimizer.zero_grad()

    # Forward pass
    logits = model(input_ids=tokenized["input_ids"], attention_mask=tokenized["attention_mask"])
    
    # Compute loss
    loss = criterion(logits, labels)
    
    # Backward pass
    loss.backward()
    optimizer.step()

    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}")


------------------

### In this step we will keep the label in natural language

In [None]:
def prepare_data_for_llama(
    dic_training,
    inp_prefix="I: ",
    out_prefix="O: ",
    arr_sep="\n",
    exa_sep="\n---\n",
    combine_token_start="[",
    combine_token_end="]"
):
    llama_data = []

    for entry_id, content in dic_training.items():
        # Extract perceptions (labels)
        perceptions = content.get("perceptions", [])
        if not perceptions:
            continue  # Skip entries without perceptions

        # Format the combined perceptions as a single token
        combined_perception = f"{combine_token_start}{', '.join(perceptions)}{combine_token_end}"

        # Combine train and test examples
        examples = content.get("example", {}).get("train", []) + content.get("example", {}).get("test", [])

        # Format the examples into a single text input
        formatted_examples = []
        for example in examples:
            input_data = f"{inp_prefix}{format_array(example['input'], arr_sep)}"
            output_data = f"{out_prefix}{format_array(example['output'], arr_sep)}"
            formatted_examples.append(f"{input_data}{exa_sep}{output_data}")

        # Combine all examples into a single text for the input
        combined_text = exa_sep.join(formatted_examples)

        # Create the final JSON structure for fine-tuning
        llama_data.append({
            "instruction": "Classify the relationship between the input and output sequences:",
            "input": combined_text,
            "output": f"{' | '.join(perceptions)} | {combined_perception}",
        })

    return llama_data

def format_array(array, arr_sep="\n"):
    """
    Helper function to format a 2D array into a string with row-wise separation.
    """
    return arr_sep.join([" ".join(map(str, row)) for row in array])


In [None]:
llama_data_training = prepare_data_for_llama(dic_training)

print(llama_data_training[0])

for example in llama_data_training:
    print("Instruction:", example["instruction"])
    print("Input:", example["input"])
    print("Output:", example["output"])


In [None]:
from datasets import Dataset

# Convert the list to a Hugging Face Dataset
hf_dataset = Dataset.from_list(llama_data_training)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples["instruction"] + "\n" + examples["input"],
        text_target=examples["output"],
        truncation=True,
        max_length=512,
    )

tokenizer = AutoTokenizer.from_pretrained("path_to_llama_tokenizer")
tokenized_dataset = hf_dataset.map(tokenize_function, batched=True)

from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

# Load LLaMA model
model = AutoModelForCausalLM.from_pretrained("path_to_llama_model")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./llama_finetuned",
    evaluation_strategy="steps",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    save_steps=500,
    logging_dir="./logs",
    logging_steps=100,
)

# Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

# Start training
trainer.train()


-------------

In [None]:
def transform_data_for_llama(dic_training):
    llama_data = []
    for entry_id, content in dic_training.items():
        # Extract train and test examples from 'example'
        example = content.get("example", {})
        train_examples = [
            {"input": example_entry["input"], "output": example_entry["output"]}
            for example_entry in example.get("train", [])
        ]
        test_examples = [
            {"input": example_entry["input"], "output": example_entry.get("output", [])}
            for example_entry in example.get("test", [])
        ]

        # Build the data entry in the compact format
        data_entry = {
            "instruction": "Answer based on the example below:",
            "input": {"train": train_examples, "test": test_examples},
            "output": content.get("perceptions", []),  # Get perceptions if available
        }
        llama_data.append(data_entry)

    return llama_data

llama_data_training = transform_data_for_llama(dic_training)

llama_data_training

In [None]:
def transform_data_for_llama(dic_training):
    llama_data = []
    for entry_id, content in dic_training.items():
        example_text = content["example"]  # e.g., can be a string or list
        if isinstance(example_text, list):
            example_text = "\n".join(example_text)  # or any other separator

        perceptions_text = content["perceptions"]  # e.g., can be a string or list
        if isinstance(perceptions_text, list):
            perceptions_text = "\n".join(perceptions_text)

        # Build the desired JSON entry
        data_entry = {
            "instruction": "Answer based on the example below:",
            "input": example_text,
            "output": perceptions_text
        }
        llama_data.append(data_entry)

    return llama_data

llama_data_training = transform_data_for_llama(dic_training)

llama_data_training

# Fine-tuning:

## Lama 3.2 Transduction Model: LoRa

In [5]:
import torch
torch.cuda.empty_cache()
torch.cuda.ipc_collect()

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    "barc0/Llama-3.1-ARC-Potpourri-Transduction-8B"
)
model.config.use_cache = False
model.config.pretraining_tp = 1

tokenizer = AutoTokenizer.from_pretrained(
    "barc0/Llama-3.1-ARC-Potpourri-Transduction-8B"
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

lora_config = LoraConfig(
    r=16,              # smaller rank
    lora_dropout=0.1
)

model = get_peft_model(model, lora_config)
model.gradient_checkpointing_enable()   # optional but helps reduce memory

training_args = TrainingArguments(
    output_dir="./finetuned_model",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,      # also small for evaluation
    gradient_accumulation_steps=4,     # increase GA steps
    num_train_epochs=3,
    save_strategy="epoch",
    save_total_limit=2,
    seed=42,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    logging_dir="./logs",
    logging_steps=50,
    report_to="tensorboard",
    fp16=torch.cuda.is_available(), 
    push_to_hub=False,
    ddp_find_unused_parameters=False,
)

serialized_texts = [example_to_text(item) for item in llama_data_training]

# E.g. reduce max_length
tokenized_data = tokenizer(
    serialized_texts,
    padding="max_length",
    truncation=True,
    max_length=256,
    return_tensors="pt"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,  # For a real scenario, use a Dataset object
    tokenizer=tokenizer,
)

trainer.train()


In [None]:
# Update TrainingArguments
training_args = TrainingArguments(
    output_dir="./finetuned_model",
    evaluation_strategy="steps",      # Evaluate every X steps
    eval_steps=500,                   # Define steps
    learning_rate=1e-5,
    per_device_train_batch_size=2,    # Increase if possible
    per_device_eval_batch_size=2,     # Increase if possible
    gradient_accumulation_steps=4,    # Tune based on memory
    num_train_epochs=5,               # Increase epochs for better training
    save_strategy="steps",
    save_steps=500,                   # Save every 500 steps
    save_total_limit=2,
    seed=42,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    logging_dir="./logs",
    logging_steps=50,
    report_to="tensorboard",
    fp16=torch.cuda.is_available(),
    push_to_hub=False,
    ddp_find_unused_parameters=False,
)

# Updated Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,  # Use Dataset object
    tokenizer=tokenizer,
)

trainer.train()