In [1]:
# Import Required Packages
import torch
import os
import json
import sys
import re
import random
import importlib.util
from typing import *
from tqdm import tqdm 
from typing import List

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import ListedColormap


from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import get_peft_model, LoraConfig
from datasets import Dataset


from JS_Architects import *

  from .autonotebook import tqdm as notebook_tqdm


# Summary

We will use Architects model

1) Implement Custom pipeline, no prompting just prediction with shrekking of the embedding 

2) Automated Forsequence Classification
    1) No prompting
    2) Prompt: "Classify with labels encode in binary"
    3) Prompt: "Clasify with list label: ["depth",...]"
    4) Prompt "Classify with list label: ["depth",...]. Depth represents..., Containment represents...."

# Custom Pipeline 

## 1. Prepare Data

### Load 


In [2]:
with open("perceptions_training.json", "r") as f:
    dic_training = json.load(f)

with open("perceptions_testing.json", "r") as f:
    dic_testing = json.load(f)

label_list = [
    "containment",
    "depth",
    "symmetry",
    "categorical",
    "spatial-Orientation",
    "spatial-Ordinal",
    "similarity",
    "quantitative",
    "replication",
    "figure-Ground",
    "continuity",
    "size",
    "closure",
    "centroid",
    "topological",
    "motion",
]

### Format

In [3]:
from datasets import Dataset

def prepare_data_for_multilabel_classification(
    dic_training,
    instruction,
    label_list,
    inp_prefix="<I>",
    out_prefix="<O>",
    arr_sep="\n",
    exa_sep="\n---\n",
    bos_token="<|begin_of_text|>",
    eos_token="<|end_of_text|>"
):
    llama_data = []

    # Create a mapping from label to index
    label_to_index = {label: i for i, label in enumerate(label_list)}

    for entry_id, content in dic_training.items():
        # Extract perceptions (labels) and encode them as a binary vector
        perceptions = content.get("perceptions", [])
        label_vector = labels_to_binary(label_list, perceptions)

        # Combine train and test examples
        examples = content.get("example", {}).get("train", []) + content.get("example", {}).get("test", [])

        # Format examples into a single input string
        formatted_examples = []
        for example in examples:
            input_data = f"{inp_prefix}{format_array(example['input'], arr_sep)}"
            output_data = f"{out_prefix}{format_array(example['output'], arr_sep)}{eos_token}"
            formatted_examples.append(f"{input_data}{exa_sep}{output_data}")

        # Combine all examples into one input text and prepend the BOS token
        combined_text = f"{exa_sep.join(formatted_examples)}"

        # Add the structured data for fine-tuning
        llama_data.append({
            "instruction": f"{instruction}",
            "input": combined_text,
            "output": label_vector,  # Multi-label as binary vector
        })

    return llama_data

def format_array(array, arr_sep="\n"):
    """
    Helper function to format a 2D array into a string with row-wise separation.
    """
    return arr_sep.join([" ".join(map(str, row)) for row in array])

def labels_to_binary(label_list, input_labels):
    """
    Convert perceptions into a binary vector based on the label list.
    Handles both single strings and lists of strings for input_labels.
    """
    # Ensure input_labels is treated as a list
    if isinstance(input_labels, str):
        input_labels = [input_labels]
    
    # Create a set of lowercase input labels
    input_set = set(label.lower() for label in input_labels)
    
    # Generate the binary vector
    return [1 if label.lower() in input_set else 0 for label in label_list]


In [4]:
instruction0 = ""
instruction1 = "Classify the relationship between the input and output sequences based on perceptions"

# List
llama_data = prepare_data_for_multilabel_classification(dic_training,instruction0,label_list)

# Dict
llama_data_dict = {
    "instruction": [item["instruction"] for item in llama_data],
    "input": [item["input"] for item in llama_data],
    "output": [item["output"] for item in llama_data],
}

# Dataset
llama_data_dataset = Dataset.from_dict(llama_data_dict)

llama_data_dataset[0]


{'instruction': '',
 'input': '<I>0 7 7\n7 7 7\n0 7 7\n---\n<O>0 0 0 0 7 7 0 7 7\n0 0 0 7 7 7 7 7 7\n0 0 0 0 7 7 0 7 7\n0 7 7 0 7 7 0 7 7\n7 7 7 7 7 7 7 7 7\n0 7 7 0 7 7 0 7 7\n0 0 0 0 7 7 0 7 7\n0 0 0 7 7 7 7 7 7\n0 0 0 0 7 7 0 7 7<|end_of_text|>\n---\n<I>4 0 4\n0 0 0\n0 4 0\n---\n<O>4 0 4 0 0 0 4 0 4\n0 0 0 0 0 0 0 0 0\n0 4 0 0 0 0 0 4 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 4 0 4 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 4 0 0 0 0<|end_of_text|>\n---\n<I>0 0 0\n0 0 2\n2 0 2\n---\n<O>0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 0\n0 0 0 0 0 0 0 0 2\n0 0 0 0 0 0 2 0 2\n0 0 0 0 0 0 0 0 0\n0 0 2 0 0 0 0 0 2\n2 0 2 0 0 0 2 0 2<|end_of_text|>\n---\n<I>6 6 0\n6 0 0\n0 6 6\n---\n<O>6 6 0 6 6 0 0 0 0\n6 0 0 6 0 0 0 0 0\n0 6 6 0 6 6 0 0 0\n6 6 0 0 0 0 0 0 0\n6 0 0 0 0 0 0 0 0\n0 6 6 0 0 0 0 0 0\n0 0 0 6 6 0 6 6 0\n0 0 0 6 0 0 6 0 0\n0 0 0 0 6 6 0 6 6<|end_of_text|>\n---\n<I>2 2 2\n0 0 0\n0 2 2\n---\n<O>2 2 2 2 2 2 2 2 2\n0 0 0 0 0 0 0 0 0\n0 2 2 0

In [5]:
llama_data_list_testing = prepare_data_for_multilabel_classification(dic_testing,instruction0, label_list)

# Restructure llama_data
llama_data_dict_testing = {
    "instruction": [item["instruction"] for item in llama_data_list_testing],
    "input": [item["input"] for item in llama_data_list_testing],
    "output": [item["output"] for item in llama_data_list_testing],
}

llama_data_dataset_testing = Dataset.from_dict(llama_data_dict_testing)
llama_data_dataset_testing[0]

{'instruction': '',
 'input': '<I>8 6\n6 4\n---\n<O>8 6 8 6 8 6\n6 4 6 4 6 4\n6 8 6 8 6 8\n4 6 4 6 4 6\n8 6 8 6 8 6\n6 4 6 4 6 4<|end_of_text|>\n---\n<I>7 9\n4 3\n---\n<O>7 9 7 9 7 9\n4 3 4 3 4 3\n9 7 9 7 9 7\n3 4 3 4 3 4\n7 9 7 9 7 9\n4 3 4 3 4 3<|end_of_text|>\n---\n<I>3 2\n7 8\n---\n<O>3 2 3 2 3 2\n7 8 7 8 7 8\n2 3 2 3 2 3\n8 7 8 7 8 7\n3 2 3 2 3 2\n7 8 7 8 7 8<|end_of_text|>',
 'output': [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]}

## 2. Load Model and Tokenizer

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from peft import get_peft_model, LoraConfig

# Install bitsandbytes if not already installed
# !pip install bitsandbytes

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B")

# Load quantized model
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.1-8B",
    load_in_4bit=True,  # Use 4-bit quantization
    device_map="auto",
    bnb_4bit_compute_dtype=torch.float16  # Match input type

)

print("Predefined Special Tokens:")
print(f"EOS Token: {tokenizer.bos_token}, ID: {tokenizer.bos_token_id}")
print(len(tokenizer.vocab))

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.04it/s]


Predefined Special Tokens:
EOS Token: <|begin_of_text|>, ID: 128000
128256


## 3. Shrink the tokenizer and embedding 

In [7]:
special_tokens_dict = {
    "input": "<I>",
    "output": "<O>",
    "array_sep": "\n",
    "example_sep": "\n---\n",
    "eos_token": "<|end_of_text|>",
    "bos_token": "<|begin_of_text|>",
    "pad_token": "[PAD]"
}

# Add special tokens
tokenizer.add_special_tokens({
    "additional_special_tokens": [
        special_tokens_dict["input"],
        special_tokens_dict["output"],
        special_tokens_dict["array_sep"],
        special_tokens_dict["example_sep"]
    ],
    "eos_token": special_tokens_dict["eos_token"],
    "bos_token": special_tokens_dict["bos_token"],
    "pad_token": special_tokens_dict["pad_token"]
})

# Set the tokenizer pad token explicitly
tokenizer.pad_token = special_tokens_dict["pad_token"]

# Check the updated tokens
print(f"Special Tokens: {tokenizer.special_tokens_map}")
print(f"Vocabulary Size: {len(tokenizer)}")

# Resize model embeddings
model.resize_token_embeddings(len(tokenizer))


The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Special Tokens: {'bos_token': '<|begin_of_text|>', 'eos_token': '<|end_of_text|>', 'pad_token': '[PAD]', 'additional_special_tokens': ['<I>', '<O>', '\n', '\n---\n']}
Vocabulary Size: 128261


The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


Embedding(128261, 4096)

In [8]:
def build_corpus_for_shrinking(hf_dataset):
    """
    Concatenate the 'input' + 'instruction' from the dataset
    to ensure all relevant tokens appear.
    """
    corpus_list = []
    for sample in hf_dataset:
        text = (sample["instruction"] or "") + " " + (sample["input"] or "")
        corpus_list.append(text)
    # Combine into one big string
    corpus = "\n".join(corpus_list)
    return corpus

corpus = build_corpus_for_shrinking(llama_data_dataset)

len(corpus)

382576

In [9]:
shrink_embeddings(
            model=model,
            tokenizer=tokenizer,
            corpus=corpus,                  # ensures relevant tokens are kept
            keep_special_tokens=True,
            keep_normalizer=False,
            keep_token_order=True
        )

print("Tokenizer size after shrinking:", len(tokenizer.vocab))

tokenizer.vocab

Token indices sequence length is longer than the specified maximum sequence length for this model (366563 > 131072). Running this sequence through the model will result in indexing errors


Tokenizer size after shrinking: 18


{'[PAD]': 17,
 '3': 3,
 '0': 0,
 '6': 6,
 'Ġ': 10,
 '<|begin_of_text|>': 11,
 '8': 8,
 '<|end_of_text|>': 12,
 '2': 2,
 '4': 4,
 '<I>': 13,
 '7': 7,
 '\n---\n': 16,
 '5': 5,
 '\n': 15,
 '9': 9,
 '1': 1,
 '<O>': 14}

In [10]:
test_sequence = "<I>8 6\n6 4\n---\n<O>8 6 8 6 8 6\n6 4 6 4 6 4\n6 8 6 8 6 8\n4 6 4 6 4 6\n8 6 8 6 8 6\n6 4 6 4 6 4<|end_of_text|>\n---\n<I>7 9\n4 3\n---\n<O>7 9 7 9 7 9\n4 3 4 3 4 3\n9 7 9 7 9 7\n3 4 3 4 3 4\n7 9 7 9 7 9\n4 3 4 3 4 3<|end_of_text|>\n---\n<I>3 2\n7 8\n---\n<O>3 2 3 2 3 2\n7 8 7 8 7 8\n2 3 2 3 2 3\n8 7 8 7 8 7\n3 2 3 2 3 2\n7 8 7 8 7 8<|end_of_text|>"
encoded = tokenizer.encode(test_sequence)
decoded = tokenizer.decode(encoded)
print("Encoded:", encoded)
print("Decoded:", decoded)

Encoded: [11, 13, 8, 10, 6, 15, 6, 10, 4, 16, 14, 8, 10, 6, 10, 8, 10, 6, 10, 8, 10, 6, 15, 6, 10, 4, 10, 6, 10, 4, 10, 6, 10, 4, 15, 6, 10, 8, 10, 6, 10, 8, 10, 6, 10, 8, 15, 4, 10, 6, 10, 4, 10, 6, 10, 4, 10, 6, 15, 8, 10, 6, 10, 8, 10, 6, 10, 8, 10, 6, 15, 6, 10, 4, 10, 6, 10, 4, 10, 6, 10, 4, 12, 16, 13, 7, 10, 9, 15, 4, 10, 3, 16, 14, 7, 10, 9, 10, 7, 10, 9, 10, 7, 10, 9, 15, 4, 10, 3, 10, 4, 10, 3, 10, 4, 10, 3, 15, 9, 10, 7, 10, 9, 10, 7, 10, 9, 10, 7, 15, 3, 10, 4, 10, 3, 10, 4, 10, 3, 10, 4, 15, 7, 10, 9, 10, 7, 10, 9, 10, 7, 10, 9, 15, 4, 10, 3, 10, 4, 10, 3, 10, 4, 10, 3, 12, 16, 13, 3, 10, 2, 15, 7, 10, 8, 16, 14, 3, 10, 2, 10, 3, 10, 2, 10, 3, 10, 2, 15, 7, 10, 8, 10, 7, 10, 8, 10, 7, 10, 8, 15, 2, 10, 3, 10, 2, 10, 3, 10, 2, 10, 3, 15, 8, 10, 7, 10, 8, 10, 7, 10, 8, 10, 7, 15, 3, 10, 2, 10, 3, 10, 2, 10, 3, 10, 2, 15, 7, 10, 8, 10, 7, 10, 8, 10, 7, 10, 8, 12]
Decoded: <|begin_of_text|><I>8 6
6 4
---
<O>8 6 8 6 8 6
6 4 6 4 6 4
6 8 6 8 6 8
4 6 4 6 4 6
8 6 8 6 8 6
6 4 6 4 6 

## 4. Apply LoRA to the Shrunk Model

In [11]:
from peft import get_peft_model, LoraConfig
import torch.nn as nn
import torch

# Configure LoRA
peft_config = LoraConfig(
    r=64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "embed_tokens", "lm_head"],
    lora_alpha=16,
    lora_dropout=0.0,
    bias="none"
)


# Wrap the shrunk model with LoRA
model_shrinked = get_peft_model(model, peft_config)

# Make sure to unfreeze embeddings if you want to train them directly 
# (LoRA on embed_tokens will still add ranks; but if you want the base embedding 
#  weights to be trainable, do something like):
for param in model_shrinked.get_input_embeddings().parameters():
    param.requires_grad = True


## 5. Full architecture

### Add Classification head

In [12]:
from transformers import PreTrainedModel
from transformers.modeling_utils import unwrap_model

class LLMWithClassificationHead(PreTrainedModel):
    def __init__(self, base_model, config, num_labels):
        super().__init__(config)
        self.base_model = base_model
        self.num_labels = num_labels
        hidden_size = config.hidden_size

        self.classifier = nn.Linear(hidden_size, num_labels)

        # Initialize weights and apply final processing
        self.post_init()

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            return_dict=True
        )
        last_hidden = outputs.hidden_states[-1]
        pooled = last_hidden[:, -1, :]  # Taking the last token's hidden state
        logits = self.classifier(pooled)

        loss = None
        if labels is not None:
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(logits, labels.float())

        return {
            "logits": logits,
            "loss": loss,
            "hidden_states": outputs.hidden_states,
        }


In [13]:
from transformers import PreTrainedModel, AutoConfig
import torch.nn as nn

class LLMWithClassificationHead(PreTrainedModel):
    def __init__(self, base_model, config, num_labels):
        super().__init__(config)
        self.base_model = base_model
        self.num_labels = num_labels
        hidden_size = config.hidden_size

        # Define a classification head
        self.classifier = nn.Linear(hidden_size, num_labels)

        # Initialize weights and other components
        self.post_init()

    def forward(self, input_ids, attention_mask=None, labels=None):
        # Forward pass through the base model
        outputs = self.base_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
            return_dict=True
        )
        last_hidden = outputs.hidden_states[-1]  # (batch_size, seq_len, hidden_size)

        # Pooling: take the last token's hidden state
        pooled = last_hidden[:, -1, :]  # (batch_size, hidden_size)

        # Classification head
        logits = self.classifier(pooled)  # (batch_size, num_labels)

        loss = None
        if labels is not None:
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(logits, labels.float())

        return {
            "loss": loss,
            "logits": logits,
            "hidden_states": outputs.hidden_states,
        }

    @property
    def config(self):
        return self.base_model.config

    def resize_token_embeddings(self, new_num_tokens: int):
        self.base_model.resize_token_embeddings(new_num_tokens)


In [14]:
# Define the final model
from transformers import AutoConfig

base_config = AutoConfig.from_pretrained("meta-llama/Llama-3.1-8B")

num_labels = 16

# Instantiate the custom model
model_classification = LLMWithClassificationHead(
    base_model=model_shrinked,
    config=base_config,
    num_labels=num_labels
)

AttributeError: can't set attribute 'config'

### Tokenize data

In [None]:
def tokenize_function(examples):
    tokenized = tokenizer(
        examples["input"],
        padding="max_length",
        truncation=True,
        max_length=2048  # Reduced from 8192 to 2048
    )
    return {
        "input_ids": tokenized["input_ids"],
        "attention_mask": tokenized["attention_mask"],
        "labels": examples["output"]
    }


# Keep only the input output no instructions for now
final_dataset = llama_data_dataset.remove_columns(["instruction"])


tokenized_final_dataset = final_dataset.map(tokenize_function, batched=True)

tokenized_final_dataset

### Define Data Collector

class DataCollatorWithLabels:
    def __call__(self, features):
        input_ids = torch.tensor([f["input_ids"] for f in features], dtype=torch.long)
        attention_mask = torch.tensor([f["attention_mask"] for f in features], dtype=torch.long)
        labels = torch.tensor([f["labels"] for f in features], dtype=torch.float32)
        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

data_collator = DataCollatorWithLabels()


## 6. Train

In [15]:
from transformers import Trainer, TrainingArguments

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # Extract labels
        labels = inputs.pop("labels")
        
        # Forward pass with additional arguments
        outputs = model(**inputs, labels=labels)
        
        # Extract loss
        loss = outputs["loss"]
        
        # Return loss and outputs if required
        return (loss, outputs) if return_outputs else loss


### Training

In [None]:
# %% [markdown]
# ## 6. Train - Optimized

# %%
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
import torch
import gc

# Clear cache before training
gc.collect()
torch.cuda.empty_cache()

# Define TrainingArguments with optimizations
training_args = TrainingArguments(
    output_dir="./JS_finetuned_model",
    evaluation_strategy="no",
    learning_rate=1e-4,
    per_device_train_batch_size=1,  # Reduced batch size
    num_train_epochs=3,  # Increased epochs if feasible
    fp16=True,
    gradient_accumulation_steps=8,  # Increased gradient accumulation
    gradient_checkpointing=True,  # Enable gradient checkpointing
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50,
    report_to="tensorboard",
    # Add any other necessary arguments
)

# Use built-in data collator with dynamic padding
data_collator = DataCollatorWithPadding(tokenizer, padding='longest')

# Initialize Trainer
trainer = CustomTrainer(
    model=model_classification,
    args=training_args,
    train_dataset=tokenized_final_dataset,
    eval_dataset=None,
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Start Training
trainer.train()

# Clear memory after training
gc.collect()
torch.cuda.empty_cache()


In [None]:
# Instantiate your custom model
model_Classification = LLMWithClassificationHead(base_model=model, num_labels=16)

training_args = TrainingArguments(
    output_dir="./finetuned_model",
    evaluation_strategy="steps",  # Evaluate periodically
    eval_steps=500,              # Evaluate every 500 steps
    learning_rate=1e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=1,  # Reduced
    num_train_epochs=3,
    save_strategy="epoch",
    save_total_limit=2,
    seed=42,
    weight_decay=0.01,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    logging_dir="./logs",
    logging_steps=50,
    report_to="tensorboard",
    fp16=True,                    # Mixed-precision
    push_to_hub=False,
    ddp_find_unused_parameters=False,
)

# Apply tokenization
tokenized_dataset = llama_data_dataset.map(tokenize_function, batched=True)
tokenized_dataset_testing = llama_data_dataset_testing.map(tokenize_function, batched=True)

data_collator = DataCollatorWithLabels()