In [None]:
# Install necessary libraries (only run once per environment)
!pip install -q transformers datasets accelerate peft


In [39]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
from sklearn.model_selection import train_test_split

# --- Load and Prepare Data ---
DATA_PATH = "/Users/dhritichandan/Downloads/Training Data Public Upload/train_data_10k.json"
df = pd.read_json(DATA_PATH, lines=True)
df = df.rename(columns={"PFD": "text", "PID": "label"})

# Clean labels
def clean_label(text):
    return text.replace("(spltt)", "(splt)")

df["label"] = df["label"].apply(clean_label)

# Train-validation split
train_df, val_df = train_test_split(df, test_size=0.01, random_state=42)
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df.reset_index(drop=True))

# --- Tokenizer & Model ---
model_ckpt = "google/byt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)

from datasets import DatasetDict

# --- Tokenize function with explicit return_tensors and safe label padding ---
def tokenize_fn(example):
    model_inputs = tokenizer(
        example["text"], max_length=512, padding="max_length", truncation=True
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            example["label"], max_length=512, padding="max_length", truncation=True
        )

    # Replace padding token id's in labels with -100 to ignore them in loss
    labels["input_ids"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Map with `remove_columns` to avoid passing unnecessary keys to the model
tokenized_train = train_dataset.map(tokenize_fn, batched=True, remove_columns=train_dataset.column_names)
tokenized_val = val_dataset.map(tokenize_fn, batched=True, remove_columns=val_dataset.column_names)

# Set format for PyTorch tensors
tokenized_train.set_format("torch")
tokenized_val.set_format("torch")

tokenized_train = train_dataset.map(tokenize_fn, batched=True, batch_size=16)
tokenized_val = val_dataset.map(tokenize_fn, batched=True, batch_size=16)

# --- Data Collator ---
collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# --- Training Arguments ---
training_args = Seq2SeqTrainingArguments(
    output_dir="./byt5-pfd-pid",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    num_train_epochs=5,
    weight_decay=0.01,
    predict_with_generate=True,
    generation_max_length=512,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=2,
    fp16=torch.cuda.is_available()  # Use mixed precision if GPU available
)

# --- Trainer ---
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    data_collator=collator
)


Map:   0%|          | 0/9900 [00:00<?, ? examples/s]



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/9900 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  trainer = Seq2SeqTrainer(


In [38]:
# Check for any empty input_ids, attention_mask, or labels
def check_dataset_for_issues(dataset, name="dataset"):
    for i, sample in enumerate(dataset):
        if any(len(sample[k]) == 0 for k in ["input_ids", "attention_mask", "labels"]):
            print(f"❌ Zero-dim tensor at index {i} in {name}")
        elif any(sample[k]() == 0 for k in ["input_ids", "attention_mask", "labels"]):
            print(f"❌ Empty array at index {i} in {name}")
    print(f"✅ {name} check complete")

print(check_dataset_for_issues(tokenized_train, "train"))
print(check_dataset_for_issues(tokenized_val, "val"))

TypeError: 'list' object is not callable

In [40]:
def check_dataset_for_empty_inputs(dataset):
    for i, sample in enumerate(dataset):
        if "input_ids" not in sample or "attention_mask" not in sample or "labels" not in sample:
            print(f"Missing keys at index {i}: {sample.keys()}")
        elif (len(sample["input_ids"]) == 0 or
              len(sample["attention_mask"]) == 0 or
              len(sample["labels"]) == 0):
            print(f"Empty arrays at index {i}")
    print("Check complete.")

# Run the check
print(check_dataset_for_empty_inputs(tokenized_train))
print(check_dataset_for_empty_inputs(tokenized_val))

Check complete.
None
Check complete.
None


In [28]:
print(tokenized_train[0])
print(tokenized_train[10])
print(tokenized_train[100])

{'text': '(raw)(v)(tank)(pp)(v)(v)(r)<&|(raw)(v)(mix)<&|(raw)(v)&|(hex){1}(hex){2}&|[{tout}(v)(prod)]{bout}(v)(hex){3}(v)(hex){4}(rect)<1<2[{tout}(cond)(sep)[(v)(prod)](splt)[(v)(prod)](v)1]{bout}(splt)[(v)(prod)](hex){5}2n|(raw)(splt)[(hex){1}(mix)<3(prod)](v)3n|(raw)(splt)[(hex){2}(mix)<4(prod)](v)4n|(raw)(splt)[(hex){3}(mix)<5(prod)](v)5n|(raw)(v)(hex){4}(prod)n|(raw)(v)(hex){5}(prod)', 'label': '(raw)(C){FC}_1(v)<_1(mix)<&|(raw)(C){FC}_2(v)&<_2|(hex){1}(C){TC}_3(hex){2}(C){TC}_4(r)<_8<&|(raw)(v)<_5(tank)[(C){LC}_5](pp)[(C){M}](C){PI}(C){FC}_6(v)<_6(C){FC}_7(v)&<_7|[(C){TC}_8][(C){LC}_9][{tout}(C){PC}_10(v)<_10(prod)]{bout}(v)<_9(hex){3}(C){TC}_11(C){FC}_12(v)<_12(hex){4}(C){TC}_13(rect)<1<2[(C){PC}_14][(C){LC}_15][{tout}(cond)(sep)[(C){LC}_16][(v)<_14(prod)](splt)[(v)<_16(prod)](C){FC}_17(v)1<_17]{bout}(splt)[(C){FC}_18(v)<_18(prod)](hex){5}2n|(raw)(splt)[(hex){1}(mix)<3(prod)](v)3<_3n|(raw)(splt)[(hex){2}(mix)<4(prod)](v)4<_4n|(raw)(splt)[(hex){3}(mix)<5(prod)](v)5<_11n|(raw)(v)<_

In [41]:

# ---- Start Training ---
trainer.train()

# --- Evaluate ---
predictions, labels, _ = trainer.predict(tokenized_val)
decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

# --- Sample Output ---
print("\nSample predictions vs labels:")
for pred, label in zip(decoded_preds[:5], decoded_labels[:5]):
    print(f"Prediction:\n{pred}\n")
    print(f"Label:\n{label}\n")
    print("-" * 40)

ValueError: zero-dimensional arrays cannot be concatenated

In [2]:
DATA_PATH = "/Users/dhritichandan/Downloads/Training Data Public Upload/train_data_10k.json"

df = pd.read_json(DATA_PATH, lines=True)

print(f"Number of samples: {len(df)}")
print("Sample data:")
print(df.head(2))


NameError: name 'pd' is not defined

In [None]:
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
import torch

DATA_PATH = "/Users/dhritichandan/Downloads/Training Data Public Upload/train_data_10k.json"
df = pd.read_json(DATA_PATH, lines=True)

df = df.rename(columns={"PFD": "text", "PID": "label"})

dataset = Dataset.from_pandas(df)

model_ckpt = "google/byt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)

max_length = 512

 
def tokenize_fn(batch):
    inputs = tokenizer(batch["text"], max_length=max_length, padding="max_length", truncation=True)
    labels = tokenizer(batch["label"], max_length=max_length, padding="max_length", truncation=True)

    new_labels = []
    for label in labels["input_ids"]:
        new_label = [token if token != tokenizer.pad_token_id else -100 for token in label]
        new_labels.append(new_label)
    inputs["labels"] = new_labels
    return inputs

tokenized_dataset = dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text", "label"], 
)

print("Columns after tokenization:", tokenized_dataset.column_names)

train_test = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test["train"]
eval_dataset = train_test["test"]

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="./byt5-pfd-pid",
    eval_strategy="steps",
    save_strategy="steps",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    learning_rate=5e-5,
    weight_decay=0.01,
    predict_with_generate=True,
    logging_dir="./logs",
    eval_steps=500,
    save_steps=500,
    logging_steps=200,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)




Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Columns after tokenization: ['input_ids', 'attention_mask', 'labels']


  trainer = Seq2SeqTrainer(


In [None]:

trainer.train()

model.save_pretrained("./byt5-pfd-pid")
tokenizer.save_pretrained("./byt5-pfd-pid")

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss,Validation Loss
500,0.3321,0.140744
1000,0.159,0.094095
1500,0.1326,0.075844
2000,0.1058,0.068132
2500,0.0989,0.060561
3000,0.084,0.054156
3500,0.0788,0.05076
4000,0.075,0.047932
4500,0.0739,0.044923
5000,0.0673,0.043966


('./byt5-pfd-pid/tokenizer_config.json',
 './byt5-pfd-pid/special_tokens_map.json',
 './byt5-pfd-pid/added_tokens.json')

In [16]:
from transformers import Text2TextGenerationPipeline

pipeline = Text2TextGenerationPipeline(
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1  
)

sample_inputs = [df["text"].iloc[i] for i in range(5)]  
max_new_tokens=1000
for i, input_text in enumerate(sample_inputs):
    print(f"\nPFD Input {i+1}:\n{input_text}\n")
    
    outputs = pipeline(
        input_text,
        max_length=max_new_tokens,
        num_beams=5,
        num_return_sequences=5,
        early_stopping=True
    )

    for j, output in enumerate(outputs):
        print(f" Prediction {chr(65+j)}:\n{output['generated_text']}\n")
        print(f" Ground Truth PID:\n{df['label'].iloc[i]}\n")

Device set to use mps:0



PFD Input 1:
(raw)(hex){1}(hex){2}(mix)<2(r)[{tout}(v)(prod)]{bout}(v)(splt)[(hex){2}(hex){3}(pp)(v)(mix)<1(r)[{bout}(v)(prod)]{tout}(v)(splt)[(hex){4}(r)[{tout}(v)(prod)]{bout}(v)(hex){4}(prod)](v)1](v)2n|(raw)(hex){1}(v)(prod)n|(raw)(hex){3}(v)(prod)



Both `max_new_tokens` (=256) and `max_length`(=1000) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=1000) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


 Prediction A:
(raw)(hex){1}(C){TC}_1(hex){2}(mix)<2(r)<_2[(C){TC}_2][(C){LC}_3][{tout}(C){PC}_4(v)<_4(prod)]{bout}(v)<_3(splt)[(hex){2}(hex){3}(C){TC}_5(pp)[(C){M}](C){PI}(C){FC}_6(v)<_6(mix)<1(r)<_7[(C){TC}_7][(C){LC}_8][{bout}(v)<_8(prod)]{tout}(C){PC}_9(v)<_9(splt)[(

 Ground Truth PID:
(raw)(hex){1}(C){TC}_1(hex){2}(mix)<2(r)<_2[(C){TC}_2][(C){LC}_3][{tout}(C){PC}_4(v)<_4(prod)]{bout}(v)<_3(splt)[(hex){2}(hex){3}(C){TC}_5(pp)[(C){M}](C){PI}(C){FC}_6(v)<_6(mix)<1(r)<_7[(C){TC}_7][(C){LC}_8][{bout}(v)<_8(prod)]{tout}(C){PC}_9(v)<_9(splt)[(C){FC}_10(v)1<_10](hex){4}(r)<_11[(C){TC}_11][(C){LC}_12][{tout}(C){PC}_13(v)<_13(prod)]{bout}(v)<_12(hex){4}(prod)](C){FC}_14(v)2<_14n|(raw)(hex){1}(v)<_1(prod)n|(raw)(hex){3}(v)<_5(prod)

 Prediction B:
(raw)(hex){1}(C){TC}_1(hex){2}(mix)<2(r)<_2[(C){TC}_2][(C){LC}_3][{tout}(C){PC}_4(v)<_4(prod)]{bout}(v)<_3(splt)[(hex){2}(hex){3}(C){TC}_5(pp)[(C){M}](C){PI}(C){FC}_6(v)<_6(mix)<1(r)[(C){TI}][(C){LC}_7][{bout}(v)<_7(prod)]{tout}(C){PC}_8(v)<_8(spl

Both `max_new_tokens` (=256) and `max_length`(=1000) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


 Prediction A:
(raw)(hex){1}(C){TC}_1(comp)[(C){M}<_2](C){PC}_2(r)<_5<&|(raw)(pp)[(C){M}](C){PI}(C){FC}_3(v)<_3(C){FC}_4(v)&<_4|[(C){TC}_5][(C){LC}_6][{tout}(C){PC}_7(v)<_7(prod)]{bout}(v)<_6(hex){2}(C){TC}_8(prod)n|(raw)(C){FC}_9<_1(v)<_9(hex){1}(prod)n|(raw)(hex){2}(v)

 Ground Truth PID:
(raw)(hex){1}(C){TC}_1(comp)[(C){M}<_2](C){PC}_2(r)<_5<&|(raw)(pp)[(C){M}](C){PI}(C){FC}_3(v)<_3(C){FC}_4(v)&<_4|[(C){TC}_5][(C){LC}_6][{tout}(C){PC}_7(v)<_7(prod)]{bout}(v)<_6(hex){2}(C){TC}_8(prod)n|(raw)(C){FC}_9<_1(v)<_9(hex){1}(prod)n|(raw)(hex){2}(v)<_8(prod)

 Prediction B:
(raw)(hex){1}(C){TC}_1(comp)[(C){M}<_2](C){PC}_2(r)<_8<&|(raw)(pp)[(C){M}](C){PI}(C){FC}_3(v)<_3(C){FC}_4(v)&<_4|[(C){TC}_5][(C){LC}_6][{tout}(C){PC}_7(v)<_7(prod)]{bout}(v)<_6(hex){2}(C){TC}_8(prod)n|(raw)(C){FC}_9<_1(v)<_9(hex){1}(prod)n|(raw)(hex){2}(v)

 Ground Truth PID:
(raw)(hex){1}(C){TC}_1(comp)[(C){M}<_2](C){PC}_2(r)<_5<&|(raw)(pp)[(C){M}](C){PI}(C){FC}_3(v)<_3(C){FC}_4(v)&<_4|[(C){TC}_5][(C){LC}_6][{tout}(C){PC}

Both `max_new_tokens` (=256) and `max_length`(=1000) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


 Prediction A:
(raw)(C){FC}_1(v)<_1(hex){1}(C){TC}_2(rect)<1<4[(C){PC}_3][(C){LC}_4][{tout}(cond)(sep)[(C){LC}_5][(v)<_3(prod)](splt)[(v)<_5(hex){2}(C){TC}_6(prod)](C){FC}_7(v)1<_7]{bout}(splt)[(C){FC}_8(v)<_8(C){FC}_9(v)<_9(hex){3}(C){TC}_10(rect)<2<3[(C){PC}_11][(C){LC

 Ground Truth PID:
(raw)(C){FC}_1(v)<_1(hex){1}(C){TC}_2(rect)<1<4[(C){PC}_3][(C){LC}_4][(C){TC}_5][{tout}(cond)(sep)[(C){LC}_6][(v)<_3(prod)](splt)[(v)<_6(hex){2}(C){TC}_7(prod)](C){FC}_8(v)1<_8]{bout}(splt)[(C){FC}_9<_5(v)<_9(C){FC}_10(v)<_10(hex){3}(C){TC}_11(rect)<2<3[(C){PC}_12][(C){LC}_13][{tout}(cond)(sep)[(C){LC}_14][(v)<_12(prod)](splt)[(C){FC}_15(v)<_15(hex){4}(C){TC}_16(prod)](v)2<_14]{bout}(splt)[(v)<_13(prod)](hex){5}3](hex){6}4n|(raw)(splt)[(hex){2}(mix)<5(prod)](v)5<_7n|(raw)(splt)[(hex){4}(mix)<6(prod)](v)6<_16n|(raw)(C){FC}_17(v)<_17(hex){5}(prod)n|(raw)(v)<_2(hex){1}(prod)n|(raw)(v)<_4(hex){6}(prod)n|(raw)(v)<_11(hex){3}(prod)

 Prediction B:
(raw)(C){FC}_1(v)<_1(hex){1}(C){TC}_2(rect)<1<4[(C){PC}_3]

Both `max_new_tokens` (=256) and `max_length`(=1000) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


 Prediction A:
(raw)(comp)[(C){M}<_1](C){PC}_1(C){FC}_2(v)<_2(r)<&|(raw)(comp)[(C){M}<_3](C){PC}_3(C){FC}_4(v)&<_4|[(C){TI}][(C){LC}_5][{bout}(v)<_5(prod)]{tout}(C){PC}_6(v)<_6(pp)[(C){M}](C){PI}(C){FC}_7(v)<_7(prod)

 Ground Truth PID:
(raw)(comp)[(C){M}<_1](C){PC}_1(C){FC}_2(v)<_2(r)<&|(raw)(hex)<_3(C){TC}&_3|[(C){TI}][(C){LC}_4][{bout}(v)<_4(prod)]{tout}(C){PC}_5(v)<_5(pp)[(C){M}](C){PI}(C){FC}_6(v)<_6(prod)

 Prediction B:
(raw)(hex)<_1(C){TC}_1(r)<_5<&|(raw)(comp)[(C){M}<_2](C){PC}_2(C){FC}_3(v)&<_3|[(C){TI}][(C){LC}_4][{bout}(v)<_4(prod)]{tout}(C){PC}_5(v)<_5(pp)[(C){M}](C){PI}(C){FC}_6(v)<_6(prod)

 Ground Truth PID:
(raw)(comp)[(C){M}<_1](C){PC}_1(C){FC}_2(v)<_2(r)<&|(raw)(hex)<_3(C){TC}&_3|[(C){TI}][(C){LC}_4][{bout}(v)<_4(prod)]{tout}(C){PC}_5(v)<_5(pp)[(C){M}](C){PI}(C){FC}_6(v)<_6(prod)

 Prediction C:
(raw)(hex)<_1(C){TC}_1(r)<_5<&|(raw)(comp)[(C){M}<_2](C){PC}_2(C){FC}_3(v)&<_3|[(C){TC}_4][(C){LC}_5][{bout}(v)<_5(prod)]{tout}(C){PC}_6(v)<_6(pp)[(C){M}](C){PI}(C){FC}_7(v)<

In [7]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Text2TextGenerationPipeline
import pandas as pd
from datasets import Dataset


DATA_PATH = "/Users/dhritichandan/Downloads/Training Data Public Upload/train_data_10k.json"
df = pd.read_json(DATA_PATH, lines=True)

df = df.rename(columns={"PFD": "text", "PID": "label"})

dataset = Dataset.from_pandas(df)

model_ckpt = "google/byt5-small"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt)

max_length = 512

 
def tokenize_fn(batch):
    inputs = tokenizer(batch["text"], max_length=max_length, padding="max_length", truncation=True)
    labels = tokenizer(batch["label"], max_length=max_length, padding="max_length", truncation=True)

    new_labels = []
    for label in labels["input_ids"]:
        new_label = [token if token != tokenizer.pad_token_id else -100 for token in label]
        new_labels.append(new_label)
    inputs["labels"] = new_labels
    return inputs

tokenized_dataset = dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text", "label"], 
)

print("Columns after tokenization:", tokenized_dataset.column_names)

train_test = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test["train"]
eval_dataset = train_test["test"]



model_dir = "./byt5-pfd-pid" 

model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

pipeline = Text2TextGenerationPipeline(
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

sample_inputs = [df["text"].iloc[i] for i in range(5)]  


for i, input_text in enumerate(sample_inputs):
    print(f"\n PFD Input {i+1}:\n{input_text}\n")

    outputs = pipeline(
        input_text,
        max_new_tokens=1000,
        num_beams=5,
        num_return_sequences=5,
        early_stopping=True
    )

    for j, output in enumerate(outputs):
        print(f" Prediction {chr(65+j)}:\n{output['generated_text']}\n")

    print(f" Ground Truth PID:\n{df['label'].iloc[i]}\n{'='*60}")

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Device set to use cpu


Columns after tokenization: ['input_ids', 'attention_mask', 'labels']

 PFD Input 1:
(raw)(hex){1}(hex){2}(mix)<2(r)[{tout}(v)(prod)]{bout}(v)(splt)[(hex){2}(hex){3}(pp)(v)(mix)<1(r)[{bout}(v)(prod)]{tout}(v)(splt)[(hex){4}(r)[{tout}(v)(prod)]{bout}(v)(hex){4}(prod)](v)1](v)2n|(raw)(hex){1}(v)(prod)n|(raw)(hex){3}(v)(prod)

 Prediction A:
(raw)(hex){1}(C){TC}_1(hex){2}(mix)<2(r)<_2[(C){TC}_2][(C){LC}_3][{tout}(C){PC}_4(v)<_4(prod)]{bout}(v)<_3(splt)[(hex){2}(hex){3}(C){TC}_5(pp)[(C){M}](C){PI}(C){FC}_6(v)<_6(mix)<1(r)<_7[(C){TC}_7][(C){LC}_8][{bout}(v)<_8(prod)]{tout}(C){PC}_9(v)<_9(splt)[(hex){4}(r)<_10[(C){TC}_10][(C){LC}_11][{tout}(C){PC}_12(v)<_12(prod)]{bout}(v)<_10(hex){4}(prod)](C){FC}_13(v)1<_13](C){FC}_14(v)2<_14n|(raw)(hex){1}(v)<_1(prod)n|(raw)(hex){3}(v)<_14(prod)

 Prediction B:
(raw)(hex){1}(C){TC}_1(hex){2}(mix)<2(r)<_2[(C){TC}_2][(C){LC}_3][{tout}(C){PC}_4(v)<_4(prod)]{bout}(v)<_3(splt)[(hex){2}(hex){3}(C){TC}_5(pp)[(C){M}](C){PI}(C){FC}_6(v)<_6(mix)<1(r)<_7[(C){TC}_7][

In [22]:
sample_input = "(raw)(pp)(v)(v)(mix)<&|(raw)(hex)(v)&|(mix)<&|(raw)(v)&|(pp)(v)(mix)<1(r)(v)(splt)[(prod)](v)1"

print(f"\nPFD Input:\n{sample_input}\n")

outputs = pipeline(
    sample_input,
    max_new_tokens=700,     # use max_new_tokens, not max_length
    num_beams=5,
    num_return_sequences=5,
    early_stopping=True
)

for j, output in enumerate(outputs):
    print(f"Prediction {chr(65+j)}:\n{output['generated_text']}\n")
    # If you want to print ground truth for that input, you can do so here:
    # print(f"Ground Truth PID:\n{df['label'].iloc[0]}\n")


PFD Input:
(raw)(pp)(v)(v)(mix)<&|(raw)(hex)(v)&|(mix)<&|(raw)(v)&|(pp)(v)(mix)<1(r)(v)(splt)[(prod)](v)1

Prediction A:
(raw)(hex)<_1(C){TC}_1(C){FC}_2(v)<_2(mix)<&|(raw)(hex)<_3(C){TC}_3(C){FC}_4(v)&<_4|(mix)<&|(raw)(C){FC}_5(v)&<_5|(pp)[(C){M}](C){PI}(C){FC}_6(v)<_6(mix)<1(r)<_7[(C){TC}_7][(C){LC}_8](v)<_8(splt)[(prod)](C){FC}_9(v)1<_9

Prediction B:
(raw)(hex)<_1(C){TC}_1(C){FC}_2(v)<_2(mix)<&|(raw)(C){FC}_3(v)&<_3|(mix)<&|(raw)(C){FC}_4(v)&<_4|(pp)[(C){M}](C){PI}(C){FC}_5(v)<_5(mix)<1(r)<_6[(C){TC}_6][(C){LC}_7](v)<_7(splt)[(prod)](C){FC}_8(v)1<_8

Prediction C:
(raw)(hex)<_1(C){TC}_1(C){FC}_2(v)<_2(mix)<&|(raw)(pp)[(C){M}](C){PI}(C){FC}_3(v)<_3(C){FC}_4(v)<_4(mix)<&|(raw)(C){FC}_5(v)&<_5|(pp)[(C){M}](C){PI}(C){FC}_6(v)<_6(mix)<1(r)<_7[(C){TC}_7][(C){LC}_8](v)<_8(splt)[(prod)](C){FC}_9(v)1<_9

Prediction D:
(raw)(hex)<_1(C){TC}_1(C){FC}_2(v)<_2(mix)<&|(raw)(hex)<_3(C){TC}_3(C){FC}_4(v)&<_4|(mix)<&|(raw)(C){FC}_5(v)&<_5|(pp)[(C){M}](C){PI}(C){FC}_6(v)<_6(mix)<1(r)<_7[(C){TC}_7][(C

✅ Ground Truth PID:
(raw)(comp)[(C){M}<_1](C){PC}_1(C){FC}_2(v)<_2(r)<&|(raw)(hex)<_3(C){TC}&_3|[(C){TI}][(C){LC}_4][{bout}(v)<_4(prod)]{tout}(C){PC}_5(v)<_5(pp)[(C){M}](C){PI}(C){FC}_6(v)<_6(prod)



In [46]:
for i in range(5):
    print(f"Sample {i}")
    print("input_ids length:", len(tokenized_train[i]['input_ids']))
    print("labels length:", len(tokenized_train[i]['labels']))
    print("input_ids type:", type(tokenized_train[i]['input_ids']))
    print("labels type:", type(tokenized_train[i]['labels']))

Sample 0
input_ids length: 512
labels length: 512
input_ids type: <class 'list'>
labels type: <class 'list'>
Sample 1
input_ids length: 512
labels length: 512
input_ids type: <class 'list'>
labels type: <class 'list'>
Sample 2
input_ids length: 512
labels length: 512
input_ids type: <class 'list'>
labels type: <class 'list'>
Sample 3
input_ids length: 512
labels length: 512
input_ids type: <class 'list'>
labels type: <class 'list'>
Sample 4
input_ids length: 512
labels length: 512
input_ids type: <class 'list'>
labels type: <class 'list'>


In [47]:
small_train = tokenized_train.select(range(10))
small_val = tokenized_val.select(range(5))

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=small_train,
    eval_dataset=small_val,
    tokenizer=tokenizer,
    data_collator=collator
)

trainer.train()

  trainer = Seq2SeqTrainer(


ValueError: zero-dimensional arrays cannot be concatenated