In [None]:
!pip install -q transformers datasets sentencepiece accelerate tensorboard


In [None]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
import torch

In [None]:

DATA_PATH = "/Users/dhritichandan/Downloads/Training Data Public Upload/train_data_10k.json"

df = pd.read_json(DATA_PATH, lines=True)

print(f"Number of samples: {len(df)}")
print("Sample data:")
print(df.head(2))


Number of samples: 10000
Sample data:
                                                 PFD  \
0  (raw)(hex){1}(hex){2}(mix)<2(r)[{tout}(v)(prod...   
1  (raw)(v)(tank)(pp)(v)(r)<1[{bout}(v)(prod)]{to...   

                                                 PID  
0  (raw)(hex){1}(C){TC}_1(hex){2}(mix)<2(r)<_2[(C...  
1  (raw)(v)<_1(tank)[(C){LC}_1](pp)[(C){M}<_2](C)...  


In [10]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)


In [None]:


MODEL_NAME = "t5-small"
MAX_LENGTH = 256

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:

def preprocess_function(examples):
    inputs = [f"assign tag: {pfd}" for pfd in examples["PFD"]]
    targets = examples["PID"]
    
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LENGTH,
        truncation=True,
        padding="max_length"
    )
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=MAX_LENGTH,
            truncation=True,
            padding="max_length"
        )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]



In [None]:

split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = split["train"]
eval_dataset = split["test"]

print(f"Train samples: {len(train_dataset)}, Eval samples: {len(eval_dataset)}")

model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Train samples: 8000, Eval samples: 2000


T5ForConditionalGeneration(
  (shared): Embedding(32128, 512)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 512)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=512, out_features=512, bias=False)
              (k): Linear(in_features=512, out_features=512, bias=False)
              (v): Linear(in_features=512, out_features=512, bias=False)
              (o): Linear(in_features=512, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 8)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=512, out_features=2048, bias=False)
              (wo): Linear(in_features=2048, out_features=512, bias=False)
              (dropout): Drop

In [None]:

training_args = TrainingArguments(
    output_dir="./t5_pid_model",
    evaluation_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    save_steps=500,
    save_total_limit=2,
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    report_to="none",  # Change to "tensorboard" if you want logs
    fp16=torch.cuda.is_available(),
)



In [15]:
#Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

trainer.train()

tokenizer.save_pretrained("./t5_pid_model")
model.save_pretrained("./t5_pid_model")


  0%|          | 0/1500 [00:00<?, ?it/s]

{'loss': 1.7464, 'grad_norm': 1.263898253440857, 'learning_rate': 4.666666666666667e-05, 'epoch': 0.2}
{'loss': 0.6815, 'grad_norm': 0.9688755869865417, 'learning_rate': 4.3333333333333334e-05, 'epoch': 0.4}
{'loss': 0.5145, 'grad_norm': 0.8053062558174133, 'learning_rate': 4e-05, 'epoch': 0.6}
{'loss': 0.4406, 'grad_norm': 0.5750955939292908, 'learning_rate': 3.6666666666666666e-05, 'epoch': 0.8}
{'loss': 0.3789, 'grad_norm': 0.9472246170043945, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.24132679402828217, 'eval_runtime': 33.9966, 'eval_samples_per_second': 58.829, 'eval_steps_per_second': 3.677, 'epoch': 1.0}
{'loss': 0.3405, 'grad_norm': 0.7601360082626343, 'learning_rate': 3e-05, 'epoch': 1.2}
{'loss': 0.3135, 'grad_norm': 0.4805811643600464, 'learning_rate': 2.6666666666666667e-05, 'epoch': 1.4}
{'loss': 0.2941, 'grad_norm': 0.6050412654876709, 'learning_rate': 2.3333333333333336e-05, 'epoch': 1.6}
{'loss': 0.2823, 'grad_norm': 0.501400887966156, 'learning_rate': 2e-05, 'epoch': 1.8}
{'loss': 0.2674, 'grad_norm': 0.5542858242988586, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.17346252501010895, 'eval_runtime': 33.6129, 'eval_samples_per_second': 59.501, 'eval_steps_per_second': 3.719, 'epoch': 2.0}
{'loss': 0.2586, 'grad_norm': 1.0297034978866577, 'learning_rate': 1.3333333333333333e-05, 'epoch': 2.2}
{'loss': 0.2505, 'grad_norm': 0.4768562614917755, 'learning_rate': 1e-05, 'epoch': 2.4}
{'loss': 0.2478, 'grad_norm': 0.679543137550354, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.6}
{'loss': 0.2435, 'grad_norm': 0.5156378149986267, 'learning_rate': 3.3333333333333333e-06, 'epoch': 2.8}
{'loss': 0.2421, 'grad_norm': 0.5669794678688049, 'learning_rate': 0.0, 'epoch': 3.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.15800225734710693, 'eval_runtime': 35.0827, 'eval_samples_per_second': 57.008, 'eval_steps_per_second': 3.563, 'epoch': 3.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


{'train_runtime': 1570.1296, 'train_samples_per_second': 15.285, 'train_steps_per_second': 0.955, 'train_loss': 0.43348087565104165, 'epoch': 3.0}


In [18]:
import traceback

device = torch.device("cpu")  # Force CPU to avoid MPS issues
model.to(device)

def pfd_to_pid(pfd_text):
    input_text = f"assign tag: {pfd_text}"
    inputs = tokenizer.encode(input_text, return_tensors="pt", max_length=MAX_LENGTH, truncation=True, padding="max_length")
    inputs = inputs.to(device)
    
    with torch.no_grad():
        outputs = model.generate(inputs, max_length=MAX_LENGTH, num_beams=5, early_stopping=True)
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print("\n=== Inference test ===")
sample_pfd = df.loc[0, "PFD"]
print("Input PFD:", sample_pfd)
print("Predicted PID:", pfd_to_pid(sample_pfd))
print("True PID:", df.loc[0, "PID"])


=== Inference test ===
Input PFD: (raw)(hex){1}(hex){2}(mix)<2(r)[{tout}(v)(prod)]{bout}(v)(splt)[(hex){2}(hex){3}(pp)(v)(mix)<1(r)[{bout}(v)(prod)]{tout}(v)(splt)[(hex){4}(r)[{tout}(v)(prod)]{bout}(v)(hex){4}(prod)](v)1](v)2n|(raw)(hex){1}(v)(prod)n|(raw)(hex){3}(v)(prod)
Predicted PID: (raw)(hex)1(C)TC_1(hex)2(C)TC_2(mix)2(r)_3[(C)TC_3][(C)LC_4][tout(C)PC_5(v)_5(prod)]bout(v)_5(splt)[(hex)2(hex)3(C)TC_7(pp)[(C)M](C)PI(C)FC_8(v)_8(mix)1(r)_9[(C)TC_9][(C)LC_10][bout(v)_10(prod)]tout(C)PC_11(v)_11(splt)[(hex)2(C)TC_12(pp)[(C)M
True PID: (raw)(hex){1}(C){TC}_1(hex){2}(mix)<2(r)<_2[(C){TC}_2][(C){LC}_3][{tout}(C){PC}_4(v)<_4(prod)]{bout}(v)<_3(splt)[(hex){2}(hex){3}(C){TC}_5(pp)[(C){M}](C){PI}(C){FC}_6(v)<_6(mix)<1(r)<_7[(C){TC}_7][(C){LC}_8][{bout}(v)<_8(prod)]{tout}(C){PC}_9(v)<_9(splt)[(C){FC}_10(v)1<_10](hex){4}(r)<_11[(C){TC}_11][(C){LC}_12][{tout}(C){PC}_13(v)<_13(prod)]{bout}(v)<_12(hex){4}(prod)](C){FC}_14(v)2<_14n|(raw)(hex){1}(v)<_1(prod)n|(raw)(hex){3}(v)<_5(prod)
