In [1]:
#!pip install -q transformers datasets sentencepiece accelerate tensorboard

from google.colab import files
uploaded = files.upload()



Saving eval_data_1k.json to eval_data_1k.json


In [2]:
from google.colab import files
uploaded = files.upload()

Saving test_data_1k.json to test_data_1k.json


In [3]:
from google.colab import files
uploaded = files.upload()

Saving train_data_10k.json to train_data_10k.json


In [5]:

import json
from datasets import Dataset

def load_json_lines(path):
    with open(path, "r") as f:
        return [json.loads(line) for line in f]

train_data = load_json_lines("train_data_10k.json")
eval_data = load_json_lines("eval_data_1k.json")
test_data = load_json_lines("test_data_1k.json")

train_dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(eval_data)
test_dataset = Dataset.from_list(test_data)

print(f"Train samples: {len(train_dataset)}")
print(f"Eval samples: {len(eval_dataset)}")
print(f"Test samples: {len(test_dataset)}")

from transformers import T5Tokenizer

MODEL_NAME = "t5-small"
MAX_INPUT_LENGTH = 256
MAX_TARGET_LENGTH = 256

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(examples):
    inputs = [f"convert pfd to pid: {' '.join(list(pfd))}" for pfd in examples["PFD"]]
    targets = [' '.join(list(pid)) for pid in examples["PID"]]

    model_inputs = tokenizer(
        inputs,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length"
    )

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=MAX_TARGET_LENGTH,
            truncation=True,
            padding="max_length"
        )

    model_inputs["labels"] = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
    ]

    return model_inputs

tokenized_train = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_eval = eval_dataset.map(preprocess_function, batched=True, remove_columns=eval_dataset.column_names)

from transformers import T5ForConditionalGeneration, TrainingArguments, Trainer
import torch

Train samples: 10000
Eval samples: 1000
Test samples: 1000


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:

model = T5ForConditionalGeneration.from_pretrained(MODEL_NAME)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

training_args = TrainingArguments(
    output_dir="./t5_pid_model",
    eval_strategy="steps",
    eval_steps=200,
    logging_steps=100,
    save_steps=200,
    save_total_limit=2,
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="tensorboard",
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
)

trainer.train()


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [14]:


tokenizer.save_pretrained("./t5_pid_model")
model.save_pretrained("./t5_pid_model")

def pfd_to_pid(pfd_text):
    input_text = f"convert pfd to pid: {pfd_text}"
    inputs = tokenizer.encode(
        input_text,
        return_tensors="pt",
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
        padding="max_length"
    ).to(device)

    outputs = model.generate(
        inputs,
        max_length=50000,
        num_beams=4,
        early_stopping=True
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

import pandas as pd
df_test = pd.read_json("test_data_1k.json", lines=True)

sample_pfd = df_test.loc[3, "PFD"]
print("\n=== Inference Test ===")
print("Input PFD:", sample_pfd)
print("Predicted PID:", pfd_to_pid(sample_pfd))
print("True PID:", df_test.loc[3, "PID"])


=== Inference Test ===
Input PFD: (raw)(hex){1}(v)(mix)<&|(raw)(v)(mix)&<&|(raw)&||(r)[{bout}(v)(prod)]{tout}(v)(pp)(v)(prod)n|(raw)(splt)[(hex){1}(mix)<1(prod)](v)1
Predicted PID: (raw)(hex)1(v)(mix)&|(raw)[(raw)(splt)[(raw)(splt)[bout(v)1(mix)&||(raw)tout(v)(prod)][(raw)[tout(v)(v)1(mix)&|(raw)(v)tout(v)
True PID: (raw)(hex){1}(C){TC}_1(C){FC}_2(v)<_2(mix)<&|(raw)(C){FFC}_3<_4(v)<_3(mix)&<&|(raw)(C){FT}&_4||(r)[(C){TI}][(C){LC}_5][{bout}(v)<_5(prod)]{tout}(C){PC}_6(v)<_6(pp)[(C){M}](C){PI}(C){FC}_7(v)<_7(prod)n|(raw)(splt)[(hex){1}(mix)<1(prod)](v)1<_1
