In [1]:
from transformers import BartTokenizer, BartForConditionalGeneration

MODEL_NAME = "facebook/bart-base"  # or "facebook/bart-large"
tokenizer = BartTokenizer.from_pretrained(MODEL_NAME)
model = BartForConditionalGeneration.from_pretrained(MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [2]:
from google.colab import files
uploaded = files.upload()

Saving test_data_1k.json to test_data_1k.json


In [3]:
from google.colab import files
uploaded = files.upload()

Saving eval_data_1k.json to eval_data_1k.json


In [4]:
from google.colab import files
uploaded = files.upload()

Saving train_data_10k.json to train_data_10k.json


In [5]:
import json
from datasets import Dataset

def load_json_lines(path):
    with open(path, "r") as f:
        return [json.loads(line) for line in f]

train_data = load_json_lines("train_data_10k.json")
eval_data = load_json_lines("eval_data_1k.json")
test_data = load_json_lines("test_data_1k.json")

train_dataset = Dataset.from_list(train_data)
eval_dataset = Dataset.from_list(eval_data)
test_dataset = Dataset.from_list(test_data)

print(f"Train samples: {len(train_dataset)}")
print(f"Eval samples: {len(eval_dataset)}")
print(f"Test samples: {len(test_dataset)}")


Train samples: 10000
Eval samples: 1000
Test samples: 1000


In [6]:
max_input_length = 256
max_target_length = 256

def preprocess_function(examples):
    inputs = examples['PFD']
    targets = examples['PID']

    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding='max_length')

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding='max_length')

    # Replace padding token ID in labels with -100
    labels_ids = [
        [(l if l != tokenizer.pad_token_id else -100) for l in label_seq]
        for label_seq in labels["input_ids"]
    ]

    model_inputs["labels"] = labels_ids
    return model_inputs

In [7]:
train_dataset = train_dataset.map(preprocess_function, batched=True)
eval_dataset = eval_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [8]:
from transformers import Trainer, TrainingArguments, DataCollatorForSeq2Seq
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

training_args = TrainingArguments(
    output_dir="./bart_pid_model",
    eval_strategy="steps",
    eval_steps=100,
    logging_steps=100,
    save_steps=100,
    save_total_limit=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=5e-5,
    num_train_epochs=10,
    weight_decay=0.01,
    report_to="tensorboard",
    fp16=torch.cuda.is_available()
)

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator
)

  trainer = Trainer(


In [9]:
trainer.train()
trainer.save_model('./bart_pid_model')
tokenizer.save_pretrained('./bart_pid_model')

Step,Training Loss,Validation Loss
100,0.862,0.205338
200,0.2219,0.129072
300,0.1459,0.098834
400,0.1199,0.07949
500,0.0981,0.077349
600,0.084,0.056554
700,0.0776,0.051816
800,0.069,0.049344
900,0.0662,0.047124
1000,0.0596,0.038517




Step,Training Loss,Validation Loss
100,0.862,0.205338
200,0.2219,0.129072
300,0.1459,0.098834
400,0.1199,0.07949
500,0.0981,0.077349
600,0.084,0.056554
700,0.0776,0.051816
800,0.069,0.049344
900,0.0662,0.047124
1000,0.0596,0.038517


('./bart_pid_model/tokenizer_config.json',
 './bart_pid_model/special_tokens_map.json',
 './bart_pid_model/vocab.json',
 './bart_pid_model/merges.txt',
 './bart_pid_model/added_tokens.json')

In [16]:
def pfd_to_pid_bart(pfd_text):
    input_ids = tokenizer.encode(pfd_text, return_tensors="pt", max_length=3000, truncation=True).to(device)
    output_ids = model.generate(input_ids, max_length=2000, num_beams=4, early_stopping=False)
    return tokenizer.decode(output_ids[0], skip_special_tokens=False)

# Example:
sample_index = 0
sample_pfd = test_dataset[sample_index]["PFD"]
print("Input:", sample_pfd)
print("Prediction:", pfd_to_pid_bart(sample_pfd))
print("True PID:", test_dataset[sample_index]["PID"])

Input: (raw)(pp)(v)(v)(mix)<&|(raw)(hex)(v)&|(mix)<&|(raw)(v)&|(pp)(v)(mix)<1(r)(v)(splt)[(prod)](v)1
Prediction: </s><s>(raw)(hex)(C){TC}_1(C){FC}_2(v)&<_2|(mix)<1(r)<_3<&|(raw(pp)[(C)M}](C){PI}(C(){FC}}_3(v]<_4(C {FC}&_5|[(C)[TC}][(C)(LC}_6][{tout}(raw)](v)(splt)[(prod)](CZI}_7<_7(v)]1<_8</s>
True PID: (raw)(hex)<_1(C){TC}_1(C){FC}_2(v)<_2(mix)<&|(raw)(pp)[(C){M}](C){PI}(C){FC}_3(v)<_3(C){FC}_4(v)&<_4|(mix)<&|(raw)(C){FC}_5(v)&<_5|(pp)[(C){M}](C){PI}(C){FC}_6(v)<_6(mix)<1(r)<_7[(C){TC}_7][(C){LC}_8](v)<_8(splt)[(prod)](C){FC}_9(v)1<_9
