In [1]:
from datasets import load_dataset

dataset = load_dataset("jiacheng-ye/nl2bash")

In [3]:
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['nl', 'bash'],
        num_rows: 8090
    })
    validation: Dataset({
        features: ['nl', 'bash'],
        num_rows: 609
    })
    test: Dataset({
        features: ['nl', 'bash'],
        num_rows: 606
    })
})


In [4]:
train_dataset = dataset['train']
test_dataset = dataset['test']
validation_dataset = dataset['validation']

print("Training Samples : ",len(train_dataset))
print("Testing Samples : ",len(test_dataset))
print("Validation Samples : ",len(validation_dataset))

Training Samples :  8090
Testing Samples :  606
Validation Samples :  609


In [5]:
model_name = "google/byt5-small"

In [6]:
# tokenizer 
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
def tokenization(example):
    inputs = tokenizer(
        ["Translate to Bash: " + nl for nl in example["nl"]],
        max_length=128,
        padding="max_length",
        truncation=True
    )
    
    targets = tokenizer(
        [bash for bash in example["bash"]], 
        max_length=128,
        padding="max_length",
        truncation=True
    )
    
    labels = targets["input_ids"]
    labels = [
        [(token if token != tokenizer.pad_token_id else -100) for token in label_seq]
        for label_seq in labels
    ]
    
    inputs["labels"] = labels
    return inputs


In [8]:
train_dataset = train_dataset.map(tokenization,batched=True)
test_dataset = test_dataset.map(tokenization,batched=True)
validation_dataset = validation_dataset.map(tokenization,batched=True)

print(train_dataset[0])

Map:   0%|          | 0/8090 [00:00<?, ? examples/s]

Map:   0%|          | 0/606 [00:00<?, ? examples/s]

Map:   0%|          | 0/609 [00:00<?, ? examples/s]

{'nl': "Do a dry run of renaming file extension '.andnav' to '.tile' for all files/directories under current directory tree", 'bash': 'find . -name "*.andnav" | rename -vn "s/\\.andnav$/.tile/"', 'input_ids': [87, 117, 100, 113, 118, 111, 100, 119, 104, 35, 119, 114, 35, 69, 100, 118, 107, 61, 35, 71, 114, 35, 100, 35, 103, 117, 124, 35, 117, 120, 113, 35, 114, 105, 35, 117, 104, 113, 100, 112, 108, 113, 106, 35, 105, 108, 111, 104, 35, 104, 123, 119, 104, 113, 118, 108, 114, 113, 35, 42, 49, 100, 113, 103, 113, 100, 121, 42, 35, 119, 114, 35, 42, 49, 119, 108, 111, 104, 42, 35, 105, 114, 117, 35, 100, 111, 111, 35, 105, 108, 111, 104, 118, 50, 103, 108, 117, 104, 102, 119, 114, 117, 108, 104, 118, 35, 120, 113, 103, 104, 117, 35, 102, 120, 117, 117, 104, 113, 119, 35, 103, 108, 117, 104, 102, 119, 114, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [9]:
# model 
from transformers import AutoModelForSeq2SeqLM


model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

2025-06-10 13:33:50.738862: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749562430.760918     232 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749562430.767674     232 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
hf_token = "your_hugging_face_token"

In [11]:
## login to hf 
from huggingface_hub import login
login(hf_token)

In [12]:
# evaluation metric 
from evaluate import load 
bleu_metric = load("bleu")

def compute_metrics(eval_preds):
    print(eval_preds)
    preds, labels = eval_preds
    # print("------- preds -----------",preds)
    # print("-------labels------------",labels)

    #replacing -100 with pad_token_id 
    preds = [
        [(token if token != -100 else tokenizer.pad_token_id) for token in preds_seq]
        for preds_seq in preds
    ]
    labels = [
        [(token if token!= -100 else tokenizer.pad_token_id) for token in labels_seq]
        for labels_seq in labels 
    ]
    # print("------- preds -----------",preds)
    # print("-------labels------------",labels)
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # BLEU expects list of references (list of list of strings) and list of predictions
    decoded_labels = [[label] for label in decoded_labels]

    result = bleu_metric.compute(predictions=decoded_preds, references=decoded_labels)
    print(result)
    return {"bleu": result["bleu"]}

In [13]:
from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="/kaggle/working/byt5-small-fine-tuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=2,
    learning_rate=1e-4,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    num_train_epochs=10,
    predict_with_generate=True,
    generation_max_length=64,
    disable_tqdm=False,       
    report_to="none", 
    hub_model_id="archan01/byt5-small-finetuned",
    hub_strategy="every_save",
)

In [14]:
from transformers import Seq2SeqTrainer,default_data_collator


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics
)

  trainer = Seq2SeqTrainer(


In [15]:
trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss,Bleu
1,1.0741,0.745262,0.244871
2,0.6909,0.644243,0.284142
3,0.5903,0.587402,0.308416
4,0.5272,0.552282,0.32159
5,0.4884,0.53206,0.330956
6,0.4547,0.520522,0.347698
7,0.4327,0.512733,0.349912
8,0.4134,0.510413,0.351045
9,0.3975,0.507435,0.358093
10,0.3889,0.507995,0.357527


<transformers.trainer_utils.EvalPrediction object at 0x7ebfe97092d0>
{'bleu': 0.24487121212333413, 'precisions': [0.47383000437381545, 0.3128, 0.22155264090747961, 0.17225640007938084], 'brevity_penalty': 0.8928989883017904, 'length_ratio': 0.8982451545311682, 'translation_length': 6859, 'reference_length': 7636}




<transformers.trainer_utils.EvalPrediction object at 0x7ebfea5d9e50>
{'bleu': 0.2841416554230989, 'precisions': [0.5579325421611493, 0.3908541846419327, 0.28527370855821127, 0.22619307038570494], 'brevity_penalty': 0.8249935072748157, 'length_ratio': 0.8386589837611315, 'translation_length': 6404, 'reference_length': 7636}




<transformers.trainer_utils.EvalPrediction object at 0x7ebfe94d1550>
{'bleu': 0.308416282974707, 'precisions': [0.5798085828959555, 0.41352871017209064, 0.3083064056263068, 0.2502144082332762], 'brevity_penalty': 0.8363075287395868, 'length_ratio': 0.8483499214248298, 'translation_length': 6478, 'reference_length': 7636}




<transformers.trainer_utils.EvalPrediction object at 0x7ec15c809e50>
{'bleu': 0.321589598838855, 'precisions': [0.5936231012182283, 0.42185430463576157, 0.3116142094607031, 0.24819326863514352], 'brevity_penalty': 0.8620488193515043, 'length_ratio': 0.8707438449449973, 'translation_length': 6649, 'reference_length': 7636}




<transformers.trainer_utils.EvalPrediction object at 0x7ebfe030f710>
{'bleu': 0.330956103513284, 'precisions': [0.5980276714748307, 0.42021018593371057, 0.3116929698708752, 0.25145494681918523], 'brevity_penalty': 0.8834391336007419, 'length_ratio': 0.889732844421163, 'translation_length': 6794, 'reference_length': 7636}




<transformers.trainer_utils.EvalPrediction object at 0x7ebfe04a62d0>
{'bleu': 0.34769846661838993, 'precisions': [0.6086377754770005, 0.4413198959687906, 0.33489630297565376, 0.2726538849646821], 'brevity_penalty': 0.878605994180897, 'length_ratio': 0.8854112100576218, 'translation_length': 6761, 'reference_length': 7636}




<transformers.trainer_utils.EvalPrediction object at 0x7ebfe8138f50>
{'bleu': 0.3499115843316061, 'precisions': [0.6405472636815921, 0.4688304997424008, 0.3572249904177846, 0.29546436285097194], 'brevity_penalty': 0.82928694094871, 'length_ratio': 0.8423258250392875, 'translation_length': 6432, 'reference_length': 7636}




<transformers.trainer_utils.EvalPrediction object at 0x7ebfe953cbd0>
{'bleu': 0.3510446258968792, 'precisions': [0.6388888888888888, 0.4650414762146606, 0.3513870541611625, 0.2892948173322005], 'brevity_penalty': 0.8420763500985253, 'length_ratio': 0.8533263488737559, 'translation_length': 6516, 'reference_length': 7636}




<transformers.trainer_utils.EvalPrediction object at 0x7ebfe04e7410>
{'bleu': 0.35809304975273043, 'precisions': [0.6447870554113876, 0.4681925277684281, 0.3567679040119985, 0.2961246840775063], 'brevity_penalty': 0.8473651487383967, 'length_ratio': 0.857909900471451, 'translation_length': 6551, 'reference_length': 7636}




<transformers.trainer_utils.EvalPrediction object at 0x7ebfe021aa50>
{'bleu': 0.3575274945055306, 'precisions': [0.6376964933494559, 0.46295988013983685, 0.35198221563542054, 0.2913287585776669], 'brevity_penalty': 0.8571248357919484, 'length_ratio': 0.8664222105814563, 'translation_length': 6616, 'reference_length': 7636}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


TrainOutput(global_step=10120, training_loss=0.5457962609091295, metrics={'train_runtime': 8104.7915, 'train_samples_per_second': 9.982, 'train_steps_per_second': 1.249, 'total_flos': 1.85817101991936e+16, 'train_loss': 0.5457962609091295, 'epoch': 10.0})

In [16]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.50k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/archan01/byt5-small-finetuned/commit/08e6a7aac70d1f1519dab91eafcdc6af281b96d8', commit_message='End of training', commit_description='', oid='08e6a7aac70d1f1519dab91eafcdc6af281b96d8', pr_url=None, repo_url=RepoUrl('https://huggingface.co/archan01/byt5-small-finetuned', endpoint='https://huggingface.co', repo_type='model', repo_id='archan01/byt5-small-finetuned'), pr_revision=None, pr_num=None)