In [None]:
!pip install sacrebleu
!pip install datasets
!pip install evaluate
!pip install transformers --upgrade
!pip install peft
!pip install accelerate -U >=0.20.1

In [None]:
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, AutoTokenizer
import evaluate
import numpy as np
from peft import LoraConfig, TaskType, get_peft_model
from huggingface_hub import notebook_login

In [None]:
sentences = load_dataset('angelacao/gloss_to_spoken5')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/9.33k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [None]:
checkpoint = "facebook/bart-large"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

source_lang = "en"
target_lang = "asl"
prefix = "translate spoken English to ASL: "

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
train_df = sentences["train"]
test_df = sentences["test"]
validation_df = sentences["validation"]

In [None]:
def preprocess_function(examples):
    inputs = [prefix + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

tokenized_train = train_df.map(preprocess_function, batched=True)
tokenized_validation = validation_df.map(preprocess_function, batched=True)
tokenized_test = test_df.map(preprocess_function, batched=True)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

Map:   0%|          | 0/13 [00:00<?, ? examples/s]

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]
    return preds, labels

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
lora_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, r=64, lora_dropout=0.05, lora_alpha=128)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


trainable params: 9,437,184 || all params: 415,728,640 || trainable%: 2.270034607189921


In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
metric = evaluate.load("sacrebleu")
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, EarlyStoppingCallback

training_args = Seq2SeqTrainingArguments(
    output_dir="Spoken_to",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=75,
    predict_with_generate=True,
    push_to_hub=True,
    metric_for_best_model="eval_loss",
    load_best_model_at_end=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)],
)

trainer.train()


In [None]:
results = trainer.evaluate(tokenized_test)

print("Evaluation Results:", results)



Evaluation Results: {'eval_loss': 1.7310229539871216, 'eval_bleu': 21.3332, 'eval_gen_len': 9.1538, 'eval_runtime': 15.9366, 'eval_samples_per_second': 0.816, 'eval_steps_per_second': 0.063, 'epoch': 31.0}


In [None]:
model.save_pretrained("Spoken_to")

In [None]:
trainer.push_to_hub("angelacao/Spoken_to")

In [None]:
loaded_model = AutoModelForSeq2SeqLM.from_pretrained("angelacao/Spoken_to")

sample_input = "translate spoken English to ASL: the boy threw the ball"

sample_input_ids = tokenizer.encode(sample_input, return_tensors="pt")
output_ids = loaded_model.generate(sample_input_ids)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Translated Output:", output_text)

Translated Output: YOU THROW THE BALL


In [None]:

sample_input = "translate ASL to spoken English: YOUR BATHROOM HAVE T-U-B"

sample_input_ids = tokenizer.encode(sample_input, return_tensors="pt")
output_ids = loaded_model.generate(sample_input_ids)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Translated Output:", output_text)

In [None]:
sample_input = "translate spoken English to ASL: how many cups of water do you drink every day"

sample_input_ids = tokenizer.encode(sample_input, return_tensors="pt")
output_ids = loaded_model.generate(sample_input_ids)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Translated Output:", output_text)



Translated Output: HOW MANY CUP YOU DRINK EVERY DAY


In [None]:
sample_input = "translate ASL to spoken English: A-S-L CLASS YOU PAST TAKE"

sample_input_ids = tokenizer.encode(sample_input, return_tensors="pt")
output_ids = loaded_model.generate(sample_input_ids)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Translated Output:", output_text)


In [None]:

sample_input = "translate spoken English to ASL: is this yours"

sample_input_ids = tokenizer.encode(sample_input, return_tensors="pt")
output_ids = loaded_model.generate(sample_input_ids)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Translated Output:", output_text)


Translated Output: YOU OWN THIS


In [None]:
sample_input = "translate ASL to spoken English: BOY"

sample_input_ids = tokenizer.encode(sample_input, return_tensors="pt")
output_ids = loaded_model.generate(sample_input_ids)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Translated Output:", output_text)
