In [14]:
!pip -qqq install 'transformers<=4.29.2' datasets 'mlflow>=2.4' evaluate rouge_score torch

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [3]:
from datasets import load_dataset

# From the user, for loading their data
DATASET = "billsum"
SPLIT = "ca_test"

# From the user, for training the model
INPUT = "text"  # the "X"
OUTPUT = "summary"  # the "y" or "label"
# From the user, with defaults. These values are the max length before the text is truncated
MAX_INPUT_LENGTH = 1024
MAX_OUTPUT_LENGTH = 128

ds = load_dataset("billsum", split="ca_test")
# Train/test split 80/20
ds = ds.train_test_split(test_size=0.2)

for key in ds["train"][0].keys():
    print(f"**{key}**: {ds['train'][0][key][:100]}...")

Found cached dataset billsum (/Users/benepstein/.cache/huggingface/datasets/billsum/default/3.0.0/75cf1719d38d6553aa0e0714c393c74579b083ae6e164b2543684e3e92e0c4cc)


**text**: The people of the State of California do enact as follows:


SECTION 1.
The Legislature finds and de...
**summary**: The Mobilehome Residency Law governs the terms and conditions of residency in mobilehome parks, and ...
**title**: An act to
add Section 798.17.5 to
repeal Section 798.17 of
the Civil Code, relating to mobilehome pa...


In [4]:
from transformers import AutoTokenizer

# From the user, the model to fine-tune
MODEL = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# Taken from the user, with a sensible default. "summarize: " for summarization, "translate: " for translation, etc
# For certain tasks, like conversational, it may be "answer the following question: "
TASK_PREFIX = "summarize: "


def preprocess_function(examples):
    inputs = [TASK_PREFIX + doc for doc in examples[INPUT]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)

    labels = tokenizer(text_target=examples[OUTPUT], max_length=MAX_OUTPUT_LENGTH, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# We batch to speed up processing by passing more than 1 row in at a time
tokenized_ds = ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

In [5]:
from transformers import DataCollatorForSeq2Seq
import evaluate
import numpy as np


# From the user, default rogue, with option for bleu
EVAL_METRIC = "rouge"
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=MODEL)
metric = evaluate.load(EVAL_METRIC)


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = metric .compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [6]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL)

In [11]:
import mlflow 
import mlflow.transformers


mlflow.transformers.autolog(
    log_input_examples=True, 
    log_model_signatures=True, 
    log_models=False, 
    log_datasets=False
)


# These are all the correct defaults, but can all be taken from the user
training_args = Seq2SeqTrainingArguments(
    output_dir="run_1",  
    evaluation_strategy="epoch",  # batch or epoch
    learning_rate=2e-5,  
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16, 
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    report_to="mlflow"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Trainer is attempting to log a value of "{'summarization': {'early_stopping': True, 'length_penalty': 2.0, 'max_length': 200, 'min_length': 30, 'no_repeat_ngram_size': 3, 'num_beams': 4, 'prefix': 'summarize: '}, 'translation_en_to_de': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to German: '}, 'translation_en_to_fr': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to French: '}, 'translation_en_to_ro': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to Romanian: '}}" for key "task_specific_params" as a parameter. MLflow's log_param() only accepts values no longer than 250 characters so we dropped this attribute. You can use `MLFLOW_FLATTEN_PARAMS` environment variable to flatten the parameters and avoid this message.
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to en

Epoch,Training Loss,Validation Loss



KeyboardInterrupt

