In [1]:
!pip -qqq install 'transformers<=4.29.2' datasets 'mlflow>=2.4' evaluate rouge_score torch accelerate

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m97.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m40.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.1/18.1 MB[0m [31m95.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m28.6 MB/s[0

In [1]:
from datasets import load_dataset

# From the user, for loading their data
DATASET = "billsum"
SPLIT = "ca_test"

# From the user, for training the model
INPUT = "text"  # the "X"
OUTPUT = "summary"  # the "y" or "label"
# From the user, with defaults. These values are the max length before the text is truncated
MAX_INPUT_LENGTH = 1024
MAX_OUTPUT_LENGTH = 128

ds = load_dataset(DATASET, split=SPLIT)
# Train/test split 80/20
ds = ds.train_test_split(test_size=0.2)

for key in ds["train"][0].keys():
    print(f"**{key}**: {ds['train'][0][key][:100]}...")



**text**: The people of the State of California do enact as follows:


SECTION 1.
Section 14186.36 of the Welf...
**summary**: Existing law provides for the Medi-Cal program, which is administered by the State Department of Hea...
**title**: An act to amend Section 14186.36 of the Welfare and Institutions Code, relating to Medi-Cal....


In [2]:
from transformers import AutoTokenizer

# From the user, the model to fine-tune
MODEL = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# Taken from the user, with a sensible default. "summarize: " for summarization, "translate: " for translation, etc
# For certain tasks, like conversational, it may be "answer the following question: "
TASK_PREFIX = "summarize: "


def preprocess_function(examples):
    inputs = [TASK_PREFIX + doc for doc in examples[INPUT]]
    model_inputs = tokenizer(inputs, max_length=MAX_INPUT_LENGTH, truncation=True)

    labels = tokenizer(text_target=examples[OUTPUT], max_length=MAX_OUTPUT_LENGTH, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


# We batch to speed up processing by passing more than 1 row in at a time
tokenized_ds = ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/989 [00:00<?, ? examples/s]

Map:   0%|          | 0/248 [00:00<?, ? examples/s]

In [25]:
from transformers import DataCollatorForSeq2Seq
import evaluate
import numpy as np


# From the user, default rouge, with option for "bleu". Must be one of "rouge" or "bleu"
EVAL_METRIC = "rouge"
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=MODEL)
metric = evaluate.load(EVAL_METRIC)


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    if EVAL_METRIC == "rouge":
        result = metric.compute(
            predictions=decoded_preds, references=decoded_labels, use_stemmer=True
        )
    else:
        result = metric.compute(
            predictions=decoded_preds, references=decoded_labels
        )

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    if EVAL_METRIC == "rouge":
        return {k: round(v, 4) for k, v in result.items()}
    else:
        return {k: v for k, v in result.items()}

In [26]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL)

In [27]:
import mlflow 
import mlflow.transformers


mlflow.transformers.autolog(
    log_input_examples=True, 
    log_model_signatures=True, 
    log_models=False, 
    log_datasets=False
)


# These are all the correct defaults, but can all be taken from the user
training_args = Seq2SeqTrainingArguments(
    output_dir="run_3",  
    evaluation_strategy="epoch",  # batch or epoch
    learning_rate=2e-5,  
    per_device_train_batch_size=16,  
    per_device_eval_batch_size=16, 
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
    report_to="mlflow"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Trainer is attempting to log a value of "{'summarization': {'early_stopping': True, 'length_penalty': 2.0, 'max_length': 200, 'min_length': 30, 'no_repeat_ngram_size': 3, 'num_beams': 4, 'prefix': 'summarize: '}, 'translation_en_to_de': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to German: '}, 'translation_en_to_fr': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to French: '}, 'translation_en_to_ro': {'early_stopping': True, 'max_length': 300, 'num_beams': 4, 'prefix': 'translate English to Romanian: '}}" for key "task_specific_params" as a parameter. MLflow's log_param() only accepts values no longer than 250 characters so we dropped this attribute. You can use `MLFLOW_FLATTEN_PARAMS` environment variable to flatten the parameters and avoid this message.


Epoch,Training Loss,Validation Loss,Bleu,Precisions,Brevity Penalty,Length Ratio,Translation Length,Reference Length,Gen Len
1,No log,2.814233,0.000279,"[0.4508324661810614, 0.11735261401557286, 0.06063321385902031, 0.0467741935483871]",0.002524,0.143224,3844,26839,19.0
2,No log,2.602874,0.000391,"[0.4622425629290618, 0.1413839891451832, 0.0788478324119872, 0.05957980558168705]",0.002956,0.14654,3933,26839,19.0
3,No log,2.539785,0.000493,"[0.45947973986993496, 0.15626666666666666, 0.09651627641347801, 0.0716041794714198]",0.003302,0.148962,3998,26839,19.0
4,No log,2.523765,0.00049,"[0.459344508381286, 0.15630834889303816, 0.09625821193944588, 0.07070396557024285]",0.003297,0.148925,3997,26839,19.0


Trainer is attempting to log a value of "[0.4508324661810614, 0.11735261401557286, 0.06063321385902031, 0.0467741935483871]" of type <class 'list'> for key "eval_precisions" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "[0.4622425629290618, 0.1413839891451832, 0.0788478324119872, 0.05957980558168705]" of type <class 'list'> for key "eval_precisions" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "[0.45947973986993496, 0.15626666666666666, 0.09651627641347801, 0.0716041794714198]" of type <class 'list'> for key "eval_precisions" as a metric. MLflow's log_metric() only accepts float and int types so we dropped this attribute.
Trainer is attempting to log a value of "[0.459344508381286, 0.15630834889303816, 0.09625821193944588, 0.07070396557024285]" of type <class 'list'> for key "eval_precisions" as a metri

TrainOutput(global_step=248, training_loss=3.0252737229870212, metrics={'train_runtime': 272.5009, 'train_samples_per_second': 14.517, 'train_steps_per_second': 0.91, 'total_flos': 1070824333246464.0, 'train_loss': 3.0252737229870212, 'epoch': 4.0})