In [1]:
!pip -qqq install 'transformers<=4.29.2' datasets 'mlflow>=2.4' evaluate rouge_score torch accelerate

In [2]:
from datasets import load_dataset, DatasetDict

# From the user, for loading their data
DATASET = "wikitext" 
SPLIT = "train"           # Optional
NAME = "wikitext-103-v1"  # Optional

LOG_MODEL_MLFLOW = True
LOG_DATA_MLFLOW = True

# From the user, for training the model
TEXT_COL = "text"  # There is no label, the next word _is_ the label. [Default "text"]

ds = load_dataset(DATASET, split=SPLIT, name=NAME)
if not isinstance(ds, DatasetDict):
    ds = ds.train_test_split(test_size=0.2)

ds

Found cached dataset wikitext (/Users/benepstein/.cache/huggingface/datasets/wikitext/wikitext-103-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 1441080
    })
    test: Dataset({
        features: ['text'],
        num_rows: 360270
    })
})

In [3]:
from transformers import AutoTokenizer

# From the user, the model to fine-tune
MODEL = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(MODEL)


def preprocess_function(examples):
    """Preprocess for conversational/causal language modeling (decoder-only architecture), 
    
    Here, he next token _is_ the label, so we simply return the entire tokenized text.
    
    In this environment, the default behavior is to return the entire input as the output, 
    as this will force the model to learn the next word given the input string. You can
    modify this function to create custom behavior, such as masking certain parts of the text
    to force the model to only focus on certain words/parts of the text. For example, you can 
    structure this text as a question/answer structure, and mask out the "question" part of
    the input by setting its tokens to -100 here. This will force the model to only learn
    on the answer, rather than learning the question itself as a part of the input. If you
    have input/output type data, you can also consider an encoder/decoder architecture
    """
    return tokenizer(examples[TEXT_COL], truncation=True)

# We batch to speed up processing by passing more than 1 row in at a time
tokenized_ds = ds.map(
    preprocess_function, 
    batched=True, 
    remove_columns=TEXT_COL, 
    num_proc=4
)
tokenized_ds

Map (num_proc=4):   0%|          | 0/1441080 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/360270 [00:00<?, ? examples/s]

In [4]:
# Load the data collator to dynamically pad the inputs
from transformers import DataCollatorForLanguageModeling

# Set the pad token to the end-of-sequence token dynamically so that each input (text) in the batch
# is the same length, but that padding is the eos token so the model doesnt try to learn it
tokenizer.pad_token = tokenizer.eos_token
# mlm=False because we do not want to perform masked language modelin
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [5]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model = AutoModelForCausalLM.from_pretrained(MODEL)

Downloading model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
import mlflow 
import mlflow.transformers


mlflow.transformers.autolog(
    log_input_examples=True, 
    log_model_signatures=True, 
    log_models=False, 
    log_datasets=False
)


# These are all the correct defaults, but can all be taken from the user
RUN_NAME = "run1"
training_args = TrainingArguments(
    output_dir=RUN_NAME,
    evaluation_strategy="epoch",
    num_train_epochs=3,   
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    data_collator=data_collator,
)

trainer.train()

In [17]:
import math

# Log the dataset and model to mlflow
run_id = mlflow.last_active_run().info.run_id
with mlflow.start_run(run_id=run_id):
    # Log the final perplexity
    eval_results = trainer.evaluate()
    perplexity = math.exp(eval_results['eval_loss'])
    print(f"Perplexity: {perplexity:.2f}")
    mlflow.log_metric("perplexity", perplexity)
    if LOG_MODEL_MLFLOW:
        model_config = {
            "model": trainer.model,
            "tokenizer": trainer.tokenizer
        }
        mlflow.transformers.log_model(model_config, artifact_path="model")
    if LOG_DATA_MLFLOW:
        for split in ds.keys():
            data = mlflow.data.from_huggingface(ds[split])
            mlflow.log_input(data, context=split)
    

Perplexity: 0.50


Downloading (…)solve/main/README.md:   0%|          | 0.00/11.0k [00:00<?, ?B/s]