In [None]:
# pip install torch transformers datasets sentencepiece scikit-learn pandas numpy evaluate accelerate


In [1]:
import pandas as pd
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration, EarlyStoppingCallback
from datasets import Dataset

In [None]:
sentences_train_io = pd.read_csv("sentences_train_io.csv")
sentences_test_io = pd.read_csv("sentences_test_io.csv")
sentences_valid_io = pd.read_csv("sentences_valid_io.csv")

In [None]:
def build_t5_style_removal_df(df):
    t5_df = pd.DataFrame()
    t5_df["input_text"] = "transfer from" + df["persona.age"] + " to neutral style: " + df["output.sentences"]
    t5_df["target_text"] = df["input.sentences"]
    return t5_df.dropna()

In [None]:
t5_sentences_train = build_t5_style_removal_df(sentences_train_io)
t5_sentences_test = build_t5_style_removal_df(sentences_test_io)
t5_sentences_valid = build_t5_style_removal_df(sentences_valid_io)

In [5]:
t5_sentences_train.head()

Unnamed: 0,input_text,target_text
0,transfer to neutral style: He had a great time...,co-worker and i in the new office
1,transfer to neutral style: Top end it with a d...,even the carosuel was lit up .
2,transfer to neutral style: Look at all of this...,the food was amazing .
3,transfer to neutral style: I took a picture of...,this is a picture of a bike .
4,transfer to neutral style: We danced while we ...,she danced with her beer .


In [6]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")
# tokenizer = T5Tokenizer.from_pretrained("t5-small")


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [7]:

def build_t5_huggingface_ds(df, tokenizer):
    def tokenize_batch(batch):
        return tokenizer(batch["input_text"], padding="max_length", truncation=True, max_length=64)

    def tokenize_target(batch):
        with tokenizer.as_target_tokenizer():
            tokenized = tokenizer(batch["target_text"], padding="max_length", truncation=True, max_length=64)
        batch["labels"] = tokenized["input_ids"]
        return batch

    t5_ds = Dataset.from_pandas(df)
    t5_ds = t5_ds.map(tokenize_batch, batched=True)
    t5_ds = t5_ds.map(tokenize_target, batched=True)


    t5_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

    return t5_ds


In [8]:
t5_sentences_train_ds = build_t5_huggingface_ds(t5_sentences_train, tokenizer)
t5_sentences_test_ds = build_t5_huggingface_ds(t5_sentences_test, tokenizer)
t5_sentences_valid_ds = build_t5_huggingface_ds(t5_sentences_valid, tokenizer)

Map:   0%|          | 0/31806 [00:00<?, ? examples/s]

Map:   0%|          | 0/31806 [00:00<?, ? examples/s]



Map:   0%|          | 0/3982 [00:00<?, ? examples/s]

Map:   0%|          | 0/3982 [00:00<?, ? examples/s]

Map:   0%|          | 0/3990 [00:00<?, ? examples/s]

Map:   0%|          | 0/3990 [00:00<?, ? examples/s]

In [9]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

model = T5ForConditionalGeneration.from_pretrained("t5-base")
# model = T5ForConditionalGeneration.from_pretrained("t5-small")
training_args = TrainingArguments(
    output_dir="./t5_styled_neutral",
    gradient_accumulation_steps=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    report_to="none"
)

In [10]:
import numpy as np
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    return {"avg_len": np.mean([len(p.split()) for p in decoded_preds])}


In [11]:
t5_sentences_valid_ds_small = t5_sentences_valid_ds.select(range(100))
t5_sentences_test_ds_small = t5_sentences_test_ds.select(range(1000))

In [12]:
trainer = Trainer(
    model=model.to("cuda"),
    args=training_args,
    train_dataset=t5_sentences_test_ds_small,
    eval_dataset=t5_sentences_valid_ds_small,
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

trainer.train()

  trainer = Trainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,3.6122,0.716267
2,0.7102,0.59151


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=186, training_loss=1.5143428720453733, metrics={'train_runtime': 120.6127, 'train_samples_per_second': 24.873, 'train_steps_per_second': 1.542, 'total_flos': 225314419507200.0, 'train_loss': 1.5143428720453733, 'epoch': 2.96})

In [39]:
def generate_neutral(model, tokenizer, styled_sentence, age_label):
    prompt = f"transfer from {age_label} to neutral style: {styled_sentence}"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_length=64)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [41]:
output_sentences = generate_neutral(model, tokenizer, "I want to go park.", "Under12")
output_sentences

'i want to go park .'