In [None]:
# pip install torch transformers datasets sentencepiece scikit-learn pandas numpy evaluate accelerate


In [None]:
import pandas as pd
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration, EarlyStoppingCallback
from datasets import Dataset

In [None]:
in_out_sentences_train = pd.read_csv("sentences_train_pairs.csv")
in_out_sentences_test = pd.read_csv("sentences_test_pairs.csv")
in_out_sentences_valid = pd.read_csv("sentences_valid_pairs.csv")

In [None]:
def build_t5_style_transfer_df(df):
    t5_df = pd.DataFrame()

    t5_df["input_text"] = "transfer from " + df["source_age"] + " to " + df["target_age"] + " style: " + df["source_text"]
    t5_df["target_text"] = df["target_text"]
    return t5_df.dropna().reset_index(drop=True)

In [None]:
t5_sentences_train = build_t5_style_transfer_df(in_out_sentences_train)
t5_sentences_test = build_t5_style_transfer_df(in_out_sentences_test)
t5_sentences_valid = build_t5_style_transfer_df(in_out_sentences_valid)

In [None]:
t5_sentences_train.head()

In [None]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")
# tokenizer = T5Tokenizer.from_pretrained("t5-small")


In [None]:

def build_t5_huggingface_ds(df, tokenizer):
    def tokenize_batch(batch):
        return tokenizer(batch["input_text"], padding="max_length", truncation=True, max_length=64)

    def tokenize_target(batch):
        with tokenizer.as_target_tokenizer():
            tokenized = tokenizer(batch["target_text"], padding="max_length", truncation=True, max_length=64)
        batch["labels"] = tokenized["input_ids"]
        return batch

    t5_ds = Dataset.from_pandas(df)
    t5_ds = t5_ds.map(tokenize_batch, batched=True)
    t5_ds = t5_ds.map(tokenize_target, batched=True)


    t5_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

    return t5_ds


In [None]:
t5_sentences_train_ds = build_t5_huggingface_ds(t5_sentences_train, tokenizer)
t5_sentences_test_ds = build_t5_huggingface_ds(t5_sentences_test, tokenizer)
t5_sentences_valid_ds = build_t5_huggingface_ds(t5_sentences_valid, tokenizer)

In [None]:
from transformers import T5ForConditionalGeneration, Trainer, TrainingArguments

model = T5ForConditionalGeneration.from_pretrained("t5-base")
# model = T5ForConditionalGeneration.from_pretrained("t5-small")
training_args = TrainingArguments(
    output_dir="./sentences_t5_paired_style_transfer",
    gradient_accumulation_steps=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model.to("cuda"),
    args=training_args,
    train_dataset=t5_sentences_train_ds,
    eval_dataset=t5_sentences_valid_ds,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
)

trainer.train()

In [None]:
def generate_neutral(model, tokenizer, styled_sentence, age_label):
    prompt = f"transfer to {age_label} style: {styled_sentence}"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_length=64)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [None]:
output_sentences = generate_neutral(model, tokenizer, "This ice cream is good.", "55-74")
output_sentences

In [None]:
age_group = in_out_sentences_train["persona.age"].unique().tolist()
age_group