In [1]:
import json, random
from datetime import date, timedelta

EN = ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]
RU = ["янв","фев","мар","апр","май","июн","июл","авг","сен","окт","ноя","дек"]

def rdate(a=date(2000,1,1), b=date(2030,12,31)):
    d = a + timedelta(days=random.randint(0, (b-a).days))
    return d

def iso(d): return str(d.strftime("%Y-%m-%d"))

def ren(d):
    m = EN[d.month-1]
    opts = [f"{d.day} {m} {d.year}", d.strftime("%m/%d/%Y"), d.strftime("%d/%m/%Y"), f"{m} {d.day}, {d.year}"]
    return random.choice(opts)

def rru(d):
    m = RU[d.month-1]
    opts = [f"{d.day} {m} {d.year}", d.strftime("%d.%m.%Y")]
    return random.choice(opts)

def synth(n=50000, out="date_norm.jsonl"):
    with open(out, "w", encoding="utf-8") as f:
        for _ in range(n):
            d = rdate()
            shown = random.choice([ren(d), rru(d)])
            prompt = f"Convert to ISO-8601 (YYYY-MM-DD): {shown}"
            target = iso(d)
            f.write(json.dumps({"prompt": prompt, "target": target}, ensure_ascii=False)+"\n")

if __name__ == "__main__":
    synth()


In [2]:
import numpy as np
from datasets import load_dataset
from transformers import T5ForConditionalGeneration, T5TokenizerFast, Trainer, TrainingArguments
import os; os.environ["TOKENIZERS_PARALLELISM"]="false"
from datasets import Features, Value

model_name = "t5-small"
tok = T5TokenizerFast.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

features = Features({
    "prompt": Value("string"),
    "target": Value("string"),
})
ds = load_dataset("json", data_files={"train":"date_norm.jsonl","validation":"date_norm.jsonl"}, features=features)
def preprocess(ex):
    x = tok(ex["prompt"], truncation=True)
    y = tok(ex["target"], truncation=True)
    ex["input_ids"], ex["attention_mask"] = x["input_ids"], x["attention_mask"]
    ex["labels"] = y["input_ids"]
    return ex

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

In [3]:
print(ds["validation"][0], type(ds["validation"][0]["prompt"]), type(ds["validation"][0]["target"]))


{'prompt': 'Convert to ISO-8601 (YYYY-MM-DD): 07.03.2027', 'target': '2027-03-07 00:00:00'} <class 'str'> <class 'str'>


In [4]:
from datetime import datetime
def fix_types(batch):
    def to_iso(x) -> str:
        if isinstance(x, (datetime, date)):
            return x.strftime("%Y-%m-%d")
        return "" if x is None else str(x)

    return {
        "prompt": ["" if p is None else str(p) for p in batch["prompt"]],
        "target": [to_iso(t) for t in batch["target"]],
    }

ds = ds.map(fix_types, batched=True, load_from_cache_file=False)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [5]:
print(ds["train"][0], type(ds["train"][0]["prompt"]), type(ds["train"][0]["target"]))  # обе строки


{'prompt': 'Convert to ISO-8601 (YYYY-MM-DD): 07.03.2027', 'target': '2027-03-07 00:00:00'} <class 'str'> <class 'str'>


In [21]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import load_dataset
import numpy as np
import os
from datetime import datetime, date

os.environ["TOKENIZERS_PARALLELISM"] = "false"

model_name = "t5-small"
tok = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

def tokenize(batch):
    return tok(
        batch["prompt"],
        text_target=batch["target"],
        padding="max_length",
        truncation=True,
        max_length=64,
    )

tokenized = ds.map(tokenize, batched=True, remove_columns=["prompt","target"])
collator = DataCollatorForSeq2Seq(tokenizer=tok, model=model)


Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [22]:
import torch

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval().to(device)

def predict(texts, max_new_tokens=16, num_beams=4):
    enc = tok(texts, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        out_ids = model.generate(
            **enc,
            max_new_tokens=max_new_tokens,
            num_beams=num_beams,
            early_stopping=True
        )
    return tok.batch_decode(out_ids, skip_special_tokens=True)

samples = [
    "What is the date '14.02.23'?",
    "Standardize this — 12月5日2003年",
    "I need September 30, 1987 in standard format",
    "How to write '31st of December 1999' in normalized form?",
    "Turn 'Aug 9, 2024' into ISO format",
]

preds = predict(samples)
for s, p in zip(samples, preds):
    print(f"IN : {s}\nOUT: {p}\n")

IN : What is the date '14.02.23'?
OUT: Was ist die Datum '14.02.23'?

IN : Standardize this — 12月5日2003年
OUT: Standardize this — 1252003

IN : I need September 30, 1987 in standard format
OUT: I need September 30, 1987 in standard format

IN : How to write '31st of December 1999' in normalized form?
OUT: Wie schreiben Sie '31. Dezember 1999' in normalisierter Form

IN : Turn 'Aug 9, 2024' into ISO format
OUT: Turn 'Aug 9, 2024' into ISO format.



In [9]:
args = TrainingArguments(
    output_dir="t5_date_norm",
    per_device_train_batch_size=16,
    learning_rate=5e-4,
    num_train_epochs=3,
    eval_strategy="no",
    save_strategy="epoch",
    weight_decay=0.01,
    logging_steps=50,
    fp16=True,
    remove_unused_columns=False
)

def compute_acc(eval_preds):
    preds, labels = eval_preds
    pred_text = tok.batch_decode(preds, skip_special_tokens=True)
    lab_text  = tok.batch_decode(np.where(labels!=-100, labels, tok.pad_token_id), skip_special_tokens=True)
    acc = np.mean([p.strip()==l.strip() for p,l in zip(pred_text, lab_text)])
    return {"exact_match": float(acc)}

trainer = Trainer(model=model, args=args, train_dataset=tokenized["train"], eval_dataset=tokenized["validation"],
                  tokenizer=tok, compute_metrics=compute_acc, data_collator=collator)
trainer.train()
trainer.save_model("t5_date_norm"); tok.save_pretrained("t5_date_norm")

  trainer = Trainer(model=model, args=args, train_dataset=tokenized["train"], eval_dataset=tokenized["validation"],
[34m[1mwandb[0m: Currently logged in as: [33mshakeman78[0m ([33mshakeman78-hse-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,0.8328
100,0.0337
150,0.0179
200,0.014
250,0.0104
300,0.008
350,0.0063
400,0.0062
450,0.0054
500,0.0051


('t5_date_norm/tokenizer_config.json',
 't5_date_norm/special_tokens_map.json',
 't5_date_norm/spiece.model',
 't5_date_norm/added_tokens.json',
 't5_date_norm/tokenizer.json')

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.eval().to(device)

def predict(texts, max_new_tokens=16, num_beams=4):
    enc = tok(texts, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        out_ids = model.generate(
            **enc,
            max_new_tokens=max_new_tokens,
            num_beams=num_beams,
            early_stopping=True
        )
    return tok.batch_decode(out_ids, skip_special_tokens=True)

samples = [
    "What is the date '14.02.23'?",
    "Standardize this — 12月5日2003年",
    "I need September 30, 1987 in standard format",
    "How to write '31st of December 1999' in normalized form?",
    "Turn 'Aug 9, 2024' into ISO format",
]

preds = predict(samples)
for s, p in zip(samples, preds):
    print(f"IN : {s}\nOUT: {p}\n")

IN : What is the date '14.02.23'?
OUT: 2023-02-14 00:00:00

IN : Standardize this — 12月5日2003年
OUT: 2003-01-12 00:00:00

IN : I need September 30, 1987 in standard format
OUT: 1987-09-30 00:00:00

IN : How to write '31st of December 1999' in normalized form?
OUT: 1999-12-31 00:00:00

IN : Turn 'Aug 9, 2024' into ISO format
OUT: 2024-08-09 00:00:00

