In [77]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

checkpoint = "google/t5-efficient-tiny"
checkpoint = "shorecode/t5-efficient-tiny-summarizer-general-purpose"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [2]:
import pandas as pd
df = pd.read_excel('jobs_w_skills.xlsx')

In [4]:
text = df.description.iloc[0]

In [3]:
import pandas as pd
import datasets
from datasets import Dataset, DatasetDict


tdf = df[['description', 'best_fit']].iloc[:900].sample(50)
vdf = df[['description', 'best_fit']].iloc[900:].sample(50)
tds = Dataset.from_pandas(tdf)
vds = Dataset.from_pandas(vdf)


ds = DatasetDict()

ds['train'] = tds
ds['validation'] = vds

print(ds)

DatasetDict({
    train: Dataset({
        features: ['description', 'best_fit', '__index_level_0__'],
        num_rows: 50
    })
    validation: Dataset({
        features: ['description', 'best_fit', '__index_level_0__'],
        num_rows: 50
    })
})


In [10]:
prefix = "summarize: "


def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["description"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["best_fit"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [5]:
tokenized_dataset = ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [6]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [7]:
import evaluate

rouge = evaluate.load("rouge")

In [11]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [12]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=False, #change to bf16=True for XPU
    push_to_hub=False,
    no_cuda=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


# Inference

In [70]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [78]:
prefix = "summarize: "


text = prefix + df.description.iloc[5]

In [79]:
inputs = tokenizer(text, return_tensors="pt").input_ids

In [84]:
outputs = model.generate(inputs, max_new_tokens=300, do_sample=False)

In [81]:
outputs

tensor([[    0,   419,  2244,     3, 18180,  2894,  4901,  5190,  4804,    89,
            17,   852,  7539,    12,  1715,     8,   372,    16,  1844,  8491,
             6,  4699,     5,  6760,  4280,    19,  1710,    69,  8521, 12262,
            11, 16857,     3,   935,   524,   232,  4890,   372,    55,   100,
           419,  2244,     3, 18180,  2894,  4901,   613,    56,    36,   294,
            13,     3,     9,   372, 16915,     8,  1096,     7,    13,   600,
          1367, 10481,     5,   216,    87,     7,    88,    56,    36,   464,
          8521,     7,     6,   335,  6218,     3,    18,   489,  4815,     6,
           740,     6,     3, 30221,    11,  9873, 29741,    11, 14613,     5,
         10476,    28,     3,     9,  2458,    16,     3,   935,   524,   232,
          4890]])

In [85]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

'Reset Merchandiser Night Shift Now hiring to join the team in Post Falls, ID. SPAR is growing our overnight reset and remodel merchandising team! This Reset Merchandiser job will be part of a team remodeling the insides of big box retailers. He/she will be working overnights, 10PM - 7AM, building, assembling and installing shelving and fixtures. Anyone with a background in merchandising resets or remodels, installer, construction, or general labor, who could be a tool bag, who can be a tool bag,'

In [86]:
text

'summarize: Reset Merchandiser Night Shift\n\nNow hiring to join the team in Post Falls, ID.\n\nSPAR is growing our overnight reset and remodel merchandising team! This Reset Merchandiser job will be part of a team remodeling the insides of big box retailers. He/she will be working overnights, 10PM - 7AM, building, assembling and installing shelving and fixtures. Anyone with a background in merchandising resets or remodels, installer, construction, or general labor, this could be a perfect fit for you. The ideal merchandiser is; self-sufficient, highly motivated, knows their way around a tool bag, and have the drive to produce high quality results in a fast paced environment.\n\nJoin the best reset/ remodel merchandiser team in the business and APPLY TODAY!\n\nProjects include category resets and fixture installation.\n\nWhen One Project Ends, Another STARTS\n\nWhat We Offer:• Great TEAM\n• Ongoing project work – long term work\n• 10PM - 7AM Sunday -Thursday\n• Competitive pay\n• Daily