In [1]:
!pip install torch transformers datasets
!pip install accelerate -U
!pip install -U transformers





In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling, TrainingArguments, Trainer, DataCollatorWithPadding
from datasets import load_dataset
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
max_token = 128
model_name = "gpt2"

## Preprocessing

In [3]:
# Instantiate the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [4]:
# Tokenization functions
def dialog_to_string(dialog: list[str]) -> str:
    formatted_dialogue = '<User> '+dialog[0]+'\n<Assistant> '+dialog[1]
    
    return formatted_dialogue

def tokenize_function(row):
    row["dialog"] = dialog_to_string(row["dialog"])
    return tokenizer(row["dialog"], max_length=max_token, truncation=True)

def is_shorter_than_max_token(row):
    """
    Return if a given row has more than max_token number of tokens
    """
    return len(row['input_ids']) <= max_token

In [5]:
# Load and tokenize dataset
dataset = load_dataset("daily_dialog")
dataset = dataset.map(tokenize_function)

# Filter the dataset
dataset = dataset.filter(is_shorter_than_max_token)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

## Training

In [6]:
# Instantiate the model
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [25]:
training_args = TrainingArguments(
    output_dir="output_dir",
    per_device_train_batch_size=25,
    gradient_accumulation_steps=20,
    num_train_epochs=20,
    learning_rate=2e-4,
    fp16=True,
    evaluation_strategy="epoch",
    warmup_steps=500,
    weight_decay=1e-3,
)

trainer = Trainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    args=training_args,
    data_collator=data_collator,
    tokenizer=tokenizer
)
trainer.train()

Epoch,Training Loss,Validation Loss
0,No log,2.104626
1,No log,2.105188
2,No log,2.106166
4,No log,2.130331


KeyboardInterrupt: 

## Inference

In [30]:
def talk_with_llm(tweet: str) -> str:
    # Encode and move tensor into cuda if applicable.
    encoded_input = tokenizer(tweet, return_tensors='pt')
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    
    output = model.generate(**encoded_input, max_new_tokens=max_token, num_return_sequences=1)
    response = tokenizer.decode(output[0], skip_special_tokens=False)
    truncated_parts = response.split('\n', 2)[:2]
    response = '\n'.join(truncated_parts)
    return response

In [46]:
talk_with_llm("<User> Can you help me pick up my kids from school today? I'll need to run to a dentist appoinment. \n<Assistant>")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"<User> Can you help me pick up my kids from school today? I'll need to run to a dentist appoinment. \n<Assistant>  Sure. Where do you live? "