In [2]:
import torch
import numpy as np
import warnings
warnings.filterwarnings("ignore")

Pipeline

In [None]:
from transformers import pipeline

generator = pipeline(model="gpt2", device='cuda')
generator("Q: Who is the king of the jungle?\nA:")

Tokenizer

In [23]:
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained('gpt2')
encoding = tok('The biggest non-biodegradable waste is plastic')
print(encoding)
encoding.word_ids()

{'input_ids': [464, 4094, 1729, 12, 8482, 1098, 9744, 540, 7030, 318, 7309], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


[0, 1, 2, 3, 4, 4, 4, 4, 5, 6, 7]

In [17]:
tok.is_fast # These are fast tokenizers by default

True

Auto Model

In [21]:
from transformers import AutoModel 
# AutoModel only returns raw hidden states, and doesn't include the functionality for text generation.
model = AutoModel.from_pretrained('gpt2')

In [None]:
model

In [None]:
inputs = tok(['hello','bye'], padding=False, truncation=True, return_tensors="pt")
outputs = model(**inputs)
print(outputs.last_hidden_state.shape)

In [None]:
inputs

In [None]:
dir(outputs)

In [None]:
outputs['last_hidden_state'].shape

In [None]:
inputs = tok(['what is your name?','how old are you?'], padding=False, truncation=True, return_tensors="pt")
print(inputs)
out = model(**inputs)
# tok.decode(out['last_hidden_state'])

Running a model on a custom input

In [19]:
from transformers import AutoTokenizer, AutoModelForCausalLM 
import torch

tokenizer = AutoTokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained('gpt2') # 
sequence = "What is your name?"
model_inputs = tokenizer(sequence, return_tensors='pt')
model_inputs['input_ids']

tensor([[2061,  318,  534, 1438,   30]])

In [28]:
tokens = tokenizer.tokenize(sequence, padding=True)
ids = tokenizer.convert_tokens_to_ids(tokens)
ids, tokenizer.decode(ids), tokens

([2061, 318, 534, 1438, 30],
 'What is your name?',
 ['What', 'Ġis', 'Ġyour', 'Ġname', '?'])

In [22]:
print(tokenizer.decode(ids))

What is your name?


In [25]:
output = model.generate(
    input_ids=model_inputs['input_ids'], 
    max_length=50,  # Adjust max length as needed
    num_beams=5,    # Beam search for better results, can set to 1 for greedy decoding
    no_repeat_ngram_size=2,
    early_stopping=True
)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [27]:
tokenizer.decode(list(output[0]))

"What is your name?\n\nMy name is John. I'm a guy who's been doing this for a long time, and I've always wanted to be a writer. So I decided to write a book about my life. It's called"

Fine tuning

In [4]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding

raw_datasets = load_dataset("glue", "mrpc")
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)


tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Generating train split: 100%|██████████| 3668/3668 [00:00<00:00, 117395.70 examples/s]
Generating validation split: 100%|██████████| 408/408 [00:00<00:00, 215661.76 examples/s]
Generating test split: 100%|██████████| 1725/1725 [00:00<00:00, 607589.39 examples/s]
Map: 100%|██████████| 3668/3668 [00:00<00:00, 25873.61 examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 25281.08 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 29730.09 examples/s]


In [5]:
from transformers import TrainingArguments

training_args = TrainingArguments("test-trainer")

In [6]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [9]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33marjun_g_ravi[0m ([33meurekabotics[0m). Use [1m`wandb login --relogin`[0m to force relogin


 36%|███▋      | 500/1377 [00:20<00:35, 24.89it/s]

{'loss': 0.5278, 'grad_norm': 10.36193561553955, 'learning_rate': 3.184458968772695e-05, 'epoch': 1.09}


 56%|█████▌    | 769/1377 [00:33<00:25, 24.13it/s]

KeyboardInterrupt: 

 56%|█████▌    | 771/1377 [00:49<00:25, 24.13it/s]

In [10]:
predictions = trainer.predict(tokenized_datasets["validation"])
print(predictions.predictions.shape, predictions.label_ids.shape)



(408, 2) (408,)


In [11]:
import numpy as np

preds = np.argmax(predictions.predictions, axis=-1)
preds

array([1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,

In [14]:
import evaluate

metric = evaluate.load("glue", "mrpc")
metric.compute(predictions=preds, references=predictions.label_ids)

Downloading builder script: 100%|██████████| 5.75k/5.75k [00:00<00:00, 13.6MB/s]


{'accuracy': 0.8480392156862745, 'f1': 0.8973509933774835}

True