In [3]:
from transformers import AutoModel, AutoTokenizer, AutoModelForCausalLM, EncoderDecoderModel, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
import numpy as np
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


## Text Generation


### GPT2


In [2]:
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [3]:
prompt = "The future of AI is "
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

In [4]:
gen_tokens = model.generate(
    input_ids,
    max_length=20,
    num_return_sequences=1,
    do_sample=True,
    temperature=0.9,
)
gen_text = tokenizer.batch_decode(gen_tokens)[0]
print(gen_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


The future of AI is vernacularly called 'the next chapter' or 'the next stage


### BertGeneration model (BERT based model)


In [5]:
sentence_fuser = EncoderDecoderModel.from_pretrained("google/roberta2roberta_L-24_discofuse")
tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_discofuse")

Config of the encoder: <class 'transformers.models.bert_generation.modeling_bert_generation.BertGenerationEncoder'> is overwritten by shared encoder config: BertGenerationConfig {
  "architectures": [
    "BertGenerationDecoder"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": null,
  "directionality": "bidi",
  "eos_token_id": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert-generation",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "return_dict": false,
  "torch_dtype": "float32",
  "transformers_version": "4.49.0",
  "use_cache": true,
  "vocab_size": 50358
}

Config of the decoder: <class 'transformers.models.bert_generation.modeling_bert_generation.BertGenerationDecoder'> is overwritten by

In [6]:
input_ids = tokenizer(
    prompt, add_special_tokens=False, return_tensors="pt"
).input_ids
outputs = sentence_fuser.generate(input_ids)
print(tokenizer.decode(outputs[0]))

<s>The future of AI is the future.</s>


## Visualize Attention Weights using `bertviz`


### GPT2


In [7]:
model = AutoModel.from_pretrained("gpt2", output_attentions=True)
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [8]:
# Run the model to get the attention weights (not using generate here)
inputs = tokenizer.encode(prompt, return_tensors='pt')
outputs = model(inputs)
attention = outputs[-1]



In [9]:
tokens = tokenizer.convert_ids_to_tokens(inputs[0])
head_view(attention, tokens)
model_view(attention, tokens)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### BERT


In [10]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModel.from_pretrained("bert-base-uncased", output_attentions=True)

In [11]:
inputs = tokenizer.encode(prompt, return_tensors='pt')
outputs = model(inputs)
attention = outputs[-1]
tokens = tokenizer.convert_ids_to_tokens(inputs[0]) 



In [12]:
head_view(attention, tokens)
model_view(attention, tokens)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Fine-Tune Models

In [4]:
dataset = load_dataset("yelp_review_full")
dataset["train"][100]

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

### GPT2

In [5]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [6]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(10))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(10))

In [7]:
model = AutoModelForSequenceClassification.from_pretrained("gpt2", num_labels=5, torch_dtype="auto")
training_args = TrainingArguments(output_dir="test_trainer")

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
metric = evaluate.load("accuracy")

In [9]:
training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch", num_train_epochs=2)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

RuntimeError: MPS backend out of memory (MPS allocated: 17.72 GB, other allocations: 408.69 MB, max allowed: 18.13 GB). Tried to allocate 96.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

### BERT

In [10]:
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

In [11]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(10))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(10))

In [12]:
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5, torch_dtype="auto")
training_args = TrainingArguments(output_dir="test_trainer")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
metric = evaluate.load("accuracy")

Using the latest cached version of the module from /Users/fernport/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Tue Mar 18 00:35:47 2025) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.


In [15]:
training_args = TrainingArguments(output_dir="test_trainer", eval_strategy="epoch", num_train_epochs=2)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.539728,0.3
2,No log,1.520594,0.3


TrainOutput(global_step=4, training_loss=1.6320981979370117, metrics={'train_runtime': 541.1323, 'train_samples_per_second': 0.037, 'train_steps_per_second': 0.007, 'total_flos': 5262362849280.0, 'train_loss': 1.6320981979370117, 'epoch': 2.0})

## Conclusion

GPT-2 performed better at text completion compared to the BERT model.

GPT-2 and BERT are built for different tasks:

- GPT-2 is awesome at generating text. It predicts the next word in a sequence, so it’s great for writing or continuing prompts.
- BERT (or the pre-trained model based on BERT) is better at understanding context. It’s perfect for tasks like classification and question answering, but not really for generating text.

Based on the training aspect it was not possible to fine-tune GPT2 with my computer. It was only possible to train BERT up to 10 eposch and 50 entries of data. BERT seemed to perform well up to 7 then it proceeded to degrade. Overall with the low testing samples and low epoch the highest level of accurary was 30.