In [1]:
# Step 1: Load GPT-2 Model
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.eval()


  from .autonotebook import tqdm as notebook_tqdm


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [2]:
# Step 2: Design 5 Prompts (Different Styles)
prompts = {
    "Direct": "Write a motivational quote about overcoming fear.",
    "Scenario": "Imagine you’re helping a friend who failed a test. Write something encouraging.",
    "Persona": "As a wise monk, write a quote about inner strength.",
    "Keyword": "Using the words 'growth', 'struggle', and 'hope', write something inspiring.",
    "Conversational": "User: I feel like giving up.\nGPT-2:"
}


In [3]:
# Step 3: Generate 3 Outputs per Prompt
def generate_outputs(prompt, num_outputs=3, max_length=50):
    outputs = []
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    
    for _ in range(num_outputs):
        with torch.no_grad():
            output = model.generate(input_ids, max_length=max_length, do_sample=True, top_k=50, top_p=0.95)
        decoded = tokenizer.decode(output[0], skip_special_tokens=True)
        outputs.append(decoded[len(prompt):].strip())
    return outputs

gpt2_outputs = {ptype: generate_outputs(ptext) for ptype, ptext in prompts.items()}


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask

In [4]:
# Step 4: Human-Written Reference
reference_text = "The struggle you’re in today is developing the strength you need for tomorrow."
reference_list = [reference_text] * 3  # Reuse for comparison


In [5]:
# Step 5: Evaluate Using BERTScore
from bert_score import score

bert_scores = []

for ptype, outputs in gpt2_outputs.items():
    P, R, F1 = score(outputs, reference_list, lang="en", verbose=True)
    for i, f1_score in enumerate(F1):
        bert_scores.append({
            "Prompt Type": ptype,
            "Output #": i + 1,
            "Generated Text": outputs[i],
            "BERTScore F1": f1_score.item()
        })


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  2.00it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 333.41it/s]


done in 0.51 seconds, 5.89 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  1.94it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 500.51it/s]


done in 0.52 seconds, 5.72 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  1.69it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 499.74it/s]


done in 0.60 seconds, 4.98 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  1.83it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 333.38it/s]


done in 0.55 seconds, 5.41 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 1/1 [00:00<00:00,  1.89it/s]


computing greedy matching.


100%|██████████| 1/1 [00:00<00:00, 336.49it/s]

done in 0.54 seconds, 5.60 sentences/sec





In [6]:
# Step 6: Display Results Table
import pandas as pd

results_df = pd.DataFrame(bert_scores)
print(results_df[['Prompt Type', 'Output #', 'BERTScore F1']])


       Prompt Type  Output #  BERTScore F1
0           Direct         1      0.819763
1           Direct         2      0.842310
2           Direct         3      0.845307
3         Scenario         1      0.836029
4         Scenario         2      0.835904
5         Scenario         3      0.840152
6          Persona         1      0.839988
7          Persona         2      0.835569
8          Persona         3      0.839448
9          Keyword         1      0.840682
10         Keyword         2      0.844775
11         Keyword         3      0.838593
12  Conversational         1      0.834269
13  Conversational         2      0.826776
14  Conversational         3      0.829342


In [7]:
# Save outputs if needed
results_df.to_csv("bert_score_results.csv", index=False)

with open("all_outputs.txt", "w") as f:
    for ptype, outputs in gpt2_outputs.items():
        f.write(f"{ptype} Prompt: {prompts[ptype]}\n")
        for i, out in enumerate(outputs, 1):
            f.write(f"Output {i}: {out}\n")
        f.write("\n")

with open("human_reference.txt", "w") as f:
    f.write(reference_text + "\n")
