# **Install transformers**

In [1]:
# -- Install transformers and graphviz
!pip install transformers

Collecting transformers
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.36.0-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2025.11.3-cp313-cp313-win_amd64.whl.metadata (41 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.7.0-cp38-abi3-win_amd64.whl.metadata (4.2 kB)
Downloading transformers-4.57.3-py3-none-any.whl (12.0 MB)
   ---------------------------------------- 0.0/12.0 MB ? eta -:--:--
   ----------------------- ---------------- 7.1/12.0 MB 35.4 MB/s eta 0:00:01
   ---------------------------------------- 12.0/12.0 MB 45.2 MB/s  0:00:00
Downloading huggingface_hub-0.36.0-py3-none-any.whl (566 kB)
   ---------------------------------------- 0.0/566.1 

# **TextGen_GreedySearch**

1.   Input text
2.   Language model
3.   Inner probability distrubution
4.   Decoding algorithm : Greedy search
5.   Generate output text



In [2]:
# -- Importation : torch, numpy, GPT2Tokenizer, GPT2LMHeadModel
import torch
import numpy as np
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# -- A. Creat Main Class (mytokenizer) and (mymodel)
mytokenizer = GPT2Tokenizer.from_pretrained("gpt2")
mymodel = GPT2LMHeadModel.from_pretrained("gpt2")

# -- B. The input text (text) and max length of generated_text (nb_generated_tokens=length-length(input_text))
input_text = "The Olmec colossal heads are at least seventeen monumental stone representations of human heads sculpted from large basalt boulders. The heads date from at least before"
print(f"-0 >>  input_text: {input_text}\n")
length=128

# -- C. Encoding: 1. 'mytokenizer' translates each token in the input text (text) into a corresponding token ID (input_ids).
input_ids = mytokenizer.encode(input_text, return_tensors='pt')
print(f"-1 >>  input_ids: {input_ids}\n")

# -- C. Encoding: 2. generate output_ids sequence until the output length (which includes the context length)
output_ids = mymodel.generate(input_ids, max_length=length)
print(f"-2 >>  output_ids: {output_ids}\n")

# -- C. Encoding: 3. generate transition scores for each token generated with 'Greedy Search'
output_ids_ss = mymodel.generate(input_ids, max_length=length, return_dict_in_generate=True, output_scores=True)
print(f"-3 >>  output_ids_ss: {output_ids_ss}\n")
#output_ids_scores = mymodel.compute_transition_scores(output_ids_ss.sequences, output_ids_ss.scores, normalize_logits=True)
#print(f"-4 >>  output_ids_scores: {output_ids_scores}\n")

# -- C. Decoding: 1. Print the scores for each token generated with Greedy Search

###--inputs_length = the length of the input prompt, for decoder-only models / =1 --> for encoder-decoder models
inputs_length = 1 if mymodel.config.is_encoder_decoder else input_ids.shape[1]
generated_tokens = output_ids_ss.sequences[:, inputs_length:]

###--Print the scores for each token generated with Greedy Search
print(f"-5 >>  generated_tokens")
print(f"| token_ids | token string | logits | probability")
#for tok, score in zip(generated_tokens[0], output_ids_scores[0]):
#    print(f"| {tok:9d} | {mytokenizer.decode(tok):12s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")

# -- C. Decoding: 2. generate output_text which includes the input_text
generated_text = mytokenizer.decode(output_ids[0])
print(f"\n-6 >>  Generated text: {generated_text}")


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to ob

-0 >>  input_text: The Olmec colossal heads are at least seventeen monumental stone representations of human heads sculpted from large basalt boulders. The heads date from at least before

-1 >>  input_ids: tensor([[  464,  6544,    76,   721, 41197,  6665,   389,   379,  1551, 38741,
         36364,  7815, 24612,   286,  1692,  6665, 14747,   276,   422,  1588,
          1615,  2501, 47069,   364,    13,   383,  6665,  3128,   422,   379,
          1551,   878]])



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


-2 >>  output_ids: tensor([[  464,  6544,    76,   721, 41197,  6665,   389,   379,  1551, 38741,
         36364,  7815, 24612,   286,  1692,  6665, 14747,   276,   422,  1588,
          1615,  2501, 47069,   364,    13,   383,  6665,  3128,   422,   379,
          1551,   878,   262,   717, 39210, 11843,    13,   383,  1182,   286,
           262,  6544,    76,   721,   318,   257,  1588,    11, 41186,    12,
         16760,  1182,   351,   257,   890,    11,  6235,    11,  6235,    11,
           290,  6235,  1182,    13,   383,  1182,   318,   257,  1588,    11,
         41186,    12, 16760,  1182,   351,   257,   890,    11,  6235,    11,
           290,  6235,  1182,    13,   383,  1182,   318,   257,  1588,    11,
         41186,    12, 16760,  1182,   351,   257,   890,    11,  6235,    11,
           290,  6235,  1182,    13,   383,  1182,   318,   257,  1588,    11,
         41186,    12, 16760,  1182,   351,   257,   890,    11,  6235,    11,
           290,  6235,  1182,    