In [2]:
import torch

from transformers import AutoTokenizer
from transformers import GPT2LMHeadModel


def sample_from_model(text):
    # how to sample from the model
    # I recommend checking out this guide: https://huggingface.co/blog/how-to-generate
    # sampling is very, very tricky and is still very much unsolved
    # see this paper for discussion: https://arxiv.org/pdf/1904.09751.pdf
    generated = torch.tensor(tok.encode(text)).unsqueeze(0)
    # generated = generated.to(device)

    # sample model
    sample_outputs = model.generate(
        # input seed; any pre-generation text we want to start with
        generated,

        # just setting the pad token to the EOS token for sampling
        pad_token_id=50256,

        # sample text using probabilities as opposed to greedy sampling / beam search
        do_sample=True,

        # higher temperature = sampling rarer words/tokens; low temperature = more conservative sample
        # temperature=0.9,

        # cap on the number of tokens that will be generated
        # currently have this set so that the model can generate up to 10 more tokens
        max_length=generated.shape[1] + 10,

        # forcing the model to generate at least 2 more tokens
        min_length=generated.shape[1] + 2,

        # sampling modalities
        # top_k only retains the top k tokens and randomly samples from thsoe
        # top_k=200,

        # top p = nucleus sampling
        # defined in terms of probabilities...
        # only retains the top words until the specified probability region is reached
        top_p=0.95,

        # how many samples to generate
        num_return_sequences=10,

        # params for beam search
        # number of hypotheses to evaluate in parallel
        # num_beams=5,

        # penalty for repeating ngrams in beam search
        # no_repeat_ngram_size=2,

        # quits beam search early if all beams have hit an EOS token
        # early_stopping=True,
    )
    for i, sample_output in enumerate(sample_outputs):
        ox = tok.decode(sample_output, skip_special_tokens=True)
        out = "{}: {}".format(i, ox)
        print(out)

if __name__ == '__main__':
    model = GPT2LMHeadModel.from_pretrained('model_unique_best/')
    tok = AutoTokenizer.from_pretrained('gpt2')
    tok.add_special_tokens({'pad_token': '<|endoftext|>'})
    
    # **********************
    # REPLACE SENTENCE BELOW
    test_txt = 'to operate a vehicle, controlling its motion Word: '
    # **********************
    
    tokens = tok.tokenize(test_txt)
    model_inp = tok(test_txt, return_tensors='pt')
    print('Num tokens:', len(tokens))

    outputs = model(**model_inp, output_hidden_states=True)
    logits = outputs.logits
    print('Logits (aka language modeling predictions) shape:', logits.shape)

    hidden_states = outputs.hidden_states
    print('Hidden states (aka latent/internal representation) tuple length:', len(hidden_states))

    initial_embed = hidden_states[0]
    print('Initial embedding representation (before contextualization) shape:', initial_embed.shape)

    final_rep = hidden_states[-1]
    print('Final representation from GPT-2 shape:', final_rep.shape)

    for ix, rep in enumerate(hidden_states):
        print('\tLayer', ix, 'representation shape:', rep.shape)

    #sample_from_model(test_txt)
    #sample_from_model('Example: I am running so fast')

Num tokens: 11
Logits (aka language modeling predictions) shape: torch.Size([1, 11, 50257])
Hidden states (aka latent/internal representation) tuple length: 13
Initial embedding representation (before contextualization) shape: torch.Size([1, 11, 768])
Final representation from GPT-2 shape: torch.Size([1, 11, 768])
	Layer 0 representation shape: torch.Size([1, 11, 768])
	Layer 1 representation shape: torch.Size([1, 11, 768])
	Layer 2 representation shape: torch.Size([1, 11, 768])
	Layer 3 representation shape: torch.Size([1, 11, 768])
	Layer 4 representation shape: torch.Size([1, 11, 768])
	Layer 5 representation shape: torch.Size([1, 11, 768])
	Layer 6 representation shape: torch.Size([1, 11, 768])
	Layer 7 representation shape: torch.Size([1, 11, 768])
	Layer 8 representation shape: torch.Size([1, 11, 768])
	Layer 9 representation shape: torch.Size([1, 11, 768])
	Layer 10 representation shape: torch.Size([1, 11, 768])
	Layer 11 representation shape: torch.Size([1, 11, 768])
	Layer 12 

In [5]:
# You'll likely want to use information from layer 0 or layer 12. 

# Get first layer's encoding: 
hidden_states[0]

tensor([[[-0.0266, -0.4240,  0.1520,  ...,  0.0091,  0.1409,  0.0453],
         [ 0.2042, -0.0241,  0.0055,  ...,  0.0604, -0.1768,  0.1519],
         [-0.0492, -0.0615,  0.0961,  ...,  0.0639,  0.0917, -0.0516],
         ...,
         [-0.2003, -0.0107,  0.3483,  ...,  0.0400, -0.0633, -0.2089],
         [ 0.0466, -0.1890,  0.1468,  ...,  0.0198, -0.0906, -0.0346],
         [ 0.1001, -0.0854,  0.1697,  ..., -0.0434, -0.1635,  0.1311]]],
       grad_fn=<AddBackward0>)

In [6]:
# Get last layer's encoding: 
hidden_states[-1]

tensor([[[ 0.2599, -1.1141, -0.3229,  ...,  0.0250, -0.1107,  0.0189],
         [-0.3212, -0.4170, -1.9204,  ...,  0.3450,  0.0714,  0.1612],
         [-0.1505,  0.2780,  0.7379,  ...,  0.0901, -0.2187, -0.2929],
         ...,
         [-1.1537,  0.6237, -1.5018,  ..., -0.3143, -0.2512,  0.3086],
         [ 0.0774, -0.4346, -0.2052,  ...,  0.1735, -0.0155,  0.7894],
         [-0.3195, -0.2111, -0.0912,  ...,  0.8415,  0.0648,  0.1905]]],
       grad_fn=<ViewBackward0>)