# Analysing pre-trained language models with topic models

In [1]:
# Library imports
import numpy as np
import torch
from transformers import set_seed
from transformers import OpenAIGPTLMHeadModel, GPT2LMHeadModel, GPTNeoForCausalLM, TransfoXLLMHeadModel
from transformers import OpenAIGPTTokenizer, GPT2Tokenizer, TransfoXLTokenizer
from gensim.corpora.textcorpus import TextCorpus

# Seed for reproducability
set_seed(42)

# Tensorflow or Pytorch
platform = "pt"     # "tf" but not configured for that

# Use GPU or CPU
use_gpu = False
torch.set_num_threads(torch.get_num_threads()*2-1)
device = "cuda" if torch.cuda.is_available() and use_gpu else "cpu"

# params
corpus_size = 1e6   # depending on the number of topics created with lda

### Generating documents from selected pre-trained language models

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM

def create_corpus(
        corpus_size=1,
        model_name="openai-gpt",
        max_document_length=None,
        tokenizer_model=AutoTokenizer,
        lm_model=AutoModelForCausalLM,
        device="cpu",
    ):
    r"""
    Generates sequences/documents/a corpus for models with a language modeling head.

    Parameters:
        corpus_size (`int`, *optional*, defaults to 1):
            The corpus size to be generated (number of documents)
        model_name (`str`, *optional*, defaults to "openai-gpt"):
            The model name of the pre-trained model: openai-gpt, gpt2-small, gpt2, gpt2-large, gpt2-xl, transfo-xl-wt103, EleutherAI/gpt-neo-2.7B, ctrl
        max_document_length (`int`, *optional*, defaults to None):
            The maximum document length, normally set to tokenizer.max_length
        tokenizer_model (`PreTrainedTokenizer`, *optional*, defaults to AutoTokenizer):
            The pre-trained tokenizer class
        lm_model (`PreTrainedModel`, *optional*, defaults to AutoModelForCausalLM):
            The pre-trained model class with language modeling head
        device (`str`, *optional*, defaults to "cpu"):
            The device the computations commence "cpu" or "cuda"
    """

    tokenizer = tokenizer_model.from_pretrained(model_name)
    model = lm_model.from_pretrained(model_name)

    max_document_length = max_document_length if max_document_length is not None else tokenizer.model_max_length

    print(f"EOS: {tokenizer.eos_token} | BOS: {tokenizer.bos_token} | UNK: {tokenizer.unk_token}")
    eos_token_id = tokenizer.eos_token_id if tokenizer.eos_token_id is not None else tokenizer.unk_token_id
    bos_token_id = tokenizer.bos_token_id if tokenizer.bos_token_id is not None else eos_token_id

    model = model.to(device)
    encoded_output = model.generate(
        # all parameters have to be set as otherwise the config of the pretrained model will be taken
        max_length=max_document_length,
        do_sample=True,
        early_stopping=False,
        num_beams=1,                            # 1 deactivates beam_search
        temperature=1.0,                        # 1.0 deactivates temperature
        top_k=0,                                # 0 deactivates top_k sampling
        top_p=1.0,                              # 1.0 deactivates top_p sampling
        repetition_penalty=1.0,                 # 1.0 deactivates repetition_penalty
        pad_token_id=eos_token_id,              # For open-end generation set to eos_token_id
        bos_token_id=bos_token_id,
        eos_token_id=eos_token_id,
        length_penalty=1.0,                     # 1.0 deactivates length_penalty
        no_repeat_ngram_size=0,                 # 0 deactivates no_repeat_ngram_size
        encoder_no_repeat_ngram_size=0,         # 0 deactivates encoder_no_repeat_ngram_size
        num_return_sequences=corpus_size,       # The number of independently computed returned sequences for each element in the batch. No input means batch size of one.
        num_beam_groups=1,
        output_scores=False,                    # Will be important if you want the prediction scores!
    )
    decoded_output = tokenizer.decode(encoded_output[0], skip_special_tokens=True)
    print(decoded_output)
    #TextCorpus.save_corpus("./data/corpus-"+model_name)


In [3]:
create_corpus(1, "openai-gpt", 10, OpenAIGPTTokenizer, OpenAIGPTLMHeadModel, device)

Some weights of OpenAIGPTLMHeadModel were not initialized from the model checkpoint at openai-gpt and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using eos_token, but it is not set yet.
Using bos_token, but it is not set yet.


EOS: None | BOS: None | UNK: <unk>
hm. priests stole our thirteen - year -


In [4]:
create_corpus(1, "gpt2", 100, GPT2Tokenizer, GPT2LMHeadModel, device)

EOS: <|endoftext|> | BOS: <|endoftext|> | UNK: <|endoftext|>
Bapa Pinata and Momages Paranormal Journal (February 2018)

no more scaling up construction and red flag (November 2018)

sanctioned readmission of redacted website of the National Policy Group Working Group moderated last month

an online violence website quietly downgraded to a disappeared link to the new shoddy work of the NGP and its 4th re.4 project (;)


In [5]:
create_corpus(1, "EleutherAI/gpt-neo-2.7B", 100, GPT2Tokenizer, GPTNeoForCausalLM, device)

EOS: <|endoftext|> | BOS: <|endoftext|> | UNK: <|endoftext|>
Producer Lexus" less

Cable Network Specials

After several months spent in the company of game shows like "The New Price Is Right" and fellow "This Old House" star Barbara Mandrell, "Desperate Housewives" Emmy-Award winner Felicity Huffman could be heard throughout "Carpool Karaoke" orchestrating a high-stake showdown for her home life. Top "Modern Family" actor Jesse Tyler Ferguson blended in one lucky week and


In [11]:
create_corpus(1, "transfo-xl-wt103", 100, TransfoXLTokenizer, TransfoXLLMHeadModel, device)

Using bos_token, but it is not set yet.


EOS: <eos> | BOS: None | UNK: <unk>
for reasons which include continued enthusiasm for the medium; frustration and frustration in Bernard discard all knowledge that the medium is capable of producing 32 million streams or fizzing advertisements for expensive medium.
