# Analysing pre-trained language models with topic models

In [1]:
# Library imports
import numpy as np
import torch
from transformers import set_seed
from transformers import OpenAIGPTLMHeadModel, GPT2LMHeadModel, GPTNeoForCausalLM, CTRLLMHeadModel, TransfoXLLMHeadModel
from transformers import OpenAIGPTTokenizer, GPT2Tokenizer, CTRLTokenizer, TransfoXLTokenizer

# Seed for reproducability
set_seed(42)

# Tensorflow or Pytorch
platform = "pt"     # "tf" but not configured for that

# Use GPU or CPU
use_gpu = False
torch.set_num_threads(torch.get_num_threads()*2)
device = "cuda" if torch.cuda.is_available() and use_gpu else "cpu"

### Generating documents from selected pre-trained language models

In [3]:
# GPT
model_name = "openai-gpt"
tokenizer = OpenAIGPTTokenizer.from_pretrained(model_name)
model = OpenAIGPTLMHeadModel.from_pretrained(model_name)
init_text = tokenizer.bos_token

encoded_input = tokenizer.encode(init_text, return_tensors=platform).to(device)
model = model.to(device)

encoded_output = model.generate(encoded_input, max_length=tokenizer.model_max_length, do_sample=True, top_k=0)
decoded_output_gpt = tokenizer.decode(encoded_output[0], skip_special_tokens=True)
print(len(encoded_output[0]))

Some weights of OpenAIGPTLMHeadModel were not initialized from the model checkpoint at openai-gpt and are newly initialized: ['lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


512


In [4]:
# GPT2
model_name = "gpt2-xl"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id)
init_text = tokenizer.bos_token

encoded_input = tokenizer.encode(init_text, return_tensors=platform).to(device)
model = model.to(device)

encoded_output = model.generate(encoded_input, max_length=tokenizer.model_max_length, do_sample=True, top_k=0)
decoded_output_gpt2 = tokenizer.decode(encoded_output[0], skip_special_tokens=True)
print(len(encoded_output[0]))

1024


In [5]:
# GPT3/Neo
model_name = "EleutherAI/gpt-neo-2.7B"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPTNeoForCausalLM.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id)
init_text = tokenizer.bos_token

encoded_input = tokenizer.encode(init_text, return_tensors=platform).to(device)
model = model.to(device)

encoded_output = model.generate(encoded_input, max_length=2048, do_sample=True, top_k=0)
decoded_output_gpt3 = tokenizer.decode(encoded_output[0], skip_special_tokens=True)
print(len(encoded_output[0]))

Downloading:   0%|          | 0.00/779k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/200 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

727


In [None]:
# Ctrl - ERROR does not accept bos token as start
model_name = "ctrl"
tokenizer = CTRLTokenizer.from_pretrained(model_name)
model = CTRLLMHeadModel.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id)
init_text = tokenizer.bos_token

encoded_input = tokenizer.encode(init_text, return_tensors=platform).to(device)
model = model.to(device)

encoded_output = model.generate(encoded_input, max_length=tokenizer.model_max_length, do_sample=True, top_k=0)
decoded_output_ctrl = tokenizer.decode(encoded_output[0], skip_special_tokens=True)
print(len(encoded_output[0]))

In [None]:
# Transfo-XL - ERROR does not accept bos token as start
model_name = "transfo-xl-wt103"
tokenizer = TransfoXLTokenizer.from_pretrained(model_name)
model = TransfoXLLMHeadModel.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id)
init_text = tokenizer.bos_token

encoded_input = tokenizer.encode(init_text, return_tensors=platform).to(device)
model = model.to(device)

encoded_output = model.generate(encoded_input, max_length=tokenizer.model_max_length, do_sample=True, top_k=0)
decoded_output_txl = tokenizer.decode(encoded_output[0], skip_special_tokens=True)
print(len(encoded_output[0]))