# Analysing pre-trained language models with topic models

In [2]:
# Library imports
import numpy as np
import io
import os.path
import logging
import re
import tarfile
import smart_open
from pathlib import Path
from tqdm import tqdm
import gc
import torch
from transformers import set_seed
from transformers import AutoModelForCausalLM, GPT2LMHeadModel, GPTNeoForCausalLM, XLMWithLMHeadModel
from transformers import AutoTokenizer, GPT2Tokenizer, XLMTokenizer
from transformers import TFOpenAIGPTLMHeadModel
from gensim import corpora

# Seed for reproducability
set_seed(42)

# Tensorflow or Pytorch
framework = "pt"     # "tf" but not configured for that

# Use GPU or CPU
use_gpu = False
torch.set_num_threads(torch.get_num_threads()*2-1)
device = "cuda" if torch.cuda.is_available() and use_gpu else "cpu"

# params
corpus_size = 1e6   # depending on the number of topics created with lda

In [5]:
def removing_wikitext_titles(file_name):
    r"""
    Removing Titles and replacing them with one empty line between each text

    :param file_name: location of the file with titles
    """
    str0 = Path(file_name).read_text(encoding='utf-8')
    heading_pattern = '( \n [=\s].*[=\s] \n)'
    str1 = re.sub(heading_pattern, '\n\n', str0)
    str2 = re.sub('\n\n+[\n]', '\n\n', str1)
    new_file_name = file_name+'_no_titles.txt'
    if not os.path.isfile(new_file_name):
        with open(file=new_file_name, mode='x', encoding='utf-8') as file:
            file.write(str2)
    else:
        print("ERROR: File already exists. Must be deleted manually.")

train = './data/data_wikitext-103-raw/wiki.train.raw'
test = './data/data_wikitext-103-raw/wiki.test.raw'
valid = './data/data_wikitext-103-raw/wiki.valid.raw'
removing_wikitext_titles(train)
removing_wikitext_titles(test)
removing_wikitext_titles(valid)

In [3]:
def create_corpus(
        tokenizer_name="gpt2",
        model_name="gpt2",
        max_document_length=None,
        device="cpu",
        corpus_size=1,
        tokenizer_model=AutoTokenizer,
        lm_model=AutoModelForCausalLM,
    ):
    r"""
    Generates sequences/documents/a corpus for models with a language modeling head.

    Parameters:
        corpus_size (`int`, *optional*, defaults to 1):
            The corpus size to be generated (number of documents)
        model_name (`str`, *optional*, defaults to "openai-gpt"):
            The model name of the pre-trained model: openai-gpt, gpt2-small, gpt2, gpt2-large, gpt2-xl, transfo-xl-wt103, EleutherAI/gpt-neo-2.7B, ctrl
        max_document_length (`int`, *optional*, defaults to None):
            The maximum document length, normally set to tokenizer.max_length
        tokenizer_model (`PreTrainedTokenizer`, *optional*, defaults to AutoTokenizer):
            The pre-trained tokenizer class
        lm_model (`PreTrainedModel`, *optional*, defaults to AutoModelForCausalLM):
            The pre-trained model class with language modeling head
        device (`str`, *optional*, defaults to "cpu"):
            The device the computations commence "cpu" or "cuda"
    """

    tokenizer = tokenizer_model.from_pretrained(tokenizer_name)
    model = lm_model.from_pretrained(model_name)

    max_document_length = max_document_length if max_document_length is not None else tokenizer.model_max_length

    # print(f"EOS: {tokenizer.eos_token} | BOS: {tokenizer.bos_token} | UNK: {tokenizer.unk_token}")

    model = model.to(device)
    encoded_output = model.generate(
        # all parameters have to be set as otherwise the config of the pretrained model will be taken
        input_ids=None,
        max_length=max_document_length,
        do_sample=True,                         # False implies Greedy search
        early_stopping=False,
        num_beams=1,                            # 1 deactivates beam_search
        temperature=1.0,                        # 1.0 deactivates temperature
        top_k=0,                                # 0 deactivates top_k sampling
        top_p=1.0,                              # 1.0 deactivates top_p sampling
        repetition_penalty=1.0,                 # 1.0 deactivates repetition_penalty
        #pad_token_id=eos_token_id,              # For open-end generation set to eos_token_id
        #bos_token_id=bos_token_id,
        #eos_token_id=eos_token_id,
        length_penalty=1.0,                     # 1.0 deactivates length_penalty
        no_repeat_ngram_size=0,                 # 0 deactivates no_repeat_ngram_size
        encoder_no_repeat_ngram_size=0,         # 0 deactivates encoder_no_repeat_ngram_size
        num_return_sequences=corpus_size,       # The number of independently computed returned sequences for each element in the batch. No input means batch size of one.
        num_beam_groups=1,
        output_scores=False,                    # Will be important if you want the prediction scores!
    )
    decoded_output = tokenizer.decode(encoded_output[0], skip_special_tokens=True)
    print(decoded_output)
    gc.collect()
    #TextCorpus.save_corpus("./data/corpus-"+model_name)

### Generating documents from selected pre-trained language models

In [5]:
create_corpus(
    tokenizer_name="gpt2",
    model_name="./data/model_gpt2-baseline",
    max_document_length=100,
    device=device,
    corpus_size=1,
    tokenizer_model=GPT2Tokenizer,
    lm_model=GPT2LMHeadModel
)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


EOS: <|endoftext|> | BOS: <|endoftext|> | UNK: <|endoftext|>
 of production and could have provided adequate clues to what would work in the wake of the Boston British Revolution, particularly those of the British colonies from the Holy Duustigo and of the 18th century. 
 Victoria's territorial affairs were strongly dependent on her isolation under the influence of the wealthier counties and Beatty's Lord Benjamin threaten the dominance of those European states on Britain and other land. Throughout her reign, 248 foreign supplies were handed down. Haiti was one of the principal carrier of


In [None]:
create_corpus(
    tokenizer_name="gpt2",
    model_name="gpt2",
    max_document_length=100,
    device=device,
    corpus_size=1,
    tokenizer_model=GPT2Tokenizer,
    lm_model=GPT2LMHeadModel
)

In [None]:
create_corpus(
    tokenizer_name="EleutherAI/gpt-neo-2.7B",
    model_name="EleutherAI/gpt-neo-2.7B",
    max_document_length=100,
    device=device,
    corpus_size=1,
    tokenizer_model=GPT2Tokenizer,
    lm_model=GPTNeoForCausalLM
)

### Latent Dirichlet Allocation

In [33]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [24]:
heading_pattern = '( \n \n [=\s]+.*[=\s]+ \n \n )'
train_data = Path('./data/data_wikitext-103-raw/wiki.train.raw').read_text(encoding='utf-8')
train_split = re.split(heading_pattern, train_data)
train_headings = [x[7:-7] for x in train_split[1::2]]
train_articles = [x for x in train_split[2::2]]

In [27]:
train_articles[2]

'Concept work for Valkyria Chronicles III began after development finished on Valkyria Chronicles II in early 2010 , with full development beginning shortly after this . The director of Valkyria Chronicles II , Takeshi Ozawa , returned to that role for Valkyria Chronicles III . Development work took approximately one year . After the release of Valkyria Chronicles II , the staff took a look at both the popular response for the game and what they wanted to do next for the series . Like its predecessor , Valkyria Chronicles III was developed for PlayStation Portable : this was due to the team wanting to refine the mechanics created for Valkyria Chronicles II , and they had not come up with the " revolutionary " idea that would warrant a new entry for the PlayStation 3 . Speaking in an interview , it was stated that the development team considered Valkyria Chronicles III to be the series \' first true sequel : while Valkyria Chronicles II had required a large amount of trial and error dur

In [20]:
# Tokenize the documents.
import nltk
from nltk.tokenize import RegexpTokenizer
nltk.download('wordnet')
nltk.download('omw-1.4')

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\franz\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\franz\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\omw-1.4.zip.


In [21]:
# Lemmatize the documents.
from nltk.stem.wordnet import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [22]:
# Compute bigrams.
from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

2022-01-06 17:33:10,013 : INFO : collecting all words and their counts
2022-01-06 17:33:10,014 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2022-01-06 17:33:13,238 : INFO : collected 1120198 token types (unigram + bigrams) from a corpus of 4629808 words and 1740 sentences
2022-01-06 17:33:13,238 : INFO : merged Phrases<1120198 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000>
2022-01-06 17:33:13,239 : INFO : Phrases lifecycle event {'msg': 'built Phrases<1120198 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000> in 3.23s', 'datetime': '2022-01-06T17:33:13.239787', 'gensim': '4.1.2', 'python': '3.9.9 (tags/v3.9.9:ccb0e6a, Nov 15 2021, 18:08:50) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'created'}


In [23]:
# Remove rare and common tokens.
from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

2022-01-06 17:33:21,506 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2022-01-06 17:33:22,852 : INFO : built Dictionary(79429 unique tokens: ['1ooooo', '1st', '25oo', '2o00', '4ooo']...) from 1740 documents (total 4953968 corpus positions)
2022-01-06 17:33:22,852 : INFO : Dictionary lifecycle event {'msg': "built Dictionary(79429 unique tokens: ['1ooooo', '1st', '25oo', '2o00', '4ooo']...) from 1740 documents (total 4953968 corpus positions)", 'datetime': '2022-01-06T17:33:22.852095', 'gensim': '4.1.2', 'python': '3.9.9 (tags/v3.9.9:ccb0e6a, Nov 15 2021, 18:08:50) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'created'}
2022-01-06 17:33:22,936 : INFO : discarding 70785 tokens: [('1ooooo', 1), ('25oo', 2), ('2o00', 6), ('4ooo', 2), ('64k', 6), ('a', 1740), ('aaditional', 1), ('above', 1114), ('abstract', 1740), ('acase', 1)]...
2022-01-06 17:33:22,937 : INFO : keeping 8644 tokens which were in no less than 20 and no more than 870 (=50.0

In [24]:
# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [25]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 8644
Number of documents: 1740


In [26]:
# Train LDA model.
from gensim.models import LdaModel

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

2022-01-06 17:36:02,118 : INFO : using autotuned alpha, starting with [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
2022-01-06 17:36:02,119 : INFO : using serial LDA version on this node
2022-01-06 17:36:02,125 : INFO : running online (multi-pass) LDA training, 10 topics, 20 passes over the supplied corpus of 1740 documents, updating model once every 1740 documents, evaluating perplexity every 0 documents, iterating 400x with a convergence threshold of 0.001000
2022-01-06 17:36:02,125 : INFO : PROGRESS: pass 0, at document #1740/1740
2022-01-06 17:36:09,204 : INFO : optimized alpha [0.09021268, 0.06739636, 0.06263462, 0.06416626, 0.09577194, 0.08619204, 0.0767858, 0.08124751, 0.08339516, 0.059971448]
2022-01-06 17:36:09,209 : INFO : topic #9 (0.060): 0.005*"neuron" + 0.004*"node" + 0.004*"connection" + 0.004*"tree" + 0.003*"layer" + 0.003*"recognition" + 0.003*"bit" + 0.003*"cell" + 0.003*"hidden" + 0.003*"sequence"
2022-01-06 17:36:09,210 : INFO : topic #2 (0.063): 0.006*"control

In [27]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

2022-01-06 17:38:15,512 : INFO : CorpusAccumulator accumulated stats from 1000 documents


Average topic coherence: -1.0664.
[([(0.025379736, 'neuron'),
   (0.01119123, 'spike'),
   (0.010484127, 'cell'),
   (0.008240662, 'signal'),
   (0.0078855185, 'firing'),
   (0.0076666763, 'response'),
   (0.007181551, 'frequency'),
   (0.0065302374, 'stimulus'),
   (0.00642043, 'synaptic'),
   (0.0059812544, 'activity'),
   (0.0056552677, 'noise'),
   (0.0053080185, 'potential'),
   (0.0050892965, 'channel'),
   (0.004909962, 'phase'),
   (0.0043921797, 'delay'),
   (0.004097093, 'fig'),
   (0.003889047, 'temporal'),
   (0.0035831556, 'dynamic'),
   (0.0035198943, 'membrane'),
   (0.00322634, 'connection')],
  -0.9443947607900195),
 ([(0.0044680373, 'estimate'),
   (0.0042189835, 'rbf'),
   (0.0040761526, 'gradient'),
   (0.004031566, 'regression'),
   (0.003907613, 'nonlinear'),
   (0.0038104798, 'optimal'),
   (0.0037612363, 'basis'),
   (0.003714169, 'class'),
   (0.003556753, 'prediction'),
   (0.003537749, 'noise'),
   (0.0034812698, 'distance'),
   (0.0034325954, 'optimization')

In [39]:
# Document topic
list(model.get_document_topics(corpus))

[[(7, 0.027538907), (8, 0.7885712), (9, 0.18370885)],
 [(2, 0.08970444),
  (3, 0.06266751),
  (4, 0.08423867),
  (6, 0.7185407),
  (9, 0.044549678)],
 [(2, 0.1114885),
  (3, 0.45256642),
  (4, 0.09520804),
  (7, 0.06045989),
  (8, 0.17162195),
  (9, 0.10854828)],
 [(2, 0.21544413),
  (3, 0.14386836),
  (4, 0.051256046),
  (6, 0.41545925),
  (8, 0.17374407)],
 [(1, 0.020357646),
  (3, 0.1536337),
  (4, 0.3098781),
  (6, 0.12859046),
  (7, 0.057898894),
  (8, 0.3295172)],
 [(0, 0.027262814),
  (3, 0.1752114),
  (4, 0.021839688),
  (6, 0.67404944),
  (8, 0.054517623),
  (9, 0.04702115)],
 [(1, 0.024120308),
  (4, 0.013194161),
  (5, 0.07742534),
  (6, 0.13242401),
  (8, 0.67537737),
  (9, 0.07726645)],
 [(0, 0.9902481)],
 [(2, 0.09325667), (6, 0.1146867), (8, 0.7684829), (9, 0.023299834)],
 [(4, 0.99973786)],
 [(1, 0.14191191), (4, 0.60745645), (5, 0.045528833), (6, 0.20491715)],
 [(1, 0.10479327),
  (3, 0.3743599),
  (4, 0.40589106),
  (6, 0.04598542),
  (7, 0.06884143)],
 [(0, 0.2263369

In [37]:
# Term topic matrix
model.get_topics()

array([[3.7814287e-05, 5.3377448e-07, 1.9588144e-05, ..., 9.0079163e-07,
        1.2143894e-06, 9.7482152e-06],
       [2.6704827e-05, 5.1839191e-07, 4.4586886e-07, ..., 4.4865857e-07,
        6.9472344e-07, 4.5054776e-07],
       [8.6939772e-06, 6.6212277e-05, 2.9174205e-05, ..., 5.6456463e-07,
        7.5195197e-07, 5.7881141e-07],
       ...,
       [2.6498903e-06, 7.5508433e-06, 5.1832359e-07, ..., 5.1332523e-07,
        5.4682869e-07, 1.1029123e-05],
       [1.8741454e-04, 1.3542121e-05, 1.3956630e-05, ..., 4.1770716e-07,
        4.2835194e-07, 4.3115236e-07],
       [8.3786483e-07, 7.3207235e-07, 1.8793445e-06, ..., 8.0135692e-07,
        1.1450361e-06, 2.7130816e-05]], dtype=float32)

### Visualize one LDA

In [41]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

vis_data = gensimvis.prepare(model, corpus, dictionary)
pyLDAvis.display(vis_data)

  default_term_info = default_term_info.sort_values(


### Compare two LDA's