# Analysing pre-trained language models with topic models

In [2]:
# Library imports
import numpy as np
import os.path
import logging
from gensim.test.utils import datapath
import re
import json
from gensim.models.ldamulticore import LdaMulticore
from nltk.tokenize import RegexpTokenizer
from pprint import pprint
from gensim.corpora import Dictionary
from pathlib import Path
from tqdm.auto import tqdm
import gc
import torch
from datasets import load_dataset
from transformers import set_seed
from transformers import AutoModelForCausalLM, GPT2LMHeadModel, GPTNeoForCausalLM
from transformers import AutoTokenizer, GPT2Tokenizer

from gensim.models import Phrases
from nltk.stem.wordnet import WordNetLemmatizer

# Seed for reproducability
set_seed(42)

# Tensorflow or Pytorch
framework = "pt"     # "tf" but not configured for that

# Use GPU or CPU
use_gpu = True
torch.set_num_threads(torch.get_num_threads()*2-2)
device = "cuda" if torch.cuda.is_available() and use_gpu else "cpu"

# params
corpus_size = 1e6   # depending on the number of topics created with lda

gc.collect()
torch.cuda.empty_cache()

### Removing Wikitext 103 Titles

In [2]:
def removing_wikitext_titles(file_name):
    r"""
    Removing Titles and replacing them with one empty line between each text

    :param file_name: location of the file with titles
    """
    new_file_name = file_name+'_no_titles.txt'
    if not os.path.isfile(new_file_name):
        str0 = Path(file_name).read_text(encoding='utf-8')
        heading_pattern = '( \n [=\s].*[=\s] \n)'
        str1 = re.sub(heading_pattern, '\n\n', str0)
        str2 = re.sub('\n\n+[\n]', '\n\n', str1)
        with open(file=new_file_name, mode='x', encoding='utf-8') as file:
            file.write(str2)
    else:
        print("ERROR: File already exists. Must be deleted manually.")

train = './data/data_wikitext-103-raw/wiki.train.raw'
test = './data/data_wikitext-103-raw/wiki.test.raw'
valid = './data/data_wikitext-103-raw/wiki.valid.raw'
removing_wikitext_titles(train)
removing_wikitext_titles(test)
removing_wikitext_titles(valid)

ERROR: File already exists. Must be deleted manually.
ERROR: File already exists. Must be deleted manually.
ERROR: File already exists. Must be deleted manually.


### Training your own language model

`python run_clm.py --model_type gpt2 --tokenizer_name gpt2 --dataset_name wikitext --dataset_config_name wikitext-103-raw-v1 --output_dir data/test --do_eval --do_train --block_size 1024 --overwrite_output_dir`

or

`python run_clm.py --model_type gpt2 --tokenizer_name gpt2 --output_dir data/test --do_eval --do_train --block_size 1024 --overwrite_output_dir --train_file .\data\data_wikitext-103-raw\wiki.train.raw.txt --validation_file .\data\data_wikitext-103-raw\wiki.valid.raw.txt`

**For validation/perplexity score only:**
`python run_clm.py --model_type gpt2 --tokenizer_name gpt2 --output_dir data/test --do_eval --block_size 1024 --overwrite_output_dir --validation_file .\data\data_wikitext-103-raw\wiki.valid.raw.txt`

### Generating documents from selected pre-trained language models

In [3]:
def create_corpus(
        tokenizer_name="gpt2",
        model_name="gpt2",
        max_document_length=None,
        device="cpu",
        corpus_size=1,
        tokenizer_model=AutoTokenizer,
        lm_model=AutoModelForCausalLM,
        pad_token_id=None,
        save_path="data/test",
        load_size=1
    ):
    r"""
    Generates sequences/documents/a corpus for models with a language modeling head.

    Parameters:
        corpus_size (`int`, *optional*, defaults to 1):
            The corpus size to be generated (number of documents)
        model_name (`str`, *optional*, defaults to "openai-gpt"):
            The model name of the pre-trained model: openai-gpt, gpt2-small, gpt2, gpt2-large, gpt2-xl, transfo-xl-wt103, EleutherAI/gpt-neo-2.7B, ctrl
        max_document_length (`int`, *optional*, defaults to None):
            The maximum document length, normally set to tokenizer.max_length
        tokenizer_model (`PreTrainedTokenizer`, *optional*, defaults to AutoTokenizer):
            The pre-trained tokenizer class
        lm_model (`PreTrainedModel`, *optional*, defaults to AutoModelForCausalLM):
            The pre-trained model class with language modeling head
        device (`str`, *optional*, defaults to "cpu"):
            The device the computations commence "cpu" or "cuda"
    """

    if os.path.isfile(save_path):
        print("ERROR: file already exist, please remove manually before running again.")
        return

    tokenizer = tokenizer_model.from_pretrained(tokenizer_name)
    model = lm_model.from_pretrained(model_name)

    max_document_length = max_document_length if max_document_length is not None else tokenizer.model_max_length
    if pad_token_id is not None:
        if pad_token_id == 'eos_token_id':
            pad_token_id = tokenizer.eos_token_id
        else:
            print("ERROR: Undefinded/unimplemented pad_token_id")

    # print(f"EOS: {tokenizer.eos_token} | BOS: {tokenizer.bos_token} | UNK: {tokenizer.unk_token}")

    model = model.to(device)

    decoded_output = []

    for i in tqdm(range(0, corpus_size, load_size)):
        step_size = min(load_size, corpus_size-i)
        encoded_output = model.generate(
            # all parameters have to be set as otherwise the config of the pretrained model will be taken
            input_ids=None,
            max_length=max_document_length,
            do_sample=True,                         # False implies Greedy search
            early_stopping=False,
            num_beams=1,                            # 1 deactivates beam_search
            temperature=1.0,                        # 1.0 deactivates temperature
            top_k=0,                                # 0 deactivates top_k sampling
            top_p=1.0,                              # 1.0 deactivates top_p sampling
            repetition_penalty=1.0,                 # 1.0 deactivates repetition_penalty
            pad_token_id=pad_token_id,              # For open-end generation set to eos_token_id
            #bos_token_id=bos_token_id,
            #eos_token_id=eos_token_id,
            length_penalty=1.0,                     # 1.0 deactivates length_penalty
            no_repeat_ngram_size=0,                 # 0 deactivates no_repeat_ngram_size
            encoder_no_repeat_ngram_size=0,         # 0 deactivates encoder_no_repeat_ngram_size
            num_return_sequences=step_size,       # The number of independently computed returned sequences for each element in the batch. No input means batch size of one.
            num_beam_groups=1,
            output_scores=False,                    # Will be important if you want the prediction scores!
        )

        for j in range(load_size):
            decoded_output.append(tokenizer.decode(encoded_output[j], skip_special_tokens=True))

    with open(save_path, 'w') as file:
        json.dump(decoded_output, file, indent=2)

    gc.collect()
    torch.cuda.empty_cache()

  if pad_token_id is 'eos_token_id':


In [4]:
create_corpus(
    tokenizer_name="gpt2",
    model_name="./data/model_gpt2-baseline_no_header",
    max_document_length=None,
    device=device,
    corpus_size=1000,
    tokenizer_model=GPT2Tokenizer,
    lm_model=GPT2LMHeadModel,
    pad_token_id='eos_token_id',
    save_path="data/data_gpt2-baseline_no_header.json",
    load_size=10
)

ERROR: file already exist, please remove manually before running again.


In [5]:
create_corpus(
    tokenizer_name="gpt2",
    model_name="./data/model_gpt2-baseline_self",
    max_document_length=None,
    device=device,
    corpus_size=1000,
    tokenizer_model=GPT2Tokenizer,
    lm_model=GPT2LMHeadModel,
    pad_token_id='eos_token_id',
    save_path="data/data_gpt2-baseline_self.json",
    load_size=10
)

  0%|          | 0/100 [00:00<?, ?it/s]

In [6]:
create_corpus(
    tokenizer_name="gpt2",
    model_name="./data/model_gpt2-baseline",
    max_document_length=None,
    device=device,
    corpus_size=1000,
    tokenizer_model=GPT2Tokenizer,
    lm_model=GPT2LMHeadModel,
    pad_token_id='eos_token_id',
    save_path="data/data_gpt2-baseline.json",
    load_size=10
)

  0%|          | 0/100 [00:00<?, ?it/s]

In [7]:
create_corpus(
    tokenizer_name="gpt2",
    model_name="gpt2",
    max_document_length=None,
    device=device,
    corpus_size=1000,
    tokenizer_model=GPT2Tokenizer,
    lm_model=GPT2LMHeadModel,
    pad_token_id='eos_token_id',
    save_path="data/data_gpt2.json",
    load_size=10
)

  0%|          | 0/100 [00:00<?, ?it/s]

In [4]:
create_corpus(
    tokenizer_name="EleutherAI/gpt-neo-2.7B",
    model_name="EleutherAI/gpt-neo-2.7B",
    max_document_length=2048,
    device=device,
    corpus_size=1000,
    tokenizer_model=GPT2Tokenizer,
    lm_model=GPTNeoForCausalLM,
    pad_token_id=None,
    save_path="data/data_gpt2neo.json",
    load_size=5
)

RuntimeError: CUDA out of memory. Tried to allocate 100.00 MiB (GPU 0; 8.00 GiB total capacity; 7.22 GiB already allocated; 0 bytes free; 7.23 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

### Perplexity of the self trained language model

In [4]:
# TODO: understand compare to perplexity calculation in run_clm.py >> check the stride!

def calculate_perplexity(model_id='gpt2', tokenizer_id=None, test=None, stride=1024):
    if tokenizer_id is None:
        tokenizer_id = model_id
    model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_id)
    encodings = tokenizer('\n\n'.join(test), return_tensors='pt')

    max_length = model.config.n_positions

    nlls = []
    for i in tqdm(range(0, encodings.input_ids.size(1), stride)):
        begin_loc = max(i + stride - max_length, 0)
        end_loc = min(i + stride, encodings.input_ids.size(1))
        trg_len = end_loc - i    # may be different from stride on last loop
        input_ids = encodings.input_ids[:,begin_loc:end_loc].to(device)
        target_ids = input_ids.clone()
        target_ids[:,:-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            neg_log_likelihood = outputs[0] * trg_len

        nlls.append(neg_log_likelihood)

    print(torch.exp(torch.stack(nlls).sum() / end_loc))
    gc.collect()
    torch.cuda.empty_cache()

In [5]:
run_perplexity = False
if run_perplexity:
    wikitext103test = load_dataset('wikitext', 'wikitext-103-raw-v1', split='test')
    wikitext103test_self = Path('./data/data_wikitext-103-raw/wiki.test.raw.txt').read_text(encoding='utf-8')
    wikitext103valid_self = Path('./data/data_wikitext-103-raw/wiki.valid.raw.txt').read_text(encoding='utf-8')

    calculate_perplexity(model_id='gpt2', test=wikitext103test['text'], stride=512)# 25.1705

    calculate_perplexity(model_id='gpt2', test=wikitext103test_self, stride=512)   # 3.1743
    calculate_perplexity(model_id='gpt2', test=wikitext103valid_self, stride=512)  # 3.1934

    calculate_perplexity(model_id='./data/model_gpt2-baseline', tokenizer_id='gpt2', test=wikitext103test['text'], stride=512) # 87.9691, 160.2229
    calculate_perplexity(model_id='./data/model_gpt2-baseline', tokenizer_id='gpt2', test=wikitext103test_self, stride=512)    # way off
    calculate_perplexity(model_id='./data/model_gpt2-baseline', tokenizer_id='gpt2', test=wikitext103valid_self, stride=512)   # way off

    #python run_clm.py --model_name_or_path gpt2 --output_dir data/test --do_eval --block_size 64 --overwrite_output_dir --validation_file .\data\data_wikitext-103-raw\wiki.test.raw.txt  #

### Load Corpora

In [3]:
heading_pattern = '( \n [=\s].*[=\s] \n)'
train_data = Path('./data/data_wikitext-103-raw/wiki.train.raw').read_text(encoding='utf-8')
train_split = re.split(heading_pattern, train_data)
train_headings = [x[7:-7] for x in train_split[1::2]]
train_articles = [x for x in train_split[2::2]]

In [5]:
len(train_articles)

305108

In [None]:
file0 = 'data/gpt2-baseline'
file1 = 'data/gpt2'
file2 = 'data/gpt2neo'

In [None]:
with open(file0, 'r') as file:
    train_articles = json.load(file)

In [None]:
size = 10000
train_articles = train_articles[:size]

### Latent Dirichlet Allocation

In [6]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [7]:
def tokenize(docs):
    # Tokenize the documents.
    # Split the documents into tokens.
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 1] for doc in docs]

    # Lemmatize the documents.
    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

    # Compute bigrams
    # Add bigrams and trigrams to docs (only ones that appear 20 times or more).
    bigram = Phrases(docs, min_count=20)
    for idx in range(len(docs)):
        for token in bigram[docs[idx]]:
            if '_' in token:
                # Token is a bigram, add to document.
                docs[idx].append(token)

    # Remove rare and common tokens.
    # Create a dictionary representation of the documents.
    dictionary = Dictionary(docs)

    # Filter out words that occur less than 20 documents, or more than 50% of the documents.
    dictionary.filter_extremes(no_below=20, no_above=0.5)

    # Bag-of-words representation of the documents.
    corpus = [dictionary.doc2bow(doc) for doc in docs]
    print('Number of unique tokens: %d' % len(dictionary))
    print('Number of documents: %d' % len(corpus))

    return dictionary, corpus



In [9]:
def train_lda(dictionary, corpus):
    # Train LDA model.

    # Set training parameters.
    num_topics = 100
    chunksize = 20000
    passes = 20
    iterations = 400
    eval_every = None  # Don't evaluate model perplexity, takes too much time.

    # Make a index to word dictionary.
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token

    model = LdaMulticore(
        corpus=corpus,
        num_topics=num_topics,
        id2word=id2word,
        workers=4,
        chunksize=chunksize,
        passes=passes,
        alpha='symmetric',
        eta='auto',
        eval_every=eval_every,
        iterations=iterations,
    )

    top_topics = model.top_topics(corpus) #, num_words=20)

    # Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    print('Average topic coherence: %.4f.' % avg_topic_coherence)

    pprint(top_topics)

    return model

In [11]:
dictionary, corpus = tokenize(train_articles)

2022-01-19 22:29:58,110 : INFO : collecting all words and their counts
2022-01-19 22:29:58,111 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2022-01-19 22:29:59,838 : INFO : collected 1007308 token types (unigram + bigrams) from a corpus of 2586127 words and 10000 sentences
2022-01-19 22:29:59,839 : INFO : merged Phrases<1007308 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000>
2022-01-19 22:29:59,840 : INFO : Phrases lifecycle event {'msg': 'built Phrases<1007308 vocab, min_count=20, threshold=10.0, max_vocab_size=40000000> in 1.73s', 'datetime': '2022-01-19T22:29:59.840561', 'gensim': '4.1.2', 'python': '3.9.9 (tags/v3.9.9:ccb0e6a, Nov 15 2021, 18:08:50) [MSC v.1929 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19044-SP0', 'event': 'created'}
2022-01-19 22:30:02,584 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2022-01-19 22:30:03,643 : INFO : built Dictionary(75444 unique tokens: ['a', 'adapted', 'adjustment', 'after', 'against']

Number of unique tokens: 8606
Number of documents: 10000


In [None]:
# Train LDA model.

# Set training parameters.
num_topics = 100
chunksize = 1000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaMulticore(
    corpus=corpus,
    num_topics=num_topics,
    id2word=id2word,
    workers=4,
    chunksize=chunksize,
    passes=passes,
    alpha='symmetric',
    eta='auto',
    eval_every=eval_every,
    iterations=iterations,
)

2022-01-19 22:26:07,816 : INFO : using symmetric alpha at 0.01
2022-01-19 22:26:07,818 : INFO : using serial LDA version on this node
2022-01-19 22:26:07,867 : INFO : running online LDA training, 100 topics, 20 passes over the supplied corpus of 10000 documents, updating every 6000 documents, evaluating every ~0 documents, iterating 400x with a convergence threshold of 0.001000
2022-01-19 22:26:07,868 : INFO : training LDA model using 6 processes


In [None]:
top_topics = model.top_topics(corpus) #, num_words=20)

# Average topic coherence is the sum of topic coherences of all topics, divided by the number of topics.
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print('Average topic coherence: %.4f.' % avg_topic_coherence)

pprint(top_topics)

In [12]:
model = train_lda(dictionary, corpus)

2022-01-19 22:30:05,705 : INFO : using symmetric alpha at 0.01
2022-01-19 22:30:05,707 : INFO : using serial LDA version on this node
2022-01-19 22:30:05,754 : INFO : running online LDA training, 100 topics, 20 passes over the supplied corpus of 10000 documents, updating every 80000 documents, evaluating every ~0 documents, iterating 400x with a convergence threshold of 0.001000
2022-01-19 22:30:05,755 : INFO : training LDA model using 4 processes
2022-01-19 22:30:08,446 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #10000/10000, outstanding queue size 1
2022-01-19 22:37:26,496 : INFO : topic #11 (0.010): 0.010*"song" + 0.009*"gun" + 0.008*"were" + 0.007*"an" + 0.005*"new" + 0.005*"mm" + 0.005*"inch" + 0.004*"be" + 0.004*"which" + 0.004*"two"
2022-01-19 22:37:26,497 : INFO : topic #52 (0.010): 0.018*"he" + 0.015*"game" + 0.010*"his" + 0.008*"were" + 0.007*"had" + 0.006*"an" + 0.005*"but" + 0.005*"not" + 0.005*"which" + 0.005*"first"
2022-01-19 22:37:26,497 : INFO : t

Average topic coherence: -2.8149.
[([(0.024418907, 'his'),
   (0.021815605, 'he'),
   (0.013642517, 'not'),
   (0.009322977, 'this'),
   (0.009298386, 'had'),
   (0.009271459, 'be'),
   (0.0092388345, 'were'),
   (0.0089136055, 'have'),
   (0.008059746, 'they'),
   (0.00804267, 'but'),
   (0.007590425, 'an'),
   (0.00631842, 'would'),
   (0.006020743, 'or'),
   (0.006013721, 'who'),
   (0.0054081506, 'one'),
   (0.0053862743, 'we'),
   (0.005330682, 'their'),
   (0.005019074, 'said'),
   (0.0048992634, 'which'),
   (0.0048097465, 'been')],
  -0.8969028612559897),
 ([(0.03185711, 'he'),
   (0.024843147, 'his'),
   (0.01748927, 'had'),
   (0.012349786, 'not'),
   (0.009952355, 'they'),
   (0.009772469, 'him'),
   (0.009557116, 'would'),
   (0.009189236, 'their'),
   (0.008261574, 'been'),
   (0.007988241, 'be'),
   (0.00791669, 'but'),
   (0.0067887264, 'who'),
   (0.0063364143, 'were'),
   (0.0063237045, 'an'),
   (0.0062571624, 'have'),
   (0.0061000264, 'this'),
   (0.005226616, 'said

In [None]:
# Document topic
list(model.get_document_topics(corpus))

In [None]:
# Term topic matrix
model.get_topics()

### Load/Save LDA Models

In [None]:
path0 = "data/model_lda-model_wikitext"
path1 = "data/model_lda-model_gpt2-baseline"
path2 = "data/model_lda-model_gpt2-baseline_no_header"
path3 = "data/model_lda-model_gpt2-baseline_self"
lda = model

In [None]:
# Save model to disk.
temp_file = datapath(path0)
lda.save(temp_file)

In [None]:
# Load a potentially pretrained model from disk.
lda = LdaMulticore.load(temp_file)

### Visualize one LDA

In [14]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

vis_data = gensimvis.prepare(model, corpus, dictionary)
pyLDAvis.display(vis_data)

  default_term_info = default_term_info.sort_values(


### Compare two LDA's