In [204]:
import numpy as np
import torch
import time
import nltk

from transformers import AutoTokenizer
from pytorch_pretrained_bert import (GPT2LMHeadModel, GPT2Tokenizer, BertForMaskedLM, TransfoXLTokenizer, TransfoXLLMHeadModel, OpenAIGPTTokenizer, OpenAIGPTLMHeadModel)

# from matplotlib import pyplot as plt

In [205]:
class LM():

    def gen_tensor(self, in_text):
        pass

    def check_probabilities(self, in_text):
        '''
        Function that GLTR interacts with to check the probabilities of words

        Params:
        - in_text: str -- The text that you want to check
        - topk: int -- Your desired truncation of the head of the distribution

        Output:
        - payload: dict -- The wrapper for results in this function, described below

        Payload values
        ==============
        real_topk: list of tuples -- (ranking, prob) of each token
        '''

        context = self.gen_tensor(in_text)
        
        # Forward through the model
        # logits, _ = self.model(context) #gpt2
        # logits = self.model(context, masked_lm_labels=None) #bert # run our tensor through model 'self.model' output is list of list of logits for each word. each list of probabilities is the length of tokenized words
        logits = self.model(context)
        print(logits)

        # construct target and pred
        yhat = torch.softmax(logits[0, 1:-1], dim=-1)    #softmax on first column and drop last element (end of text token's logit array)
        
        print("max prob of ind 0: ", yhat[0][1012])
        print("max prob of second to last: ", yhat[0][517])


        y = context[0, 1:-1]  #tensor without end of text token, essentially massive list of tokens representing words
        

        # Sort the predictions for each timestep
        sorted_preds = np.argsort(-yhat.data.cpu().numpy())     #creates an array of the indexes corresponding to the yhat probabilities if they were sorted greatest to least

        print(sorted_preds)

        # [pos, ...]
        real_topk_pos = list(
            [int(np.where(sorted_preds[i] == y[i].item())[0][0])
             for i in range(y.shape[0])])   #create vector of which place the word in text was in the returned predicted probability of every word to follow. if word was the 10th most likely, put 10 in vector
        
        print(real_topk_pos)

        real_topk_probs = yhat[np.arange(0, y.shape[0], 1), y].data.cpu().numpy().tolist()  #make list of probabilities of each of the tokens at each index


        real_topk_probs = list(map(lambda x: round(x, 5), real_topk_probs))     #round off decimals

        # create [(pos, prob), ...] from [pos, ...] and [prob, ...]
        real_topk = list(zip(real_topk_pos, real_topk_probs))       #zip position in probability list and proabilities together
        
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

        return real_topk

In [206]:
class LM_BERT(LM):
    def __init__(self):
        '''
        In the subclass, you need to load all necessary components
        for the other functions.
        Typically, this will comprise a tokenizer and a model.
        '''
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        super(LM, self).__init__()
        self.model = BertForMaskedLM.from_pretrained("bert-base-uncased")
        self.enc = AutoTokenizer.from_pretrained("bert-base-uncased")
        self.start_token = "[CLS]"
        self.end_token = "[SEP]"
        self.model.to(self.device)
        self.model.eval()

        print("Loaded BERT model!")

    def gen_tensor(self, in_text):
        # Process input

        # print(self.enc.encode(in_text))

        start_t = torch.full((1, 1),
                             self.enc.encode(self.start_token)[0],
                             device=self.device,
                             dtype=torch.long)  #create a tensor to stick words in and feed to model

        end_t = torch.full((1, 1),
                            self.enc.encode(self.end_token)[0],
                            device=self.device,
                            dtype=torch.long)        #create tensor with end_token

        context = self.enc.encode(in_text)

        context = torch.tensor(context,
                               device=self.device,
                               dtype=torch.long).unsqueeze(0)   #encode then add words into tensor
        return torch.cat([start_t, context, end_t], dim=1)  #join tensors, endoftext at beginning

class LM_GPT2(LM):
    def __init__(self):
        '''
        In the subclass, you need to load all necessary components
        for the other functions.
        Typically, this will comprise a tokenizer and a model.
        '''
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        super(LM, self).__init__()
        self.enc = GPT2Tokenizer.from_pretrained("gpt2")
        self.model = GPT2LMHeadModel.from_pretrained("gpt2")
        self.start_token = '<|endoftext|>'
        self.end_token = self.start_token
        self.model.to(self.device)
        self.model.eval()

        print("Loaded GPT-2 model!")
    
    def gen_tensor(self, in_text):
        # Process input
        start_t = torch.full((1, 1),
                             self.enc.encoder[self.start_token],
                             device=self.device,
                             dtype=torch.long)  #create a tensor to stick words in and feed to model

        # print(self.enc.encoder[self.start_token])

        end_t = torch.full((1, 1),
                            self.enc.encoder[self.end_token],
                            device=self.device,
                            dtype=torch.long)        #create tensor with end_token

        context = self.enc.encode(in_text)

        context = torch.tensor(context,
                               device=self.device,
                               dtype=torch.long).unsqueeze(0)   #encode then add words into tensor
        return torch.cat([start_t, context, end_t], dim=1)  #join tensors, endoftext at beginning

class LM_TRANXL(LM):
    def __init__(self):
        '''
        In the subclass, you need to load all necessary components
        for the other functions.
        Typically, this will comprise a tokenizer and a model.
        '''
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        super(LM, self).__init__()
        self.enc = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
        self.model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
        self.start_token = '<|endoftext|>'
        self.end_token = self.start_token
        self.model.to(self.device)
        self.model.eval()

        print("Loaded GPT-2 model!")
    
    def gen_tensor(self, in_text):
        # Process input
        # start_t = torch.full((1, 1),
        #                      self.enc.encoder[self.start_token],
        #                      device=self.device,
        #                      dtype=torch.long)  #create a tensor to stick words in and feed to model

        # print(self.enc.encoder[self.start_token])

        # end_t = torch.full((1, 1),
        #                     self.enc.encoder[self.end_token],
        #                     device=self.device,
        #                     dtype=torch.long)        #create tensor with end_token

        context = self.enc.encode(in_text)

        context = torch.tensor(context,
                               device=self.device,
                               dtype=torch.long).unsqueeze(0)   #encode then add words into tensor
        # return torch.cat([start_t, context, end_t], dim=1)  #join tensors, endoftext at beginning
        return context

class LM_GPT(LM):
    def __init__(self):
        '''
        In the subclass, you need to load all necessary components
        for the other functions.
        Typically, this will comprise a tokenizer and a model.
        '''
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        super(LM, self).__init__()
        self.enc = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
        self.model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
        self.start_token = '<|endoftext|>'
        self.end_token = self.start_token
        self.model.to(self.device)
        self.model.eval()

        print("Loaded GPT-2 model!")
    
    def gen_tensor(self, in_text):
        # Process input
        # start_t = torch.full((1, 1),
        #                      self.enc.encoder[self.start_token],
        #                      device=self.device,
        #                      dtype=torch.long)  #create a tensor to stick words in and feed to model

        # print(self.enc.encoder[self.start_token])

        # end_t = torch.full((1, 1),
        #                     self.enc.encoder[self.end_token],
        #                     device=self.device,
        #                     dtype=torch.long)        #create tensor with end_token

        context = self.enc.encode(in_text)

        context = torch.tensor(context,
                               device=self.device,
                               dtype=torch.long).unsqueeze(0)   #encode then add words into tensor
        # return torch.cat([start_t, context, end_t], dim=1)  #join tensors, endoftext at beginning
        return context

In [207]:
def main_code(raw_text):
    # lm = LM_GPT2()
    # lm = LM_BERT()
    # lm = LM_TRANXL()
    lm = LM_GPT()
    start = time.time()
    real_topk = lm.check_probabilities(raw_text)
    end = time.time()
    ranks = [i[0] for i in real_topk]
    preds = [i[1] for i in real_topk]
    print(real_topk)
    print("{:.2f} Seconds for a check with GPT-2".format(end - start))

GPT2 Results:
[(11296, 0.0), (126, 0.00018), (327, 0.00036), (15, 0.003), (151, 6e-05), (737, 7e-05), (708, 0.00028), (57, 0.00159), (577, 8e-05), (1048, 5e-05), (631, 8e-05), (192, 0.00012), (257, 0.00055), (27, 0.00159), (316, 0.00012), (568, 0.00014), (63, 0.0008), (1402, 2e-05), (2677, 0.0), (336, 0.00026), (1200, 1e-05), (284, 9e-05), (937, 2e-05), (150, 0.00017), (355, 9e-05), (332, 6e-05), (965, 6e-05), (153, 1e-05), (277, 7e-05), (249, 0.00039), (99, 0.00045), (511, 6e-05), (239, 0.0004), (4856, 0.0), (642, 0.0), (901, 0.0), (211, 0.0), (185, 0.00021), (298, 0.00048), (530, 0.00024), (1837, 2e-05), (819, 0.00015), (1385, 5e-05), (70, 0.00199), (55, 0.00044), (1122, 2e-05), (12806, 0.0), (50, 0.00227), (226, 0.00026), (150, 3e-05), (326, 0.00018), (2916, 1e-05), (1407, 4e-05), (91, 8e-05), (598, 0.00017), (875, 2e-05), (1647, 1e-05)]



GPT Results: [(409, 2e-05), (5099, 2e-05), (914, 0.0), (1294, 1e-05), (6321, 1e-05), (19, 0.00335), (1635, 2e-05), (1509, 2e-05), (1175, 4e-05), (213, 6e-05), (2725, 3e-05), (5, 0.0066), (1055, 8e-05), (1067, 0.00011), (119, 0.00024), (129, 0.00035), (62, 0.00033), (3652, 3e-05), (786, 2e-05), (1214, 6e-05), (2352, 1e-05), (140, 0.00015), (2089, 2e-05), (93, 0.00055), (5465, 0.0), (56, 9e-05), (270, 9e-05), (1510, 7e-05), (127, 0.00054), (957, 4e-05), (1924, 2e-05), (443, 0.00023), (267, 1e-05), (14, 1e-05), (435, 0.00011), (1818, 5e-05), (124, 0.00111), (492, 0.0001), (192, 0.00065), (1354, 3e-05), (17, 0.00178), (251, 3e-05), (177, 0.00025), (285, 6e-05), (16, 0.0061), (636, 0.00011), (176, 0.00015), (1128, 5e-05), (4150, 1e-05), (987, 0.00011), (720, 0.0), (549, 0.00011), (944, 1e-05)]

BERT softmax'd logits:

tensor([[1.0506e-07, 1.1879e-07, 1.1111e-07,  ..., 1.7469e-07, 2.5401e-07,
         1.5148e-06],
        [1.0723e-15, 3.0842e-15, 9.0170e-16,  ..., 1.6667e-14, 4.4826e-14,
         1.9831e-13],
        [1.5042e-10, 1.5468e-10, 8.2493e-11,  ..., 5.6238e-11, 3.6081e-10,
         3.0738e-10],
        ...,
        [1.3800e-08, 1.1116e-08, 1.0551e-08,  ..., 1.2808e-08, 3.0984e-08,
         3.9100e-10],
        [1.5904e-17, 3.0755e-17, 1.3421e-17,  ..., 8.0877e-17, 8.6377e-17,
         2.2796e-14],
        [7.4714e-12, 8.7586e-12, 6.0399e-12,  ..., 1.3769e-11, 2.8122e-10,
         3.9264e-09]], grad_fn=<SoftmaxBackward0>)

GPT-2 softmax'd logits:

tensor([[1.6370e-05, 1.4382e-05, 1.4753e-07,  ..., 6.0292e-08, 2.8607e-08,
         1.4753e-05],
        [8.6838e-05, 1.5199e-05, 7.4894e-07,  ..., 8.9443e-08, 5.7484e-09,
         2.5310e-05],
        [1.4357e-06, 8.5909e-07, 8.2984e-08,  ..., 2.4151e-07, 1.5402e-08,
         6.7930e-06],
        ...,
        [1.0720e-05, 5.9266e-06, 4.9893e-07,  ..., 1.6310e-07, 2.0822e-08,
         1.9812e-05],
        [3.6592e-04, 2.0784e-05, 6.0956e-06,  ..., 2.8720e-08, 5.3717e-08,
         1.5519e-04],
...
        [4.9105e-06, 1.5616e-06, 1.6863e-04,  ..., 5.8378e-09, 4.4392e-09,
         1.3820e-02]], grad_fn=<SoftmaxBackward0>)

In [208]:
txt = "To check the smoothness of a text, I will plot the rank of every word. If the ranks of words in a text are higher, the text will be unsmooth according to the GPT-2 Language Model. Following code is used to create these plots for texts."
# print(len(txt.split()))
main_code(txt)

100%|██████████| 815973/815973 [00:00<00:00, 2975081.73B/s]
100%|██████████| 458495/458495 [00:00<00:00, 2086021.12B/s]
ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.
100%|██████████| 478750579/478750579 [00:37<00:00, 12698161.25B/s]
100%|██████████| 656/656 [00:00<00:00, 151964.18B/s]


Loaded GPT-2 model!
tensor([[[ -8.4633,  -6.6491, -15.5382,  ...,  -8.9018, -10.4672,  -2.5969],
         [ -9.9728,  -8.4973, -19.6219,  ...,  -9.9638, -11.8911,  -1.9545],
         [ -7.1261,  -3.4169, -12.6065,  ..., -11.2287,  -5.5814,  -1.1759],
         ...,
         [ -5.9414,  -3.3718, -15.9036,  ..., -10.6002,  -7.5471,  -2.3585],
         [ -6.7730,  -5.1471, -20.4338,  ..., -13.8276, -15.5916,  -2.4700],
         [ -6.1527,  -4.7163, -17.3600,  ...,  -9.7287, -11.0065,   4.5684]]],
       grad_fn=<UnsafeViewBackward0>)
max prob of ind 0:  tensor(1.6933e-08, grad_fn=<SelectBackward0>)
max prob of second to last:  tensor(2.3744e-06, grad_fn=<SelectBackward0>)
[[  504   481   551 ... 32953  4471  8225]
 [  720  4786  1353 ... 21529  5818  4118]
 [  498   488   239 ... 27888 16728 16481]
 ...
 [  239   240   488 ... 34610 22079 25910]
 [  481   246  1183 ... 37510 21210 16024]
 [  239   240   485 ... 12319 16481 22079]]
[409, 5099, 914, 1294, 6321, 19, 1635, 1509, 1175, 213, 272