In [1]:
from functools import *
from itertools import *

import pandas as pd
import torch
import torch.nn.functional as F
from transformers import *

chaini = chain.from_iterable

In [2]:
device = torch.device('cuda:0')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')
tokenizer.pad_token = tokenizer.eos_token
lm = GPT2LMHeadModel.from_pretrained('gpt2-large').eval().to(device)

loading file https://huggingface.co/gpt2-large/resolve/main/vocab.json from cache at /home/yugai/.cache/huggingface/transformers/79f5e05af067df502528a0d902e82c24c3f1df9ae570c91fcc38e1f3c0af4c45.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
loading file https://huggingface.co/gpt2-large/resolve/main/merges.txt from cache at /home/yugai/.cache/huggingface/transformers/7f7bf8a7802a708af08a812bfbdec9335f2c30f761ec14a8cd17b0d61c818876.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/gpt2-large/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/gpt2-large/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/gpt2-large/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingface.co/gpt2-large/resolve/main/config.json from cache at /home/yugai/.cache/huggingface/transformers/d82fb41558a2cc40bb6e10a57bbfbd9ff2f3c

In [3]:
query = 'Trace-based Just-in-Time Type Specialization for Dynamic'
cands = [
    'Languages',
    'Andreas Gal ∗ +, Brendan Eich ∗, Mike Shaver ∗, David Anderson ∗, David Mandelin ∗,',
    'Mohammad R. Haghighat $, Blake Kaplan ∗, Graydon Hoare ∗, Boris Zbarsky ∗, Jason Orendorff ∗,',
]

# query = 'Permission to make digital or hard copies of all or part of this work for personal or'
# cands = [
#     'classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. To copy otherwise, to republish, to post on servers or to redistribute to lists, requires prior specific permission and/or a fee. PLDI’09, June 15–20, 2009, Dublin, Ireland.',
#     'and is used for the application logic of browser-based productivity applications such as Google Mail, Google Docs and Zimbra Collaboration Suite. In this domain, in order to provide a fluid user experience and enable a new generation of applications, virtual machines must provide a low startup time and high performance.',
#     'We present a trace-based compilation technique for dynamic languages that reconciles speed of compilation with excellent performance of the generated machine code. Our system uses a mixedmode execution approach: the system starts running JavaScript in a fast-starting bytecode interpreter. As the program runs, the system identifies hot (frequently executed) bytecode sequences, records them, and compiles them to fast native code. We call such a sequence of instructions a trace. Unlike method-based dynamic compilers, our dynamic compiler operates at the granularity of individual loops. This design choice is based on the expectation that programs spend most of their time in hot loops. Even in dynamically typed languages, we expect hot loops to be mostly type-stable, meaning that the types of values are invariant. (12) For example, we would expect loop counters that start as integers to remain integers for all iterations. When both of these expectations hold, a trace-based compiler can cover the program execution with a small number of type-specialized, efficiently compiled traces.'
# ]

# query = 'Dynamic languages such as JavaScript, Python, and Ruby, are popular since they are expressive, accessible to non-experts, and make deployment as easy as distributing a source file. They are used for small scripts as well as for complex applications. JavaScript, for example, is the de facto standard for client-side web programming'
# cands = [
#     'Permission to make digital or hard copies of all or part of this work for personal or',
#     'classroom use is granted without fee provided that copies are not made or distributed for profit or commercial advantage and that copies bear this notice and the full citation on the first page. To copy otherwise, to republish, to post on servers or to redistribute to lists, requires prior specific permission and/or a fee. PLDI’09, June 15–20, 2009, Dublin, Ireland.',
#     'Copyright c © 2009 ACM 978-1-60558-392-1/09/06... $5.00',
#     'and is used for the application logic of browser-based productivity applications such as Google Mail, Google Docs and Zimbra Collaboration Suite. In this domain, in order to provide a fluid user experience and enable a new generation of applications, virtual machines must provide a low startup time and high performance.',
#     'Compilers for statically typed languages rely on type information to generate efficient machine code. In a dynamically typed programming language such as JavaScript, the types of expressions may vary at runtime. This means that the compiler can no longer easily transform operations into machine instructions that operate on one specific type. Without exact type information, the compiler must emit slower generalized machine code that can deal with all potential type combinations. While compile-time static type inference might be able to gather type information to generate optimized machine code, traditional static analysis is very expensive and hence not well suited for the highly interactive environment of a web browser.'
# ]

In [4]:
def forward(text, tokenizer, lm):
    inputs = tokenizer.batch_encode_plus(text,
                                         padding=True,
                                         truncation=True,
                                         return_tensors='pt',
                                         return_attention_mask=False,
                                         return_token_type_ids=True).to(device)
    with torch.no_grad():
        outputs = lm(inputs['input_ids'])
        input_ids = F.pad(inputs['input_ids'], [-1, 1, 0, 0],
                          value=tokenizer.eos_token_id)[:, :, None]
        log_probs = outputs['logits'].log_softmax(2).gather(
            2, input_ids).squeeze(2)
    return inputs, outputs, input_ids, log_probs


def display_as_df(input_ids, log_probs):
    df = pd.DataFrame(
        dict(
            chaini([[
                f'input_ids[{i}]',
                tokenizer.convert_ids_to_tokens(input_ids[i])
            ], [f'log_probs[{i}]', [round(_.item(), 3) for _ in log_probs[i]]]]
                   for i in range(len(input_ids)))))
    with pd.option_context('display.max_rows', len(df) + 1):
        display(df)

In [5]:
# text = [f'{tokenizer.eos_token} {query} {cand}' for cand in cands]
# text = [[f'{tokenizer.eos_token} {query}', cand] for cand in cands]
text = [[f'{tokenizer.eos_token} {query}', f' {cand}'] for cand in cands]
inputs, outputs, input_ids, log_probs = forward(
    text, tokenizer, lm)
display_as_df(input_ids, log_probs)

Unnamed: 0,input_ids[0],log_probs[0],input_ids[1],log_probs[1],input_ids[2],log_probs[2]
0,ĠTrace,-20.299,ĠTrace,-20.299,ĠTrace,-20.299
1,-,-5.195,-,-5.195,-,-5.195
2,based,-4.134,based,-4.134,based,-4.134
3,ĠJust,-11.263,ĠJust,-11.263,ĠJust,-11.263
4,-,-0.626,-,-0.626,-,-0.626
5,in,-0.734,in,-0.734,in,-0.734
6,-,-0.007,-,-0.007,-,-0.007
7,Time,-0.157,Time,-0.157,Time,-0.157
8,ĠType,-7.906,ĠType,-7.906,ĠType,-7.906
9,ĠSpecial,-10.564,ĠSpecial,-10.564,ĠSpecial,-10.564


In [6]:
win_size = 5
cumsum = inputs['token_type_ids'].cumsum(1)
mask = F.pad((0 < cumsum) & (cumsum <= win_size),
                [-1, 1, 0, 0])
scores = outputs['logits'].log_softmax(2).gather(
    2,
    F.pad(inputs['input_ids'],
            [-1, 1, 0, 0])[:, :, None]).squeeze(2).masked_fill(
                ~mask, 0).sum(1) / mask.sum(1)
scores

tensor([-11.4124,  -8.7616,  -6.4605], device='cuda:0')