In [5]:
import sys
import numpy as np
import torch
dummy = torch.zeros(1).cuda()
from transformers import *

In [6]:
# Transformers has a unified API
# for 10 transformer architectures and 30 pretrained weights.
#          Model          | Tokenizer          | Pretrained weights shortcut
MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased'),
          (OpenAIGPTModel,  OpenAIGPTTokenizer,  'openai-gpt'),
          (GPT2Model,       GPT2Tokenizer,       'gpt2'),
          (CTRLModel,       CTRLTokenizer,       'ctrl'),
          (TransfoXLModel,  TransfoXLTokenizer,  'transfo-xl-wt103'),
          (XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
          (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
          (DistilBertModel, DistilBertTokenizer, 'distilbert-base-cased'),
          (RobertaModel,    RobertaTokenizer,    'roberta-base'),
          (XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base'),
         ]

In [3]:
# Let's encode some text in a sequence of hidden-states using each model:
for model_class, tokenizer_class, pretrained_weights in MODELS:
    # Load pretrained model/tokenizer
    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    model = model_class.from_pretrained(pretrained_weights)

    # Encode text
    input_ids = torch.tensor([tokenizer.encode("Here is some text to encode", add_special_tokens=True)])  # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
    with torch.no_grad():
        last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples

In [7]:
BERT_MODEL_CLASSES = [BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction,
                      BertForSequenceClassification, BertForTokenClassification, BertForQuestionAnswering]

pretrained_weights = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
for model_class in BERT_MODEL_CLASSES:
    # Load pretrained model/tokenizer
    model = model_class.from_pretrained(pretrained_weights)

    # Models can return full list of hidden-states & attentions weights at each layer
    print(model_class)
    model = model_class.from_pretrained(pretrained_weights,
                                        output_hidden_states=True,
                                        output_attentions=True)
    input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")])
    all_hidden_states, all_attentions = model(input_ids)[-2:]
    print(list(map((lambda x: x.shape), all_hidden_states)))
    print(list(map((lambda x: x.shape), all_attentions)))

<class 'transformers.modeling_bert.BertModel'>
[torch.Size([1, 16, 768]), torch.Size([1, 16, 768]), torch.Size([1, 16, 768]), torch.Size([1, 16, 768]), torch.Size([1, 16, 768]), torch.Size([1, 16, 768]), torch.Size([1, 16, 768]), torch.Size([1, 16, 768]), torch.Size([1, 16, 768]), torch.Size([1, 16, 768]), torch.Size([1, 16, 768]), torch.Size([1, 16, 768]), torch.Size([1, 16, 768])]
[torch.Size([1, 12, 16, 16]), torch.Size([1, 12, 16, 16]), torch.Size([1, 12, 16, 16]), torch.Size([1, 12, 16, 16]), torch.Size([1, 12, 16, 16]), torch.Size([1, 12, 16, 16]), torch.Size([1, 12, 16, 16]), torch.Size([1, 12, 16, 16]), torch.Size([1, 12, 16, 16]), torch.Size([1, 12, 16, 16]), torch.Size([1, 12, 16, 16]), torch.Size([1, 12, 16, 16])]
<class 'transformers.modeling_bert.BertForPreTraining'>
[torch.Size([1, 16, 768]), torch.Size([1, 16, 768]), torch.Size([1, 16, 768]), torch.Size([1, 16, 768]), torch.Size([1, 16, 768]), torch.Size([1, 16, 768]), torch.Size([1, 16, 768]), torch.Size([1, 16, 768]), 