In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2
# 多行输出
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

[huggingface/pytorch-transformers: 👾 A library of state-of-the-art pretrained models for Natural Language Processing (NLP)](https://github.com/huggingface/pytorch-transformers)

In [25]:
import torch
from pytorch_transformers import *
import logging
logging.basicConfig(level=logging.INFO)
from pathlib import Path
import os

In [3]:
root = Path('/home/lyc/.torch/models/bert-base-uncased/')
# os.makedirs(root)

## BERT 结构

In [26]:
# Load pre-trained model tokenizer (vocabulary)  使用预训练的词典
config = BertConfig.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

INFO:pytorch_transformers.modeling_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/lyc/.torch/pytorch_transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
INFO:pytorch_transformers.modeling_utils:Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "torchscript": false,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

INFO:pytorch_transformers.tokenization_utils:loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt fr

In [5]:
tokenizer.vocab_size

30522

In [6]:
# Tokenize input
text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
tokenized_text = tokenizer.tokenize(text)

In [7]:
tokenized_text

['[CLS]',
 'who',
 'was',
 'jim',
 'henson',
 '?',
 '[SEP]',
 'jim',
 'henson',
 'was',
 'a',
 'puppet',
 '##eer',
 '[SEP]']

In [8]:
# Mask a token that we will try to predict back with `BertForMaskedLM`
masked_index = 8
tokenized_text[masked_index] = '[MASK]'
assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']

In [9]:
tokenized_text

['[CLS]',
 'who',
 'was',
 'jim',
 'henson',
 '?',
 '[SEP]',
 'jim',
 '[MASK]',
 'was',
 'a',
 'puppet',
 '##eer',
 '[SEP]']

In [10]:
# Convert token to vocabulary indices
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

In [11]:
indexed_tokens  # 索引 stoi

[101,
 2040,
 2001,
 3958,
 27227,
 1029,
 102,
 3958,
 103,
 2001,
 1037,
 13997,
 11510,
 102]

In [12]:
# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

In [13]:
# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])
tokens_tensor
segments_tensors

tensor([[  101,  2040,  2001,  3958, 27227,  1029,   102,  3958,   103,  2001,
          1037, 13997, 11510,   102]])

tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]])

In [14]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased', cached_dir=root)

INFO:pytorch_transformers.modeling_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /home/lyc/.torch/pytorch_transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c
INFO:pytorch_transformers.modeling_utils:Model config {
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "torchscript": false,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

INFO:pytorch_transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin not found in ca

In [15]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=

In [16]:
# Set the model in evaluation mode to desactivate the DropOut modules
# This is IMPORTANT to have reproductible results during evaluation!
model.eval()

# If you have a GPU, put everything on cuda
tokens_tensor = tokens_tensor.to('cuda')
segments_tensors = segments_tensors.to('cuda')
model.to('cuda');

In [17]:
# Predict hidden states features for each layer
with torch.no_grad():
    # See the models docstrings for the detail of the inputs
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    # PyTorch-Transformers models always output tuples.
    # See the models docstrings for the detail of all the outputs
    # In our case, the first element is the hidden state of the last layer of the Bert model
    encoded_layers = outputs[0]
# We have encoded our input sequence in a FloatTensor of shape (batch size, sequence length, model hidden dimension)
assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size)

In [18]:
len(outputs)
outputs[0].size()
outputs[1].size()

2

torch.Size([1, 14, 768])

torch.Size([1, 768])

In [19]:
encoded_layers

tensor([[[-0.5570,  0.2839, -0.6436,  ..., -0.7274,  0.4557,  0.6204],
         [-1.1572,  0.0354,  0.0355,  ...,  0.0591,  0.1097, -0.3150],
         [ 0.1008, -0.5286, -0.4688,  ..., -0.0431,  0.4889,  0.4134],
         ...,
         [ 0.1948,  0.0761,  0.2893,  ..., -0.0807,  0.7071,  0.0502],
         [-0.1119,  0.0714,  0.6101,  ...,  0.4044,  0.1614, -0.3569],
         [ 0.8296,  0.2729, -0.3090,  ...,  0.2452, -0.3581, -0.1970]]],
       device='cuda:0')

LM

In [20]:
# Predict all tokens
with torch.no_grad():
    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
    predictions = outputs[0]

# confirm we were able to predict 'henson'
predicted_index = torch.argmax(predictions[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]

In [21]:
predicted_token

'[unused171]'

In [30]:
model1 = BertForSequenceClassification(config)
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1

In [31]:
model1

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1)
            )
          )
          (intermediat

In [33]:
outputs = model1(input_ids, labels=labels)
loss, logits = outputs[:2]

In [41]:
outputs[0].detach().item()

0.8445613384246826

In [43]:
outputs[1].detach().data

tensor([[0.5092, 0.2263]])

In [44]:
model = BertForMultipleChoice(config)
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
labels = torch.tensor(1).unsqueeze(0)  # Batch size 1

In [45]:
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]

In [46]:
outputs

(tensor(0.4529, grad_fn=<NllLossBackward>),
 tensor([[-0.4277,  0.1295]], grad_fn=<ViewBackward>))

In [48]:
input_ids
labels

tensor([[[ 7592,  1010,  2026,  3899,  2003, 10140],
         [ 7592,  1010,  2026,  4937,  2003,  6429]]])

tensor([1])

In [50]:
[tokenizer.tokenize(s) for s in choices]

[['hello', ',', 'my', 'dog', 'is', 'cute'],
 ['hello', ',', 'my', 'cat', 'is', 'amazing']]

In [51]:
config

{
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "torchscript": false,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

## GPT2

In [52]:
config = GPT2Config.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model(config)
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1

INFO:pytorch_transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json not found in cache, downloading to /tmp/tmpj__borob
100%|██████████| 176/176 [00:00<00:00, 199620.74B/s]
INFO:pytorch_transformers.file_utils:copying /tmp/tmpj__borob to cache at /home/lyc/.torch/pytorch_transformers/4be02c5697d91738003fb1685c9872f284166aa32e061576bbe6aaeb95649fcf.085d5f6a8e7812ea05ff0e6ed0645ab2e75d80387ad55c1ad9806ee70d272f80
INFO:pytorch_transformers.file_utils:creating metadata file for /home/lyc/.torch/pytorch_transformers/4be02c5697d91738003fb1685c9872f284166aa32e061576bbe6aaeb95649fcf.085d5f6a8e7812ea05ff0e6ed0645ab2e75d80387ad55c1ad9806ee70d272f80
INFO:pytorch_transformers.file_utils:removing temp file /tmp/tmpj__borob
INFO:pytorch_transformers.modeling_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json from cache at /home/lyc/.torch/pytorch_transformers/4be02c5697d91738003fb1685c9872f284166aa32e061576bbe6a

In [53]:
outputs = model(input_ids)
last_hidden_states = outputs[0]

In [61]:
outputs[0].size()
[x.size() for x in outputs[1]]

torch.Size([1, 6, 768])

[torch.Size([2, 1, 12, 6, 64]),
 torch.Size([2, 1, 12, 6, 64]),
 torch.Size([2, 1, 12, 6, 64]),
 torch.Size([2, 1, 12, 6, 64]),
 torch.Size([2, 1, 12, 6, 64]),
 torch.Size([2, 1, 12, 6, 64]),
 torch.Size([2, 1, 12, 6, 64]),
 torch.Size([2, 1, 12, 6, 64]),
 torch.Size([2, 1, 12, 6, 64]),
 torch.Size([2, 1, 12, 6, 64]),
 torch.Size([2, 1, 12, 6, 64]),
 torch.Size([2, 1, 12, 6, 64])]

In [62]:
model

GPT2Model(
  (wte): Embedding(50257, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1)
  (h): ModuleList(
    (0): Block(
      (ln_1): BertLayerNorm()
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1)
        (resid_dropout): Dropout(p=0.1)
      )
      (ln_2): BertLayerNorm()
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1)
      )
    )
    (1): Block(
      (ln_1): BertLayerNorm()
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1)
        (resid_dropout): Dropout(p=0.1)
      )
      (ln_2): BertLayerNorm()
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1)
      )
    )
    (2): Block(
      (ln_1): BertLayerNorm()
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1)


- GPT2 使用12个transformer的decoder

## Transformer-XL

In [63]:
config = TransfoXLConfig.from_pretrained('transfo-xl-wt103')
tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
model = TransfoXLModel(config)
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
outputs = model(input_ids)
last_hidden_states, mems = outputs[:2]

INFO:pytorch_transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json not found in cache, downloading to /tmp/tmprc0lzrpf
100%|██████████| 606/606 [00:00<00:00, 781355.13B/s]
INFO:pytorch_transformers.file_utils:copying /tmp/tmprc0lzrpf to cache at /home/lyc/.torch/pytorch_transformers/a6dfd6a3896b3ae4c1a3c5f26ff1f1827c26c15b679de9212a04060eaf1237df.aef76fb1064c932cd6a2a2be3f23ebbfa5f9b6e29e8e87b571c45b4a5d5d1b90
INFO:pytorch_transformers.file_utils:creating metadata file for /home/lyc/.torch/pytorch_transformers/a6dfd6a3896b3ae4c1a3c5f26ff1f1827c26c15b679de9212a04060eaf1237df.aef76fb1064c932cd6a2a2be3f23ebbfa5f9b6e29e8e87b571c45b4a5d5d1b90
INFO:pytorch_transformers.file_utils:removing temp file /tmp/tmprc0lzrpf
INFO:pytorch_transformers.modeling_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/transfo-xl-wt103-config.json from cache at /home/lyc/.torch/pytorch_transformers/a6dfd6a3896b3ae4c1a3c5f26ff1f

In [64]:
config

{
  "adaptive": true,
  "attn_type": 0,
  "clamp_len": 1000,
  "cutoffs": [
    20000,
    40000,
    200000
  ],
  "d_embed": 1024,
  "d_head": 64,
  "d_inner": 4096,
  "d_model": 1024,
  "div_val": 4,
  "dropatt": 0.0,
  "dropout": 0.1,
  "ext_len": 0,
  "finetuning_task": null,
  "init": "normal",
  "init_range": 0.01,
  "init_std": 0.02,
  "mem_len": 1600,
  "n_head": 16,
  "n_layer": 18,
  "n_token": 267735,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "pre_lnorm": false,
  "proj_init_std": 0.01,
  "same_length": true,
  "sample_softmax": -1,
  "tgt_len": 128,
  "tie_projs": [
    false,
    true,
    true,
    true
  ],
  "tie_weight": true,
  "torchscript": false,
  "untie_r": true
}

In [65]:
model

TransfoXLModel(
  (word_emb): AdaptiveEmbedding(
    (emb_layers): ModuleList(
      (0): Embedding(20000, 1024)
      (1): Embedding(20000, 256)
      (2): Embedding(160000, 64)
      (3): Embedding(67735, 16)
    )
    (emb_projs): ParameterList(
        (0): Parameter containing: [torch.FloatTensor of size 1024x1024]
        (1): Parameter containing: [torch.FloatTensor of size 1024x256]
        (2): Parameter containing: [torch.FloatTensor of size 1024x64]
        (3): Parameter containing: [torch.FloatTensor of size 1024x16]
    )
  )
  (drop): Dropout(p=0.1)
  (layers): ModuleList(
    (0): RelPartialLearnableDecoderLayer(
      (dec_attn): RelPartialLearnableMultiHeadAttn(
        (qkv_net): Linear(in_features=1024, out_features=3072, bias=False)
        (drop): Dropout(p=0.1)
        (dropatt): Dropout(p=0.0)
        (o_net): Linear(in_features=1024, out_features=1024, bias=False)
        (layer_norm): BertLayerNorm()
        (r_net): Linear(in_features=1024, out_features=1024,

In [68]:
outputs[0].size()

torch.Size([1, 5, 1024])

In [69]:
[x.size() for x in outputs[1]]

[torch.Size([1600, 1, 1024]),
 torch.Size([1600, 1, 1024]),
 torch.Size([1600, 1, 1024]),
 torch.Size([1600, 1, 1024]),
 torch.Size([1600, 1, 1024]),
 torch.Size([1600, 1, 1024]),
 torch.Size([1600, 1, 1024]),
 torch.Size([1600, 1, 1024]),
 torch.Size([1600, 1, 1024]),
 torch.Size([1600, 1, 1024]),
 torch.Size([1600, 1, 1024]),
 torch.Size([1600, 1, 1024]),
 torch.Size([1600, 1, 1024]),
 torch.Size([1600, 1, 1024]),
 torch.Size([1600, 1, 1024]),
 torch.Size([1600, 1, 1024]),
 torch.Size([1600, 1, 1024]),
 torch.Size([1600, 1, 1024])]

## XLNet

In [85]:
config = XLNetConfig.from_pretrained('xlnet-base-cased')
tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
model = XLNetModel(config)
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1

INFO:pytorch_transformers.file_utils:https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json not found in cache, downloading to /tmp/tmpzhdpubiu
100%|██████████| 641/641 [00:00<00:00, 736992.56B/s]
INFO:pytorch_transformers.file_utils:copying /tmp/tmpzhdpubiu to cache at /home/lyc/.torch/pytorch_transformers/c9cc6e53904f7f3679a31ec4af244f4419e25ebc8e71ebf8c558a31cbcf07fc8.ef1824921bc0786e97dc88d55eb17aabf18aac90f24bd34c0650529e7ba27d6f
INFO:pytorch_transformers.file_utils:creating metadata file for /home/lyc/.torch/pytorch_transformers/c9cc6e53904f7f3679a31ec4af244f4419e25ebc8e71ebf8c558a31cbcf07fc8.ef1824921bc0786e97dc88d55eb17aabf18aac90f24bd34c0650529e7ba27d6f
INFO:pytorch_transformers.file_utils:removing temp file /tmp/tmpzhdpubiu
INFO:pytorch_transformers.modeling_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json from cache at /home/lyc/.torch/pytorch_transformers/c9cc6e53904f7f3679a31ec4af244

In [86]:
config

{
  "attn_type": "bi",
  "bi_data": false,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 3072,
  "d_model": 768,
  "dropout": 0.1,
  "end_n_top": 5,
  "ff_activation": "gelu",
  "finetuning_task": null,
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-12,
  "mem_len": null,
  "n_head": 12,
  "n_layer": 12,
  "n_token": 32000,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "reuse_len": null,
  "same_length": false,
  "start_n_top": 5,
  "summary_activation": "tanh",
  "summary_last_dropout": 0.1,
  "summary_type": "last",
  "summary_use_proj": true,
  "torchscript": false,
  "untie_r": true
}

In [87]:
model

XLNetModel(
  (word_embedding): Embedding(32000, 768)
  (layer): ModuleList(
    (0): XLNetLayer(
      (rel_attn): XLNetRelativeAttention(
        (layer_norm): XLNetLayerNorm()
        (dropout): Dropout(p=0.1)
      )
      (ff): XLNetFeedForward(
        (layer_norm): XLNetLayerNorm()
        (layer_1): Linear(in_features=768, out_features=3072, bias=True)
        (layer_2): Linear(in_features=3072, out_features=768, bias=True)
        (dropout): Dropout(p=0.1)
      )
      (dropout): Dropout(p=0.1)
    )
    (1): XLNetLayer(
      (rel_attn): XLNetRelativeAttention(
        (layer_norm): XLNetLayerNorm()
        (dropout): Dropout(p=0.1)
      )
      (ff): XLNetFeedForward(
        (layer_norm): XLNetLayerNorm()
        (layer_1): Linear(in_features=768, out_features=3072, bias=True)
        (layer_2): Linear(in_features=3072, out_features=768, bias=True)
        (dropout): Dropout(p=0.1)
      )
      (dropout): Dropout(p=0.1)
    )
    (2): XLNetLayer(
      (rel_attn): XLNetR

In [88]:
outputs = model(input_ids)
last_hidden_states = outputs[0]

In [89]:
outputs[0].detach()
outputs[0].size()

tensor([[[ 0.4631,  2.5091,  1.6458,  ..., -0.1804, -0.0000, -0.4700],
         [ 0.2742, -0.2042, -0.5669,  ...,  2.8734, -0.3431,  0.2065],
         [ 0.9146,  0.5399,  0.0661,  ...,  1.4533,  0.4497, -0.0844],
         ...,
         [ 1.5695,  0.0000,  3.2084,  ...,  2.5973, -1.4548,  0.1073],
         [ 1.3308,  1.4569, -0.1415,  ...,  1.9561, -1.5834,  0.5102],
         [-1.2122, -0.2389, -0.1772,  ...,  0.4195, -0.1349, -0.2906]]])

torch.Size([1, 7, 768])

In [90]:
model = XLNetForSequenceClassification(config)
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1

In [91]:
model

XLNetForSequenceClassification(
  (transformer): XLNetModel(
    (word_embedding): Embedding(32000, 768)
    (layer): ModuleList(
      (0): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): XLNetLayerNorm()
          (dropout): Dropout(p=0.1)
        )
        (ff): XLNetFeedForward(
          (layer_norm): XLNetLayerNorm()
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1)
        )
        (dropout): Dropout(p=0.1)
      )
      (1): XLNetLayer(
        (rel_attn): XLNetRelativeAttention(
          (layer_norm): XLNetLayerNorm()
          (dropout): Dropout(p=0.1)
        )
        (ff): XLNetFeedForward(
          (layer_norm): XLNetLayerNorm()
          (layer_1): Linear(in_features=768, out_features=3072, bias=True)
          (layer_2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout

In [92]:
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]

In [97]:
outputs

(tensor(0.9764, grad_fn=<NllLossBackward>),
 tensor([[ 0.0759, -0.4278]], grad_fn=<AddmmBackward>),
 (None, None, None, None, None, None, None, None, None, None, None, None))

In [123]:
-torch.log_softmax(outputs[1].detach(), 1)

tensor([[0.4727, 0.9764]])

## XLM

In [None]:
config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = XLMForSequenceClassification(config)
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1

INFO:pytorch_transformers.modeling_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json from cache at /home/lyc/.torch/pytorch_transformers/063cbd65bb7d2e7fa034126477f72870c897d51a5e29a6baf2ebe35acf00810c.a9584498ff24d6bef104dcc2693a9efab757d2e5ad782c797c29c89fa445b552
INFO:pytorch_transformers.modeling_utils:Model config {
  "asm": false,
  "attention_dropout": 0.1,
  "bos_index": 0,
  "causal": false,
  "dropout": 0.1,
  "emb_dim": 2048,
  "embed_init_std": 0.02209708691207961,
  "end_n_top": 5,
  "eos_index": 1,
  "finetuning_task": null,
  "gelu_activation": true,
  "init_std": 0.02,
  "is_encoder": true,
  "layer_norm_eps": 1e-12,
  "mask_index": 5,
  "max_position_embeddings": 512,
  "n_heads": 16,
  "n_langs": 1,
  "n_layers": 12,
  "n_words": 30145,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "pad_index": 2,
  "sinusoidal_embeddings": false,
  "start_n_top": 5,
  "summary_activat

In [None]:
model

In [None]:
outputs = model(input_ids, labels=labels)
loss, logits = outputs[:2]

## GPT

In [133]:
config = OpenAIGPTConfig.from_pretrained('openai-gpt')
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
model = OpenAIGPTModel(config)
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1

INFO:pytorch_transformers.modeling_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json from cache at /home/lyc/.torch/pytorch_transformers/a27bb7c70e9002d7558d2682d5a95f3c0a8b31034616309459e0b51ef07ade09.f59b19eb0e361a0230a1106b66b8c6e7a994cb200cd63d9190cda8d56d75ff85
INFO:pytorch_transformers.modeling_utils:Model config {
  "afn": "gelu",
  "attn_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "finetuning_task": null,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "n_ctx": 512,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 512,
  "n_special": 0,
  "num_labels": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "predict_special_tokens": true,
  "resid_pdrop": 0.1,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "token_ids",
  "summary_use_proj": true,
  "torchscript": false,
  "vocab_size": 40478
}

INFO:pytorch_transformers

In [134]:
model

OpenAIGPTModel(
  (tokens_embed): Embedding(40478, 768)
  (positions_embed): Embedding(512, 768)
  (drop): Dropout(p=0.1)
  (h): ModuleList(
    (0): Block(
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1)
        (resid_dropout): Dropout(p=0.1)
      )
      (ln_1): BertLayerNorm()
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1)
      )
      (ln_2): BertLayerNorm()
    )
    (1): Block(
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1)
        (resid_dropout): Dropout(p=0.1)
      )
      (ln_1): BertLayerNorm()
      (mlp): MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1)
      )
      (ln_2): BertLayerNorm()
    )
    (2): Block(
      (attn): Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1)
     

In [135]:
outputs = model(input_ids)
last_hidden_states = outputs[0]

In [136]:
outputs

(tensor([[[-0.8225,  0.3185,  1.3487,  ..., -1.8885,  0.2741,  1.0912],
          [-0.8317,  0.8078, -0.4801,  ..., -0.0639,  0.0191,  1.9491],
          [-1.2643,  0.1680,  1.3317,  ..., -1.1416,  0.1503,  1.3841],
          [-0.5614, -0.8596,  0.3522,  ...,  0.1271, -0.3651,  0.2104],
          [ 0.5176,  2.5435,  0.3551,  ..., -0.5990, -0.7403,  0.4764],
          [-0.1666,  0.7315,  1.0842,  ..., -0.7447, -1.6451, -1.4017]]],
        grad_fn=<ViewBackward>),)