# Test why RDP overestimate in the small state case

In [1]:
from data_utils import MSCOCOData
from gpt2net import GPT2NetModel

In [2]:
import torch 
import numpy as np 
from tqdm import tqdm 
from time import time 

In [3]:
dataset = MSCOCOData(batch_size=10, test_batch_size=10, subsample=True)

Processing mscoco data ... 


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Reading mscoco raw data .. 
  data path: ../data/mscoco/captions_train2014.json


  0%|          | 0/414113 [00:00<?, ?it/s]

414113 sentences in total, 0.65sec


100%|██████████| 414113/414113 [00:00<00:00, 577006.65it/s]


Reading mscoco raw data .. 
  data path: ../data/mscoco/captions_val2014.json


  0%|          | 0/202654 [00:00<?, ?it/s]

202654 sentences in total, 0.38sec


100%|██████████| 202654/202654 [00:00<00:00, 479769.62it/s]
  1%|          | 62/5000 [00:00<00:08, 574.45it/s]

82586
82581 82573
Processing sentence


100%|██████████| 5000/5000 [00:04<00:00, 1096.40it/s]


Padding to max sentence length 18
Padding to max bow length 9


100%|██████████| 2000/2000 [00:00<00:00, 5635.06it/s]
100%|██████████| 1000/1000 [00:00<00:00, 5645.75it/s]


In [4]:
loader = dataset.train_dataloader()

In [5]:
batch = next(iter(loader))

In [6]:
batch.keys()

dict_keys(['input_ids', 'bow'])

In [7]:
model = GPT2NetModel(num_state=50,
                     transition_init_scale=0.01,
                     exact_rsample=True,
                     sum_size=20,
                     sample_size=5,
                     proposal='softmax',
                     transition_proposal='none',
                     device='cuda',
                     vocab_size=len(dataset.tokenizer),
                     pad_id=dataset.pad_id,
                     bos_id=dataset.bos_id,
                     max_dec_len=dataset.max_slen, 
                     use_bow=True,
                     use_copy=True,
                     task='paraphrasing',
                     ent_approx=True,
                     word_dropout_decay=False,
                     dropout=0.2,
                     potential_normalization='minmax',
                     potential_scale=10.,
                     mask_z=False,
                     z_st=True,
                     topk_sum=False,
                     cache_dir=''
                    )

Some weights of GPT2Model were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.6.attn.masked_bias', 'h.9.attn.masked_bias', 'h.3.attn.masked_bias', 'h.10.attn.masked_bias', 'h.0.attn.masked_bias', 'h.2.attn.masked_bias', 'h.1.attn.masked_bias', 'h.8.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.11.attn.masked_bias', 'h.7.attn.masked_bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
model.to('cuda')

GPT2NetModel(
  (encoder): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
       

In [9]:
sent = batch['input_ids'].to('cuda')
x_enc = sent[:, 1:]
attention_mask = (x_enc != model.pad_id).float() 
x_lens = attention_mask.sum(-1).long()
x_emb = model.encoder(x_enc, attention_mask=attention_mask).last_hidden_state

In [10]:
state_matrix, emission_seq, transition, emission = model.weight_norm(x_emb)

In [11]:
emission.size()

torch.Size([10, 17, 50])

In [12]:
_, log_z_exact = model.crf.forward_sum(transition, emission, x_lens)

In [13]:
log_z_exact[0]

tensor(142.5574, device='cuda:0', grad_fn=<SelectBackward>)

In [17]:
log_z_est_non_trans = []
sampled_index_all = []
for _ in range(100):
    est, sampled_index = model.crf.forward_approx(state_matrix, emission, x_lens, 
        sum_size=25, proposal='softmax', 
        transition_proposal='none', sample_size=5, return_sampled_idx=True)
    log_z_est_non_trans.append(est[0].cpu().item())
    sampled_index_all.append(sampled_index)

In [18]:
np.average(log_z_est_non_trans)

142.55740051269532

In [19]:
sampled_index[0].size()

torch.Size([17, 30])