In [1]:
from collections import namedtuple
import os
import ast
import numpy as np
import tqdm
import torch
from models import TransformerLM, TransformerConditionedLM
import torch.nn.functional as F
from fairseq import checkpoint_utils, options, tasks, utils


In [2]:
uLM_checkpoint_path = "/net/papilio/storage2/yhaoyuan/transformer_I2S/gslm_models/uLM/hubert100_lm/checkpoint_best.pt"
checkpoint = torch.load(uLM_checkpoint_path)
uLM_state_dict = checkpoint["model"]

In [134]:
model = TransformerLM(
    vocab_size=104,
    d_model=1024,
    nhead=16,
    num_layers=12,
    activation="relu",
    layer_norm_eps=1e-5,
    batch_first=True,
    norm_first=True,
    classifier_bias=False,
)

In [135]:
for k, v in model.state_dict().items():
    print(k)
    print(v)
    break

embed.weight
tensor([[ 0.0393, -0.0569,  0.0812,  ...,  0.0961,  0.0389, -0.0317],
        [-0.0721, -0.0568, -0.0007,  ...,  0.0997, -0.0921, -0.0186],
        [ 0.0017,  0.0506,  0.0050,  ..., -0.0804,  0.0531, -0.0834],
        ...,
        [ 0.0706,  0.0569,  0.0035,  ...,  0.0178,  0.0548,  0.0307],
        [-0.0836,  0.0109, -0.0821,  ...,  0.0065, -0.0768,  0.0746],
        [-0.0314,  0.0458, -0.0280,  ..., -0.0913, -0.0764,  0.0452]])


In [136]:
# for k, v in model.state_dict().items():
#     print(k)
#     print(model.state_dict()[k].shape)
#     #print(v.shape)

In [137]:
# for k, v in uLM_state_dict.items():
#     print(k)
#     print(v.shape)

In [138]:
"""
For Initial Embedding:
prefix
    None                       decoder.

    embed.weight:              embed_tokens.weight

For Each Layer:
prefix: 
    LM_decoder.layers.[].      decoder.layers.[].

    self_attn.in_proj_weight : self_attn.q_proj.weight
                               self_attn.k_proj.weight
                               self_attn.v_proj.weight
    self_attn.in_proj_bias :   self_attn.q_proj.bias
                               self_attn.k_proj.bias
                               self_attn.v_proj.bias
    self_attn.out_proj.weight: self_attn.out_proj.weight
    self_attn.out_proj.bias:   self_attn.out_proj.bias
    linear1.weight:            fc1.weight
    linear1.bias:              fc1.bias
    linear2.weight:            fc2.weight
    linear2.bias:              fc2.bias
    norm1.weight:              self_attn_layer_norm.weight
    norm1.bias:                self_attn_layer_norm.bias
    norm2.weight:              final_layer_norm.weight
    norm2.bias:                final_layer_norm.bias

For Final Layer:
prefix:
    LM_decoder.                decoder.

    norm.weight:               layer_norm.weight
    norm.bias:                 layer_norm.bias

Classifier:
prefix:
    None                       decoder.
    
    classifier.weight:         output_projection.weight
"""

'\nFor Initial Embedding:\nprefix\n    None                       decoder.\n\n    embed.weight:              embed_tokens.weight\n\nFor Each Layer:\nprefix: \n    LM_decoder.layers.[].      decoder.layers.[].\n\n    self_attn.in_proj_weight : self_attn.q_proj.weight\n                               self_attn.k_proj.weight\n                               self_attn.v_proj.weight\n    self_attn.in_proj_bias :   self_attn.q_proj.bias\n                               self_attn.k_proj.bias\n                               self_attn.v_proj.bias\n    self_attn.out_proj.weight: self_attn.out_proj.weight\n    self_attn.out_proj.bias:   self_attn.out_proj.bias\n    linear1.weight:            fc1.weight\n    linear1.bias:              fc1.bias\n    linear2.weight:            fc2.weight\n    linear2.bias:              fc2.bias\n    norm1.weight:              self_attn_layer_norm.weight\n    norm1.bias:                self_attn_layer_norm.bias\n    norm2.weight:              final_layer_norm.weight\n  

In [139]:
def load_key(tgt_state_dict, key, value):
    model = tgt_state_dict
    # Make sure the loaded values won't cause errors
    assert model[key].shape == value.shape, f"Key {key}, need shape {model[key].shape}, get shape {value.shape}"
    assert model[key].dtype == value.dtype, f"Key {key}, need type {model[key].dtype}, get type {value.dtype}"
    model[key] = value
    return model

In [140]:
def load_layer(tgt_state_dict, src_state_dict, layer_id):
    # Load in_proj weight and bias
    tgt_prefix = f"LM_decoder.layers.{int(layer_id)}."
    src_prefix = f"decoder.layers.{int(layer_id)}."
    src_q_weight = src_state_dict[src_prefix + "self_attn.q_proj.weight"]
    src_q_bias = src_state_dict[src_prefix + "self_attn.q_proj.bias"]
    src_k_weight = src_state_dict[src_prefix + "self_attn.k_proj.weight"]
    src_k_bias = src_state_dict[src_prefix + "self_attn.k_proj.bias"]
    src_v_weight = src_state_dict[src_prefix + "self_attn.v_proj.weight"]
    src_v_bias = src_state_dict[src_prefix + "self_attn.v_proj.bias"]
    
    in_proj_weight = torch.cat((src_q_weight,src_k_weight,src_v_weight), dim=0)
    in_proj_bias = torch.cat((src_q_bias,src_k_bias,src_v_bias), dim=0)
    
    tgt_state_dict = load_key(tgt_state_dict, tgt_prefix+"self_attn.in_proj_weight", in_proj_weight)
    tgt_state_dict = load_key(tgt_state_dict, tgt_prefix+"self_attn.in_proj_bias", in_proj_bias)

    # load others
    tgt_state_dict = load_key(tgt_state_dict, tgt_prefix+"self_attn.out_proj.weight", src_state_dict[src_prefix + "self_attn.out_proj.weight"])
    tgt_state_dict = load_key(tgt_state_dict, tgt_prefix+"self_attn.out_proj.bias", src_state_dict[src_prefix + "self_attn.out_proj.bias"])
    tgt_state_dict = load_key(tgt_state_dict, tgt_prefix+"linear1.weight", src_state_dict[src_prefix + "fc1.weight"])
    tgt_state_dict = load_key(tgt_state_dict, tgt_prefix+"linear1.bias", src_state_dict[src_prefix + "fc1.bias"])
    tgt_state_dict = load_key(tgt_state_dict, tgt_prefix+"linear2.weight", src_state_dict[src_prefix + "fc2.weight"])
    tgt_state_dict = load_key(tgt_state_dict, tgt_prefix+"linear2.bias", src_state_dict[src_prefix + "fc2.bias"])
    tgt_state_dict = load_key(tgt_state_dict, tgt_prefix+"norm1.weight", src_state_dict[src_prefix + "self_attn_layer_norm.weight"])
    tgt_state_dict = load_key(tgt_state_dict, tgt_prefix+"norm1.bias", src_state_dict[src_prefix + "self_attn_layer_norm.bias"])
    tgt_state_dict = load_key(tgt_state_dict, tgt_prefix+"norm2.weight", src_state_dict[src_prefix + "final_layer_norm.weight"])
    tgt_state_dict = load_key(tgt_state_dict, tgt_prefix+"norm2.bias", src_state_dict[src_prefix + "final_layer_norm.bias"])
    # print(tgt_state_dict[tgt_prefix+"self_attn.in_proj_weight"])
    # print(src_q_weight)
    #print(src_v_weight)
    return tgt_state_dict

def load_embed(tgt_state_dict, src_state_dict):
    tgt_state_dict = load_key(tgt_state_dict, "embed.weight", src_state_dict["decoder.embed_tokens.weight"])
    tgt_state_dict = load_key(tgt_state_dict, "classifier.weight", src_state_dict["decoder.output_projection.weight"])
    return tgt_state_dict

def load_final_norm(tgt_state_dict, src_state_dict):
    tgt_state_dict = load_key(tgt_state_dict, "LM_decoder.norm.weight", src_state_dict["decoder.layer_norm.weight"])
    tgt_state_dict = load_key(tgt_state_dict, "LM_decoder.norm.bias", src_state_dict["decoder.layer_norm.bias"])
    return tgt_state_dict

In [141]:
model_state_dict = model.state_dict()

In [142]:
model_state_dict = load_embed(model_state_dict, uLM_state_dict)
model_state_dict = load_final_norm(model_state_dict, uLM_state_dict)
for i in range(12):
    model_state_dict = load_layer(model_state_dict, uLM_state_dict, i)

In [143]:
for k, v in model.state_dict().items():
    print(k)
    print(v)
    break

embed.weight
tensor([[ 0.0393, -0.0569,  0.0812,  ...,  0.0961,  0.0389, -0.0317],
        [-0.0721, -0.0568, -0.0007,  ...,  0.0997, -0.0921, -0.0186],
        [ 0.0017,  0.0506,  0.0050,  ..., -0.0804,  0.0531, -0.0834],
        ...,
        [ 0.0706,  0.0569,  0.0035,  ...,  0.0178,  0.0548,  0.0307],
        [-0.0836,  0.0109, -0.0821,  ...,  0.0065, -0.0768,  0.0746],
        [-0.0314,  0.0458, -0.0280,  ..., -0.0913, -0.0764,  0.0452]])


In [158]:
model.load_state_dict(model_state_dict)

<All keys matched successfully>

In [159]:
for name, param in model.named_parameters():
    print(name)
    print(param)
    break

embed.weight
Parameter containing:
tensor([[-0.0534,  0.0380, -0.1919,  ..., -0.1920, -0.0940, -0.0767],
        [-0.0703,  0.0638, -0.1917,  ..., -0.1895, -0.0715, -0.0443],
        [-0.0107, -0.0076, -0.0311,  ..., -0.0222, -0.1194, -0.0261],
        ...,
        [-0.0319,  0.0427,  0.0266,  ...,  0.0098, -0.0749,  0.0232],
        [ 0.0108,  0.0111,  0.0388,  ..., -0.0373, -0.0581, -0.0033],
        [ 0.0684, -0.0652, -0.0375,  ..., -0.0121, -0.0370, -0.0023]],
       requires_grad=True)


In [160]:
model.eval()

TransformerLM(
  (embed): Embedding(104, 1024)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (LM_decoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
        )
        (linear1): Linear(in_features=1024, out_features=4096, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=4096, out_features=1024, bias=True)
        (norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
      (1): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024,

In [167]:
k = 10
k_prev_words = torch.LongTensor([[2, 75, 16]] * k)
seqs = k_prev_words  # (k, 1)
print(seqs)
# Tensor to store top k sequences' scores; now they're just 0
top_k_scores = torch.zeros(k, 1)  # (k, 1)
# Lists to store completed sequences and scores
complete_seqs = list()
complete_seqs_scores = list()
# Start decoding
step = 5

x = model.embed(seqs)
x = model.pos_encoder(x)
output = model.LM_decoder(x)
print(output)
scores = model.classifier(output[:,-1,:])
scores = F.log_softmax(scores, dim=1)
# Add
scores = top_k_scores.expand_as(scores) + scores  # (s, vocab_size)
if step == 1:
    top_k_scores, top_k_words = scores[0].topk(k, 0, True, True)  # (s)
else:
    # Unroll and find top scores, and their unrolled indices
    top_k_scores, top_k_words = scores.view(-1).topk(k, 0, True, True)  # (s)

prev_word_inds = torch.div(top_k_words, 104, rounding_mode="floor")
next_word_inds = top_k_words % 104  # (s)

tensor([[ 2, 75, 16],
        [ 2, 75, 16],
        [ 2, 75, 16],
        [ 2, 75, 16],
        [ 2, 75, 16],
        [ 2, 75, 16],
        [ 2, 75, 16],
        [ 2, 75, 16],
        [ 2, 75, 16],
        [ 2, 75, 16]])
tensor([[[-0.2129,  1.1702,  2.6507,  ...,  0.2212,  0.1652, -0.0551],
         [-0.2535,  1.1923,  2.6548,  ...,  0.1870,  0.1670, -0.0985],
         [-0.3158,  1.2271,  2.6061,  ...,  0.1849,  0.1543, -0.1002]],

        [[-0.2129,  1.1702,  2.6507,  ...,  0.2212,  0.1652, -0.0551],
         [-0.2535,  1.1923,  2.6548,  ...,  0.1870,  0.1670, -0.0985],
         [-0.3158,  1.2271,  2.6061,  ...,  0.1849,  0.1543, -0.1002]],

        [[-0.2129,  1.1702,  2.6507,  ...,  0.2212,  0.1652, -0.0551],
         [-0.2535,  1.1923,  2.6548,  ...,  0.1870,  0.1670, -0.0985],
         [-0.3158,  1.2271,  2.6061,  ...,  0.1849,  0.1543, -0.1002]],

        ...,

        [[-0.2129,  1.1702,  2.6507,  ...,  0.2212,  0.1652, -0.0551],
         [-0.2535,  1.1923,  2.6548,  ...,  0.187

In [168]:
prev_word_inds

tensor([3, 1, 2, 6, 4, 7, 8, 9, 5, 0])

In [132]:
k = 1
k_prev_words = torch.LongTensor([[2, 75]] * k)
seqs = k_prev_words  # (k, 1)
print(seqs)
# Tensor to store top k sequences' scores; now they're just 0
top_k_scores = torch.zeros(k, 1)  # (k, 1)
# Lists to store completed sequences and scores
complete_seqs = list()
complete_seqs_scores = list()
# Start decoding

x = model.embed(seqs)
x = model.pos_encoder(x)
output = model.LM_decoder(x)
print(output)
scores = model.classifier(output)
scores = F.log_softmax(scores, dim=1)

tensor([[ 2, 75]])
tensor([[[-0.2359,  1.1805,  1.9810,  ...,  0.0401, -0.1347,  0.2386],
         [-0.2661,  1.1961,  1.9554,  ...,  0.0164, -0.1628,  0.2259]]],
       grad_fn=<NativeLayerNormBackward0>)


In [170]:
import torch
from torcheval.metrics.text import Perplexity

In [176]:
metric=Perplexity()
input = torch.tensor([[[0.3659, 0.7025, 0.3104]], [[0.0097, 0.6577, 0.1947]],[[0.5659, 0.0025, 0.0104]], [[0.9097, 0.0577, 0.7947]]])
target = torch.tensor([[2],  [1], [2],  [1]])
metric.update(input, target)
metric.compute()

tensor(3.5257, dtype=torch.float64)

In [204]:
input = torch.tensor([[[0.3659, 0.1025, 0.1104]], [[0.8097, 0.6577, 0.1947]],[[0.5659, 0.0025, 0.0104]], [[0.9097, 0.0577, 0.7947]]])
target = torch.tensor([[2],  [1], [2],  [1]])
metric.update(input, target)
#pp = metric.compute()

<torcheval.metrics.text.perplexity.Perplexity at 0x7f291103e160>

In [205]:
metric.compute()

tensor(3.6732, dtype=torch.float64)