### HuggingFace Interface

In [30]:
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

from allennlp.modules.elmo import Elmo, batch_to_ids
from sacremoses import MosesTokenizer
from transformers import (
    PretrainedConfig,
    PreTrainedModel,
    PreTrainedTokenizer,
)

class ElmoConfig(PretrainedConfig):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.hidden_size = 1024
        self.num_hidden_layers = 1
        self.is_encoder_decoder = False

class ElmoModel(PreTrainedModel):
    def __init__(self, options_file, weights_file):
        super().__init__(config=ElmoConfig())
        self.elmo_model = Elmo(options_file, weights_file, 1, dropout=0)

    def forward(
        self,
        input_ids,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        labels=None,
    ):
        return self.elmo_model(input_ids)["elmo_representations"][0]

    @staticmethod
    def from_pretrained(path):
        options_file = (
            "/home/kyle/elk/elk/rnn/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json"
        )
        weights_file = (
            "/home/kyle/elk/elk/rnn/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5"
        )
        return ElmoModel(options_file, weights_file)

class ElmoTokenizer(PreTrainedTokenizer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.tokenizer = MosesTokenizer()
        self.wnl = WordNetLemmatizer()

    def __call__(self, text, return_tensors, truncation):
        sequences = text if isinstance(text, list) else [text]
        tokens = [
            [self.wnl.lemmatize(token) for token in self.tokenizer.tokenize(sequence, escape=False)] for sequence in sequences
        ]
        character_ids = batch_to_ids(text)  # type: ignore
        return character_ids

    @staticmethod
    def from_pretrained(path):
        return ElmoTokenizer()


[nltk_data] Downloading package wordnet to /home/kyle/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [31]:
sequences = ["Is this an ELMo representation? Yes", "Is this an ELMo representation? No", "Is this an ELMo representation? Maybe"]
elmo_tokenizer = ElmoTokenizer()
tokenized_sequences = elmo_tokenizer(sequences, return_tensors="pt", truncation=True)
tokenized_sequences.shape

torch.Size([3, 37, 50])

In [None]:

elmo_model = ElmoModel.from_pretrained("elmo")
embeddings = elmo_model(tokenized_sequences)
print(embeddings.shape)
embeddings[0]

torch.Size([3, 7, 1024])


tensor([[-0.8412, -0.1165,  0.5267,  ...,  0.4512, -0.1392,  0.2720],
        [-0.6446, -0.0498,  0.3769,  ...,  0.4740, -0.1965,  0.0583],
        [-0.2392,  1.1553,  0.1504,  ..., -0.0441, -0.4352,  0.2593],
        ...,
        [-0.3401, -0.1522,  0.0504,  ...,  0.1691, -0.0028, -0.2495],
        [-0.1372,  0.0519,  0.0663,  ...,  1.1651,  0.6015, -0.1530],
        [-0.2489, -0.0674,  0.5087,  ...,  0.5525, -0.2275,  0.5592]],
       grad_fn=<SelectBackward0>)

In [35]:
embeddings[0].mean(dim=1)

tensor([ 0.0004, -0.0064,  0.0055, -0.0080, -0.0164, -0.0010,  0.0106],
       grad_fn=<MeanBackward1>)

In [None]:
embeddings[1][-2]

tensor([-0.1372,  0.0519,  0.0663,  ...,  1.1888,  0.6638, -0.0508],
       grad_fn=<SelectBackward0>)

## Isolate LSTM

In [None]:
bilm_lstm = elmo_model.elmo_model._elmo_lstm
lstm_token_embedder = bilm_lstm._token_embedder
internal_embeddings = lstm_token_embedder(tokenized_sequences)
print(internal_embeddings["token_embedding"].shape)
internal_embeddings

torch.Size([3, 9, 512])


{'mask': tensor([[True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True],
         [True, True, True, True, True, True, True, True, True]]),
 'token_embedding': tensor([[[-8.5709e+00, -9.9289e+00,  4.0575e+00,  ...,  9.1735e+00,
            8.3215e+00, -6.9233e+00],
          [-1.3800e-02, -2.2660e-01, -3.4851e-02,  ...,  1.9179e-01,
            4.6514e-02, -2.2795e-02],
          [-6.0722e-02,  5.1529e-02,  1.8452e-02,  ...,  2.3105e-01,
            3.6155e-03,  4.1534e-02],
          ...,
          [ 6.4416e-01, -7.7645e-02,  7.0715e-02,  ...,  6.5416e-01,
            5.1353e-01, -2.2917e-02],
          [-3.8933e-01,  8.6538e-01,  1.1729e-01,  ...,  4.5468e-01,
           -9.4481e-02,  2.5647e-01],
          [-1.2410e-01, -3.2147e+00, -6.1114e+00,  ...,  9.8041e+00,
            1.9380e+00, -8.1160e+00]],
 
         [[-8.5709e+00, -9.9289e+00,  4.0575e+00,  ...,  9.1735e+00,
            8.3215e+00, -6.9233e+00],
         

In [None]:
activations = bilm_lstm(tokenized_sequences)["activations"]
activations[0].shape

torch.Size([3, 9, 1024])

In [None]:
internal_bilm = bilm_lstm._elmo_lstm
bilm_hidden_states = internal_bilm(inputs=internal_embeddings["token_embedding"], mask=internal_embeddings["mask"])
print(bilm_hidden_states.shape)
print(internal_bilm.hidden_size)

# Calling the internal BiLM directly gives the hidden states for each layer. There are two layers.
# The output tensor is of shape (2, batch_size, sequence_length, 1012 (512 * 2?))
bilm_hidden_states

torch.Size([2, 3, 9, 1024])
512


tensor([[[[ 2.2756e-02,  2.9971e-02,  2.5870e-02,  ...,  1.2941e-01,
            1.2591e-01,  2.0738e-02],
          [-6.6910e-01, -6.9574e-03,  5.6632e-01,  ...,  3.4151e-01,
           -3.7841e-01,  2.6081e-01],
          [-8.1336e-01, -2.1257e-03,  1.5202e-01,  ...,  3.8662e-01,
           -6.2787e-01,  1.0230e-02],
          ...,
          [-4.7419e-01,  2.3019e-01, -5.5697e-02,  ...,  1.1738e-01,
           -2.6359e-01,  9.8556e-03],
          [ 9.7131e-02, -3.3274e-02,  7.3773e-01,  ...,  2.8967e-01,
            1.6298e-01,  3.8194e-01],
          [ 2.4646e-01,  6.4309e-01,  1.6108e-01,  ...,  1.9356e-01,
           -6.8029e-03,  1.8191e-03]],

         [[ 2.2720e-02,  3.0274e-02,  2.5760e-02,  ...,  1.3126e-01,
            1.2724e-01,  2.0480e-02],
          [-6.6751e-01, -6.3711e-03,  5.6428e-01,  ...,  3.5123e-01,
           -3.4959e-01,  2.6068e-01],
          [-8.1317e-01, -2.1860e-03,  1.5058e-01,  ...,  4.0542e-01,
           -5.8162e-01, -4.1161e-03],
          ...,
     

In [None]:
bilm_hidden_states[0][0][0]

tensor([0.0228, 0.0300, 0.0259,  ..., 0.1294, 0.1259, 0.0207])

In [None]:
# Output activations from the internal BiLM
print(activations[0].shape)
bilm_lstm(tokenized_sequences)["activations"]

torch.Size([3, 9, 1024])


[tensor([[[-8.5709e+00, -9.9289e+00,  4.0575e+00,  ...,  9.1735e+00,
            8.3215e+00, -6.9233e+00],
          [-1.3800e-02, -2.2660e-01, -3.4851e-02,  ...,  1.9179e-01,
            4.6514e-02, -2.2795e-02],
          [-6.0722e-02,  5.1529e-02,  1.8452e-02,  ...,  2.3105e-01,
            3.6155e-03,  4.1534e-02],
          ...,
          [ 6.4416e-01, -7.7645e-02,  7.0715e-02,  ...,  6.5416e-01,
            5.1353e-01, -2.2917e-02],
          [-3.8933e-01,  8.6538e-01,  1.1729e-01,  ...,  4.5468e-01,
           -9.4481e-02,  2.5647e-01],
          [-1.2410e-01, -3.2147e+00, -6.1114e+00,  ...,  9.8041e+00,
            1.9380e+00, -8.1160e+00]],
 
         [[-8.5709e+00, -9.9289e+00,  4.0575e+00,  ...,  9.1735e+00,
            8.3215e+00, -6.9233e+00],
          [-1.3800e-02, -2.2660e-01, -3.4851e-02,  ...,  1.9179e-01,
            4.6514e-02, -2.2795e-02],
          [-6.0722e-02,  5.1529e-02,  1.8452e-02,  ...,  2.3105e-01,
            3.6155e-03,  4.1534e-02],
          ...,
    

## biLM HuggingFace Port

In [None]:
class ElmoBiLMTokenizer(PreTrainedTokenizer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.elmo_tokenizer = ElmoTokenizer()
        self.token_embedder = ElmoModel.from_pretrained("elmo").elmo_model._elmo_lstm._token_embedder

    def __call__(self, text, return_tensors, truncation):
        tokenized_text = self.elmo_tokenizer(text, return_tensors, truncation)
        embeddings = self.token_embedder(tokenized_text)
        return embeddings
        
    @staticmethod
    def from_pretrained(path):
        return ElmoBiLMTokenizer()


class ElmoBiLMConfig(PretrainedConfig):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.hidden_size = 1024
        self.num_hidden_layers = 2
        self.is_encoder_decoder = False

    
class ElmoBiLM(PreTrainedModel):
    def __init__(self):
        super().__init__(config=ElmoBiLMConfig())
        self.elmo_lstm = ElmoModel.from_pretrained("elmo").elmo_model._elmo_lstm._elmo_lstm
        
    def forward(
        self,
        input_ids,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        labels=None,
    ):
        inputs = input_ids
        return self.elmo_lstm(inputs=input_ids, mask=attention_mask)

bilm_tokenizer = ElmoBiLMTokenizer()
tokenizer_batch = bilm_tokenizer(sequences, return_tensors="pt", truncation=True)
bilm_model = ElmoBiLM()
hidden_states = bilm_model(tokenizer_batch["token_embedding"], attention_mask=tokenizer_batch["mask"])
hidden_states

tensor([[[[ 2.7004e-02, -5.8437e-03,  3.8740e-02,  ...,  1.1279e-01,
            8.2891e-02,  3.0009e-02],
          [-9.0796e-01, -1.6039e-01,  8.3736e-01,  ...,  4.3962e-01,
           -1.0700e-01,  2.9674e-01],
          [-6.9313e-01,  1.0666e-02,  3.2420e-01,  ...,  3.0632e-01,
           -3.9687e-01, -1.9246e-02],
          ...,
          [-4.8125e-01,  1.9718e-01,  3.3368e-03,  ...,  2.3145e-01,
           -3.2376e-02, -2.3980e-01],
          [ 1.3262e-01, -1.0354e-01,  7.3106e-01,  ...,  6.5551e-01,
           -3.6035e-01,  1.1051e+00],
          [ 2.4790e-01,  5.9119e-01,  1.9945e-01,  ...,  1.6856e-01,
           -3.2241e-02,  4.7876e-02]],

         [[ 2.7004e-02, -5.8437e-03,  3.8740e-02,  ...,  1.1832e-01,
            9.5615e-02,  2.6538e-02],
          [-9.0796e-01, -1.6039e-01,  8.3736e-01,  ...,  4.0804e-01,
            7.6256e-03,  3.4817e-01],
          [-6.9313e-01,  1.0666e-02,  3.2420e-01,  ...,  2.6200e-01,
           -2.6565e-01,  9.1764e-02],
          ...,
     

In [None]:
hidden_states.shape

torch.Size([2, 3, 9, 1024])