This repository has been archived by the owner on Dec 16, 2022. It is now read-only.
/
elmo_indexer.py
159 lines (130 loc) · 5.6 KB
/
elmo_indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
from typing import Dict, List
from overrides import overrides
import torch
from allennlp.common.util import pad_sequence_to_length
from allennlp.data.tokenizers import Token
from allennlp.data.token_indexers.token_indexer import TokenIndexer, IndexedTokenList
from allennlp.data.vocabulary import Vocabulary
def _make_bos_eos(
character: int,
padding_character: int,
beginning_of_word_character: int,
end_of_word_character: int,
max_word_length: int,
):
char_ids = [padding_character] * max_word_length
char_ids[0] = beginning_of_word_character
char_ids[1] = character
char_ids[2] = end_of_word_character
return char_ids
class ELMoCharacterMapper:
"""
Maps individual tokens to sequences of character ids, compatible with ELMo.
To be consistent with previously trained models, we include it here as special of existing
character indexers.
We allow to add optional additional special tokens with designated
character ids with `tokens_to_add`.
"""
max_word_length = 50
# char ids 0-255 come from utf-8 encoding bytes
# assign 256-300 to special chars
beginning_of_sentence_character = 256 # <begin sentence>
end_of_sentence_character = 257 # <end sentence>
beginning_of_word_character = 258 # <begin word>
end_of_word_character = 259 # <end word>
padding_character = 260 # <padding>
beginning_of_sentence_characters = _make_bos_eos(
beginning_of_sentence_character,
padding_character,
beginning_of_word_character,
end_of_word_character,
max_word_length,
)
end_of_sentence_characters = _make_bos_eos(
end_of_sentence_character,
padding_character,
beginning_of_word_character,
end_of_word_character,
max_word_length,
)
bos_token = "<S>"
eos_token = "</S>"
def __init__(self, tokens_to_add: Dict[str, int] = None) -> None:
self.tokens_to_add = tokens_to_add or {}
def convert_word_to_char_ids(self, word: str) -> List[int]:
if word in self.tokens_to_add:
char_ids = [ELMoCharacterMapper.padding_character] * ELMoCharacterMapper.max_word_length
char_ids[0] = ELMoCharacterMapper.beginning_of_word_character
char_ids[1] = self.tokens_to_add[word]
char_ids[2] = ELMoCharacterMapper.end_of_word_character
elif word == ELMoCharacterMapper.bos_token:
char_ids = ELMoCharacterMapper.beginning_of_sentence_characters
elif word == ELMoCharacterMapper.eos_token:
char_ids = ELMoCharacterMapper.end_of_sentence_characters
else:
word_encoded = word.encode("utf-8", "ignore")[
: (ELMoCharacterMapper.max_word_length - 2)
]
char_ids = [ELMoCharacterMapper.padding_character] * ELMoCharacterMapper.max_word_length
char_ids[0] = ELMoCharacterMapper.beginning_of_word_character
for k, chr_id in enumerate(word_encoded, start=1):
char_ids[k] = chr_id
char_ids[len(word_encoded) + 1] = ELMoCharacterMapper.end_of_word_character
# +1 one for masking
return [c + 1 for c in char_ids]
def __eq__(self, other) -> bool:
if isinstance(self, other.__class__):
return self.__dict__ == other.__dict__
return NotImplemented
@TokenIndexer.register("elmo_characters")
class ELMoTokenCharactersIndexer(TokenIndexer):
"""
Convert a token to an array of character ids to compute ELMo representations.
Registered as a `TokenIndexer` with name "elmo_characters".
# Parameters
namespace : `str`, optional (default=`elmo_characters`)
tokens_to_add : `Dict[str, int]`, optional (default=`None`)
If not None, then provides a mapping of special tokens to character
ids. When using pre-trained models, then the character id must be
less then 261, and we recommend using un-used ids (e.g. 1-32).
token_min_padding_length : `int`, optional (default=`0`)
See :class:`TokenIndexer`.
"""
def __init__(
self,
namespace: str = "elmo_characters",
tokens_to_add: Dict[str, int] = None,
token_min_padding_length: int = 0,
) -> None:
super().__init__(token_min_padding_length)
self._namespace = namespace
self._mapper = ELMoCharacterMapper(tokens_to_add)
@overrides
def count_vocab_items(self, token: Token, counter: Dict[str, Dict[str, int]]):
pass
@overrides
def get_empty_token_list(self) -> IndexedTokenList:
return {"elmo_tokens": []}
@overrides
def tokens_to_indices(
self, tokens: List[Token], vocabulary: Vocabulary
) -> Dict[str, List[List[int]]]:
# TODO(brendanr): Retain the token to index mappings in the vocabulary and remove this
# https://github.com/allenai/allennlp/blob/master/allennlp/data/token_indexers/wordpiece_indexer.py#L113
return {
"elmo_tokens": [self._mapper.convert_word_to_char_ids(t.ensure_text()) for t in tokens]
}
@overrides
def as_padded_tensor_dict(
self, tokens: IndexedTokenList, padding_lengths: Dict[str, int]
) -> Dict[str, torch.Tensor]:
# Overriding this method only because we need a different padding token than the default.
tensor_dict = {}
def padding_token():
return [0] * ELMoCharacterMapper.max_word_length
tensor_dict["elmo_tokens"] = torch.LongTensor(
pad_sequence_to_length(
tokens["elmo_tokens"], padding_lengths["elmo_tokens"], default_value=padding_token
)
)
return tensor_dict