-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1 from JT-Ushio/master
release(antu): ready to release antu v0.0.1 [Tao Ji].
- Loading branch information
Showing
47 changed files
with
2,158 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,24 @@ | ||
# pyAnt | ||
Universal data IO module in NLP tasks (for AntNLP Group) | ||
# AntU | ||
Universal data IO and neural network modules in NLP tasks. | ||
|
||
+ **data IO** is an universal module in Natural Language Processing system and not based on any framework (like TensorFlow, PyTorch, MXNet, Dynet...). | ||
+ **neural network** module contains the neural network structures commonly used in NLP tasks. We want to design commonly used structures for each neural network framework. We will continue to develop this module. | ||
|
||
|
||
|
||
# Requirements | ||
|
||
+ Python>=3.6 | ||
+ bidict | ||
+ overrides | ||
|
||
##### If you need dynet neural network: | ||
|
||
+ dynet>=2.0 | ||
|
||
|
||
|
||
# Resources | ||
|
||
+ [Documentation](https://wait) | ||
+ [Source Code](https://github.com/AntNLP/antu) |
Empty file.
Empty file.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
from typing import Dict, List | ||
from abc import ABCMeta, abstractmethod | ||
from antu.io.instance import Instance | ||
|
||
|
||
class DatasetReader(metaclass=ABCMeta): | ||
|
||
@abstractmethod | ||
def read(self, file_path: str) -> List[Instance]: | ||
pass | ||
|
||
@abstractmethod | ||
def input_to_instance(self, inputs: str) -> Instance: | ||
pass |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
from typing import Dict, List | ||
from abc import ABCMeta, abstractmethod | ||
from antu.io.vocabulary import Vocabulary | ||
from antu.io.instance import Instance | ||
|
||
|
||
class Dataset(metaclass=ABCMeta): | ||
|
||
vocabulary_set: Vocabulary = {} | ||
datasets: Dict[str, List[Instance]] = {} | ||
|
||
@abstractmethod | ||
def read(): | ||
pass | ||
|
||
@abstractmethod | ||
def input_to_instance(): | ||
pass | ||
|
||
|
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
from typing import List, Dict | ||
from abc import ABCMeta, abstractmethod | ||
|
||
from antu.io.vocabulary import Vocabulary | ||
|
||
|
||
class Field(metaclass=ABCMeta): | ||
""" | ||
A ``Field`` is an ingredient of a data instance. In most NLP tasks, ``Field`` | ||
stores data of string types. It contains one or more indexers that map string | ||
data to the corresponding index. Data instances are collections of fields. | ||
""" | ||
@abstractmethod | ||
def count_vocab_items(self, counter: Dict[str, Dict[str, int]]) -> None: | ||
""" | ||
We count the number of strings if the string needs to be mapped to one | ||
or more integers. You can pass directly if there is no string that needs | ||
to be mapped. | ||
Parameters | ||
---------- | ||
counter : ``Dict[str, Dict[str, int]]`` | ||
``counter`` is used to count the number of each item. The first key | ||
represents the namespace of the vocabulary, and the second key represents | ||
the string of the item. | ||
""" | ||
pass | ||
|
||
@abstractmethod | ||
def index(self, vocab: Vocabulary) -> None: | ||
""" | ||
Gets one or more index mappings for each element in the Field. | ||
Parameters | ||
---------- | ||
vocab : ``Vocabulary`` | ||
``vocab`` is used to get the index of each item. | ||
""" | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
from typing import List, Dict, Iterator | ||
from overrides import overrides | ||
from antu.io.token_indexers.token_indexer import TokenIndexer | ||
from antu.io.vocabulary import Vocabulary | ||
from antu.io.fields.field import Field | ||
|
||
|
||
class IndexField(Field): | ||
""" | ||
A ``IndexField`` is an integer field, and we can use it to store data ID. | ||
Parameters | ||
---------- | ||
name : ``str`` | ||
Field name. This is necessary and must be unique (not the same as other | ||
field names). | ||
tokens : ``List[str]`` | ||
Field content that contains a list of string. | ||
indexers : ``List[TokenIndexer]``, optional (default=``list()``) | ||
Indexer list that defines the vocabularies associated with the field. | ||
""" | ||
def __init__( | ||
self, | ||
name: str, | ||
tokens: List[str], | ||
indexers: List[TokenIndexer] = None): | ||
self.name = name | ||
self.tokens = tokens | ||
self.indexers = indexers | ||
|
||
def __iter__(self) -> Iterator[str]: | ||
return iter(self.tokens) | ||
|
||
def __getitem__(self, idx: int) -> str: | ||
return self.tokens[idx] | ||
|
||
def __len__(self) -> int: | ||
return len(self.tokens) | ||
|
||
@overrides | ||
def count_vocab_items( | ||
self, | ||
counters: Dict[str, Dict[str, int]]) -> None: | ||
""" | ||
``IndexField`` doesn't need index operation. | ||
""" | ||
pass | ||
|
||
@overrides | ||
def index( | ||
self, | ||
vocab: Vocabulary) -> None: | ||
""" | ||
``IndexField`` doesn't need index operation. | ||
""" | ||
pass | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
from typing import List, Iterator, Dict | ||
from overrides import overrides | ||
from antu.io.token_indexers.token_indexer import TokenIndexer | ||
from antu.io.vocabulary import Vocabulary | ||
from antu.io.fields.field import Field | ||
|
||
|
||
class SequenceLabelField(Field): | ||
|
||
def __init__( | ||
self, | ||
name: str, | ||
tokens: List[str], | ||
indexers: List[TokenIndexer]): | ||
self.name = name | ||
self.tokens = tokens | ||
self.indexers = indexers | ||
|
||
def __iter__(self) -> Iterator[str]: | ||
return iter(self.tokens) | ||
|
||
def __getitem__(self, idx: int) -> str: | ||
return self.tokens[idx] | ||
|
||
def __len__(self) -> int: | ||
return len(self.tokens) | ||
|
||
@overrides | ||
def count_vocab_items( | ||
self, | ||
counters: Dict[str, Dict[str, int]]) -> None: | ||
for idxer in self.indexers: | ||
for token in self.tokens: | ||
idxer.count_vocab_items(token, counters) | ||
|
||
@overrides | ||
def index( | ||
self, | ||
vocab: Vocabulary) -> None: | ||
self.indexes = {} | ||
for idxer in self.indexers: | ||
self.indexes.update(idxer.tokens_to_indices(self.tokens, vocab)) | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
from typing import List, Iterator, Dict | ||
from overrides import overrides | ||
from antu.io.token_indexers.token_indexer import TokenIndexer | ||
from antu.io.vocabulary import Vocabulary | ||
from antu.io.fields.field import Field | ||
|
||
|
||
class TextField(Field): | ||
""" | ||
A ``TextField`` is a data field that is commonly used in NLP tasks, and we | ||
can use it to store text sequences such as sentences, paragraphs, POS tags, | ||
and so on. | ||
Parameters | ||
---------- | ||
name : ``str`` | ||
Field name. This is necessary and must be unique (not the same as other | ||
field names). | ||
tokens : ``List[str]`` | ||
Field content that contains a list of string. | ||
indexers : ``List[TokenIndexer]``, optional (default=``list()``) | ||
Indexer list that defines the vocabularies associated with the field. | ||
""" | ||
def __init__( | ||
self, | ||
name: str, | ||
tokens: List[str], | ||
indexers: List[TokenIndexer] = list()): | ||
self.name = name | ||
self.tokens = tokens | ||
self.indexers = indexers | ||
|
||
def __iter__(self) -> Iterator[str]: | ||
return iter(self.tokens) | ||
|
||
def __getitem__(self, idx: int) -> str: | ||
return self.tokens[idx] | ||
|
||
def __len__(self) -> int: | ||
return len(self.tokens) | ||
|
||
@overrides | ||
def count_vocab_items( | ||
self, | ||
counters: Dict[str, Dict[str, int]]) -> None: | ||
""" | ||
We count the number of strings if the string needs to be counted to some | ||
counters. You can pass directly if there is no string that needs | ||
to be counted. | ||
Parameters | ||
---------- | ||
counters : ``Dict[str, Dict[str, int]]`` | ||
Element statistics for datasets. if field indexers indicate that | ||
this field is related to some counters, we use field content to | ||
update the counters. | ||
""" | ||
for idxer in self.indexers: | ||
for token in self.tokens: | ||
idxer.count_vocab_items(token, counters) | ||
|
||
@overrides | ||
def index( | ||
self, | ||
vocab: Vocabulary) -> None: | ||
""" | ||
Gets one or more index mappings for each element in the Field. | ||
Parameters | ||
---------- | ||
vocab : ``Vocabulary`` | ||
``vocab`` is used to get the index of each item. | ||
""" | ||
self.indexes = {} | ||
for idxer in self.indexers: | ||
self.indexes.update(idxer.tokens_to_indices(self.tokens, vocab)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
import pytest | ||
from antu.io.fields.text_field import TextField | ||
|
||
|
||
class TestTextField: | ||
|
||
def test_textfield(self): | ||
sentence = ['This', 'is', 'a', 'test', 'sentence', '.'] | ||
sent = TextField('sentence', sentence) | ||
assert sent[0] == 'This' | ||
assert sent[-1] == '.' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
from typing import Dict, MutableMapping, Mapping, TypeVar, List | ||
|
||
from antu.io.vocabulary import Vocabulary | ||
from antu.io.fields.field import Field | ||
|
||
Indices = TypeVar("Indices", List[int], List[List[int]]) | ||
|
||
|
||
class Instance(Mapping[str, Field]): | ||
""" | ||
An ``Instance`` is a collection (list) of multiple data fields. | ||
Parameters | ||
---------- | ||
fields : ``List[Field]``, optional (default=``None``) | ||
A list of multiple data fields. | ||
""" | ||
|
||
def __init__(self, fields: List[Field]=None) -> None: | ||
self.fields = fields | ||
self.indexed = False # Indicates whether the instance has been indexed | ||
|
||
def __getitem__(self, key: str) -> Field: | ||
return self.fields[key] | ||
|
||
def __iter__(self): | ||
return iter(self.fields) | ||
|
||
def __len__(self) -> int: | ||
return len(self.fields) | ||
|
||
def add_field(self, field: Field) -> None: | ||
""" | ||
Add the field to the existing ``Instance``. | ||
Parameters | ||
---------- | ||
field : ``Field`` | ||
Which field needs to be added. | ||
""" | ||
self.fields.append(field) | ||
if self.indexed: | ||
field.index(vocab) | ||
|
||
def count_vocab_items(self, counter: Dict[str, Dict[str, int]]) -> None: | ||
""" | ||
Increments counts in the given ``counter`` for all of the vocabulary | ||
items in all of the ``Fields`` in this ``Instance``. | ||
Parameters | ||
---------- | ||
counter : ``Dict[str, Dict[str, int]]`` | ||
We count the number of strings if the string needs to be counted to | ||
some counters. | ||
""" | ||
for field in self.fields: | ||
field.count_vocab_items(counter) | ||
|
||
def index_fields(self, vocab: Vocabulary) -> Dict[str, Dict[str, Indices]]: | ||
""" | ||
Indexes all fields in this ``Instance`` using the provided ``Vocabulary``. | ||
This `mutates` the current object, it does not return a new ``Instance``. | ||
A ``DataIterator`` will call this on each pass through a dataset; we use the ``indexed`` | ||
flag to make sure that indexing only happens once. | ||
This means that if for some reason you modify your vocabulary after you've | ||
indexed your instances, you might get unexpected behavior. | ||
Parameters | ||
---------- | ||
vocab : ``Vocabulary`` | ||
``vocab`` is used to get the index of each item. | ||
Returns | ||
------- | ||
res : ``Dict[str, Dict[str, Indices]]`` | ||
Returns the Indices corresponding to the instance. The first key is | ||
field name and the second key is the vocabulary name. | ||
""" | ||
if not self.indexed: | ||
self.indexed = True | ||
for field in self.fields: | ||
field.index(vocab) | ||
res = {} | ||
for field in self.fields: | ||
res[field.name] = field.indexes | ||
return res | ||
|
Oops, something went wrong.