Skip to content

Commit

Permalink
Merge pull request #1 from JT-Ushio/master
Browse files Browse the repository at this point in the history
release(antu): ready to release antu v0.0.1 [Tao Ji].
  • Loading branch information
JT-Ushio committed Dec 28, 2018
2 parents 01a4e3e + 3ee6e55 commit 1819cfd
Show file tree
Hide file tree
Showing 47 changed files with 2,158 additions and 2 deletions.
26 changes: 24 additions & 2 deletions README.md
@@ -1,2 +1,24 @@
# pyAnt
Universal data IO module in NLP tasks (for AntNLP Group)
# AntU
Universal data IO and neural network modules in NLP tasks.

+ **data IO** is an universal module in Natural Language Processing system and not based on any framework (like TensorFlow, PyTorch, MXNet, Dynet...).
+ **neural network** module contains the neural network structures commonly used in NLP tasks. We want to design commonly used structures for each neural network framework. We will continue to develop this module.



# Requirements

+ Python>=3.6
+ bidict
+ overrides

##### If you need dynet neural network:

+ dynet>=2.0



# Resources

+ [Documentation](https://wait)
+ [Source Code](https://github.com/AntNLP/antu)
Empty file added antu/__init__.py
Empty file.
Empty file added antu/io/__init__.py
Empty file.
Empty file.
14 changes: 14 additions & 0 deletions antu/io/dataset_readers/dataset_reader.py
@@ -0,0 +1,14 @@
from typing import Dict, List
from abc import ABCMeta, abstractmethod
from antu.io.instance import Instance


class DatasetReader(metaclass=ABCMeta):

@abstractmethod
def read(self, file_path: str) -> List[Instance]:
pass

@abstractmethod
def input_to_instance(self, inputs: str) -> Instance:
pass
Empty file added antu/io/datasets/__init__.py
Empty file.
20 changes: 20 additions & 0 deletions antu/io/datasets/dataset.py
@@ -0,0 +1,20 @@
from typing import Dict, List
from abc import ABCMeta, abstractmethod
from antu.io.vocabulary import Vocabulary
from antu.io.instance import Instance


class Dataset(metaclass=ABCMeta):

vocabulary_set: Vocabulary = {}
datasets: Dict[str, List[Instance]] = {}

@abstractmethod
def read():
pass

@abstractmethod
def input_to_instance():
pass


Empty file added antu/io/fields/__init__.py
Empty file.
39 changes: 39 additions & 0 deletions antu/io/fields/field.py
@@ -0,0 +1,39 @@
from typing import List, Dict
from abc import ABCMeta, abstractmethod

from antu.io.vocabulary import Vocabulary


class Field(metaclass=ABCMeta):
"""
A ``Field`` is an ingredient of a data instance. In most NLP tasks, ``Field``
stores data of string types. It contains one or more indexers that map string
data to the corresponding index. Data instances are collections of fields.
"""
@abstractmethod
def count_vocab_items(self, counter: Dict[str, Dict[str, int]]) -> None:
"""
We count the number of strings if the string needs to be mapped to one
or more integers. You can pass directly if there is no string that needs
to be mapped.
Parameters
----------
counter : ``Dict[str, Dict[str, int]]``
``counter`` is used to count the number of each item. The first key
represents the namespace of the vocabulary, and the second key represents
the string of the item.
"""
pass

@abstractmethod
def index(self, vocab: Vocabulary) -> None:
"""
Gets one or more index mappings for each element in the Field.
Parameters
----------
vocab : ``Vocabulary``
``vocab`` is used to get the index of each item.
"""
pass
59 changes: 59 additions & 0 deletions antu/io/fields/index_field.py
@@ -0,0 +1,59 @@
from typing import List, Dict, Iterator
from overrides import overrides
from antu.io.token_indexers.token_indexer import TokenIndexer
from antu.io.vocabulary import Vocabulary
from antu.io.fields.field import Field


class IndexField(Field):
"""
A ``IndexField`` is an integer field, and we can use it to store data ID.
Parameters
----------
name : ``str``
Field name. This is necessary and must be unique (not the same as other
field names).
tokens : ``List[str]``
Field content that contains a list of string.
indexers : ``List[TokenIndexer]``, optional (default=``list()``)
Indexer list that defines the vocabularies associated with the field.
"""
def __init__(
self,
name: str,
tokens: List[str],
indexers: List[TokenIndexer] = None):
self.name = name
self.tokens = tokens
self.indexers = indexers

def __iter__(self) -> Iterator[str]:
return iter(self.tokens)

def __getitem__(self, idx: int) -> str:
return self.tokens[idx]

def __len__(self) -> int:
return len(self.tokens)

@overrides
def count_vocab_items(
self,
counters: Dict[str, Dict[str, int]]) -> None:
"""
``IndexField`` doesn't need index operation.
"""
pass

@overrides
def index(
self,
vocab: Vocabulary) -> None:
"""
``IndexField`` doesn't need index operation.
"""
pass



46 changes: 46 additions & 0 deletions antu/io/fields/sequence_label_field.py
@@ -0,0 +1,46 @@
from typing import List, Iterator, Dict
from overrides import overrides
from antu.io.token_indexers.token_indexer import TokenIndexer
from antu.io.vocabulary import Vocabulary
from antu.io.fields.field import Field


class SequenceLabelField(Field):

def __init__(
self,
name: str,
tokens: List[str],
indexers: List[TokenIndexer]):
self.name = name
self.tokens = tokens
self.indexers = indexers

def __iter__(self) -> Iterator[str]:
return iter(self.tokens)

def __getitem__(self, idx: int) -> str:
return self.tokens[idx]

def __len__(self) -> int:
return len(self.tokens)

@overrides
def count_vocab_items(
self,
counters: Dict[str, Dict[str, int]]) -> None:
for idxer in self.indexers:
for token in self.tokens:
idxer.count_vocab_items(token, counters)

@overrides
def index(
self,
vocab: Vocabulary) -> None:
self.indexes = {}
for idxer in self.indexers:
self.indexes.update(idxer.tokens_to_indices(self.tokens, vocab))




76 changes: 76 additions & 0 deletions antu/io/fields/text_field.py
@@ -0,0 +1,76 @@
from typing import List, Iterator, Dict
from overrides import overrides
from antu.io.token_indexers.token_indexer import TokenIndexer
from antu.io.vocabulary import Vocabulary
from antu.io.fields.field import Field


class TextField(Field):
"""
A ``TextField`` is a data field that is commonly used in NLP tasks, and we
can use it to store text sequences such as sentences, paragraphs, POS tags,
and so on.
Parameters
----------
name : ``str``
Field name. This is necessary and must be unique (not the same as other
field names).
tokens : ``List[str]``
Field content that contains a list of string.
indexers : ``List[TokenIndexer]``, optional (default=``list()``)
Indexer list that defines the vocabularies associated with the field.
"""
def __init__(
self,
name: str,
tokens: List[str],
indexers: List[TokenIndexer] = list()):
self.name = name
self.tokens = tokens
self.indexers = indexers

def __iter__(self) -> Iterator[str]:
return iter(self.tokens)

def __getitem__(self, idx: int) -> str:
return self.tokens[idx]

def __len__(self) -> int:
return len(self.tokens)

@overrides
def count_vocab_items(
self,
counters: Dict[str, Dict[str, int]]) -> None:
"""
We count the number of strings if the string needs to be counted to some
counters. You can pass directly if there is no string that needs
to be counted.
Parameters
----------
counters : ``Dict[str, Dict[str, int]]``
Element statistics for datasets. if field indexers indicate that
this field is related to some counters, we use field content to
update the counters.
"""
for idxer in self.indexers:
for token in self.tokens:
idxer.count_vocab_items(token, counters)

@overrides
def index(
self,
vocab: Vocabulary) -> None:
"""
Gets one or more index mappings for each element in the Field.
Parameters
----------
vocab : ``Vocabulary``
``vocab`` is used to get the index of each item.
"""
self.indexes = {}
for idxer in self.indexers:
self.indexes.update(idxer.tokens_to_indices(self.tokens, vocab))
11 changes: 11 additions & 0 deletions antu/io/fields/text_field_test.py
@@ -0,0 +1,11 @@
import pytest
from antu.io.fields.text_field import TextField


class TestTextField:

def test_textfield(self):
sentence = ['This', 'is', 'a', 'test', 'sentence', '.']
sent = TextField('sentence', sentence)
assert sent[0] == 'This'
assert sent[-1] == '.'
87 changes: 87 additions & 0 deletions antu/io/instance.py
@@ -0,0 +1,87 @@
from typing import Dict, MutableMapping, Mapping, TypeVar, List

from antu.io.vocabulary import Vocabulary
from antu.io.fields.field import Field

Indices = TypeVar("Indices", List[int], List[List[int]])


class Instance(Mapping[str, Field]):
"""
An ``Instance`` is a collection (list) of multiple data fields.
Parameters
----------
fields : ``List[Field]``, optional (default=``None``)
A list of multiple data fields.
"""

def __init__(self, fields: List[Field]=None) -> None:
self.fields = fields
self.indexed = False # Indicates whether the instance has been indexed

def __getitem__(self, key: str) -> Field:
return self.fields[key]

def __iter__(self):
return iter(self.fields)

def __len__(self) -> int:
return len(self.fields)

def add_field(self, field: Field) -> None:
"""
Add the field to the existing ``Instance``.
Parameters
----------
field : ``Field``
Which field needs to be added.
"""
self.fields.append(field)
if self.indexed:
field.index(vocab)

def count_vocab_items(self, counter: Dict[str, Dict[str, int]]) -> None:
"""
Increments counts in the given ``counter`` for all of the vocabulary
items in all of the ``Fields`` in this ``Instance``.
Parameters
----------
counter : ``Dict[str, Dict[str, int]]``
We count the number of strings if the string needs to be counted to
some counters.
"""
for field in self.fields:
field.count_vocab_items(counter)

def index_fields(self, vocab: Vocabulary) -> Dict[str, Dict[str, Indices]]:
"""
Indexes all fields in this ``Instance`` using the provided ``Vocabulary``.
This `mutates` the current object, it does not return a new ``Instance``.
A ``DataIterator`` will call this on each pass through a dataset; we use the ``indexed``
flag to make sure that indexing only happens once.
This means that if for some reason you modify your vocabulary after you've
indexed your instances, you might get unexpected behavior.
Parameters
----------
vocab : ``Vocabulary``
``vocab`` is used to get the index of each item.
Returns
-------
res : ``Dict[str, Dict[str, Indices]]``
Returns the Indices corresponding to the instance. The first key is
field name and the second key is the vocabulary name.
"""
if not self.indexed:
self.indexed = True
for field in self.fields:
field.index(vocab)
res = {}
for field in self.fields:
res[field.name] = field.indexes
return res

0 comments on commit 1819cfd

Please sign in to comment.