Merge pull request #1 from JT-Ushio/master

release(antu): ready to release antu v0.0.1 [Tao Ji].
AntNLP · Dec 28, 2018 · 1819cfd · 1819cfd
2 parents 01a4e3e + 3ee6e55
commit 1819cfd
Show file tree

Hide file tree

Showing 47 changed files with 2,158 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -1,2 +1,24 @@
-# pyAnt
-Universal data IO module in NLP tasks (for AntNLP Group)
+# AntU
+Universal data IO and neural network modules in NLP tasks.
+
++ **data IO** is an universal module in Natural Language Processing system and not based on any framework (like TensorFlow, PyTorch, MXNet, Dynet...).
++ **neural network** module contains the neural network structures commonly used in NLP tasks. We want to design commonly used structures for each neural network framework. We will continue to develop this module.
+
+
+
+# Requirements
+
++ Python>=3.6
++ bidict
++ overrides
+
+##### If you need dynet neural network:
+
++ dynet>=2.0
+
+
+
+# Resources
+
++ [Documentation](https://wait)
++ [Source Code](https://github.com/AntNLP/antu)
diff --git a/antu/__init__.py b/antu/__init__.py
diff --git a/antu/io/__init__.py b/antu/io/__init__.py
diff --git a/antu/io/dataset_readers/__init__.py b/antu/io/dataset_readers/__init__.py
diff --git a/antu/io/dataset_readers/dataset_reader.py b/antu/io/dataset_readers/dataset_reader.py
@@ -0,0 +1,14 @@
+from typing import Dict, List
+from abc import ABCMeta, abstractmethod
+from antu.io.instance import Instance
+
+
+class DatasetReader(metaclass=ABCMeta):
+
+    @abstractmethod
+    def read(self, file_path: str) -> List[Instance]:
+        pass
+
+    @abstractmethod
+    def input_to_instance(self, inputs: str) -> Instance:
+        pass
diff --git a/antu/io/datasets/__init__.py b/antu/io/datasets/__init__.py
diff --git a/antu/io/datasets/dataset.py b/antu/io/datasets/dataset.py
@@ -0,0 +1,20 @@
+from typing import Dict, List
+from abc import ABCMeta, abstractmethod
+from antu.io.vocabulary import Vocabulary
+from antu.io.instance import Instance
+
+
+class Dataset(metaclass=ABCMeta):
+
+    vocabulary_set: Vocabulary = {}
+    datasets: Dict[str, List[Instance]] = {}
+
+    @abstractmethod
+    def read():
+        pass
+
+    @abstractmethod
+    def input_to_instance():
+        pass
+
+
diff --git a/antu/io/fields/__init__.py b/antu/io/fields/__init__.py
diff --git a/antu/io/fields/field.py b/antu/io/fields/field.py
@@ -0,0 +1,39 @@
+from typing import List, Dict
+from abc import ABCMeta, abstractmethod
+
+from antu.io.vocabulary import Vocabulary
+
+
+class Field(metaclass=ABCMeta):
+    """
+    A ``Field`` is an ingredient of a data instance. In most NLP tasks, ``Field``
+    stores data of string types. It contains one or more indexers that map string
+    data to the corresponding index. Data instances are collections of fields.
+    """
+    @abstractmethod
+    def count_vocab_items(self, counter: Dict[str, Dict[str, int]]) -> None:
+        """
+        We count the number of strings if the string needs to be mapped to one
+        or more integers. You can pass directly if there is no string that needs
+        to be mapped.
+
+        Parameters
+        ----------
+        counter : ``Dict[str, Dict[str, int]]``
+        ``counter`` is used to count the number of each item. The first key
+        represents the namespace of the vocabulary, and the second key represents
+        the string of the item.
+        """
+        pass
+
+    @abstractmethod
+    def index(self, vocab: Vocabulary) -> None:
+        """
+        Gets one or more index mappings for each element in the Field.
+
+        Parameters
+        ----------
+        vocab : ``Vocabulary``
+        ``vocab`` is used to get the index of each item.
+        """
+        pass
diff --git a/antu/io/fields/index_field.py b/antu/io/fields/index_field.py
@@ -0,0 +1,59 @@
+from typing import List, Dict, Iterator
+from overrides import overrides
+from antu.io.token_indexers.token_indexer import TokenIndexer
+from antu.io.vocabulary import Vocabulary
+from antu.io.fields.field import Field
+
+
+class IndexField(Field):
+    """
+    A ``IndexField`` is an integer field, and we can use it to store data ID.
+
+    Parameters
+    ----------
+    name : ``str``
+        Field name. This is necessary and must be unique (not the same as other
+        field names).
+    tokens : ``List[str]``
+        Field content that contains a list of string.
+    indexers : ``List[TokenIndexer]``, optional (default=``list()``)
+        Indexer list that defines the vocabularies associated with the field.
+    """
+    def __init__(
+        self,
+        name: str,
+        tokens: List[str],
+        indexers: List[TokenIndexer] = None):
+        self.name = name
+        self.tokens = tokens
+        self.indexers = indexers
+
+    def __iter__(self) -> Iterator[str]:
+        return iter(self.tokens)
+
+    def __getitem__(self, idx: int) -> str:
+        return self.tokens[idx]
+
+    def __len__(self) -> int:
+        return len(self.tokens)
+
+    @overrides
+    def count_vocab_items(
+        self,
+        counters: Dict[str, Dict[str, int]]) -> None:
+        """
+        ``IndexField`` doesn't need index operation.
+        """
+        pass
+
+    @overrides
+    def index(
+        self,
+        vocab: Vocabulary) -> None:
+        """
+        ``IndexField`` doesn't need index operation.
+        """
+        pass
+
+
+
diff --git a/antu/io/fields/sequence_label_field.py b/antu/io/fields/sequence_label_field.py
@@ -0,0 +1,46 @@
+from typing import List, Iterator, Dict
+from overrides import overrides
+from antu.io.token_indexers.token_indexer import TokenIndexer
+from antu.io.vocabulary import Vocabulary
+from antu.io.fields.field import Field
+
+
+class SequenceLabelField(Field):
+
+    def __init__(
+        self,
+        name: str,
+        tokens: List[str],
+        indexers: List[TokenIndexer]):
+        self.name = name
+        self.tokens = tokens
+        self.indexers = indexers
+
+    def __iter__(self) -> Iterator[str]:
+        return iter(self.tokens)
+
+    def __getitem__(self, idx: int) -> str:
+        return self.tokens[idx]
+
+    def __len__(self) -> int:
+        return len(self.tokens)
+
+    @overrides
+    def count_vocab_items(
+        self,
+        counters: Dict[str, Dict[str, int]]) -> None:
+        for idxer in self.indexers:
+            for token in self.tokens:
+                idxer.count_vocab_items(token, counters)
+
+    @overrides
+    def index(
+        self,
+        vocab: Vocabulary) -> None:
+        self.indexes = {}
+        for idxer in self.indexers:
+            self.indexes.update(idxer.tokens_to_indices(self.tokens, vocab))
+
+
+
+
diff --git a/antu/io/fields/text_field.py b/antu/io/fields/text_field.py
@@ -0,0 +1,76 @@
+from typing import List, Iterator, Dict
+from overrides import overrides
+from antu.io.token_indexers.token_indexer import TokenIndexer
+from antu.io.vocabulary import Vocabulary
+from antu.io.fields.field import Field
+
+
+class TextField(Field):
+    """
+    A ``TextField`` is a data field that is commonly used in NLP tasks, and we
+    can use it to store text sequences such as sentences, paragraphs, POS tags,
+    and so on.
+
+    Parameters
+    ----------
+    name : ``str``
+        Field name. This is necessary and must be unique (not the same as other
+        field names).
+    tokens : ``List[str]``
+        Field content that contains a list of string.
+    indexers : ``List[TokenIndexer]``, optional (default=``list()``)
+        Indexer list that defines the vocabularies associated with the field.
+    """
+    def __init__(
+        self,
+        name: str,
+        tokens: List[str],
+        indexers: List[TokenIndexer] = list()):
+        self.name = name
+        self.tokens = tokens
+        self.indexers = indexers
+
+    def __iter__(self) -> Iterator[str]:
+        return iter(self.tokens)
+
+    def __getitem__(self, idx: int) -> str:
+        return self.tokens[idx]
+
+    def __len__(self) -> int:
+        return len(self.tokens)
+
+    @overrides
+    def count_vocab_items(
+        self,
+        counters: Dict[str, Dict[str, int]]) -> None:
+        """
+        We count the number of strings if the string needs to be counted to some
+         counters. You can pass directly if there is no string that needs
+        to be counted.
+
+        Parameters
+        ----------
+        counters : ``Dict[str, Dict[str, int]]``
+            Element statistics for datasets. if field indexers indicate that
+            this field is related to some counters, we use field content to
+            update the counters.
+        """
+        for idxer in self.indexers:
+            for token in self.tokens:
+                idxer.count_vocab_items(token, counters)
+
+    @overrides
+    def index(
+        self,
+        vocab: Vocabulary) -> None:
+        """
+        Gets one or more index mappings for each element in the Field.
+
+        Parameters
+        ----------
+        vocab : ``Vocabulary``
+            ``vocab`` is used to get the index of each item.
+        """
+        self.indexes = {}
+        for idxer in self.indexers:
+            self.indexes.update(idxer.tokens_to_indices(self.tokens, vocab))
diff --git a/antu/io/fields/text_field_test.py b/antu/io/fields/text_field_test.py
@@ -0,0 +1,11 @@
+import pytest
+from antu.io.fields.text_field import TextField
+
+
+class TestTextField:
+
+    def test_textfield(self):
+        sentence = ['This', 'is', 'a', 'test', 'sentence', '.']
+        sent = TextField('sentence', sentence)
+        assert sent[0] == 'This'
+        assert sent[-1] == '.'
diff --git a/antu/io/instance.py b/antu/io/instance.py
@@ -0,0 +1,87 @@
+from typing import Dict, MutableMapping, Mapping, TypeVar, List
+
+from antu.io.vocabulary import Vocabulary
+from antu.io.fields.field import Field
+
+Indices = TypeVar("Indices", List[int], List[List[int]])
+
+
+class Instance(Mapping[str, Field]):
+    """
+    An ``Instance`` is a collection (list) of multiple data fields.
+
+    Parameters
+    ----------
+    fields : ``List[Field]``, optional (default=``None``)
+        A list of multiple data fields.
+    """
+
+    def __init__(self, fields: List[Field]=None) -> None:
+        self.fields = fields
+        self.indexed = False  # Indicates whether the instance has been indexed
+
+    def __getitem__(self, key: str) -> Field:
+        return self.fields[key]
+
+    def __iter__(self):
+        return iter(self.fields)
+
+    def __len__(self) -> int:
+        return len(self.fields)
+
+    def add_field(self, field: Field) -> None:
+        """
+        Add the field to the existing ``Instance``.
+
+        Parameters
+        ----------
+        field : ``Field``
+            Which field needs to be added.
+        """
+        self.fields.append(field)
+        if self.indexed:
+            field.index(vocab)
+
+    def count_vocab_items(self, counter: Dict[str, Dict[str, int]]) -> None:
+        """
+        Increments counts in the given ``counter`` for all of the vocabulary
+        items in all of the ``Fields`` in this ``Instance``.
+
+        Parameters
+        ----------
+        counter : ``Dict[str, Dict[str, int]]``
+            We count the number of strings if the string needs to be counted to
+            some counters.
+        """
+        for field in self.fields:
+            field.count_vocab_items(counter)
+
+    def index_fields(self, vocab: Vocabulary) -> Dict[str, Dict[str, Indices]]:
+        """
+        Indexes all fields in this ``Instance`` using the provided ``Vocabulary``.
+        This `mutates` the current object, it does not return a new ``Instance``.
+        A ``DataIterator`` will call this on each pass through a dataset; we use the ``indexed``
+        flag to make sure that indexing only happens once.
+        This means that if for some reason you modify your vocabulary after you've
+        indexed your instances, you might get unexpected behavior.
+
+        Parameters
+        ----------
+        vocab : ``Vocabulary``
+            ``vocab`` is used to get the index of each item.
+
+        Returns
+        -------
+        res : ``Dict[str, Dict[str, Indices]]``
+            Returns the Indices corresponding to the instance. The first key is
+            field name and the second key is the vocabulary name.
+        """
+        if not self.indexed:
+            self.indexed = True
+            for field in self.fields:
+                field.index(vocab)
+        res = {}
+        for field in self.fields:
+            res[field.name] = field.indexes
+        return res
+