Skip to content
Permalink
Browse files

Merge branch 'release/1.0.2'

  • Loading branch information...
lukostaz committed Apr 19, 2019
2 parents cf54fa8 + 0a4db34 commit 1aa91a4b32081bcaad4e7386032b9ac85deb99d6
Showing with 501,775 additions and 663 deletions.
  1. +6 −1 .gitignore
  2. +9 −9 README.md
  3. +1 −1 ampligraph/__init__.py
  4. +105 −49 ampligraph/datasets/datasets.py
  5. +224 −74 ampligraph/evaluation/protocol.py
  6. +4 −5 ampligraph/latent_features/__init__.py
  7. +87 −11 ampligraph/latent_features/loss_functions.py
  8. +0 −110 ampligraph/latent_features/model_utils.py
  9. +396 −202 ampligraph/latent_features/models.py
  10. +499,999 −1 ampligraph/latent_features/prime_number_list.txt
  11. +3 −8 ampligraph/latent_features/regularizers.py
  12. +4 −4 ampligraph/logger.conf
  13. +7 −0 ampligraph/utils/__init__.py
  14. +234 −0 ampligraph/utils/model_utils.py
  15. +5 −0 docs/ampligraph.datasets.rst
  16. +5 −10 docs/ampligraph.latent_features.rst
  17. +31 −0 docs/ampligraph.utils.rst
  18. +1 −0 docs/api.rst
  19. +20 −1 docs/changelog.md
  20. +5 −3 docs/contacts.md
  21. +2 −2 docs/dev.md
  22. +82 −10 docs/examples.md
  23. +139 −69 docs/experiments.rst
  24. +20 −0 docs/generated/ampligraph.latent_features.NLLMulticlass.rst
  25. +0 −6 docs/generated/ampligraph.latent_features.restore_model.rst
  26. +0 −6 docs/generated/ampligraph.latent_features.save_model.rst
  27. +6 −0 docs/generated/ampligraph.utils.create_tensorboard_visualizations.rst
  28. +6 −0 docs/generated/ampligraph.utils.restore_model.rst
  29. +6 −0 docs/generated/ampligraph.utils.save_model.rst
  30. BIN docs/img/kg_lp.png
  31. +0 −1 docs/index.rst
  32. +1 −1 docs/install.md
  33. +27 −0 docs/references.bib
  34. +10 −3 experiments/config.json
  35. +1 −1 experiments/predictive_performance.py
  36. +10 −4 jenkins.sh
  37. +7 −3 tests/ampligraph/datasets/test_datasets.py
  38. +198 −14 tests/ampligraph/evaluation/test_protocol.py
  39. +1 −1 tests/ampligraph/latent_features/test_misc.py
  40. +50 −53 tests/ampligraph/latent_features/test_models.py
  41. 0 tests/ampligraph/utils/__init__.py
  42. +63 −0 tests/ampligraph/utils/test_model_utils.py
@@ -121,4 +121,9 @@ experiments/result.csv
experiments/playground.py
experiments/fb237.txt

.pytest_cache
.pytest_cache

playground


.vscode/settings.json
@@ -114,18 +114,18 @@ pip install -e .
```python
>> import ampligraph
>> ampligraph.__version__
'1.0.1'
'1.0.2'
```


## Predictive Power Evaluation (MRR Filtered)

| |FB15k |WN18 |WN18RR |FB15K-237|
|----------|------|-------|-------|---------|
| TransE | 0.55 | 0.50 | 0.23 | 0.31 |
| DistMult | 0.79 | 0.83 | 0.44 | 0.29 |
| ComplEx | 0.79 | 0.94 | 0.44 | 0.30 |
| HolE | 0.80 | 0.94 | 0.47 | 0.28 |
| |FB15k |WN18 |WN18RR |FB15K-237|YAGO3-10 |
|----------|------|-------|-------|---------|---------|
| TransE | 0.55 | 0.50 | 0.23 | 0.31 | 0.24 |
| DistMult | 0.79 | 0.83 | 0.44 | 0.29 | 0.49 |
| ComplEx | 0.79 | 0.94 | 0.44 | 0.30 | 0.50 |
| HolE | 0.80 | 0.94 | 0.47 | 0.28 | 0.50 |


## Documentation
@@ -159,7 +159,7 @@ If you instead use AmpliGraph in an academic publication, cite as:
Sumit Pai and
Chan Le Van and
Rory McGrath and
Nick McCarthy},
Nicholas McCarthy},
title = {{AmpliGraph: a Library for Representation Learning on Knowledge Graphs}},
month = mar,
year = 2019,
@@ -168,6 +168,6 @@ If you instead use AmpliGraph in an academic publication, cite as:
}
```

## Licence
## License

AmpliGraph is licensed under the Apache 2.0 License.
@@ -2,7 +2,7 @@
import logging.config
import pkg_resources

__version__ = '1.0.1'
__version__ = '1.0.2'
__all__ = ['datasets', 'latent_features', 'evaluation']

logging.config.fileConfig(pkg_resources.resource_filename(__name__, 'logger.conf'), disable_existing_loggers=False)
@@ -5,15 +5,12 @@
import urllib
import zipfile
from pathlib import Path
import hashlib
from collections import namedtuple

AMPLIGRAPH_ENV_NAME = 'AMPLIGRAPH_DATA_HOME'
REMOTE_DATASET_SERVER = 'https://s3-eu-west-1.amazonaws.com/ampligraph/datasets/'
DATASET_FILE_NAME = {'WN18': 'wn18.zip',
'WN18RR': 'wn18RR.zip',
'FB15K': 'fb15k.zip',
'FB15K_237': 'fb15k-237.zip',
'YAGO3_10': 'YAGO3-10.zip',
}

DatasetMetadata = namedtuple('DatasetMetadata',['dataset_name','filename','url','train_name','valid_name','test_name','train_checksum','valid_checksum','test_checksum'])

logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
@@ -121,8 +118,18 @@ def _get_data_home(data_home=None):
logger.debug('data_home is set to {}'.format(data_home))
return data_home


def _unzip_dataset(source, destination):
def _md5(file_path):
md5hash = hashlib.md5()
chunk_size = 4096
with open(file_path,'rb') as f:
content_buffer = f.read(chunk_size)
while content_buffer:
md5hash.update(content_buffer)
content_buffer = f.read(chunk_size)
return md5hash.hexdigest()


def _unzip_dataset(remote, source, destination, check_md5hash=False):
"""Unzip a file from a source location to a destination.
Parameters
@@ -139,10 +146,19 @@ def _unzip_dataset(source, destination):
with zipfile.ZipFile(source, 'r') as zip_ref:
logger.debug('Unzipping {} to {}'.format(source, destination))
zip_ref.extractall(destination)
if check_md5hash:
for file_name,remote_checksum in [[remote.train_name,remote.train_checksum],[remote.valid_name,remote.valid_checksum],[remote.test_name,remote.test_checksum]]:
file_path = os.path.join(destination,remote.dataset_name,file_name)
checksum = _md5(file_path)
if checksum != remote_checksum:
os.remove(source)
msg = '{} has an md5 checksum of ({}) which is different from the expected ({}), the file may be corrupted.'.format(file_path,checksum,remote_checksum)
logger.error(msg)
raise IOError(msg)
os.remove(source)


def _fetch_remote_data(url, download_dir, data_home):
def _fetch_remote_data(remote, download_dir, data_home, check_md5hash=False):
"""Download a remote datasets.
Parameters
@@ -159,12 +175,12 @@ def _fetch_remote_data(url, download_dir, data_home):

file_path = '{}.zip'.format(download_dir)
if not Path(file_path).exists():
urllib.request.urlretrieve(url, file_path)
urllib.request.urlretrieve(remote.url, file_path)
# TODO - add error checking
_unzip_dataset(file_path, data_home)
_unzip_dataset(remote, file_path, data_home, check_md5hash)


def _fetch_dataset(dataset_name, data_home=None, url=None):
def _fetch_dataset(remote, data_home=None, check_md5hash=False):
"""Get a dataset.
Gets the directory of a dataset. If the dataset is not found
@@ -187,13 +203,14 @@ def _fetch_dataset(dataset_name, data_home=None, url=None):
The location of the dataset.
"""
data_home = _get_data_home(data_home)
dataset_dir = os.path.join(data_home, dataset_name)
dataset_dir = os.path.join(data_home, remote.dataset_name)
if not os.path.exists(dataset_dir):
if url is None:
if remote.url is None:
msg = 'No dataset at {} and no url provided.'.format(dataset_dir)
logger.error(msg)
raise Exception(msg)
_fetch_remote_data(url, dataset_dir, data_home)

_fetch_remote_data(remote, dataset_dir, data_home, check_md5hash)
return dataset_dir


@@ -261,24 +278,37 @@ def load_from_csv(directory_path, file_name, sep='\t', header=None):
return df.values


def load_dataset(dataset_name=None, url=None, data_home=None, train_name='train.txt', valid_name='valid.txt',
test_name='test.txt'):
if dataset_name is None:
if url is None:
raise ValueError('The dataset name or url must be provided to load a dataset.')
dataset_name = url[url.rfind('/') + 1:url.rfind('.')]
dataset_path = _fetch_dataset(dataset_name, data_home, url)
train = load_from_csv(dataset_path, train_name)
valid = load_from_csv(dataset_path, valid_name)
test = load_from_csv(dataset_path, test_name)
return {'train': train, 'valid': valid, 'test': test}
def _load_dataset(dataset_metadata, data_home=None, check_md5hash=False):
"""Load a dataset from the details provided.
DatasetMetadata = namedtuple('DatasetMetadata',['dataset_name','filename','url','train_name','valid_name','test_name','train_checksum','valid_checksum','test_checksum'])
Parameters
----------
dataset_metadata : DatasetMetadata
Named tuple containing remote datasets meta information: dataset name, dataset filename,
url, train filename, validation filename, test filename, train checksum, valid checksum, test checksum.
data_home : str
The location to save the dataset to. Defaults to None.
def _load_core_dataset(dataset_key, data_home=None):
return load_dataset(url='{}{}'.format(REMOTE_DATASET_SERVER, DATASET_FILE_NAME[dataset_key]), data_home=data_home)
check_md5hash : boolean
If true check the md4hash of the files after they are downloaded.
"""

if dataset_metadata.dataset_name is None:
if dataset_metadata.url is None:
raise ValueError('The dataset name or url must be provided to load a dataset.')
dataset_name = dataset_metadata.url[dataset_metadata.url.rfind('/') + 1:dataset_metadata.url.rfind('.')]
dataset_path = _fetch_dataset(dataset_metadata, data_home, check_md5hash)

train = load_from_csv(dataset_path, dataset_metadata.train_name)
valid = load_from_csv(dataset_path, dataset_metadata.valid_name)
test = load_from_csv(dataset_path, dataset_metadata.test_name)

return {'train': train, 'valid': valid, 'test': test}

def load_wn18():
def load_wn18(check_md5hash=False):
"""Load the WN18 dataset
WN18 is a subset of Wordnet. It was first presented by :cite:`bordes2013translating`.
@@ -323,11 +353,16 @@ def load_wn18():
['10217831', '_hyponym', '10682169']], dtype=object)
"""

WN18 = DatasetMetadata(dataset_name='wn18', filename='wn18.zip', url='https://s3-eu-west-1.amazonaws.com/ampligraph/datasets/wn18.zip',
train_name='train.txt', valid_name='valid.txt', test_name='test.txt',
train_checksum='7d68324d293837ac165c3441a6c8b0eb', valid_checksum='f4f66fec0ca83b5ebe7ad7003404e61d',
test_checksum='b035247a8916c7ec3443fa949e1ff02c')

return _load_core_dataset('WN18', data_home=None)
return _load_dataset(WN18, data_home=None, check_md5hash=check_md5hash)


def load_wn18rr(clean_unseen=True):
def load_wn18rr(check_md5hash=False, clean_unseen=True):
""" Load the WN18RR dataset
The dataset is described in :cite:`DettmersMS018`.
@@ -374,14 +409,18 @@ def load_wn18rr(clean_unseen=True):
array(['02174461', '_hypernym', '02176268'], dtype=object)
"""


WN18RR = DatasetMetadata(dataset_name='wn18RR', filename='wn18RR.zip', url='https://s3-eu-west-1.amazonaws.com/ampligraph/datasets/wn18RR.zip',
train_name='train.txt', valid_name='valid.txt', test_name='test.txt',
train_checksum='35e81af3ae233327c52a87f23b30ad3c', valid_checksum='74a2ee9eca9a8d31f1a7d4d95b5e0887',
test_checksum='2b45ba1ba436b9d4ff27f1d3511224c9')
if clean_unseen:
return _clean_data(_load_core_dataset('WN18RR', data_home=None), throw_valid=True)
return _clean_data(_load_dataset(WN18RR, data_home=None, check_md5hash=check_md5hash), throw_valid=True)
else:
_load_core_dataset('WN18RR', data_home=None)
return _load_dataset(WN18RR, data_home=None, check_md5hash=check_md5hash)


def load_fb15k():
def load_fb15k(check_md5hash=False):
"""Load the FB15k dataset
FB15k is a split of Freebase, first proposed by :cite:`bordes2013translating`.
@@ -430,11 +469,16 @@ def load_fb15k():
'/m/05lf_']], dtype=object)
"""

FB15K = DatasetMetadata(dataset_name='fb15k', filename='fb15k.zip', url='https://s3-eu-west-1.amazonaws.com/ampligraph/datasets/fb15k.zip',
train_name='train.txt', valid_name='valid.txt', test_name='test.txt',
train_checksum='5a87195e68d7797af00e137a7f6929f2', valid_checksum='275835062bb86a86477a3c402d20b814',
test_checksum='71098693b0efcfb8ac6cd61cf3a3b505')

return _load_core_dataset('FB15K', data_home=None)
return _load_dataset(FB15K, data_home=None, check_md5hash=False)


def load_fb15k_237(clean_unseen=True):
def load_fb15k_237(check_md5hash=False, clean_unseen=True):
"""Load the FB15k-237 dataset
FB15k-237 is a reduced version of FB15K. It was first proposed by :cite:`toutanova2015representing`.
@@ -481,13 +525,18 @@ def load_fb15k_237(clean_unseen=True):
dtype=object)
"""

FB15K_237 = DatasetMetadata(dataset_name='fb15k-237', filename='fb15k-237.zip', url='https://s3-eu-west-1.amazonaws.com/ampligraph/datasets/fb15k-237.zip',
train_name='train.txt', valid_name='valid.txt', test_name='test.txt',
train_checksum='c05b87b9ac00f41901e016a2092d7837', valid_checksum='6a94efd530e5f43fcf84f50bc6d37b69',
test_checksum='f5bdf63db39f455dec0ed259bb6f8628')

if clean_unseen:
return _clean_data(_load_core_dataset('FB15K_237', data_home=None), throw_valid=True)
return _clean_data(_load_dataset(FB15K_237, data_home=None, check_md5hash=check_md5hash), throw_valid=True)
else:
_load_core_dataset('FB15K_237', data_home=None)
return _load_dataset(FB15K_237, data_home=None, check_md5hash=check_md5hash)


def load_yago3_10():
def load_yago3_10(check_md5hash=False, clean_unseen = True):
""" Load the YAGO3-10 dataset
The dataset is a split of YAGO3 :cite:`mahdisoltani2013yago3`, and has been first presented in :cite:`DettmersMS018`.
@@ -526,16 +575,23 @@ def load_yago3_10():
array(['Mikheil_Khutsishvili', 'playsFor', 'FC_Merani_Tbilisi'], dtype=object)
"""
YAGO3_10 = DatasetMetadata(dataset_name='YAGO3-10', filename='YAGO3-10.zip', url='https://s3-eu-west-1.amazonaws.com/ampligraph/datasets/YAGO3-10.zip',
train_name='train.txt', valid_name='valid.txt', test_name='test.txt',
train_checksum='a9da8f583ec3920570eeccf07199229a', valid_checksum='2d679a906f2b1ac29d74d5c948c1ad09',
test_checksum='14bf97890b2fee774dbce5f326acd189')

return _load_core_dataset('YAGO3_10', data_home=None)

if clean_unseen:
return _clean_data(_load_dataset(YAGO3_10, data_home=None, check_md5hash=check_md5hash), throw_valid=True)
else:
return _load_dataset(YAGO3_10, data_home=None, check_md5hash=check_md5hash)


def load_all_datasets():
load_wn18()
load_wn18rr()
load_fb15k()
load_fb15k_237()
load_yago3_10()
def load_all_datasets(check_md5hash=False):
load_wn18(check_md5hash)
load_wn18rr(check_md5hash)
load_fb15k(check_md5hash)
load_fb15k_237(check_md5hash)
load_yago3_10(check_md5hash)


def load_from_rdf(folder_name, file_name, format='nt', data_home=None):
Oops, something went wrong.

0 comments on commit 1aa91a4

Please sign in to comment.
You can’t perform that action at this time.