Skip to content

Commit

Permalink
Merge pull request #24 from BaderLab/fix-embedding-loading
Browse files Browse the repository at this point in the history
Improve pre-trained embeddings loading

Former-commit-id: 709d6860a493c67fbf188ef8e52c47dbb547b368 [formerly 877737dc69044c4faa6d2c97bd7616493a67db7d] [formerly e86cfaf5d153397b298d203ce2cef18c827a9af9 [formerly d7a63b2]]
Former-commit-id: d26724f8c28f149560b644c3a8fa331980c8e1d9 [formerly 7c938cafeb9ec2347d6009db378e41722f8c1fbb]
Former-commit-id: 0362b1994af23ba803124dd9606b7317e3ff2c03
  • Loading branch information
JohnGiorgi authored Aug 14, 2018
2 parents d0b4228 + 3c29919 commit 2667931
Show file tree
Hide file tree
Showing 7 changed files with 60 additions and 81 deletions.
30 changes: 19 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -242,25 +242,33 @@ Corpora are collected in [here](https://github.com/BaderLab/Biomedical-Corpora)

### Word embeddings

When training new models, you can (and should) provide your own pre-trained word embeddings with the `token_pretrained_embedding_filepath` argument (either at the command line or in the configuration file). [Pyysalo _et al_. 2013](https://pdfs.semanticscholar.org/e2f2/8568031e1902d4f8ee818261f0f2c20de6dd.pdf) provide word embeddings that work quite well in the biomedical domain, which can be downloaded [here](http://bio.nlplab.org).

Once downloaded, you will need to convert them from `.bin` to `.txt` format:
When training new models, you can (and should) provide your own pre-trained word embeddings with the `pretrained_embeddings` argument (either at the command line or in the configuration file). Saber expects all word embeddings to be in the `word2vec` file format. [Pyysalo _et al_. 2013](https://pdfs.semanticscholar.org/e2f2/8568031e1902d4f8ee818261f0f2c20de6dd.pdf) provide word embeddings that work quite well in the biomedical domain, which can be downloaded [here](http://bio.nlplab.org). Alternatively, from the command line call:

```bash
(saber) $ cd saber
(saber) $ pip install gensim
(saber) $ python
>> from generic_utils import bin_to_txt
>> path_to_embeddings = '/path/to/wikipedia-pubmed-and-PMC-w2v.bin'
>> bin_to_txt('wikipedia-pubmed-and-PMC-w2v.bin', output_dir='path/to/word_embeddings')

mkdir saber/word_embeddings
cd saber/word_embeddings
# Note: this file is over 4GB
wget http://evexdb.org/pmresources/vec-space-models/wikipedia-pubmed-and-PMC-w2v.bin
```

> Note: you do not need to download pre-trained word embeddings if you only plan on using Saber's pre-trained models.
#### GloVe

To use [GloVe](https://nlp.stanford.edu/projects/glove/) embeddings, just convert them to the [word2vec](https://code.google.com/archive/p/word2vec/) format first:

```bash
(saber) $ python
>>> from gensim.scripts.glove2word2vec import glove2word2vec
>>> glove_input_file = 'glove.txt'
>>> word2vec_output_file = 'word2vec.txt'
>>> glove2word2vec(glove_input_file, word2vec_output_file)
```

## Running tests

Sabers test suite can be found in `Saber/saber/tests`. In order to run the tests, you'll usually want to clone the repository locally. Make sure to install all required development dependencies defined in the ``requirements.txt`` (see [Installation](#Installation) for more help). Additionally, you will
need to install ``pytest``:
Sabers test suite can be found in `Saber/saber/tests`. In order to run the tests, you'll usually want to clone the repository locally. Make sure to install all required development dependencies defined in the ``requirements.txt`` (see [Installation](#Installation) for more help). Additionally, you will need to install ``pytest``:

```bash
(saber) $ pip install pytest
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
flask>=1.0.2
gensim>=3.4.0
# Keras contrib repo
git+https://www.github.com/keras-team/keras-contrib.git#egg=keras_contrib
# Spacy small english language model
Expand Down
64 changes: 32 additions & 32 deletions saber/sequence_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import os
import time

from gensim.models import KeyedVectors
import numpy as np
from spacy import displacy

Expand Down Expand Up @@ -292,9 +293,13 @@ def _load_compound_dataset(self):

return compound_ds

def load_embeddings(self):
def load_embeddings(self, binary=True):
"""Coordinates the loading of pre-trained token embeddings.
Args:
binary (bool): True if pre-trained embeddings are in C binary format, False if they are
in C text format.
Raises:
MissingStepException: if no dataset has been loaded.
ValueError: If 'self.config.pretrained_embeddings' is None.
Expand All @@ -308,7 +313,7 @@ def load_embeddings(self):
self.log.error('ValueError: %s', err_msg)
raise ValueError(err_msg)

self._load_token_embeddings()
self._load_token_embeddings(binary)

return self

Expand Down Expand Up @@ -387,57 +392,52 @@ def fit(self):
# train_history = pd.DataFrame(train_history.history)
# return train_history

def _load_token_embeddings(self):
def _load_token_embeddings(self, binary=True):
"""Coordinates the loading of pre-trained token embeddings.
Coordinates the loading of pre-trained token embeddings by reading in the file containing
the token embeddings and created an embedding matrix whos ith row corresponds to the token
the token embeddings and creating a embedding matrix whos ith row corresponds to the token
embedding for the ith word in the models word to idx mapping.
Args:
binary (bool): True if pre-trained embeddings are in C binary format, False if they are
in C text format.
"""
start_time = time.time()
start = time.time()
print('Loading embeddings... ', end='', flush=True)

# prepare the embedding indicies
embedding_idx = self._prepare_token_embedding_layer()
embedding_idx = self._prepare_token_embedding_layer(binary)
embedding_dim = len(list(embedding_idx.values())[0])
# create the embedding matrix, update attribute
embedding_matrix = self._prepare_token_embedding_matrix(embedding_idx, embedding_dim)
self.token_embedding_matrix = embedding_matrix

elapsed_time = time.time() - start_time
print('Done ({0:.2f} seconds)'.format(elapsed_time))
end = time.time() - start
print('Done ({0:.2f} seconds)'.format(end))
print('Found {} word vectors of dimension {}'.format(len(embedding_idx), embedding_dim))
self.log.info('Loaded %i word vectors of dimension %i', len(embedding_idx), embedding_dim)

def _prepare_token_embedding_layer(self):
def _prepare_token_embedding_layer(self, binary=True):
"""Creates an embedding index using pretrained token embeddings.
For the models given pretrained token embeddings, creates and returns a dictionary mapping
words to known embeddings.
For the pretrained word embeddings given at `self.config.pretrained_embeddings`, creates
and returns a dictionary mapping words to embeddings, or word vectors. Note that if
`self.config.debug` is True, only the first 10K vectors are loaded.
Args:
binary (bool): True if pre-trained embeddings are in C binary format, False if they are
in C text format.
Returns:
embedding_idx (dict): mapping of words to pre-trained token
embeddings
embed_idx (dict): mapping of words to pre-trained word embeddings
"""
# acc
embedding_idx = {}

# open pre-trained token embedding file for reading
with open(self.config.pretrained_embeddings, 'r') as pte:
for i, line in enumerate(pte):
# split line, get word and its embedding
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')

# update our embedding index
embedding_idx[word] = coefs

# if debug, load a small, arbitrary number of word embeddings
if i >= 10000 and self.config.debug:
break

return embedding_idx
limit = 10000 if self.config.debug else None
vectors = KeyedVectors.load_word2vec_format(self.config.pretrained_embeddings,
binary=binary,
limit=limit)
embed_idx = {word: vectors[word] for word in vectors.vocab}
return embed_idx

def _prepare_token_embedding_matrix(self, embedding_idx, embedding_dim):
"""Creates an embedding matrix using pretrained token embeddings.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
4 2
</s> 0.00170094 0.000166173
the 0.155801 -0.0710875
, -0.0593159 0.102907
. -0.010395 0.120041
. -0.010395 0.120041
8 changes: 3 additions & 5 deletions saber/tests/test_sequence_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,15 +118,13 @@ def test_token_embeddings_load(sp_sing_ds_no_embed, sp_compound_ds_no_embed):
"""Asserts that pre-trained token embeddings are loaded correctly when
SequenceProcessor.load_embeddings() is called"""
# load embeddings for each model
sp_sing_ds_no_embed.load_embeddings()
sp_compound_ds_no_embed.load_embeddings()

print(sp_compound_ds_no_embed.token_embedding_matrix)
sp_sing_ds_no_embed.load_embeddings(binary=False)
sp_compound_ds_no_embed.load_embeddings(binary=False)

# check type
assert isinstance(sp_sing_ds_no_embed.token_embedding_matrix, numpy.ndarray)
assert isinstance(sp_compound_ds_no_embed.token_embedding_matrix, numpy.ndarray)
# check value
# check shape
assert sp_sing_ds_no_embed.token_embedding_matrix.shape == DUMMY_EMBEDDINGS_MATRIX_SHAPE
assert sp_compound_ds_no_embed.token_embedding_matrix.shape == DUMMY_EMBEDDINGS_MATRIX_SHAPE

Expand Down
30 changes: 0 additions & 30 deletions saber/utils/generic_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""A collection of generic helper/utility functions.
"""
import codecs
import errno
import logging
import os
Expand Down Expand Up @@ -74,35 +73,6 @@ def compress_model(dir_path):

return True

def bin_to_txt(filepath, output_dir=os.getcwd()):
"""Converts word embeddings given in the binary C format (w2v) to a simple
text format that can be used with Saber.
Args:
filepath (str): path to the word vectors file in binary C (w2v) format
output_dir (str): path to save converted word vectors file (defaults to
current working directory)
"""
# bad practice, but requires gensim to be installed when using
# SequenceProcessor if import statement is at the top of this file
from gensim.models.keyedvectors import KeyedVectors

# load word vectors provided in C binary format
word_vectors = KeyedVectors.load_word2vec_format(filepath, binary=True)
vocab = word_vectors.vocab

# create a new filepath
base_name = os.path.splitext(os.path.basename(filepath))[0]
output_filepath = os.path.join(output_dir, base_name + '.txt')

# write contents of input_file to new file output_filepath in txt format
with codecs.open(output_filepath, 'w+', encoding='utf-8') as out_file:
for word in vocab:
vector = word_vectors[word]
out_file.write("%s %s\n" %(word, " ".join(str(v) for v in vector)))

print('[INFO] Converted C binary file saved to {}'.format(output_filepath))

def get_pretrained_model_dir(config):
"""Returns path to top-level directory to save a pretrained model.
Expand Down
5 changes: 3 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/BaderLab/Saber",
python_requires='>=3.5',
python_requires='>=3.6',
packages=setuptools.find_packages(),
classifiers=(
"Programming Language :: Python :: 3",
Expand All @@ -21,7 +21,8 @@
),
install_requires=[
'flask>=1.0.2',
'keras>=2.2.0',
'gensim>=3.4.0'
'keras>=2.2.2',
'PTable',
'scikit-learn>=0.19.1',
'spacy>=2.0.11',
Expand Down

0 comments on commit 2667931

Please sign in to comment.