Merge pull request #24 from BaderLab/fix-embedding-loading

Improve pre-trained embeddings loading Former-commit-id: 709d6860a493c67fbf188ef8e52c47dbb547b368 [formerly 877737dc69044c4faa6d2c97bd7616493a67db7d] [formerly e86cfaf5d153397b298d203ce2cef18c827a9af9 [formerly d7a63b2]] Former-commit-id: d26724f8c28f149560b644c3a8fa331980c8e1d9 [formerly 7c938cafeb9ec2347d6009db378e41722f8c1fbb] Former-commit-id: 0362b1994af23ba803124dd9606b7317e3ff2c03
BaderLab · Aug 14, 2018 · 2667931 · 2667931
2 parents d0b4228 + 3c29919
commit 2667931
Show file tree

Hide file tree

Showing 7 changed files with 60 additions and 81 deletions.
diff --git a/README.md b/README.md
@@ -242,25 +242,33 @@ Corpora are collected in [here](https://github.com/BaderLab/Biomedical-Corpora)
 
 ### Word embeddings
 
-When training new models, you can (and should) provide your own pre-trained word embeddings with the `token_pretrained_embedding_filepath` argument (either at the command line or in the configuration file). [Pyysalo _et al_. 2013](https://pdfs.semanticscholar.org/e2f2/8568031e1902d4f8ee818261f0f2c20de6dd.pdf) provide word embeddings that work quite well in the biomedical domain, which can be downloaded [here](http://bio.nlplab.org).
-
-Once downloaded, you will need to convert them from `.bin` to `.txt` format:
+When training new models, you can (and should) provide your own pre-trained word embeddings with the `pretrained_embeddings` argument (either at the command line or in the configuration file). Saber expects all word embeddings to be in the `word2vec` file format. [Pyysalo _et al_. 2013](https://pdfs.semanticscholar.org/e2f2/8568031e1902d4f8ee818261f0f2c20de6dd.pdf) provide word embeddings that work quite well in the biomedical domain, which can be downloaded [here](http://bio.nlplab.org). Alternatively, from the command line call:
 
 ```bash
-(saber) $ cd saber
-(saber) $ pip install gensim
-(saber) $ python
->> from generic_utils import bin_to_txt
->> path_to_embeddings = '/path/to/wikipedia-pubmed-and-PMC-w2v.bin'
->> bin_to_txt('wikipedia-pubmed-and-PMC-w2v.bin', output_dir='path/to/word_embeddings')
+
+mkdir saber/word_embeddings
+cd saber/word_embeddings
+# Note: this file is over 4GB
+wget http://evexdb.org/pmresources/vec-space-models/wikipedia-pubmed-and-PMC-w2v.bin
 ```
 
 > Note: you do not need to download pre-trained word embeddings if you only plan on using Saber's pre-trained models.
 
+#### GloVe
+
+To use [GloVe](https://nlp.stanford.edu/projects/glove/) embeddings, just convert them to the [word2vec](https://code.google.com/archive/p/word2vec/) format first:
+
+```bash
+(saber) $ python
+>>> from gensim.scripts.glove2word2vec import glove2word2vec
+>>> glove_input_file = 'glove.txt'
+>>> word2vec_output_file = 'word2vec.txt'
+>>> glove2word2vec(glove_input_file, word2vec_output_file)
+```
+
 ## Running tests
 
-Sabers test suite can be found in `Saber/saber/tests`. In order to run the tests, you'll usually want to clone the repository locally. Make sure to install all required development dependencies defined in the ``requirements.txt`` (see [Installation](#Installation) for more help). Additionally, you will
-need to install ``pytest``:
+Sabers test suite can be found in `Saber/saber/tests`. In order to run the tests, you'll usually want to clone the repository locally. Make sure to install all required development dependencies defined in the ``requirements.txt`` (see [Installation](#Installation) for more help). Additionally, you will need to install ``pytest``:
 
 ```bash
 (saber) $ pip install pytest

diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,5 @@
 flask>=1.0.2
+gensim>=3.4.0
 # Keras contrib repo
 git+https://www.github.com/keras-team/keras-contrib.git#egg=keras_contrib
 # Spacy small english language model

diff --git a/saber/sequence_processor.py b/saber/sequence_processor.py
@@ -7,6 +7,7 @@
 import os
 import time
 
+from gensim.models import KeyedVectors
 import numpy as np
 from spacy import displacy
 
@@ -292,9 +293,13 @@ def _load_compound_dataset(self):
 
         return compound_ds
 
-    def load_embeddings(self):
+    def load_embeddings(self, binary=True):
         """Coordinates the loading of pre-trained token embeddings.
 
+        Args:
+            binary (bool): True if pre-trained embeddings are in C binary format, False if they are
+                in C text format.
+
         Raises:
             MissingStepException: if no dataset has been loaded.
             ValueError: If 'self.config.pretrained_embeddings' is None.
@@ -308,7 +313,7 @@ def load_embeddings(self):
             self.log.error('ValueError: %s', err_msg)
             raise ValueError(err_msg)
 
-        self._load_token_embeddings()
+        self._load_token_embeddings(binary)
 
         return self
 
@@ -387,57 +392,52 @@ def fit(self):
         # train_history = pd.DataFrame(train_history.history)
         # return train_history
 
-    def _load_token_embeddings(self):
+    def _load_token_embeddings(self, binary=True):
         """Coordinates the loading of pre-trained token embeddings.
 
         Coordinates the loading of pre-trained token embeddings by reading in the file containing
-        the token embeddings and created an embedding matrix whos ith row corresponds to the token
+        the token embeddings and creating a embedding matrix whos ith row corresponds to the token
         embedding for the ith word in the models word to idx mapping.
+
+        Args:
+            binary (bool): True if pre-trained embeddings are in C binary format, False if they are
+                in C text format.
         """
-        start_time = time.time()
+        start = time.time()
         print('Loading embeddings... ', end='', flush=True)
 
         # prepare the embedding indicies
-        embedding_idx = self._prepare_token_embedding_layer()
+        embedding_idx = self._prepare_token_embedding_layer(binary)
         embedding_dim = len(list(embedding_idx.values())[0])
         # create the embedding matrix, update attribute
         embedding_matrix = self._prepare_token_embedding_matrix(embedding_idx, embedding_dim)
         self.token_embedding_matrix = embedding_matrix
 
-        elapsed_time = time.time() - start_time
-        print('Done ({0:.2f} seconds)'.format(elapsed_time))
+        end = time.time() - start
+        print('Done ({0:.2f} seconds)'.format(end))
         print('Found {} word vectors of dimension {}'.format(len(embedding_idx), embedding_dim))
         self.log.info('Loaded %i word vectors of dimension %i', len(embedding_idx), embedding_dim)
 
-    def _prepare_token_embedding_layer(self):
+    def _prepare_token_embedding_layer(self, binary=True):
         """Creates an embedding index using pretrained token embeddings.
 
-        For the models given pretrained token embeddings, creates and returns a dictionary mapping
-        words to known embeddings.
+        For the pretrained word embeddings given at `self.config.pretrained_embeddings`, creates
+        and returns a dictionary mapping words to embeddings, or word vectors. Note that if
+        `self.config.debug` is True, only the first 10K vectors are loaded.
+
+        Args:
+            binary (bool): True if pre-trained embeddings are in C binary format, False if they are
+                in C text format.
 
         Returns:
-            embedding_idx (dict): mapping of words to pre-trained token
-                embeddings
+            embed_idx (dict): mapping of words to pre-trained word embeddings
         """
-        # acc
-        embedding_idx = {}
-
-        # open pre-trained token embedding file for reading
-        with open(self.config.pretrained_embeddings, 'r') as pte:
-            for i, line in enumerate(pte):
-                # split line, get word and its embedding
-                values = line.split()
-                word = values[0]
-                coefs = np.asarray(values[1:], dtype='float32')
-
-                # update our embedding index
-                embedding_idx[word] = coefs
-
-                # if debug, load a small, arbitrary number of word embeddings
-                if i >= 10000 and self.config.debug:
-                    break
-
-        return embedding_idx
+        limit = 10000 if self.config.debug else None
+        vectors = KeyedVectors.load_word2vec_format(self.config.pretrained_embeddings,
+                                                    binary=binary,
+                                                    limit=limit)
+        embed_idx = {word: vectors[word] for word in vectors.vocab}
+        return embed_idx
 
     def _prepare_token_embedding_matrix(self, embedding_idx, embedding_dim):
         """Creates an embedding matrix using pretrained token embeddings.

diff --git a/saber/tests/resources/dummy_word_embeddings/dummy_word_embeddings.txt b/saber/tests/resources/dummy_word_embeddings/dummy_word_embeddings.txt
@@ -1,4 +1,5 @@
+4 2
 </s> 0.00170094 0.000166173
 the 0.155801 -0.0710875
 , -0.0593159 0.102907
-. -0.010395 0.120041
+. -0.010395 0.120041
diff --git a/saber/tests/test_sequence_processor.py b/saber/tests/test_sequence_processor.py
@@ -118,15 +118,13 @@ def test_token_embeddings_load(sp_sing_ds_no_embed, sp_compound_ds_no_embed):
     """Asserts that pre-trained token embeddings are loaded correctly when
     SequenceProcessor.load_embeddings() is called"""
     # load embeddings for each model
-    sp_sing_ds_no_embed.load_embeddings()
-    sp_compound_ds_no_embed.load_embeddings()
-
-    print(sp_compound_ds_no_embed.token_embedding_matrix)
+    sp_sing_ds_no_embed.load_embeddings(binary=False)
+    sp_compound_ds_no_embed.load_embeddings(binary=False)
 
     # check type
     assert isinstance(sp_sing_ds_no_embed.token_embedding_matrix, numpy.ndarray)
     assert isinstance(sp_compound_ds_no_embed.token_embedding_matrix, numpy.ndarray)
-    # check value
+    # check shape
     assert sp_sing_ds_no_embed.token_embedding_matrix.shape == DUMMY_EMBEDDINGS_MATRIX_SHAPE
     assert sp_compound_ds_no_embed.token_embedding_matrix.shape == DUMMY_EMBEDDINGS_MATRIX_SHAPE
 

diff --git a/saber/utils/generic_utils.py b/saber/utils/generic_utils.py
@@ -1,6 +1,5 @@
 """A collection of generic helper/utility functions.
 """
-import codecs
 import errno
 import logging
 import os
@@ -74,35 +73,6 @@ def compress_model(dir_path):
 
     return True
 
-def bin_to_txt(filepath, output_dir=os.getcwd()):
-    """Converts word embeddings given in the binary C format (w2v) to a simple
-    text format that can be used with Saber.
-
-    Args:
-        filepath (str): path to the word vectors file in binary C (w2v) format
-        output_dir (str): path to save converted word vectors file (defaults to
-            current working directory)
-    """
-    # bad practice, but requires gensim to be installed when using
-    # SequenceProcessor if import statement is at the top of this file
-    from gensim.models.keyedvectors import KeyedVectors
-
-    # load word vectors provided in C binary format
-    word_vectors = KeyedVectors.load_word2vec_format(filepath, binary=True)
-    vocab = word_vectors.vocab
-
-    # create a new filepath
-    base_name = os.path.splitext(os.path.basename(filepath))[0]
-    output_filepath = os.path.join(output_dir, base_name + '.txt')
-
-    # write contents of input_file to new file output_filepath in txt format
-    with codecs.open(output_filepath, 'w+', encoding='utf-8') as out_file:
-        for word in vocab:
-            vector = word_vectors[word]
-            out_file.write("%s %s\n" %(word, " ".join(str(v) for v in vector)))
-
-    print('[INFO] Converted C binary file saved to {}'.format(output_filepath))
-
 def get_pretrained_model_dir(config):
     """Returns path to top-level directory to save a pretrained model.
 

diff --git a/setup.py b/setup.py
@@ -12,7 +12,7 @@
     long_description=long_description,
     long_description_content_type="text/markdown",
     url="https://github.com/BaderLab/Saber",
-    python_requires='>=3.5',
+    python_requires='>=3.6',
     packages=setuptools.find_packages(),
     classifiers=(
         "Programming Language :: Python :: 3",
@@ -21,7 +21,8 @@
     ),
     install_requires=[
         'flask>=1.0.2',
-        'keras>=2.2.0',
+        'gensim>=3.4.0'
+        'keras>=2.2.2',
         'PTable',
         'scikit-learn>=0.19.1',
         'spacy>=2.0.11',