In [None]:
import torch
torch.cuda.is_available()

True

In [None]:
%pip install pyxdameraulevenshtein
from collections import defaultdict
import numpy as np
import difflib
import pandas as pd

try:
    from pyxdameraulevenshtein import damerau_levenshtein_distance_withNPArray
except ImportError:
    pass


class Corpus():
    _keys_frequency = None

    def __init__(self, out_of_vocabulary=-1, skip=-2):
        """ The Corpus helps with tasks involving integer representations of
        words. This object is used to filter, subsample, and convert loose
        word indices to compact word indices.

        'Loose' word arrays are word indices given by a tokenizer. The word
        index is not necessarily representative of word's frequency rank, and
        so loose arrays tend to have 'gaps' of unused indices, which can make
        models less memory efficient. As a result, this class helps convert
        a loose array to a 'compact' one where the most common words have low
        indices, and the most infrequent have high indices.

        Corpus maintains a count of how many of each word it has seen so
        that it can later selectively filter frequent or rare words. However,
        since word popularity rank could change with incoming data the word
        index count must be updated fully and `self.finalize()` must be called
        before any filtering and subsampling operations can happen.

        Arguments
        ---------
        out_of_vocabulary : int, default=-1
            Token index to replace whenever we encounter a rare or unseen word.
            Instead of skipping the token, we mark as an out of vocabulary
            word.
        skip : int, default=-2
            Token index to replace whenever we want to skip the current frame.
            Particularly useful when subsampling words or when padding a
            sentence.

        Examples
        --------
        >>> corpus = Corpus()
        >>> words_raw = np.random.randint(100, size=25)
        >>> corpus.update_word_count(words_raw)
        >>> corpus.finalize()
        >>> words_compact = corpus.to_compact(words_raw)
        >>> words_pruned = corpus.filter_count(words_compact, min_count=2)
        >>> # words_sub = corpus.subsample_frequent(words_pruned, thresh=1e-5)
        >>> words_loose = corpus.to_loose(words_pruned)
        >>> not_oov = words_loose > -1
        >>> np.all(words_loose[not_oov] == words_raw[not_oov])
        True
        """
        self.counts_loose = defaultdict(int)
        self._finalized = False
        self.specials = dict(out_of_vocabulary=out_of_vocabulary,
                             skip=skip)

    @property
    def n_specials(self):
        return len(self.specials)

    def update_word_count(self, loose_array):
        """ Update the corpus word counts given a loose array of word indices.
        Can be called multiple times, but once `finalize` is called the word
        counts cannot be updated.

        Arguments
        ---------
        loose_array : int array
            Array of word indices.

        Examples
        --------
        >>> corpus = Corpus()
        >>> corpus.update_word_count(np.arange(10))
        >>> corpus.update_word_count(np.arange(8))
        >>> corpus.counts_loose[0]
        2
        >>> corpus.counts_loose[9]
        1
        """
        self._check_unfinalized()
        uniques, counts = np.unique(np.ravel(loose_array), return_counts=True)
        msg = "Loose arrays cannot have elements below the values of special "
        msg += "tokens as these indices are reserved"
        assert uniques.min() >= min(self.specials.values()), msg
        for k, v in zip(uniques, counts):
            self.counts_loose[k] += v

    def _loose_keys_ordered(self):
        """ Get the loose keys in order of decreasing frequency"""
        loose_counts = sorted(self.counts_loose.items(), key=lambda x: x[1],
                              reverse=True)
        keys = np.array(loose_counts)[:, 0]
        counts = np.array(loose_counts)[:, 1]
        order = np.argsort(counts)[::-1].astype('int32')
        keys, counts = keys[order], counts[order]
        # Add in the specials as a prefix to the other keys
        specials = np.sort(self.specials.values())
        keys = np.concatenate((specials, keys))
        empty = np.zeros(len(specials), dtype='int32')
        counts = np.concatenate((empty, counts))
        n_keys = keys.shape[0]
        assert counts.min() >= 0
        return keys, counts, n_keys

    def finalize(self):
        """ Call `finalize` once done updating word counts. This means the
        object will no longer accept new word count data, but the loose
        to compact index mapping can be computed. This frees the object to
        filter, subsample, and compactify incoming word arrays.

        Examples
        --------
        >>> corpus = Corpus()
        >>> # We'll update the word counts, making sure that word index 2
        >>> # is the most common word index.
        >>> corpus.update_word_count(np.arange(1) + 2)
        >>> corpus.update_word_count(np.arange(3) + 2)
        >>> corpus.update_word_count(np.arange(10) + 2)
        >>> corpus.update_word_count(np.arange(8) + 2)
        >>> corpus.counts_loose[2]
        4
        >>> # The corpus has not been finalized yet, and so the compact mapping
        >>> # has not yet been computed.
        >>> corpus.keys_counts[0]
        Traceback (most recent call last):
            ...
        AttributeError: Corpus instance has no attribute 'keys_counts'
        >>> corpus.finalize()
        >>> corpus.n_specials
        2
        >>> # The special tokens are mapped to the first compact indices
        >>> corpus.compact_to_loose[0]
        -2
        >>> corpus.compact_to_loose[0] == corpus.specials['skip']
        True
        >>> corpus.compact_to_loose[1] == corpus.specials['out_of_vocabulary']
        True
        >>> corpus.compact_to_loose[2]  # Most popular token is mapped next
        2
        >>> corpus.loose_to_compact[3]  # 2nd most popular token is mapped next
        4
        >>> first_non_special = corpus.n_specials
        >>> corpus.keys_counts[first_non_special] # First normal token
        4
        """
        # Return the loose keys and counts in descending count order
        # so that the counts arrays is already in compact order
        self.keys_loose, self.keys_counts, n_keys = self._loose_keys_ordered()
        self.keys_compact = np.arange(n_keys).astype('int32')
        self.loose_to_compact = {l: c for l, c in
                                 zip(self.keys_loose, self.keys_compact)}
        self.compact_to_loose = {c: l for l, c in
                                 self.loose_to_compact.items()}
        self.specials_to_compact = {s: self.loose_to_compact[i]
                                    for s, i in self.specials.items()}
        self.compact_to_special = {c: s for c, s in
                                   self.specials_to_compact.items()}
        self._finalized = True

    @property
    def keys_frequency(self):
        if self._keys_frequency is None:
            f = self.keys_counts * 1.0 / np.sum(self.keys_counts)
            self._keys_frequency = f
        return self._keys_frequency

    def _check_finalized(self):
        msg = "self.finalized() must be called before any other array ops"
        assert self._finalized, msg

    def _check_unfinalized(self):
        msg = "Cannot update word counts after self.finalized()"
        msg += "has been called"
        assert not self._finalized, msg

    def filter_count(self, words_compact, min_count=15, max_count=0,
                     max_replacement=None, min_replacement=None):
        """ Replace word indices below min_count with the pad index.

        Arguments
        ---------
        words_compact: int array
            Source array whose values will be replaced. This is assumed to
            already be converted into a compact array with `to_compact`.
        min_count : int
            Replace words less frequently occuring than this count. This
            defines the threshold for what words are very rare
        max_count : int
            Replace words occuring more frequently than this count. This
            defines the threshold for very frequent words
        min_replacement : int, default is out_of_vocabulary
            Replace words less than min_count with this.
        max_replacement : int, default is out_of_vocabulary
            Replace words greater than max_count with this.

        Examples
        --------
        >>> corpus = Corpus()
        >>> # Make 1000 word indices with index < 100 and
        >>> # update the word counts.
        >>> word_indices = np.random.randint(100, size=1000)
        >>> corpus.update_word_count(word_indices)
        >>> corpus.finalize()  # any word indices above 99 will be filtered
        >>> # Now create a new text, but with some indices above 100
        >>> word_indices = np.random.randint(200, size=1000)
        >>> word_indices.max() < 100
        False
        >>> # Remove words that have never appeared in the original corpus.
        >>> filtered = corpus.filter_count(word_indices, min_count=1)
        >>> filtered.max() < 100
        True
        >>> # We can also remove highly frequent words.
        >>> filtered = corpus.filter_count(word_indices, max_count=2)
        >>> len(np.unique(word_indices)) > len(np.unique(filtered))
        True
        """
        self._check_finalized()
        ret = words_compact.copy()
        if min_replacement is None:
            min_replacement = self.specials_to_compact['out_of_vocabulary']
        if max_replacement is None:
            max_replacement = self.specials_to_compact['out_of_vocabulary']
        not_specials = np.ones(self.keys_counts.shape[0], dtype='bool')
        not_specials[:self.n_specials] = False
        if min_count:
            # Find first index with count less than min_count
            min_idx = np.argmax(not_specials & (self.keys_counts < min_count))
            # Replace all indices greater than min_idx
            ret[ret > min_idx] = min_replacement
        if max_count:
            # Find first index with count less than max_count
            max_idx = np.argmax(not_specials & (self.keys_counts < max_count))
            # Replace all indices less than max_idx
            ret[ret < max_idx] = max_replacement
        return ret

    def subsample_frequent(self, words_compact, threshold=1e-5):
        """ Subsample the most frequent words. This aggressively
        replaces words with frequencies higher than `threshold`. Words
        are replaced with the out_of_vocabulary token.

        Words will be replaced with probability as a function of their
        frequency in the training corpus:

        .. math::
            p(w) = 1.0 - \sqrt{threshold\over f(w)}

        Arguments
        ---------
        words_compact: int array
            The input array to subsample.
        threshold: float in [0, 1]
            Words with frequencies higher than this will be increasingly
            subsampled.

        Examples
        --------
        >>> corpus = Corpus()
        >>> word_indices = (np.random.power(5.0, size=1000) * 100).astype('i')
        >>> corpus.update_word_count(word_indices)
        >>> corpus.finalize()
        >>> compact = corpus.to_compact(word_indices)
        >>> sampled = corpus.subsample_frequent(compact, threshold=1e-2)
        >>> skip = corpus.specials_to_compact['skip']
        >>> np.sum(compact == skip)  # No skips in the compact tokens
        0
        >>> np.sum(sampled == skip) > 0  # Many skips in the sampled tokens
        True

        .. [1] Distributed Representations of Words and Phrases and
               their Compositionality. Mikolov, Tomas and Sutskever, Ilya
               and Chen, Kai and Corrado, Greg S and Dean, Jeff
               Advances in Neural Information Processing Systems 26
        """
        self._check_finalized()
        freq = self.keys_frequency + 1e-10
        pw = 1.0 - (np.sqrt(threshold / freq) + threshold / freq)
        prob = fast_replace(words_compact, self.keys_compact, pw)
        draw = np.random.uniform(size=prob.shape)
        ret = words_compact.copy()
        # If probability greater than draw, skip the word
        ret[prob > draw] = self.specials_to_compact['skip']
        return ret

    def to_compact(self, word_loose):
        """ Convert a loose word index matrix to a compact array using
        a fixed loose to dense mapping. Out of vocabulary word indices
        will be replaced by the out of vocabulary index. The most common
        index will be mapped to 0, the next most common to 1, and so on.

        Arguments
        ---------
        word_loose : int array
            Input loose word array to be converted into a compact array.


        Examples
        --------
        >>> corpus = Corpus()
        >>> word_indices = np.random.randint(100, size=1000)
        >>> n_words = len(np.unique(word_indices))
        >>> corpus.update_word_count(word_indices)
        >>> corpus.finalize()
        >>> word_compact = corpus.to_compact(word_indices)
        >>> # The most common word in the training set will be mapped to be
        >>> # right after all the special tokens, so 2 in this case.
        >>> np.argmax(np.bincount(word_compact)) == 2
        True
        >>> most_common = np.argmax(np.bincount(word_indices))
        >>> corpus.loose_to_compact[most_common] == 2
        True
        >>> # Out of vocabulary indices will be mapped to 1
        >>> word_indices = np.random.randint(150, size=1000)
        >>> word_compact_oov = corpus.to_compact(word_indices)
        >>> oov = corpus.specials_to_compact['out_of_vocabulary']
        >>> oov
        1
        >>> oov in word_compact
        False
        >>> oov in word_compact_oov
        True
        """
        self._check_finalized()
        keys = self.keys_loose
        reps = self.keys_compact
        uniques = np.unique(word_loose)
        # Find the out of vocab indices
        oov = np.setdiff1d(uniques, keys, assume_unique=True)
        oov_token = self.specials_to_compact['out_of_vocabulary']
        keys = np.concatenate((keys, oov))
        reps = np.concatenate((reps, np.zeros_like(oov) + oov_token))
        compact = fast_replace(word_loose, keys, reps)
        msg = "Error: all compact indices should be non-negative"
        assert compact.min() >= 0, msg
        return compact

    def to_loose(self, word_compact):
        """ Convert a compacted array back into a loose array.

        Arguments
        ---------
        word_compact : int array
            Input compacted word array to be converted into a loose array.


        Examples
        --------
        >>> corpus = Corpus()
        >>> word_indices = np.random.randint(100, size=1000)
        >>> corpus.update_word_count(word_indices)
        >>> corpus.finalize()
        >>> word_compact = corpus.to_compact(word_indices)
        >>> word_loose = corpus.to_loose(word_compact)
        >>> np.all(word_loose == word_indices)
        True
        """
        self._check_finalized()
        uniques = np.unique(word_compact)
        # Find the out of vocab indices
        oov = np.setdiff1d(uniques, self.keys_compact, assume_unique=True)
        msg = "Found keys in `word_compact` not present in the"
        msg += "training corpus. Is this actually a compacted array?"
        assert np.all(oov < 0), msg
        loose = fast_replace(word_compact, self.keys_compact, self.keys_loose)
        return loose

    def compact_to_flat(self, word_compact, *components):
        """ Ravel a 2D compact array of documents (rows) and word
        positions (columns) into a 1D array of words. Leave out special
        tokens and ravel the component arrays in the same fashion.

        Arguments
        ---------
        word_compact : int array
            Array of word indices in documents. Has shape (n_docs, max_length)
        components : list of arrays
            A list of arrays detailing per-document properties. Each array
            must n_docs long.

        Returns
        -------
        flat : int array
            An array of all words unravelled into a 1D shape
        components : list of arrays
            Each array here is also unravelled into the same shape

        Examples
        --------
        >>> corpus = Corpus()
        >>> word_indices = np.random.randint(100, size=1000)
        >>> corpus.update_word_count(word_indices)
        >>> corpus.finalize()
        >>> doc_texts = np.arange(8).reshape((2, 4))
        >>> doc_texts[:, -1] = -2  # Mark as skips
        >>> doc_ids = np.arange(2)
        >>> compact = corpus.to_compact(doc_texts)
        >>> oov = corpus.specials_to_compact['out_of_vocabulary']
        >>> compact[1, 3] = oov  # Mark the last word as OOV
        >>> flat = corpus.compact_to_flat(compact)
        >>> flat.shape[0] == 6  # 2 skips were dropped from 8 words
        True
        >>> flat[-1] == corpus.loose_to_compact[doc_texts[1, 2]]
        True
        >>> flat, (flat_id,) = corpus.compact_to_flat(compact, doc_ids)
        >>> flat_id
        array([0, 0, 0, 1, 1, 1])
        """
        self._check_finalized()
        n_docs = word_compact.shape[0]
        max_length = word_compact.shape[1]
        idx = word_compact > self.n_specials
        components_raveled = []
        msg = "Length of each component must much `word_compact` size"
        for component in components:
            raveled = np.tile(component[:, None], max_length)[idx]
            components_raveled.append(raveled)
            assert len(component) == n_docs, msg
        if len(components_raveled) == 0:
            return word_compact[idx]
        else:
            return word_compact[idx], components_raveled

    def word_list(self, vocab, max_compact_index=None, oov_token='<OoV>'):
        """ Translate compact keys back into string representations for a word.

        Arguments
        ---------
        vocab : dict
            The vocab object has loose indices as keys and word strings as
            values.

        max_compact_index : int
            Only return words up to this index. If None, defaults to the number
            of compact indices available

        oov_token : str
            Returns this string if a compact index does not have a word in the
            vocab dictionary provided.

        Returns
        -------
        word_list : list
            A list of strings representations corresponding to word indices
            zero to `max_compact_index`

        Examples
        --------

        >>> vocab = {0: 'But', 1: 'the', 2: 'night', 3: 'was', 4: 'warm'}
        >>> word_indices = np.zeros(50).astype('int32')
        >>> word_indices[:25] = 0  # 'But' shows 25 times
        >>> word_indices[25:35] = 1  # 'the' is in 10 times
        >>> word_indices[40:46] = 2  # 'night' is in 6 times
        >>> word_indices[46:49] = 3  # 'was' is in 3 times
        >>> word_indices[49:] = 4  # 'warm' in in 2 times
        >>> corpus = Corpus()
        >>> corpus.update_word_count(word_indices)
        >>> corpus.finalize()
        >>> # Build a vocabulary of word indices
        >>> corpus.word_list(vocab)
        ['skip', 'out_of_vocabulary', 'But', 'the', 'night', 'was', 'warm']
        """
        # Translate the compact keys into string words
        oov = self.specials['out_of_vocabulary']
        words = []
        if max_compact_index is None:
            max_compact_index = self.keys_compact.shape[0]
        index_to_special = {i: s for s, i in self.specials.items()}
        for compact_index in range(max_compact_index):
            loose_index = self.compact_to_loose.get(compact_index, oov)
            special = index_to_special.get(loose_index, oov_token)
            string = vocab.get(loose_index, special)
            words.append(string)
        return words

    def compact_word_vectors(self, vocab, filename=None, array=None,
                             top=20000):
        """ Retrieve pretrained word spectors for our vocabulary.
        The returned word array has row indices corresponding to the
        compact index of a word, and columns correponding to the word
        vector.

        Arguments
        ---------
        vocab : dict
            Dictionary where keys are the loose index, and values are
            the word string.

        use_spacy : bool
            Use SpaCy to load in word vectors. Otherwise Gensim.

        filename : str
            Filename for SpaCy-compatible word vectors or if use_spacy=False
            then uses word2vec vectors via gensim.

        Returns
        -------
        data : numpy float array
            Array such that data[compact_index, :] = word_vector

        Examples
        --------
        >>> import numpy.linalg as nl
        >>> vocab = {19: 'shuttle', 5: 'astronomy', 7: 'cold', 3: 'hot'}
        >>> word_indices = np.zeros(50).astype('int32')
        >>> word_indices[:25] = 19  # 'Shuttle' shows 25 times
        >>> word_indices[25:35] = 5  # 'astronomy' is in 10 times
        >>> word_indices[40:46] = 7  # 'cold' is in 6 times
        >>> word_indices[46:] = 3  # 'hot' is in 3 times
        >>> corpus = Corpus()
        >>> corpus.update_word_count(word_indices)
        >>> corpus.finalize()
        >>> v, s, f = corpus.compact_word_vectors(vocab)
        >>> sim = lambda x, y: np.dot(x, y) / nl.norm(x) / nl.norm(y)
        >>> vocab[corpus.compact_to_loose[2]]
        'shuttle'
        >>> vocab[corpus.compact_to_loose[3]]
        'astronomy'
        >>> vocab[corpus.compact_to_loose[4]]
        'cold'
        >>> sim_shuttle_astro = sim(v[2, :], v[3, :])
        >>> sim_shuttle_cold = sim(v[2, :], v[4, :])
        >>> sim_shuttle_astro > sim_shuttle_cold
        True
        """
        n_words = len(self.compact_to_loose)
        from gensim.models.word2vec import Word2Vec
        model = Word2Vec.load_word2vec_format(filename, binary=True)
        n_dim = model.syn0.shape[1]
        data = np.random.normal(size=(n_words, n_dim)).astype('float32')
        data -= data.mean()
        data += model.syn0.mean()
        data /= data.std()
        data *= model.syn0.std()
        if array is not None:
            data = array
            n_words = data.shape[0]
        keys_raw = model.vocab.keys()
        keys = [s.encode('ascii', 'ignore') for s in keys_raw]
        lens = [len(s) for s in model.vocab.keys()]
        choices = np.array(keys, dtype='S')
        lengths = np.array(lens, dtype='int32')
        s, f = 0, 0
        rep0 = lambda w: w
        rep1 = lambda w: w.replace(' ', '_')
        rep2 = lambda w: w.title().replace(' ', '_')
        reps = [rep0, rep1, rep2]
        for compact in np.arange(top):
            loose = self.compact_to_loose.get(compact, None)
            if loose is None:
                continue
            word = vocab.get(loose, None)
            if word is None:
                continue
            word = word.strip()
            vector = None
            for rep in reps:
                clean = rep(word)
                if clean in model.vocab:
                    vector = model[clean]
                    break
            if vector is None:
                try:
                    word = unicode(word)
                    idx = lengths >= len(word) - 3
                    idx &= lengths <= len(word) + 3
                    sel = choices[idx]
                    d = damerau_levenshtein_distance_withNPArray(word, sel)
                    choice = np.array(keys_raw)[idx][np.argmin(d)]
                    # choice = difflib.get_close_matches(word, choices)[0]
                    vector = model[choice]
                    print(compact, word, ' --> ', choice)
                except IndexError:
                    pass
            if vector is None:
                f += 1
                continue
            s += 1
            data[compact, :] = vector[:]
        return data, s, f

    def compact_to_bow(self, word_compact, max_compact_index=None):
        """ Given a 2D array of compact indices, return the bag of words
        representation where the column is the word index, row is the document
        index, and the value is the number of times that word appears in that
        document.

        >>> import numpy.linalg as nl
        >>> vocab = {19: 'shuttle', 5: 'astronomy', 7: 'cold', 3: 'hot'}
        >>> word_indices = np.zeros(50).astype('int32')
        >>> word_indices[:25] = 19  # 'Shuttle' shows 25 times
        >>> word_indices[25:35] = 5  # 'astronomy' is in 10 times
        >>> word_indices[40:46] = 7  # 'cold' is in 6 times
        >>> word_indices[46:] = 3  # 'hot' is in 3 times
        >>> corpus = Corpus()
        >>> corpus.update_word_count(word_indices)
        >>> corpus.finalize()
        >>> v = corpus.compact_to_bow(word_indices)
        >>> len(v)
        20
        >>> v[:6]
        array([ 5,  0,  0,  4,  0, 10])
        >>> v[19]
        25
        >>> v.sum()
        50
        >>> words = [[0, 0, 0, 3, 4], [1, 1, 1, 4, 5]]
        >>> words = np.array(words)
        >>> bow = corpus.compact_to_bow(words)
        >>> bow.shape
        (2, 6)
        """
        if max_compact_index is None:
            max_compact_index = word_compact.max()

        def bincount(x):
            return np.bincount(x, minlength=max_compact_index + 1)
        axis = len(word_compact.shape) - 1
        bow = np.apply_along_axis(bincount, axis, word_compact)
        return bow

    def compact_to_coocurrence(self, word_compact, indices, window_size=10):
        """ From an array of compact tokens and aligned array of document indices
        compute (word, word, document) co-occurrences within a moving window.

        Arguments
        ---------
        word_compact: int array
        Sequence of tokens.

        indices: dict of int arrays
        Each array in this dictionary should represent the document index it
        came from.

        window_size: int
        Indicates the moving window size around which all co-occurrences will
        be computed.

        Returns
        -------
        counts : DataFrame
        Returns a DataFrame with two columns for word index A and B,
        one extra column for each document index, and a final column for counts
        in that key.

        >>> compact = np.array([0, 1, 1, 1, 2, 2, 3, 0])
        >>> doc_idx = np.array([0, 0, 0, 0, 1, 1, 1, 1])
        >>> corpus = Corpus()
        >>> counts = corpus.compact_to_coocurrence(compact, {'doc': doc_idx})
        >>> counts.counts.sum()
        24
        >>> counts.query('doc == 0').counts.values
        array([3, 3, 6])
        >>> compact = np.array([0, 1, 1, 1, 2, 2, 3, 0])
        >>> doc_idx = np.array([0, 0, 0, 1, 1, 2, 2, 2])
        >>> corpus = Corpus()
        >>> counts = corpus.compact_to_coocurrence(compact, {'doc': doc_idx})
        >>> counts.counts.sum()
        14
        >>> counts.query('doc == 0').word_index_x.values
        array([0, 1, 1])
        >>> counts.query('doc == 0').word_index_y.values
        array([1, 0, 1])
        >>> counts.query('doc == 0').counts.values
        array([2, 2, 2])
        >>> counts.query('doc == 1').counts.values
        array([1, 1])
        """
        tokens = pd.DataFrame(dict(word_index=word_compact)).reset_index()
        for name, index in indices.items():
            tokens[name] = index
        a, b = tokens.copy(), tokens.copy()
        mask = lambda x: np.prod([x[k + '_x'] == x[k + '_y']
                                  for k in indices.keys()], axis=0)
        group_keys = ['word_index_x', 'word_index_y', ]
        group_keys += [k + '_x' for k in indices.keys()]
        total = []
        a['frame'] = a['index'].copy()
        for frame in range(-window_size, window_size + 1):
            if frame == 0:
                continue
            b['frame'] = b['index'] + frame
            matches = (a.merge(b, on='frame')
                        .assign(same_doc=mask)
                        .pipe(lambda df: df[df['same_doc'] == 1])
                        .groupby(group_keys)['frame']
                        .count()
                        .reset_index())
            total.append(matches)
        counts = (pd.concat(total)
                    .groupby(group_keys)['frame']
                    .sum()
                    .reset_index()
                    .rename(columns={k + '_x': k for k in indices.keys()})
                    .rename(columns=dict(frame='counts')))
        return counts


def fast_replace(data, keys, values, skip_checks=False):
    """ Do a search-and-replace in array `data`.

    Arguments
    ---------
    data : int array
        Array of integers
    keys : int array
        Array of keys inside of `data` to be replaced
    values : int array
        Array of values that replace the `keys` array
    skip_checks : bool, default=False
        Optionally skip sanity checking the input.

    Examples
    --------
    >>> fast_replace(np.arange(5), np.arange(5), np.arange(5)[::-1])
    array([4, 3, 2, 1, 0])
    """
    assert np.allclose(keys.shape, values.shape)
    if not skip_checks:
        msg = "data has elements not in keys"
        assert data.max() <= keys.max(), msg
    sdx = np.argsort(keys)
    keys, values = keys[sdx], values[sdx]
    idx = np.digitize(data, keys, right=True)
    new_data = values[idx]
    return new_data

In [None]:
!pip install chainer
import chainer.functions as F
from chainer import Variable


def dirichlet_likelihood(weights, alpha=None):
    """ Calculate the log likelihood of the observed topic proportions.
    A negative likelihood is more likely than a negative likelihood.

    Args:
        weights (chainer.Variable): Unnormalized weight vector. The vector
            will be passed through a softmax function that will map the input
            onto a probability simplex.
        alpha (float): The Dirichlet concentration parameter. Alpha
            greater than 1.0 results in very dense topic weights such
            that each document belongs to many topics. Alpha < 1.0 results
            in sparser topic weights. The default is to set alpha to
            1.0 / n_topics, effectively enforcing the prior belief that a
            document belong to very topics at once.

    Returns:
        ~chainer.Variable: Output loss variable.
    """
    if type(weights) is Variable:
        n_topics = weights.data.shape[1]
    else:
        n_topics = weights.W.data.shape[1]
    if alpha is None:
        alpha = 1.0 / n_topics
    if type(weights) is Variable:
        log_proportions = F.log_softmax(weights)
    else:
        log_proportions = F.log_softmax(weights.W)
    loss = (alpha - 1.0) * log_proportions
    return -F.sum(loss)

In [None]:
import chainer
import chainer.links as L


def _orthogonal_matrix(shape):
    # Stolen from blocks:
    # github.com/mila-udem/blocks/blob/master/blocks/initialization.py
    M1 = np.random.randn(shape[0], shape[0])
    M2 = np.random.randn(shape[1], shape[1])

    # QR decomposition of matrix with entries in N(0, 1) is random
    Q1, R1 = np.linalg.qr(M1)
    Q2, R2 = np.linalg.qr(M2)
    # Correct that NumPy doesn't force diagonal of R to be non-negative
    Q1 = Q1 * np.sign(np.diag(R1))
    Q2 = Q2 * np.sign(np.diag(R2))

    n_min = min(shape[0], shape[1])
    return np.dot(Q1[:, :n_min], Q2[:n_min, :])


class EmbedMixture(chainer.Chain):
    """ A single document is encoded as a multinomial mixture of latent topics.
    The mixture is defined on simplex, so that mixture weights always sum
    to 100%. The latent topic vectors resemble word vectors whose elements are
    defined over all real numbers.

    For example, a single document mix may be :math:`[0.9, 0.1]`, indicating
    that it is 90% in the first topic, 10% in the second. An example topic
    vector looks like :math:`[1.5e1, -1.3e0, +3.4e0, -0.2e0]`, which is
    largely uninterpretable until you measure the words most similar to this
    topic vector.

    A single document vector :math:`\vec{e}` is composed as weights :math:`c_j`
    over topic vectors :math:`\vec{T_j}`:

    .. math::

        \vec{e}=\Sigma_{j=0}^{j=n\_topics}c_j\vec{T_j}

    This is usually paired with regularization on the weights :math:`c_j`.
    If using a Dirichlet prior with low alpha, these weights will be sparse.

    Args:
        n_documents (int): Total number of documents
        n_topics (int): Number of topics per document
        n_dim (int): Number of dimensions per topic vector (should match word
            vector size)

    Attributes:
        weights : chainer.links.EmbedID
            Unnormalized topic weights (:math:`c_j`). To normalize these
            weights, use `F.softmax(weights)`.
        factors : chainer.links.Parameter
            Topic vector matrix (:math:`T_j`)

    .. seealso:: :func:`lda2vec.dirichlet_likelihood`
    """

    def __init__(self, n_documents, n_topics, n_dim, dropout_ratio=0.2,
                 temperature=1.0):
        self.n_documents = n_documents
        self.n_topics = n_topics
        self.n_dim = n_dim
        self.dropout_ratio = dropout_ratio
        factors = _orthogonal_matrix((n_topics, n_dim)).astype('float32')
        factors /= np.sqrt(n_topics + n_dim)
        super(EmbedMixture, self).__init__(
            weights=L.EmbedID(n_documents, n_topics),
            factors=L.Parameter(factors))
        self.temperature = temperature
        self.weights.W.data[...] /= np.sqrt(n_documents + n_topics)

    def __call__(self, doc_ids, update_only_docs=False):
        """ Given an array of document integer indices, returns a vector
        for each document. The vector is composed of topic weights projected
        onto topic vectors.

        Args:
            doc_ids : chainer.Variable
                One-dimensional batch vectors of IDs

        Returns:
            doc_vector : chainer.Variable
                Batch of two-dimensional embeddings for every document.
        """
        # (batchsize, ) --> (batchsize, multinomial)
        proportions = self.proportions(doc_ids, softmax=True)
        # (batchsize, n_factors) * (n_factors, n_dim) --> (batchsize, n_dim)
        factors = F.dropout(self.factors(), ratio=self.dropout_ratio)
        if update_only_docs:
            factors.unchain_backward()
        w_sum = F.matmul(proportions, factors)
        return w_sum

    def proportions(self, doc_ids, softmax=False):
        """ Given an array of document indices, return a vector
        for each document of just the unnormalized topic weights.

        Returns:
            doc_weights : chainer.Variable
                Two dimensional topic weights of each document.
        """
        w = self.weights(doc_ids)
        if softmax:
            size = w.data.shape
            mask = self.xp.random.random_integers(0, 1, size=size)
            y = (F.softmax(w * self.temperature) *
                 Variable(mask.astype('float32')))
            norm, y = F.broadcast(F.expand_dims(F.sum(y, axis=1), 1), y)
            return y / (norm + 1e-7)
        else:
            return w

In [None]:
from numpy.random import random_sample


def orthogonal_matrix(shape):
    # Stolen from blocks:
    # github.com/mila-udem/blocks/blob/master/blocks/initialization.py
    M1 = np.random.randn(shape[0], shape[0])
    M2 = np.random.randn(shape[1], shape[1])

    # QR decomposition of matrix with entries in N(0, 1) is random
    Q1, R1 = np.linalg.qr(M1)
    Q2, R2 = np.linalg.qr(M2)
    # Correct that NumPy doesn't force diagonal of R to be non-negative
    Q1 = Q1 * np.sign(np.diag(R1))
    Q2 = Q2 * np.sign(np.diag(R2))

    n_min = min(shape[0], shape[1])
    return np.dot(Q1[:, :n_min], Q2[:n_min, :])


def softmax(w):
    # https://gist.github.com/stober/1946926
    w = np.array(w)
    maxes = np.amax(w, axis=1)
    maxes = maxes.reshape(maxes.shape[0], 1)
    e = np.exp(w - maxes)
    dist = e / np.sum(e, axis=1)[:, None]
    return dist


def sample(values, probabilities, size):
    assert np.allclose(np.sum(probabilities, axis=-1), 1.0)
    bins = np.add.accumulate(probabilities)
    return values[np.digitize(random_sample(size), bins)]


def fake_data(n_docs, n_words, n_sent_length, n_topics):
    """ Generate latent topic vectors for words and documents
    and then for each document, draw a sentence. Draw each word
    document with probability proportional to the dot product and
    normalized with a softmax.

    Arguments
    ---------
    n_docs : int
        Number of documents
    n_words : int
        Number of words in the vocabulary
    n_sent_length : int
        Number of words to draw for each document
    n_topics : int
        Number of topics that a single document can belong to.

    Returns
    -------
    sentences : int array
        Array of word indices of shape (n_docs, n_sent_length).

    """
    # These are log ratios for the doc & word topics
    doc_topics = orthogonal_matrix([n_docs, n_topics])
    wrd_topics = orthogonal_matrix([n_topics, n_words])
    # Multiply log ratios and softmax to get prob of word in doc
    doc_to_wrds = softmax(np.dot(doc_topics, wrd_topics))
    # Now sample from doc_to_wrd to get realizations
    indices = np.arange(n_words).astype('int32')
    sentences = []
    for doc_to_wrd in doc_to_wrds:
        words = sample(indices, doc_to_wrd, n_sent_length)
        sentences.append(words)
    sentences = np.array(sentences)
    return sentences.astype('int32')

In [None]:
import numpy
import six

from chainer import cuda
from chainer.utils import type_check


class NegativeSamplingFunction(function.Function):

    ignore_label = -1

    def __init__(self, sampler, sample_size):
        self.sampler = sampler
        self.sample_size = sample_size

    def _make_samples(self, t):
        if hasattr(self, 'samples'):
            return self.samples  # for testing

        size = int(t.shape[0])
        # first one is the positive, and others are sampled negatives
        samples = self.sampler((size, self.sample_size + 1))
        samples[:, 0] = t
        self.samples = samples

    def check_type_forward(self, in_types):
        type_check.expect(in_types.size() == 3)
        x_type, t_type, w_type = in_types

        type_check.expect(
            x_type.dtype == numpy.float32,
            x_type.ndim == 2,
            t_type.dtype == numpy.int32,
            t_type.ndim == 1,
            x_type.shape[0] == t_type.shape[0],
            w_type.dtype == numpy.float32,
            w_type.ndim == 2,
        )

    def forward_cpu(self, inputs):
        x, t, W = inputs
        self.ignore_mask = (t != self.ignore_label)
        self._make_samples(t)

        loss = numpy.float32(0.0)
        for i, (ix, k) in enumerate(six.moves.zip(x[self.ignore_mask],
                                    self.samples[self.ignore_mask])):
            w = W[k]
            f = w.dot(ix)
            f[0] *= -1  # positive sample
            loss += numpy.sum(numpy.logaddexp(f, 0))
        return numpy.array(loss, numpy.float32),

    def forward_gpu(self, inputs):
        x, t, W = inputs
        self.ignore_mask = (t != self.ignore_label)
        n_in = x.shape[1]
        self._make_samples(t)

        self.wx = cuda.elementwise(
            'raw T W, raw T x, bool mask, S k, int32 c, int32 m', 'T wx',
            '''
            T f = 0;
            if (mask == 1){
                for (int j = 0; j < c; ++j) {
                  int x_ind[] = {(i / m), j};
                  int w_ind[] = {k, j};
                  f += x[x_ind] * W[w_ind];
                }
            }
            wx = f;
            ''',
            'negative_sampling_wx'
            )(W, x, self.ignore_mask[:, None], self.samples, n_in,
              self.sample_size + 1)

        y = cuda.elementwise(
            'T wx, int32 c, int32 m', 'T y',
            '''
            T f = wx;
            if (i % m == 0) {
              f = -f;
            }
            T loss;
            if (f < 0) {
              loss = __logf(1 + __expf(f));
            } else {
              loss = f + __logf(1 + __expf(-f));
            }
            y = loss;
            ''',
            'negative_sampling_forward'
        )(self.wx, n_in, self.sample_size + 1)
        # TODO(okuta): merge elementwise
        loss = cuda.cupy.sum(y * self.ignore_mask[:, None].astype('float32'))
        return loss,

    def backward_cpu(self, inputs, grads):
        x, t, W = inputs
        gloss, = grads

        gx = numpy.zeros_like(x)
        gW = numpy.zeros_like(W)
        for i, (ix, k) in enumerate(six.moves.zip(x[self.ignore_mask],
                                    self.samples[self.ignore_mask])):
            w = W[k]
            f = w.dot(ix)

            # g == -y * gloss / (1 + exp(yf))
            f[0] *= -1
            g = gloss / (1 + numpy.exp(-f))
            g[0] *= -1

            gx[i] = g.dot(w)
            for ik, ig in six.moves.zip(k, g):
                gW[ik] += ig * ix
        return gx, None, gW

    def backward_gpu(self, inputs, grads):
        cupy = cuda.cupy
        x, t, W = inputs
        gloss, = grads

        n_in = x.shape[1]
        g = cuda.elementwise(
            'T wx, raw T gloss, int32 m', 'T g',
            '''
            T y;
            if (i % m == 0) {
              y = 1;
            } else {
              y = -1;
            }

            g = -y * gloss[0] / (1.0f + __expf(wx * y));
            ''',
            'negative_sampling_calculate_g'
        )(self.wx, gloss, self.sample_size + 1)
        gx = cupy.zeros_like(x)
        cuda.elementwise(
            'raw T g, raw T W, bool mask, raw S k, int32 c, int32 m', 'T gx',
            '''
            int d = i / c;
            T w = 0;
            if (mask == 1){
                for (int j = 0; j < m; ++j) {
                  w += g[d * m + j] * W[k[d * m + j] * c + i % c];
                }
            }
            gx = w;
            ''',
            'negative_sampling_calculate_gx'
            )(g, W, self.ignore_mask[:, None], self.samples, n_in,
              self.sample_size + 1, gx)
        gW = cupy.zeros_like(W)
        cuda.elementwise(
            'T g, raw T x, S k, bool mask, int32 c, int32 m',
            'raw T gW',
            '''
            T gi = g;
            if (mask == 1) {
                for (int j = 0; j < c; ++j) {
                  atomicAdd(&gW[k * c + j], gi * x[(i / m) * c + j]);
                }
            }
            ''',
            'negative_sampling_calculate_gw'
            )(g, x, self.samples, self.ignore_mask[:, None], n_in,
              self.sample_size + 1, gW)
        return gx, None, gW


def negative_sampling(x, t, W, sampler, sample_size):
    """Negative sampling loss function.

    In natural language processing, especially language modeling, the number of
    words in a vocabulary can be very large.
    Therefore, you need to spend a lot of time calculating the gradient of the
    embedding matrix.

    By using the negative sampling trick you only need to calculate the
    gradient for a few sampled negative examples.

    The objective function is below:

    .. math::

       f(x, p) = \\log \\sigma(x^\\top w_p) + \\
       k E_{i \\sim P(i)}[\\log \\sigma(- x^\\top w_i)],

    where :math:`\sigma(\cdot)` is a sigmoid function, :math:`w_i` is the
    weight vector for the word :math:`i`, and :math:`p` is a positive example.
    It is approximeted with :math:`k` examples :math:`N` sampled from
    probability :math:`P(i)`, like this:

    .. math::

       f(x, p) \\approx \\log \\sigma(x^\\top w_p) + \\
       \\sum_{n \\in N} \\log \\sigma(-x^\\top w_n).

    Each sample of :math:`N` is drawn from the word distribution :math:`P(w)`.
    This is calculated as :math:`P(w) = \\frac{1}{Z} c(w)^\\alpha`, where
    :math:`c(w)` is the unigram count of the word :math:`w`, :math:`\\alpha` is
    a hyper-parameter, and :math:`Z` is the normalization constant.

    Args:
        x (~chainer.Variable): Batch of input vectors.
        t (~chainer.Variable): Vector of groundtruth labels.
        W (~chainer.Variable): Weight matrix.
        sampler (function): Sampling function. It takes a shape and returns an
            integer array of the shape. Each element of this array is a sample
            from the word distribution. A :class:`~chainer.utils.WalkerAlias`
            object built with the power distribution of word frequency is
            recommended.
        sample_size (int): Number of samples.

    See: `Distributed Representations of Words and Phrases and their\
         Compositionality <http://arxiv.org/abs/1310.4546>`_

    .. seealso:: :class:`~chainer.links.NegativeSampling`.

    """
    return NegativeSamplingFunction(sampler, sample_size)(x, t, W)


# Monkey-patch the chainer code to replace the negative sampling
# with the one used here
import chainer.links as L
import chainer.functions as F
negative_sampling.patched = True
L.NegativeSampling.negative_sampling = negative_sampling
F.negative_sampling = negative_sampling

In [None]:
# !pip install spacy
from spacy.lang.en import English
from spacy.attrs import LOWER, LIKE_URL, LIKE_EMAIL

import numpy as np


def tokenize(texts, max_length, skip=-2, attr='idx', merge=False, nlp=None,
             **kwargs):
    """ Uses spaCy to quickly tokenize text and return an array
    of indices.

    This method stores a global NLP directory in memory, and takes
    up to a minute to run for the time. Later calls will have the
    tokenizer in memory.

    Parameters
    ----------
    text : list of unicode strings
        These are the input documents. There can be multiple sentences per
        item in the list.
    max_length : int
        This is the maximum number of words per document. If the document is
        shorter then this number it will be padded to this length.
    skip : int, optional
        Short documents will be padded with this variable up until max_length.
    attr : int, from spacy.attrs
        What to transform the token to. Choice must be in spacy.attrs, and =
        common choices are (LOWER, LEMMA)
    merge : int, optional
        Merge noun phrases into a single token. Useful for turning 'New York'
        into a single token.
    nlp : None
        A spaCy NLP object. Useful for not reinstantiating the object multiple
        times.
    kwargs : dict, optional
        Any further argument will be sent to the spaCy tokenizer. For extra
        speed consider setting tag=False, parse=False, entity=False, or
        n_threads=8.

    Returns
    -------
    arr : 2D array of ints
        Has shape (len(texts), max_length). Each value represents
        the word index.
    vocab : dict
        Keys are the word index, and values are the string. The pad index gets
        mapped to None
    """
    if nlp is None:
        nlp = English()
    data = np.zeros((len(texts), max_length), dtype='int32')
    data[:] = skip
    bad_deps = ('amod', 'compound')
    for row, doc in enumerate(nlp.pipe([text.lower() for text in texts], **kwargs)):
        if merge:
            # from the spaCy blog, an example on how to merge
            # noun phrases into single tokens
            for phrase in doc.noun_chunks:
                # Only keep adjectives and nouns, e.g. "good ideas"
                while len(phrase) > 1 and phrase[0].dep_ not in bad_deps:
                    phrase = phrase[1:]
                if len(phrase) > 1:
                    # Merge the tokens, e.g. good_ideas
                    phrase.merge(phrase.root.tag_, phrase.text,
                                 phrase.root.ent_type_)
                # Iterate over named entities
                for ent in doc.ents:
                    if len(ent) > 1:
                        # Merge them into single tokens
                        ent.merge(ent.root.tag_, ent.text, ent.label_)
        dat = doc.to_array([attr, LIKE_EMAIL, LIKE_URL]).astype('int32')
        if len(dat) > 0:
            dat = dat.astype('int32')
            msg = "Negative indices reserved for special tokens"
            assert dat.min() >= 0, msg
            # Replace email and URL tokens
            idx = (dat[:, 1] > 0) | (dat[:, 2] > 0)
            dat[idx] = skip
            length = min(len(dat), max_length)
            data[row, :length] = dat[:length, 0].ravel()
    uniques = np.unique(data)
    vocab = {v: nlp.vocab[v].lower_ for v in uniques if v != skip}
    vocab[skip] = '<SKIP>'
    return data, vocab


# if __name__ == "__main__":
#     import doctest
#     doctest.testmod()

In [None]:
import requests
import multiprocessing

def _softmax(x):
    e_x = np.exp(x - np.max(x))
    out = e_x / e_x.sum()
    return out

def _softmax_2d(x):
    y = x - x.max(axis=1, keepdims=True)
    np.exp(y, out=y)
    y /= y.sum(axis=1, keepdims=True)
    return y

def prob_words(context, vocab, temperature=1.0):
    """ This calculates a softmax over the vocabulary as a function
    of the dot product of context and word.
    """
    dot = np.dot(vocab, context)
    prob = _softmax(dot / temperature)
    return prob

def prepare_topics(weights, factors, word_vectors, vocab, temperature=1.0,
                   doc_lengths=None, term_frequency=None, normalize=False):
    """ Collects a dictionary of word, document and topic distributions.

    Arguments
    ---------
    weights : float array
        This must be an array of unnormalized log-odds of document-to-topic
        weights. Shape should be [n_documents, n_topics]
    factors : float array
        Should be an array of topic vectors. These topic vectors live in the
        same space as word vectors and will be used to find the most similar
        words to each topic. Shape should be [n_topics, n_dim].
    word_vectors : float array
        This must be a matrix of word vectors. Should be of shape
        [n_words, n_dim]
    vocab : list of str
        These must be the strings for words corresponding to
        indices [0, n_words]
    temperature : float
        Used to calculate the log probability of a word. Higher
        temperatures make more rare words more likely.
    doc_lengths : int array
        An array indicating the number of words in the nth document.
        Must be of shape [n_documents]. Required by pyLDAvis.
    term_frequency : int array
        An array indicating the overall number of times each token appears
        in the corpus. Must be of shape [n_words]. Required by pyLDAvis.

    Returns
    -------
    data : dict
        This dictionary is readily consumed by pyLDAVis for topic
        visualization.
    """
    # Map each factor vector to a word
    topic_to_word = []
    msg = "Vocabulary size did not match size of word vectors"
    assert len(vocab) == word_vectors.shape[0], msg
    if normalize:
        word_vectors /= np.linalg.norm(word_vectors, axis=1)[:, None]
    # factors = factors / np.linalg.norm(factors, axis=1)[:, None]
    for factor_vector in factors:
        factor_to_word = prob_words(factor_vector, word_vectors,
                                    temperature=temperature)
        topic_to_word.append(np.ravel(factor_to_word))
    topic_to_word = np.array(topic_to_word)
    msg = "Not all rows in topic_to_word sum to 1"
    assert np.allclose(np.sum(topic_to_word, axis=1), 1), msg
    # Collect document-to-topic distributions, e.g. theta
    doc_to_topic = _softmax_2d(weights)
    msg = "Not all rows in doc_to_topic sum to 1"
    assert np.allclose(np.sum(doc_to_topic, axis=1), 1), msg
    data = {'topic_term_dists': topic_to_word,
            'doc_topic_dists': doc_to_topic,
            'doc_lengths': doc_lengths,
            'vocab': vocab,
            'term_frequency': term_frequency}
    return data

def print_top_words_per_topic(data, top_n=10, do_print=True):
    """ Given a pyLDAvis data array, print out the top words in every topic.

    Arguments
    ---------
    data : dict
        A dict object that summarizes topic data and has been made using
        `prepare_topics`.
    """
    msgs = []
    lists = []
    for j, topic_to_word in enumerate(data['topic_term_dists']):
        top = np.argsort(topic_to_word)[::-1][:top_n]
        prefix = "Top words in topic %i " % j
        top_words = [data['vocab'][i].strip().replace(' ', '_') for i in top]
        msg = ' '.join(top_words)
        if do_print:
            print(prefix + msg)
        lists.append(top_words)
    return lists

def get_request(url):
    for _ in range(5):
        try:
            return float(requests.get(url).text)
        except:
            pass
    return None

def topic_coherence(lists, services=['ca', 'cp', 'cv', 'npmi', 'uci',
                                     'umass']):
    """ Requests the topic coherence from AKSW Palmetto

    Arguments
    ---------
    lists : list of lists
        A list of lists with one list of top words for each topic.

    >>> topic_words = [['cake', 'apple', 'banana', 'cherry', 'chocolate']]
    >>> topic_coherence(topic_words, services=['cv'])
    {(0, 'cv'): 0.5678879445677241}
    """
    url = u'http://palmetto.aksw.org/palmetto-webapp/service/{}?words={}'
    reqs = [url.format(s, '%20'.join(top[:10])) for s in services for top in lists]
    pool = multiprocessing.Pool()
    coherences = pool.map(get_request, reqs)
    pool.close()
    pool.terminate()
    pool.join()
    del pool
    args = [(j, s, top) for s in services for j, top in enumerate(lists)]
    ans = {}
    for ((j, s, t), tc) in zip(args, coherences):
        ans[(j, s)] = tc
    return ans

In [None]:
from sklearn.linear_model import LinearRegression

class Tracking:
    cache = {}
    calls = 0
    slope = 0.0

    def __init__(self, n=5000):
        """ The tracking class keeps a most recently used cache of values

        Parameters
        ----------
        n: int
        Number of items to keep.
        """
        self.n = n

    def add(self, key, item):
        """ Add an item with a particular to the cache.

        >>> tracker = Tracking()
        >>> tracker.add('log_perplexity', 55.6)
        >>> tracker.cache['log_perplexity']
        [55.6]
        >>> tracker.add('log_perplexity', 55.2)
        >>> tracker.add('loss', -12.1)
        >>> tracker.cache['log_perplexity']
        [55.6, 55.2]
        >>> tracker.cache['loss']
        [-12.1]
        """
        if key not in self.cache:
            self.cache[key] = []
        self.cache[key].append(item)
        if len(self.cache[key]) > self.n:
            self.cache[key] = self.cache[key][:self.n]

    def stats(self, key):
        """ Get the statistics for items with a particular key

        >>> tracker = Tracking()
        >>> tracker.add('log_perplexity', 55.6)
        >>> tracker.add('log_perplexity', 55.2)
        >>> tracker.stats('log_perplexity')
        (55.400000000000006, 0.19999999999999929, 0.0)
        """
        data = self.cache[key]
        mean = np.mean(data)
        std = np.std(data)
        slope = self.slope
        if self.calls % 100 == 0:
            lr = LinearRegression()
            x = np.arange(len(data)).astype('float32')
            lr.fit(x[:, None], np.array(data))
            self.slope = lr.coef_[0]
        self.calls += 1
        return mean, std, slope

# if __name__ == "__main__":
#     import doctest
#     doctest.testmod()

In [None]:
from chainer import Variable
import random

def move(xp, *args):
    for arg in args:
        if 'float' in str(arg.dtype):
            yield Variable(xp.asarray(arg, dtype='float32'))
        else:
            assert 'int' in str(arg.dtype)
            yield Variable(xp.asarray(arg, dtype='int32'))

def most_similar(embeddings, word_index):
    input_vector = embeddings.W[word_index]
    similarities = embeddings.dot(input_vector)
    return similarities

def chunks(n, *args):
    """Yield successive n-sized chunks from l."""
    # From stackoverflow question 312443
    keypoints = []
    for i in xrange(0, len(args[0]), n):
        keypoints.append((i, i + n))
    random.shuffle(keypoints)
    for a, b in keypoints:
        yield [arg[a: b] for arg in args]

class MovingAverage():
    def __init__(self, lastn=100):
        self.points = np.array([])
        self.lastn = lastn

    def add(self, x):
        self.points = np.append(self.points, x)

    def mean(self):
        return np.mean(self.points[-self.lastn:])

    def std(self):
        return np.std(self.points[-self.lastn:])

    def get_stats(self):
        return (np.mean(self.points[-self.lastn:]),
                np.std(self.points[-self.lastn:]))

In [None]:
import numpy as np
import pytest
import os

on_ci = os.environ.get('CI', False) == 'true'
nlp = English()

@pytest.mark.skipif(on_ci, reason='SpaCy install fails on TravisCI')
def test_tokenize():
    texts = [u'Do you recall, not long ago']
    texts += [u'We would walk on the sidewalk?']
    arr, vocab = tokenize(texts, 10)
    print(arr, vocab)
    assert arr[0, 0] != arr[0, 1]
    assert arr.shape[0] == 2
    assert arr.shape[1] == 10
    assert arr[0, -1] == -2
    assert arr.dtype == np.dtype('int32')
    first_word = texts[0].split(' ')[0].lower()
    first_lowr = nlp.vocab[arr[0, 0]].lower_
    print(first_word, "hello", first_lowr)
    assert first_word == first_lowr

test_tokenize()

['\t', 'en', '\n', ' ', "'", "''", '"', "'Cause", 'because', "'cause", 'use', "'Xxxxx", 'Cause', 'cause', 'C', 'Xxxxx', "'Cos", "'cos", 'Cos', "'Xxx", 'cos', 'Xxx', "'Coz", "'coz", 'Coz', 'coz', "'Cuz", "'cuz", 'Cuz', 'cuz', "'S", "'s", "'X", 'S', 's', "'bout", 'about', 'out', "'xxxx", 'bout', 'b', 'xxxx', 'c', "'xxx", 'xxx', "'d", "'x", 'd', 'x', "'em", 'them', "'xx", 'em', 'e', 'xx', "'ll", 'will', 'll', 'l', "'nuff", 'enough', 'uff', 'nuff', 'n', "'re", 'are', 're', 'r', '(*_*)', '(', '_*)', ')', '*', '(-8', '(-d', '-8', '-', '-d', '(-:', ':', '(-;', ';', '(-_-)', '_-)', '-_-', '(._.)', '_.)', '.', '(:', '(;', '(=', '=', '(>_<)', '_<)', '>', '<', '(^_^)', '_^)', '^_^', '^', '(o:', '(x:', 'o', '(¬_¬)', '_¬)', '¬_¬', '¬', '(ಠ_ಠ)', '_ಠ)', '(x_x)', 'ಠ_ಠ', 'ಠ', 'x_x', '(╯°□°）╯︵┻━┻', '┻━┻', '┻', '╯', '━', '°', '□', '）', '︵', ')-:', '):', '-__-', '__-', '._.', '0.0', '0', 'd.d', '0.o', 'd.x', '0_0', 'd_d', '0_o', 'd_x', '10', '1', 'dd', 'a.m.', 'a', '.m.', 'x.x.', '10a.m', 'a.m', 'ddx.x', 

AttributeError: ignored

In [None]:
""