From 6dddc2a6f3ef4623fdb7a06dec965ddd139b5d76 Mon Sep 17 00:00:00 2001 From: Slobodan Ilic Date: Fri, 3 Aug 2018 18:25:19 +0200 Subject: [PATCH 1/3] Invert valid_indices logic --- src/cr/cube/dimension.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/src/cr/cube/dimension.py b/src/cr/cube/dimension.py index 4f9a010f2..c9e2d3010 100644 --- a/src/cr/cube/dimension.py +++ b/src/cr/cube/dimension.py @@ -211,7 +211,7 @@ def labels(self, include_missing=False, include_transforms=False, (self._get_name(el), el.get('id', -1)) ) for (i, el) in enumerate(self._elements) - if i in valid_indices + if include_missing or i not in self.invalid_indices ] # Create subtotals names and insert them in labels after @@ -272,10 +272,12 @@ def elements(self, include_missing=False): internally. For other variable types, actual 'elements' of the Crunch Cube JSON response are returned. ''' - valid_indices = self.valid_indices(include_missing) + if include_missing: + return self._elements + return [ el for (i, el) in enumerate(self._elements) - if i in valid_indices + if i not in self.invalid_indices ] def valid_indices(self, include_missing): @@ -289,8 +291,18 @@ def valid_indices(self, include_missing): if include_missing: return [i for (i, el) in enumerate(self._elements)] else: - return [i for (i, el) in enumerate(self._elements) - if not el.get('missing')] + return [ + i for (i, el) in enumerate(self._elements) + if not el.get('missing') + ] + + @lazyproperty + def invalid_indices(self): + return set([ + i for (i, el) in enumerate(self._elements) + if el.get('missing') + ]) + @lazyproperty def shape(self): From 1c77c2ec4f6ea853a9627abd1e81ffb45153388b Mon Sep 17 00:00:00 2001 From: Slobodan Ilic Date: Fri, 3 Aug 2018 18:55:41 +0200 Subject: [PATCH 2/3] Optimize inserted H&S indices --- src/cr/cube/dimension.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cr/cube/dimension.py b/src/cr/cube/dimension.py index c9e2d3010..b8ab51289 100644 --- a/src/cr/cube/dimension.py +++ b/src/cr/cube/dimension.py @@ -115,7 +115,7 @@ def _elements(self): @property def inserted_hs_indices(self): '''Returns inserted H&S indices for the dimension.''' - if self.type == 'categorical_array': + if (self.type == 'categorical_array' or not self.subtotals): return [] # For CA subvariables, we don't do H&S insertions element_ids = [element['id'] for element in self.elements()] From 85159b71a7a545fddab5c797915f390835eebd60 Mon Sep 17 00:00:00 2001 From: percious Date: Fri, 3 Aug 2018 11:16:28 -0600 Subject: [PATCH 3/3] performance improvements --- src/cr/cube/dimension.py | 30 +++++++++--- src/cr/cube/utils/__init__.py | 92 ++++++++++++++++++++++++++++++++++- 2 files changed, 112 insertions(+), 10 deletions(-) diff --git a/src/cr/cube/dimension.py b/src/cr/cube/dimension.py index 4f9a010f2..932ce04eb 100644 --- a/src/cr/cube/dimension.py +++ b/src/cr/cube/dimension.py @@ -3,7 +3,7 @@ import numpy as np from .subtotal import Subtotal -from .utils import lazyproperty +from .utils import lazyproperty, memoize class Dimension(object): @@ -118,7 +118,8 @@ def inserted_hs_indices(self): if self.type == 'categorical_array': return [] # For CA subvariables, we don't do H&S insertions - element_ids = [element['id'] for element in self.elements()] + elements = self.elements() + element_ids = [element['id'] for element in elements] tops = [st for st in self.subtotals if st.anchor == 'top'] bottoms = [st for st in self.subtotals if st.anchor == 'bottom'] @@ -130,7 +131,7 @@ def inserted_hs_indices(self): for index, insertion in enumerate(middles) ] bottom_indexes = [ - index + len(tops) + len(middles) + len(self.elements()) + index + len(tops) + len(middles) + len(elements) for index, insertion in enumerate(bottoms) ] return top_indexes + middle_indexes + bottom_indexes @@ -211,7 +212,7 @@ def labels(self, include_missing=False, include_transforms=False, (self._get_name(el), el.get('id', -1)) ) for (i, el) in enumerate(self._elements) - if i in valid_indices + if include_missing or i not in self.invalid_indices ] # Create subtotals names and insert them in labels after @@ -265,6 +266,7 @@ def _include_in_labels(label_with_ind, valid_indices): return label_with_ind['ind'] in valid_indices + @memoize def elements(self, include_missing=False): '''Get elements of the crunch Dimension. @@ -272,12 +274,16 @@ def elements(self, include_missing=False): internally. For other variable types, actual 'elements' of the Crunch Cube JSON response are returned. ''' - valid_indices = self.valid_indices(include_missing) + + if include_missing: + return self._elements + return [ el for (i, el) in enumerate(self._elements) - if i in valid_indices + if i not in self.invalid_indices ] + @memoize def valid_indices(self, include_missing): '''Gets valid indices of Crunch Cube Dimension's elements. @@ -289,8 +295,16 @@ def valid_indices(self, include_missing): if include_missing: return [i for (i, el) in enumerate(self._elements)] else: - return [i for (i, el) in enumerate(self._elements) - if not el.get('missing')] + return [ + i for (i, el) in enumerate(self._elements) + if not el.get('missing') + ] + + @lazyproperty + def invalid_indices(self): + return set([i for (i, el) in enumerate(self._elements) + if el.get('missing') + ]) @lazyproperty def shape(self): diff --git a/src/cr/cube/utils/__init__.py b/src/cr/cube/utils/__init__.py index 8c049b2d3..3617609b4 100644 --- a/src/cr/cube/utils/__init__.py +++ b/src/cr/cube/utils/__init__.py @@ -1,10 +1,19 @@ '''Utility functions for crunch cube, as well as other modules.''' -import os +import collections +import functools +from itertools import ifilterfalse import json +import os + + +class Counter(dict): + """Mapping where default values are zero""" + def __missing__(self, key): + return 0 def load_fixture(fixtures_directory, filename): - '''Loads fixtures for CrunchCube integration tests.''' + """Loads fixtures for CrunchCube integration tests.""" with open(os.path.join(fixtures_directory, filename)) as ctx_file: fixture = json.load(ctx_file) return fixture @@ -32,3 +41,82 @@ def get_prop_value(obj): return value return property(get_prop_value, doc=docstring) + + +def lru_cache(maxsize=100): + '''Least-recently-used cache decorator. + + Arguments to the cached function must be hashable. + Cache performance statistics stored in f.hits and f.misses. + Clear the cache with f.clear(). + http://en.wikipedia.org/wiki/Cache_algorithms#Least_Recently_Used + + ''' + maxqueue = maxsize * 10 + + def decorating_function(user_function, + len=len, iter=iter, tuple=tuple, sorted=sorted, KeyError=KeyError): + cache = {} # mapping of args to results + queue = collections.deque() # order that keys have been used + refcount = Counter() # times each key is in the queue + sentinel = object() # marker for looping around the queue + kwd_mark = object() # separate positional and keyword args + + # lookup optimizations (ugly but fast) + queue_append, queue_popleft = queue.append, queue.popleft + queue_appendleft, queue_pop = queue.appendleft, queue.pop + + @functools.wraps(user_function) + def wrapper(*args, **kwds): + # cache key records both positional and keyword args + key = args + if kwds: + key += (kwd_mark,) + tuple(sorted(kwds.items())) + + # record recent use of this key + queue_append(key) + refcount[key] += 1 + + # get cache entry or compute if not found + try: + result = cache[key] + wrapper.hits += 1 + except KeyError: + result = user_function(*args, **kwds) + cache[key] = result + wrapper.misses += 1 + + # purge least recently used cache entry + if len(cache) > maxsize: + key = queue_popleft() + refcount[key] -= 1 + while refcount[key]: + key = queue_popleft() + refcount[key] -= 1 + del cache[key], refcount[key] + + # periodically compact the queue by eliminating duplicate keys + # while preserving order of most recent access + if len(queue) > maxqueue: + refcount.clear() + queue_appendleft(sentinel) + for key in ifilterfalse(refcount.__contains__, + iter(queue_pop, sentinel)): + queue_appendleft(key) + refcount[key] = 1 + + return result + + def clear(): + cache.clear() + queue.clear() + refcount.clear() + wrapper.hits = wrapper.misses = 0 + + wrapper.hits = wrapper.misses = 0 + wrapper.clear = clear + return wrapper + return decorating_function + + +memoize = lru_cache(100)