Multiple output column features with primitive refactor (#376)

* initial support * cfm tests passing * linting * progress * progress * add lint-fix and autopep * more passing * progress * progress * progress * don't require initialization * separate override tests * lint * cleanup * more tests passing * one more test passing * all but one test passing * all test pass * linting * add to lint-fix * all tests pass! * cleanup * fix tests * fix test * fix linting * docs building * actually fix docs * updates * Primitive refactor updates (#365) * remove incorrect commutative attributes * handle rsub override and test reverse overrides * rename weekend primitive is_weekend * updated weekend to is_weekend in docs * test values for scalar_subtract_numeric * rename subtract_numeric and scalar_subtract_numeric to subtract_numeric_feature and scalar_subtract_numeric_feature * revert subtract_numeric_feature to subtract_numeric * update rsub * fix test * replace expanding with number_output_features * remove old line from encode features * have dfs intialized un-initialized primitives * fix feature name assertion * remove get_feature_names on primitive * cleanup * test direct features of multi output features * linting * remove unecessary primitive initializations * rename NMostCommon primitive function * remove unecessary default value from NMostCommon * allow NMostCommon to stack * don't set primitive for DirectFeature * make number_output_features a property of FeatureBase * change parameter name in check_stacking * rename ready_primitive to handle_primitive and add check that input is a primitive * update variable name in where primitive check * switch to using feature.number_output_features in logic * add warning in encode_features for multi output features * make helper function for adding feature values to existing data * clean up direct feature tests on multi-output features * addressed review comments * remove logic to do direct features of grandparents
alteryx · Jan 19, 2019 · f80b5ae · f80b5ae
1 parent 36ce3c3
commit f80b5ae
Show file tree

Hide file tree

Showing 15 changed files with 431 additions and 102 deletions.
diff --git a/featuretools/computational_backends/pandas_backend.py b/featuretools/computational_backends/pandas_backend.py
@@ -250,12 +250,20 @@ def calculate_all_features(self, instance_ids, time_last,
             df = df.append(default_df, sort=True)
 
         df.index.name = self.entityset[self.target_eid].index
-        return df[[feat.get_name() for feat in self.features]]
+        column_list = []
+        for feat in self.features:
+            column_list.extend(feat.get_feature_names())
+        return df[column_list]
 
     def generate_default_df(self, instance_ids, extra_columns=None):
         index_name = self.features[0].entity.index
-        default_row = [f.default_value for f in self.features]
-        default_cols = [f.get_name() for f in self.features]
+        default_row = []
+        default_cols = []
+        for f in self.features:
+            for name in f.get_feature_names():
+                default_cols.append(name)
+                default_row.append(f.default_value)
+
         default_matrix = [default_row] * len(instance_ids)
         default_df = pd.DataFrame(default_matrix,
                                   columns=default_cols,
@@ -311,10 +319,17 @@ def _calculate_transform_features(self, features, entity_frames):
                 values = feature_func(*variable_data)
 
             # if we don't get just the values, the assignment breaks when indexes don't match
-            if isinstance(values, pd.Series):
-                values = values.values
+            def strip_values_if_series(values):
+                if isinstance(values, pd.Series):
+                    values = values.values
+                return values
+
+            if f.number_output_features > 1:
+                values = [strip_values_if_series(value) for value in values]
+            else:
+                values = [strip_values_if_series(values)]
+            update_feature_columns(f, frame, values)
 
-            frame[f.get_name()] = values
         return frame
 
     def _calculate_direct_features(self, features, entity_frames):
@@ -341,9 +356,11 @@ def _calculate_direct_features(self, features, entity_frames):
             # Sometimes entityset._add_multigenerational_links adds link variables
             # that would ordinarily get calculated as direct features,
             # so we make sure not to attempt to calculate again
-            if f.get_name() in child_df.columns:
-                continue
-            col_map[f.base_features[0].get_name()] = f.get_name()
+            base_names = f.base_features[0].get_feature_names()
+            for name, base_name in zip(f.get_feature_names(), base_names):
+                if name in child_df.columns:
+                    continue
+                col_map[base_name] = name
 
         # merge the identity feature from the parent entity into the child
         merge_df = parent_df[list(col_map.keys())].rename(columns=col_map)
@@ -472,23 +489,18 @@ def last_n(df):
                                  left_index=True, right_index=True, how='left')
 
         # Handle default values
-        # 1. handle non scalar default values
-        iterfeats = [f for f in features
-                     if hasattr(f.default_value, '__iter__')]
-        for f in iterfeats:
-            nulls = pd.isnull(frame[f.get_name()])
-            for ni in nulls[nulls].index:
-                frame.at[ni, f.get_name()] = f.default_value
-
-        # 2. handle scalars default values
-        fillna_dict = {f.get_name(): f.default_value for f in features
-                       if f not in iterfeats}
+        fillna_dict = {}
+        for f in features:
+            feature_defaults = {name: f.default_value
+                                for name in f.get_feature_names()}
+            fillna_dict.update(feature_defaults)
+
         frame.fillna(fillna_dict, inplace=True)
 
         # convert boolean dtypes to floats as appropriate
         # pandas behavior: https://github.com/pydata/pandas/issues/3752
         for f in features:
-            if (not f.expanding and
+            if (f.number_output_features == 1 and
                     f.variable_type == variable_types.Numeric and
                     frame[f.get_name()].dtype.name in ['object', 'bool']):
                 frame[f.get_name()] = frame[f.get_name()].astype(float)
@@ -505,8 +517,8 @@ def _can_agg(feature):
 
     if feature.primitive.uses_calc_time:
         return False
-
-    return len(base_features) == 1 and not feature.expanding
+    single_output = feature.primitive.number_output_features == 1
+    return len(base_features) == 1 and single_output
 
 
 def agg_wrapper(feats, time_last):
@@ -518,17 +530,25 @@ def wrap(df):
             args = [df[v] for v in variable_ids]
 
             if f.primitive.uses_calc_time:
-                d[f.get_name()] = func(*args, time=time_last)
+                values = func(*args, time=time_last)
             else:
-                d[f.get_name()] = func(*args)
+                values = func(*args)
+
+            if f.number_output_features == 1:
+                values = [values]
+            update_feature_columns(f, d, values)
 
         return pd.Series(d)
     return wrap
 
 
 def set_default_column(frame, f):
-    default = f.default_value
-    if hasattr(default, '__iter__'):
-        length = frame.shape[0]
-        default = [f.default_value] * length
-    frame[f.get_name()] = default
+    for name in f.get_feature_names():
+        frame[name] = f.default_value
+
+
+def update_feature_columns(feature, data, values):
+    names = feature.get_feature_names()
+    assert len(names) == len(values)
+    for name, value in zip(names, values):
+        data[name] = value
diff --git a/featuretools/feature_base/feature_base.py b/featuretools/feature_base/feature_base.py
@@ -23,14 +23,13 @@
 
 class FeatureBase(object):
     _name = None
-    expanding = False
 
     def __init__(self, entity, base_features, primitive):
         """Base class for all features
 
         Args:
             entity (Entity): entity this feature is being calculated for
-            base_featres (list[FeatureBase]): list of base features for primitive
+            base_features (list[FeatureBase]): list of base features for primitive
             primitive (:class:`.PrimitiveBase`): primitive to calculate. if not initialized when passed, gets initialized with no arguments
         """
         assert all(isinstance(f, FeatureBase) for f in base_features), \
@@ -63,6 +62,14 @@ def get_name(self):
             return self._name
         return self.generate_name()
 
+    def get_feature_names(self):
+        n = self.number_output_features
+        if n == 1:
+            names = [self.get_name()]
+        else:
+            names = [self.get_name() + "__{}".format(i) for i in range(n)]
+        return names
+
     def get_function(self):
         return self.primitive.get_function()
 
@@ -132,6 +139,10 @@ def entity(self):
         """Entity this feature belongs too"""
         return self.entityset[self.entity_id]
 
+    @property
+    def number_output_features(self):
+        return self.primitive.number_output_features
+
     def __repr__(self):
         ret = "<Feature: %s>" % (self.get_name())
 
@@ -322,33 +333,24 @@ class DirectFeature(FeatureBase):
 
     def __init__(self, base_feature, child_entity):
         base_feature = _check_feature(base_feature)
-        if base_feature.expanding:
-            self.expanding = True
-        self.base_feature = base_feature
-
-        # M TODO what does this do?
-        path = child_entity.entityset.find_forward_path(child_entity.id, base_feature.entity.id)
-        if len(path) > 1:
-            parent_entity_id = path[1].child_entity.id
-            parent_entity = child_entity.entityset[parent_entity_id]
-            parent_feature = DirectFeature(base_feature, parent_entity)
-        else:
-            parent_feature = base_feature
-
-        self.parent_entity = parent_feature.entity
-        super(DirectFeature, self).__init__(child_entity, [parent_feature], primitive=PrimitiveBase)
+        self.parent_entity = base_feature.entity
+        super(DirectFeature, self).__init__(child_entity, [base_feature], primitive=PrimitiveBase)
 
     @property
     def variable(self):
-        return self.base_feature.variable
+        return self.base_features[0].variable
+
+    @property
+    def number_output_features(self):
+        return self.base_features[0].primitive.number_output_features
 
     @property
     def default_value(self):
         return self.base_features[0].default_value
 
     def copy(self):
         """Return copy of feature"""
-        return DirectFeature(self.base_feature, self.entity)
+        return DirectFeature(self.base_features[0], self.entity)
 
     @property
     def variable_type(self):
@@ -358,6 +360,10 @@ def generate_name(self):
         return u"%s.%s" % (self.parent_entity.id,
                            self.base_features[0].get_name())
 
+    def get_feature_names(self):
+        return [u"%s.%s" % (self.parent_entity.id, base_name)
+                for base_name in self.base_features[0].get_feature_names()]
+
 
 class AggregationFeature(FeatureBase):
     # Feature to condition this feature by in
@@ -440,8 +446,8 @@ def __init__(self, base_features, primitive):
         else:
             base_features = [_check_feature(base_features)]
 
-        if any(bf.expanding for bf in base_features):
-            self.expanding = True
+        # R TODO handle stacking on sub-features
+        assert (bf.number_output_features == 1 for bf in base_features)
 
         super(TransformFeature, self).__init__(base_features[0].entity,
                                                base_features, primitive=primitive)

diff --git a/featuretools/primitives/base/aggregation_primitive_base.py b/featuretools/primitives/base/aggregation_primitive_base.py
@@ -27,7 +27,7 @@ def make_agg_primitive(function, input_types, return_type, name=None,
                        stack_on_exclude=None, base_of=None,
                        base_of_exclude=None, description='A custom primitive',
                        cls_attributes=None, uses_calc_time=False,
-                       commutative=False):
+                       commutative=False, number_output_features=1):
     '''Returns a new aggregation primitive class. The primitive infers default
     values by passing in empty data.
 
@@ -68,6 +68,9 @@ def make_agg_primitive(function, input_types, return_type, name=None,
         commutative (bool): If True, will only make one feature per unique set
             of base features.
 
+        number_output_features (int): The number of output features (columns in
+            the matrix) associated with this feature.
+
     Example:
         .. ipython :: python
 
@@ -100,6 +103,7 @@ def time_since_last(values, time=None):
     new_class.base_of = base_of
     new_class.base_of_exclude = base_of_exclude
     new_class.commutative = commutative
+    new_class.number_output_features = number_output_features
     new_class, default_kwargs = inspect_function_args(new_class,
                                                       function,
                                                       uses_calc_time)

diff --git a/featuretools/primitives/base/primitive_base.py b/featuretools/primitives/base/primitive_base.py
@@ -21,9 +21,8 @@ class PrimitiveBase(object):
     #: (int): Maximum number of features in the largest chain proceeding
     # downward from this feature's base features.
     max_stack_depth = None
-    #: (bool): If True, feature will expand into multiple values during
-    # calculation
-    expanding = False
+    #: (int): Number of columns in feature matrix associated with this feature
+    number_output_features = 1
     # whitelist of primitives can have this primitive in input_types
     base_of = None
     # blacklist of primitives can have this primitive in input_types

diff --git a/featuretools/primitives/base/transform_primitive_base.py b/featuretools/primitives/base/transform_primitive_base.py
@@ -22,7 +22,7 @@ def generate_name(self, base_feature_names):
 def make_trans_primitive(function, input_types, return_type, name=None,
                          description='A custom transform primitive',
                          cls_attributes=None, uses_calc_time=False,
-                         commutative=False):
+                         commutative=False, number_output_features=1):
     '''Returns a new transform primitive class
 
     Args:
@@ -48,6 +48,9 @@ def make_trans_primitive(function, input_types, return_type, name=None,
         commutative (bool): If True, will only make one feature per unique set
             of base features.
 
+        number_output_features (int): The number of output features (columns in
+            the matrix) associated with this feature.
+
     Example:
         .. ipython :: python
 
@@ -84,6 +87,7 @@ def isin_generate_name(self):
     new_class.input_types = input_types
     new_class.return_type = return_type
     new_class.commutative = commutative
+    new_class.number_output_features = number_output_features
     new_class, default_kwargs = inspect_function_args(new_class,
                                                       function,
                                                       uses_calc_time)

diff --git a/featuretools/primitives/standard/aggregation_primitives.py b/featuretools/primitives/standard/aggregation_primitives.py
@@ -132,6 +132,25 @@ def percent_true(s):
         return percent_true
 
 
+class NMostCommon(AggregationPrimitive):
+    """Finds the N most common elements in a categorical feature."""
+    name = "n_most_common"
+    input_types = [Discrete]
+    return_type = Discrete
+
+    def __init__(self, n=3):
+        self.number_output_features = n
+
+    def get_function(self):
+        def n_most_common(x, n=self.number_output_features):
+            array = np.array(x.value_counts()[:n].index)
+            if len(array) < n:
+                filler = np.full(n - len(array), np.nan)
+                array = np.append(array, filler)
+            return array
+        return n_most_common
+
+
 class AvgTimeBetween(AggregationPrimitive):
     """Computes the average time between consecutive events.