Skip to content

Commit

Permalink
Multiple output column features with primitive refactor (#376)
Browse files Browse the repository at this point in the history
* initial support

* cfm tests passing

* linting

* progress

* progress

* add lint-fix and autopep

* more passing

* progress

* progress

* progress

* don't require initialization

* separate override tests

* lint

* cleanup

* more tests passing

* one more test passing

* all but one test passing

* all test pass

* linting

* add to lint-fix

* all tests pass!

* cleanup

* fix tests

* fix test

* fix linting

* docs building

* actually fix docs

* updates

* Primitive refactor updates (#365)

* remove incorrect commutative attributes

* handle rsub override and test reverse overrides

* rename weekend primitive is_weekend

* updated weekend to is_weekend in docs

* test values for scalar_subtract_numeric

* rename subtract_numeric and scalar_subtract_numeric to subtract_numeric_feature and scalar_subtract_numeric_feature

* revert subtract_numeric_feature to subtract_numeric

* update rsub

* fix test

* replace expanding with number_output_features

* remove old line from encode features

* have dfs intialized un-initialized primitives

* fix feature name assertion

* remove get_feature_names on primitive

* cleanup

* test direct features of multi output features

* linting

* remove unecessary primitive initializations

* rename NMostCommon primitive function

* remove unecessary default value from NMostCommon

* allow NMostCommon to stack

* don't set primitive for DirectFeature

* make number_output_features a property of FeatureBase

* change parameter name in check_stacking

* rename ready_primitive to handle_primitive and add check that input is a primitive

* update variable name in where primitive check

* switch to using feature.number_output_features in logic

* add warning in encode_features for multi output features

* make helper function for adding feature values to existing data

* clean up direct feature tests on multi-output features

* addressed review comments

* remove logic to do direct features of grandparents
  • Loading branch information
rwedge committed Jan 19, 2019
1 parent 36ce3c3 commit f80b5ae
Show file tree
Hide file tree
Showing 15 changed files with 431 additions and 102 deletions.
80 changes: 50 additions & 30 deletions featuretools/computational_backends/pandas_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,12 +250,20 @@ def calculate_all_features(self, instance_ids, time_last,
df = df.append(default_df, sort=True)

df.index.name = self.entityset[self.target_eid].index
return df[[feat.get_name() for feat in self.features]]
column_list = []
for feat in self.features:
column_list.extend(feat.get_feature_names())
return df[column_list]

def generate_default_df(self, instance_ids, extra_columns=None):
index_name = self.features[0].entity.index
default_row = [f.default_value for f in self.features]
default_cols = [f.get_name() for f in self.features]
default_row = []
default_cols = []
for f in self.features:
for name in f.get_feature_names():
default_cols.append(name)
default_row.append(f.default_value)

default_matrix = [default_row] * len(instance_ids)
default_df = pd.DataFrame(default_matrix,
columns=default_cols,
Expand Down Expand Up @@ -311,10 +319,17 @@ def _calculate_transform_features(self, features, entity_frames):
values = feature_func(*variable_data)

# if we don't get just the values, the assignment breaks when indexes don't match
if isinstance(values, pd.Series):
values = values.values
def strip_values_if_series(values):
if isinstance(values, pd.Series):
values = values.values
return values

if f.number_output_features > 1:
values = [strip_values_if_series(value) for value in values]
else:
values = [strip_values_if_series(values)]
update_feature_columns(f, frame, values)

frame[f.get_name()] = values
return frame

def _calculate_direct_features(self, features, entity_frames):
Expand All @@ -341,9 +356,11 @@ def _calculate_direct_features(self, features, entity_frames):
# Sometimes entityset._add_multigenerational_links adds link variables
# that would ordinarily get calculated as direct features,
# so we make sure not to attempt to calculate again
if f.get_name() in child_df.columns:
continue
col_map[f.base_features[0].get_name()] = f.get_name()
base_names = f.base_features[0].get_feature_names()
for name, base_name in zip(f.get_feature_names(), base_names):
if name in child_df.columns:
continue
col_map[base_name] = name

# merge the identity feature from the parent entity into the child
merge_df = parent_df[list(col_map.keys())].rename(columns=col_map)
Expand Down Expand Up @@ -472,23 +489,18 @@ def last_n(df):
left_index=True, right_index=True, how='left')

# Handle default values
# 1. handle non scalar default values
iterfeats = [f for f in features
if hasattr(f.default_value, '__iter__')]
for f in iterfeats:
nulls = pd.isnull(frame[f.get_name()])
for ni in nulls[nulls].index:
frame.at[ni, f.get_name()] = f.default_value

# 2. handle scalars default values
fillna_dict = {f.get_name(): f.default_value for f in features
if f not in iterfeats}
fillna_dict = {}
for f in features:
feature_defaults = {name: f.default_value
for name in f.get_feature_names()}
fillna_dict.update(feature_defaults)

frame.fillna(fillna_dict, inplace=True)

# convert boolean dtypes to floats as appropriate
# pandas behavior: https://github.com/pydata/pandas/issues/3752
for f in features:
if (not f.expanding and
if (f.number_output_features == 1 and
f.variable_type == variable_types.Numeric and
frame[f.get_name()].dtype.name in ['object', 'bool']):
frame[f.get_name()] = frame[f.get_name()].astype(float)
Expand All @@ -505,8 +517,8 @@ def _can_agg(feature):

if feature.primitive.uses_calc_time:
return False

return len(base_features) == 1 and not feature.expanding
single_output = feature.primitive.number_output_features == 1
return len(base_features) == 1 and single_output


def agg_wrapper(feats, time_last):
Expand All @@ -518,17 +530,25 @@ def wrap(df):
args = [df[v] for v in variable_ids]

if f.primitive.uses_calc_time:
d[f.get_name()] = func(*args, time=time_last)
values = func(*args, time=time_last)
else:
d[f.get_name()] = func(*args)
values = func(*args)

if f.number_output_features == 1:
values = [values]
update_feature_columns(f, d, values)

return pd.Series(d)
return wrap


def set_default_column(frame, f):
default = f.default_value
if hasattr(default, '__iter__'):
length = frame.shape[0]
default = [f.default_value] * length
frame[f.get_name()] = default
for name in f.get_feature_names():
frame[name] = f.default_value


def update_feature_columns(feature, data, values):
names = feature.get_feature_names()
assert len(names) == len(values)
for name, value in zip(names, values):
data[name] = value
48 changes: 27 additions & 21 deletions featuretools/feature_base/feature_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,13 @@

class FeatureBase(object):
_name = None
expanding = False

def __init__(self, entity, base_features, primitive):
"""Base class for all features
Args:
entity (Entity): entity this feature is being calculated for
base_featres (list[FeatureBase]): list of base features for primitive
base_features (list[FeatureBase]): list of base features for primitive
primitive (:class:`.PrimitiveBase`): primitive to calculate. if not initialized when passed, gets initialized with no arguments
"""
assert all(isinstance(f, FeatureBase) for f in base_features), \
Expand Down Expand Up @@ -63,6 +62,14 @@ def get_name(self):
return self._name
return self.generate_name()

def get_feature_names(self):
n = self.number_output_features
if n == 1:
names = [self.get_name()]
else:
names = [self.get_name() + "__{}".format(i) for i in range(n)]
return names

def get_function(self):
return self.primitive.get_function()

Expand Down Expand Up @@ -132,6 +139,10 @@ def entity(self):
"""Entity this feature belongs too"""
return self.entityset[self.entity_id]

@property
def number_output_features(self):
return self.primitive.number_output_features

def __repr__(self):
ret = "<Feature: %s>" % (self.get_name())

Expand Down Expand Up @@ -322,33 +333,24 @@ class DirectFeature(FeatureBase):

def __init__(self, base_feature, child_entity):
base_feature = _check_feature(base_feature)
if base_feature.expanding:
self.expanding = True
self.base_feature = base_feature

# M TODO what does this do?
path = child_entity.entityset.find_forward_path(child_entity.id, base_feature.entity.id)
if len(path) > 1:
parent_entity_id = path[1].child_entity.id
parent_entity = child_entity.entityset[parent_entity_id]
parent_feature = DirectFeature(base_feature, parent_entity)
else:
parent_feature = base_feature

self.parent_entity = parent_feature.entity
super(DirectFeature, self).__init__(child_entity, [parent_feature], primitive=PrimitiveBase)
self.parent_entity = base_feature.entity
super(DirectFeature, self).__init__(child_entity, [base_feature], primitive=PrimitiveBase)

@property
def variable(self):
return self.base_feature.variable
return self.base_features[0].variable

@property
def number_output_features(self):
return self.base_features[0].primitive.number_output_features

@property
def default_value(self):
return self.base_features[0].default_value

def copy(self):
"""Return copy of feature"""
return DirectFeature(self.base_feature, self.entity)
return DirectFeature(self.base_features[0], self.entity)

@property
def variable_type(self):
Expand All @@ -358,6 +360,10 @@ def generate_name(self):
return u"%s.%s" % (self.parent_entity.id,
self.base_features[0].get_name())

def get_feature_names(self):
return [u"%s.%s" % (self.parent_entity.id, base_name)
for base_name in self.base_features[0].get_feature_names()]


class AggregationFeature(FeatureBase):
# Feature to condition this feature by in
Expand Down Expand Up @@ -440,8 +446,8 @@ def __init__(self, base_features, primitive):
else:
base_features = [_check_feature(base_features)]

if any(bf.expanding for bf in base_features):
self.expanding = True
# R TODO handle stacking on sub-features
assert (bf.number_output_features == 1 for bf in base_features)

super(TransformFeature, self).__init__(base_features[0].entity,
base_features, primitive=primitive)
Expand Down
6 changes: 5 additions & 1 deletion featuretools/primitives/base/aggregation_primitive_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def make_agg_primitive(function, input_types, return_type, name=None,
stack_on_exclude=None, base_of=None,
base_of_exclude=None, description='A custom primitive',
cls_attributes=None, uses_calc_time=False,
commutative=False):
commutative=False, number_output_features=1):
'''Returns a new aggregation primitive class. The primitive infers default
values by passing in empty data.
Expand Down Expand Up @@ -68,6 +68,9 @@ def make_agg_primitive(function, input_types, return_type, name=None,
commutative (bool): If True, will only make one feature per unique set
of base features.
number_output_features (int): The number of output features (columns in
the matrix) associated with this feature.
Example:
.. ipython :: python
Expand Down Expand Up @@ -100,6 +103,7 @@ def time_since_last(values, time=None):
new_class.base_of = base_of
new_class.base_of_exclude = base_of_exclude
new_class.commutative = commutative
new_class.number_output_features = number_output_features
new_class, default_kwargs = inspect_function_args(new_class,
function,
uses_calc_time)
Expand Down
5 changes: 2 additions & 3 deletions featuretools/primitives/base/primitive_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,8 @@ class PrimitiveBase(object):
#: (int): Maximum number of features in the largest chain proceeding
# downward from this feature's base features.
max_stack_depth = None
#: (bool): If True, feature will expand into multiple values during
# calculation
expanding = False
#: (int): Number of columns in feature matrix associated with this feature
number_output_features = 1
# whitelist of primitives can have this primitive in input_types
base_of = None
# blacklist of primitives can have this primitive in input_types
Expand Down
6 changes: 5 additions & 1 deletion featuretools/primitives/base/transform_primitive_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def generate_name(self, base_feature_names):
def make_trans_primitive(function, input_types, return_type, name=None,
description='A custom transform primitive',
cls_attributes=None, uses_calc_time=False,
commutative=False):
commutative=False, number_output_features=1):
'''Returns a new transform primitive class
Args:
Expand All @@ -48,6 +48,9 @@ def make_trans_primitive(function, input_types, return_type, name=None,
commutative (bool): If True, will only make one feature per unique set
of base features.
number_output_features (int): The number of output features (columns in
the matrix) associated with this feature.
Example:
.. ipython :: python
Expand Down Expand Up @@ -84,6 +87,7 @@ def isin_generate_name(self):
new_class.input_types = input_types
new_class.return_type = return_type
new_class.commutative = commutative
new_class.number_output_features = number_output_features
new_class, default_kwargs = inspect_function_args(new_class,
function,
uses_calc_time)
Expand Down
19 changes: 19 additions & 0 deletions featuretools/primitives/standard/aggregation_primitives.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,25 @@ def percent_true(s):
return percent_true


class NMostCommon(AggregationPrimitive):
"""Finds the N most common elements in a categorical feature."""
name = "n_most_common"
input_types = [Discrete]
return_type = Discrete

def __init__(self, n=3):
self.number_output_features = n

def get_function(self):
def n_most_common(x, n=self.number_output_features):
array = np.array(x.value_counts()[:n].index)
if len(array) < n:
filler = np.full(n - len(array), np.nan)
array = np.append(array, filler)
return array
return n_most_common


class AvgTimeBetween(AggregationPrimitive):
"""Computes the average time between consecutive events.
Expand Down
Loading

0 comments on commit f80b5ae

Please sign in to comment.