Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Grouby transform feature #455

Merged
merged 34 commits into from Mar 25, 2019
Merged
Changes from 26 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
87194f2
add GroupByTransformPrimitive
rwedge Feb 25, 2019
862590c
update cumulative primitives to inherit from GBTP
rwedge Feb 25, 2019
6948485
add GroupByTransformFeature
rwedge Feb 25, 2019
9f0a2ff
add GBTF to uses_full_entity logic
rwedge Feb 25, 2019
55b6b16
add groupby handler to pandas backend
rwedge Feb 25, 2019
6398a78
move cumulative tests to new test_groupby_primitives file
rwedge Feb 25, 2019
3748402
Merge branch 'master' into grouby-transform-feature
rwedge Feb 26, 2019
09b84fc
remove GroupByTransformPrimitive
rwedge Feb 26, 2019
9254060
rework groupby logic
rwedge Feb 26, 2019
e3a77d3
update test cases for cumulative primitives
rwedge Feb 26, 2019
bde94de
handled 'nan' group of groupby
rwedge Feb 27, 2019
8dc9b62
redo groupby_feature.get_name
rwedge Mar 6, 2019
f89167d
linting
rwedge Mar 6, 2019
f17e6e6
make GBTF a sublcass of TransformFeature
rwedge Mar 6, 2019
607ccb2
change groupby restriction from Id to Discrete
rwedge Mar 6, 2019
927e62d
test categorical direct feature as groupby in GBTF
rwedge Mar 6, 2019
67f1fcf
test GBTF.copy
rwedge Mar 6, 2019
fe5400f
test groupby with empty data
rwedge Mar 6, 2019
d8db338
test uses_calc_time with GBTF
rwedge Mar 7, 2019
e55fc51
linting
rwedge Mar 7, 2019
a866d73
Merge branch 'master' into grouby-transform-feature
rwedge Mar 15, 2019
9dd526c
rename test file
rwedge Mar 15, 2019
06dac4c
have feature tree separate features by groupby
rwedge Mar 15, 2019
6a9b8d5
check exact class instead of allowing subclasses in feature handlers
rwedge Mar 15, 2019
d70dc6d
add groupby to base_features earlier
rwedge Mar 15, 2019
84bc534
Merge branch 'master' into grouby-transform-feature
rwedge Mar 18, 2019
95d7f3e
change implementation of cumulative count
rwedge Mar 22, 2019
03f1a41
add comments about time where we exclude the groupby feature when usi…
rwedge Mar 22, 2019
a3ee88f
reassign index when primitive function returns series
rwedge Mar 22, 2019
d9af1bf
let pandas fill in null values for instances without a group
rwedge Mar 22, 2019
403ad54
Merge branch 'master' into grouby-transform-feature
rwedge Mar 22, 2019
2a41be1
linting
rwedge Mar 22, 2019
10e0ecc
Update cum_transform_feature.py
kmax12 Mar 25, 2019
49aa411
Update cum_transform_feature.py
kmax12 Mar 25, 2019
File filter...
Filter file types
Jump to…
Jump to file or symbol
Failed to load files and symbols.
+473 −267
Diff settings

Always

Just for now

@@ -9,6 +9,7 @@
from featuretools import variable_types
from featuretools.feature_base import (
AggregationFeature,
GroupByTransformFeature,
This conversation was marked as resolved by kmax12

This comment has been minimized.

Copy link
@kmax12

kmax12 Mar 14, 2019

Member

should we add a handler so GroupByTransformFeature get split by the feature.groupby value?

IdentityFeature,
TransformFeature
)
@@ -167,7 +168,8 @@ def key_func(f):
_get_use_previous(f),
_get_where(f),
self.input_frames_type(f),
self.output_frames_type(f))
self.output_frames_type(f),
_get_groupby(f))

# Sort the list of features by the complex key function above, then
# group them by the same key
@@ -213,7 +215,8 @@ def _get_feature_depths(self, entity_id):
return list(features.values()), out

def uses_full_entity(self, feature):
if isinstance(feature, TransformFeature) and feature.primitive.uses_full_entity:
if (isinstance(feature, (GroupByTransformFeature, TransformFeature)) and
feature.primitive.uses_full_entity):
return True
return self._dependent_uses_full_entity(feature)

@@ -280,3 +283,10 @@ def _get_base_entity_id(f):
else:
# Assume all of f's base_features belong to the same entity
return f.base_features[0].entity_id


def _get_groupby(f):
if isinstance(f, GroupByTransformFeature):
return f.groupby.hash()
else:
return -1
@@ -18,6 +18,7 @@
from featuretools.feature_base import (
AggregationFeature,
DirectFeature,
GroupByTransformFeature,
IdentityFeature,
TransformFeature
)
@@ -104,7 +105,9 @@ def calculate_all_features(self, instance_ids, time_last,
training_window=training_window,
verbose=verbose)
large_eframes_by_filter = None
if any([f.primitive.uses_full_entity for f in self.feature_tree.all_features if isinstance(f, TransformFeature)]):
if any([f.primitive.uses_full_entity
for f in self.feature_tree.all_features
if isinstance(f, (GroupByTransformFeature, TransformFeature))]):
large_necessary_columns = self.feature_tree.necessary_columns_for_all_values_features
large_eframes_by_filter = \
self.entityset.get_pandas_data_slice(filter_entity_ids=ordered_entities,
@@ -277,13 +280,15 @@ def generate_default_df(self, instance_ids, extra_columns=None):
return default_df

def _feature_type_handler(self, f):
if isinstance(f, TransformFeature):
if type(f) == TransformFeature:
return self._calculate_transform_features
elif isinstance(f, DirectFeature):
elif type(f) == GroupByTransformFeature:
return self._calculate_groupby_features
elif type(f) == DirectFeature:
return self._calculate_direct_features
elif isinstance(f, AggregationFeature):
elif type(f) == AggregationFeature:
return self._calculate_agg_features
elif isinstance(f, IdentityFeature):
elif type(f) == IdentityFeature:
return self._calculate_identity_features
else:
raise UnknownFeature(u"{} feature unknown".format(f.__class__))
@@ -318,11 +323,6 @@ def _calculate_transform_features(self, features, entity_frames):
values = feature_func(*variable_data)

# if we don't get just the values, the assignment breaks when indexes don't match
def strip_values_if_series(values):
if isinstance(values, pd.Series):
values = values.values
return values

if f.number_output_features > 1:
values = [strip_values_if_series(value) for value in values]
else:
@@ -331,6 +331,48 @@ def strip_values_if_series(values):

return frame

def _calculate_groupby_features(self, features, entity_frames):
entity_id = features[0].entity.id
assert len(set([f.entity.id for f in features])) == 1, \
"features must share base entity"
assert entity_id in entity_frames

frame = entity_frames[entity_id]

# handle when no data
if frame.shape[0] == 0:
for f in features:
set_default_column(frame, f)
return frame

groupby = features[0].groupby.get_name()
group_values = {f.hash(): [] for f in features}
for index, group in frame.groupby(groupby):
for f in features:
column_names = [bf.get_name() for bf in f.base_features]
variable_data = [group[name] for name in column_names[:-1]]
This conversation was marked as resolved by kmax12

This comment has been minimized.

Copy link
@kmax12

kmax12 Mar 22, 2019

Member

can we add a comment here indicating why we do the column_names[:-1]. i think it because we want to remove the groupby feature, right?

feature_func = f.get_function()

# apply the function to the relevant dataframe slice and add the
# feature row to the results dataframe.
if f.primitive.uses_calc_time:
values = feature_func(*variable_data, time=self.time_last)
else:
values = feature_func(*variable_data)

if not isinstance(values, pd.Series):

This comment has been minimized.

Copy link
@kmax12

kmax12 Mar 22, 2019

Member
# make sure index is aligned for update call below
if isinstance(values, pd.Series):
    values.index = variable_data[0].index
else:
    values = pd.Series(values, index=variable_data[0].index)
    group_values[f.hash()].append(values)
values = pd.Series(values, index=variable_data[0].index)
group_values[f.hash()].append(values)

This comment has been minimized.

Copy link
@kmax12

kmax12 Mar 22, 2019

Member

let's try doing just

frame[f.get_name()].update(values)

then we can remove null handling below


null_group = frame[pd.isnull(frame[groupby])]
for f in features:
values = group_values[f.hash()]
values.append(null_group[groupby])
values = pd.concat(values)
update_feature_columns(f, frame, [values.sort_index().values])

return frame

def _calculate_direct_features(self, features, entity_frames):
entity_id = features[0].entity.id
parent_entity_id = features[0].parent_entity.id
@@ -557,3 +599,9 @@ def update_feature_columns(feature, data, values):
assert len(names) == len(values)
for name, value in zip(names, values):
data[name] = value


def strip_values_if_series(values):
if isinstance(values, pd.Series):
values = values.values
return values
@@ -4,6 +4,7 @@
DirectFeature,
Feature,
FeatureBase,
GroupByTransformFeature,
IdentityFeature,
TransformFeature
)
@@ -14,6 +14,7 @@
Categorical,
Datetime,
DatetimeTimeIndex,
Discrete,
Id,
Numeric,
NumericTimeIndex,
@@ -462,13 +463,39 @@ def generate_name(self):
return self.primitive.generate_name(base_feature_names=[bf.get_name() for bf in self.base_features])


class GroupByTransformFeature(TransformFeature):
def __init__(self, base_features, primitive, groupby):
if not isinstance(groupby, FeatureBase):
groupby = IdentityFeature(groupby)
assert issubclass(groupby.variable_type, Discrete)
self.groupby = groupby

if hasattr(base_features, '__iter__'):
base_features.append(groupby)
else:
base_features = [base_features, groupby]

super(GroupByTransformFeature, self).__init__(base_features,
primitive=primitive)

def copy(self):
return GroupByTransformFeature(self.base_features[:-1],

This comment has been minimized.

Copy link
@kmax12

kmax12 Mar 22, 2019

Member

add comment about excluding groupby

self.primitive,
self.groupby)

def generate_name(self):
base_names = [bf.get_name() for bf in self.base_features[:-1]]

This comment has been minimized.

Copy link
@kmax12

kmax12 Mar 22, 2019

Member

add comment about excluding groupby

_name = self.primitive.generate_name(base_names)
return "{} by {}".format(_name, self.groupby.get_name())


class Feature(object):
"""
Alias to create feature. Infers the feature type based on init parameters.
"""

def __new__(self, base, entity=None,
parent_entity=None, primitive=None, use_previous=None, where=None):
def __new__(self, base, entity=None, groupby=None, parent_entity=None,
primitive=None, use_previous=None, where=None):

# either direct or indentity
if primitive is None and entity is None:
@@ -481,7 +508,12 @@ def __new__(self, base, entity=None,
use_previous=use_previous, where=where,
primitive=primitive)
elif primitive is not None:
assert isinstance(primitive, TransformPrimitive) or issubclass(primitive, TransformPrimitive)
assert (isinstance(primitive, TransformPrimitive) or
issubclass(primitive, TransformPrimitive))
if groupby is not None:
return GroupByTransformFeature(base,
primitive=primitive,
groupby=groupby)
return TransformFeature(base, primitive=primitive)

raise Exception("Unrecognized feature initialization")
@@ -1,3 +1,5 @@
import pandas as pd

from featuretools.primitives.base import TransformPrimitive
from featuretools.variable_types import Discrete, Id, Numeric

@@ -6,20 +8,16 @@ class CumSum(TransformPrimitive):
"""Returns the cumulative sum after grouping"""

name = "cum_sum"
input_types = [[Numeric, Id],
[Numeric, Discrete]]
input_types = [Numeric]
return_type = Numeric
uses_full_entity = True

def get_function(self):
def cum_sum(values, groups):
return values.groupby(groups).cumsum()
def cum_sum(values):
return values.cumsum()

return cum_sum

def generate_name(self, base_feature_names):
return "CUM_SUM(%s by %s)" % (base_feature_names[0], base_feature_names[1])


class CumCount(TransformPrimitive):
"""Returns the cumulative count after grouping"""
@@ -31,67 +29,51 @@ class CumCount(TransformPrimitive):

def get_function(self):
def cum_count(values):
return values.groupby(values).cumcount() + 1
return pd.Series(range(1, len(values) + 1), index=values.index)

This comment has been minimized.

Copy link
@kmax12

kmax12 Mar 14, 2019

Member

another option that is easier to read would be. not sure which is more performant though

values.iloc[:] = range(1, len(values) + 1)

This comment has been minimized.

Copy link
@kmax12

kmax12 Mar 22, 2019

Member

in case anyone's curious, I ran some benchmarks

In [1]: import pandas as pd 
   ...: values = pd.Series(range(100000))                                                                                                                                                              

In [2]: %timeit pd.Series(range(1, len(values) + 1), index=values.index)                                                                                                                               
118 µs ± 1.43 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)

In [3]: %timeit values.iloc[:] = range(1, len(values) + 1)                                                                                                                                             
54 ms ± 916 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)

In [4]: %timeit np.arange(1, len(values))                                                                                                                                                                                                  
50.1 µs ± 555 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

In [5]: %timeit pd.Series(np.arange(1, len(values) + 1), index=values.index)                                                                                                                                                           
107 µs ± 835 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)

i think we should just do the 3rd option since pandas backend can handle the conversion to a series for us


return cum_count

def generate_name(self, base_feature_names):
return "CUM_COUNT(%s)" % (base_feature_names[0])


class CumMean(TransformPrimitive):
"""Returns the cumulative mean after grouping"""

name = "cum_mean"
input_types = [[Numeric, Id],
[Numeric, Discrete]]
input_types = [Numeric]
return_type = Numeric
uses_full_entity = True

def get_function(self):
def cum_mean(values, groups):
temp = values.groupby(groups)
return temp.cumsum() / (temp.cumcount() + 1)
def cum_mean(values):
return values.cumsum() / pd.Series(range(1, len(values) + 1), index=values.index)

return cum_mean

def generate_name(self, base_feature_names):
return "CUM_MEAN(%s by %s)" % (base_feature_names[0], base_feature_names[1])


class CumMin(TransformPrimitive):
"""Returns the cumulative min after grouping"""

name = "cum_min"
input_types = [[Numeric, Id],
[Numeric, Discrete]]
input_types = [Numeric]
return_type = Numeric
uses_full_entity = True

def get_function(self):
def cum_min(values, groups):
return values.groupby(groups).cummin()
def cum_min(values):
return values.cummin()

return cum_min

def generate_name(self, base_feature_names):
return "CUM_MIN(%s by %s)" % (base_feature_names[0], base_feature_names[1])


class CumMax(TransformPrimitive):
"""Returns the cumulative max after grouping"""

name = "cum_max"
input_types = [[Numeric, Id],
[Numeric, Discrete]]
input_types = [Numeric]
return_type = Numeric
uses_full_entity = True

def get_function(self):
def cum_max(values, groups):
return values.groupby(groups).cummax()
def cum_max(values):
return values.cummax()

return cum_max

def generate_name(self, base_feature_names):
return "CUM_MAX(%s by %s)" % (base_feature_names[0], base_feature_names[1])
Oops, something went wrong.
ProTip! Use n and p to navigate between commits in a pull request.
You can’t perform that action at this time.