Skip to content

Commit

Permalink
Cumulative primitives (#410)
Browse files Browse the repository at this point in the history
* Updated Cumulative Primitives
  • Loading branch information
CharlesBradshaw committed Feb 11, 2019
1 parent 69b6e04 commit d93ff0d
Show file tree
Hide file tree
Showing 5 changed files with 293 additions and 523 deletions.
2 changes: 1 addition & 1 deletion Makefile
Expand Up @@ -8,7 +8,7 @@ lint:
flake8 featuretools && isort --check-only --recursive featuretools

lint-fix:
autopep8 --in-place --recursive --max-line-length=100 --exclude="*/migrations/*" --select="E225,E303,E302,E203,E128,E231,E251,E271,E127,E126" featuretools
autopep8 --in-place --recursive --max-line-length=100 --exclude="*/migrations/*" --select="E225,E303,E302,E203,E128,E231,E251,E271,E127,E126,E301,W293,E226" featuretools
isort --recursive featuretools


Expand Down
1 change: 1 addition & 0 deletions featuretools/entityset/entity.py
Expand Up @@ -34,6 +34,7 @@ class Entity(object):
:class:`.Relationship`, :class:`.Variable`, :class:`.EntitySet`
"""

def __init__(self, id, df, entityset, variable_types=None,
index=None, time_index=None, secondary_time_index=None,
last_time_index=None, already_sorted=False, make_index=False,
Expand Down
1 change: 1 addition & 0 deletions featuretools/entityset/entityset.py
Expand Up @@ -34,6 +34,7 @@ class EntitySet(object):
metadata
"""

def __init__(self, id=None, entities=None, relationships=None):
"""Creates EntitySet
Expand Down
309 changes: 67 additions & 242 deletions featuretools/primitives/standard/cum_transform_feature.py
@@ -1,272 +1,97 @@
# import uuid
# from builtins import str
from featuretools.primitives.base import TransformPrimitive
from featuretools.variable_types import Discrete, Id, Numeric

# import numpy as np
# import pandas as pd

# from ..base.primitive_base import IdentityFeature, PrimitiveBase
# from ..base.transform_primitive_base import TransformPrimitive
# from .aggregation_primitives import Count, Max, Mean, Min, Sum
# from .utils import apply_dual_op_from_feat
class CumSum(TransformPrimitive):
"""Returns the cumulative sum after grouping"""

# from featuretools.utils import is_string
# from featuretools.utils.wrangle import _check_timedelta
# from featuretools.variable_types import Id, Index, Numeric, TimeIndex
# from featuretools.variable_types.variable import Discrete
name = "cum_sum"
input_types = [[Numeric, Id],
[Numeric, Discrete]]
return_type = Numeric
uses_full_entity = True

def get_function(self):
def cum_sum(values, groups):
return values.groupby(groups).cumsum()

# class CumFeature(TransformPrimitive):
# allow_where = True
# agg_feature = None
# uses_full_entity = True
return cum_sum

# # Note: Any row with a nan value in the group by feature will have a
# # NaN value in the cumfeat
def generate_name(self, base_feature_names):
return "CUM_SUM(%s by %s)" % (base_feature_names[0], base_feature_names[1])

# # Todo: also passing the parent entity instead of the group_feat
# def __init__(self, base_feature, group_feature, time_index=None,
# where=None, use_previous=None):
# """Summary

# Args:
# agg_feature (type): subclass of :class:`.AggregationPrimitive`;
# aggregation method being used. This is passed by the
# constructors of the cumfeat subclasses
# base_feature (:class:`.PrimitiveBase` or :class:`.Variable`): Feature
# or variable calculated on
# group_feature (:class:`.PrimitiveBase` or :class:`.Variable`): Feature
# or variable used to group the rows before computation
# where (optional[:class:`.PrimitiveBase`]):
# use_previous (optional[:class:`.Timedelta`):
# """
# base_feature = self._check_feature(base_feature)
class CumCount(TransformPrimitive):
"""Returns the cumulative count after grouping"""

# td_entity_id = None
# if is_string(use_previous):
# td_entity_id = base_feature.entity.id
# self.use_previous = _check_timedelta(
# use_previous, entity_id=td_entity_id)
name = "cum_count"
input_types = [[Id], [Discrete]]
return_type = Numeric
uses_full_entity = True

# group_feature = self._check_feature(group_feature)
# self.group_feature = group_feature
def get_function(self):
def cum_count(values):
return values.groupby(values).cumcount() + 1

# self.base_features = [base_feature, group_feature]
return cum_count

# if time_index is None:
# entity = base_feature.entity
# time_index = IdentityFeature(entity[entity.time_index])
# self.base_features += [time_index]
def generate_name(self, base_feature_names):
return "CUM_COUNT(%s)" % (base_feature_names[0])

# if where is not None:
# self.where = where

# super(CumFeature, self).__init__(*self.base_features)
class CumMean(TransformPrimitive):
"""Returns the cumulative mean after grouping"""

# def generate_name(self):
# where_str = u""
# use_prev_str = u""
name = "cum_mean"
input_types = [[Numeric, Id],
[Numeric, Discrete]]
return_type = Numeric
uses_full_entity = True

# if self.where is not None:
# where_str = u" WHERE " + self.where.get_name()
def get_function(self):
def cum_mean(values, groups):
temp = values.groupby(groups)
return temp.cumsum() / (temp.cumcount() + 1)

# if self.use_previous is not None:
# use_prev_str = u", Last %s" % (self.use_previous.get_name())
return cum_mean

# base_features_str = u"%s by %s" % \
# (self.base_features[0].get_name(), self.group_feature.get_name())
def generate_name(self, base_feature_names):
return "CUM_MEAN(%s by %s)" % (base_feature_names[0], base_feature_names[1])

# return u"%s(%s%s%s)" % (self.name.upper(), base_features_str,
# where_str, use_prev_str)

# def get_function(self):
# return pd_rolling_outer(self.rolling_func_name, self)
class CumMin(TransformPrimitive):
"""Returns the cumulative min after grouping"""

name = "cum_min"
input_types = [[Numeric, Id],
[Numeric, Discrete]]
return_type = Numeric
uses_full_entity = True

# class CumSum(CumFeature):
# """Calculates the sum of previous values of an instance for each value in a time-dependent entity.
# """
# name = "cum_sum"
# rolling_func_name = "sum"
def get_function(self):
def cum_min(values, groups):
return values.groupby(groups).cummin()

# default_value = 0
# agg_feature = Sum
# input_types = [[Numeric, Id, TimeIndex],
# [Numeric, Discrete, TimeIndex]]
return cum_min

def generate_name(self, base_feature_names):
return "CUM_MIN(%s by %s)" % (base_feature_names[0], base_feature_names[1])

# class CumMean(CumFeature):
# """Calculates the mean of previous values of an instance for each value in a time-dependent entity.
# """
# name = "cum_mean"
# rolling_func_name = "mean"
# default_value = 0
# agg_feature = Mean
# input_types = [[Numeric, Id, TimeIndex],
# [Numeric, Discrete, TimeIndex]]

class CumMax(TransformPrimitive):
"""Returns the cumulative max after grouping"""

# class CumCount(CumFeature):
# """Calculates the number of previous values of an instance for each value in a time-dependent entity.
# """
# name = "cum_count"
# rolling_func_name = "count"
# default_value = 0
# agg_feature = Count
# input_types = [Index, Discrete, TimeIndex]
name = "cum_max"
input_types = [[Numeric, Id],
[Numeric, Discrete]]
return_type = Numeric
uses_full_entity = True

def get_function(self):
def cum_max(values, groups):
return values.groupby(groups).cummax()

# class CumMax(CumFeature):
# """Calculates the max of previous values of an instance for each value in a time-dependent entity.
# """
# name = "cum_max"
# rolling_func_name = "max"
# default_value = 0
# agg_feature = Max
# input_types = [[Numeric, Id, TimeIndex],
# [Numeric, Discrete, TimeIndex]]
return cum_max


# class CumMin(CumFeature):
# """Calculates the min of previous values of an instance for each value in a time-dependent entity.
# """
# name = "cum_min"
# rolling_func_name = "min"
# default_value = 0
# agg_feature = Min
# input_types = [[Numeric, Id, TimeIndex],
# [Numeric, Discrete, TimeIndex]]


# def pd_rolling_outer(rolling_func, f):
# def pd_rolling(base_array, group_array, values_3=None, values_4=None):
# bf_name = f.base_features[0].get_name()
# entity = f.base_features[0].entity
# time_index = entity.time_index
# groupby = f.group_feature.get_name()
# timedelta = f.use_previous
# if timedelta is not None:
# if timedelta.is_absolute():
# timedelta = f.use_previous.get_pandas_timedelta()
# absolute = True
# else:
# timedelta = f.use_previous.value
# absolute = False
# df_dict = {bf_name: base_array, groupby: group_array}
# if timedelta:
# df_dict[time_index] = values_3
# if f.where:
# df_dict[f.where.get_name()] = values_4
# elif f.where:
# df_dict[f.where.get_name()] = values_3

# df = pd.DataFrame.from_dict(df_dict)

# if f.use_previous and not f.where:
# def apply_rolling(group):
# to_roll = group
# kwargs = {'window': timedelta,
# 'min_periods': 1}
# if absolute:
# to_roll = to_roll[[bf_name, time_index]].sort_values(
# time_index, kind='mergesort')
# kwargs['on'] = time_index
# else:
# to_roll = to_roll[bf_name]
# rolled = to_roll.rolling(**kwargs)
# rolled = getattr(rolled, rolling_func)()
# if absolute:
# rolled = rolled[bf_name]
# return rolled
# elif not f.where:
# cumfuncs = {"count": "cumcount",
# "sum": "cumsum",
# "max": "cummax",
# "min": "cummin",
# "prod": "cumprod",
# }
# if rolling_func in ["count", "sum", "max", "min"]:
# cumfunc = cumfuncs[rolling_func]
# grouped = df.groupby(groupby, sort=False, observed=True)[bf_name]
# applied = getattr(grouped, cumfunc)()
# # TODO: to produce same functionality as the rolling cases already
# # implemented, we add 1
# # We may want to consider changing this functionality to instead
# # return count of the *previous* events
# if rolling_func == "count":
# applied += 1
# return applied
# else:
# def apply_rolling(group):
# rolled = group[bf_name].expanding(min_periods=1)
# return getattr(rolled, rolling_func)()
# elif f.use_previous and f.where:
# def apply_rolling(group):
# variable_data = [group[base.get_name()]
# for base in [f.where.left, f.where.right]
# if isinstance(base, PrimitiveBase)]
# mask = apply_dual_op_from_feat(f.where, *variable_data)
# to_roll = group[mask]
# kwargs = {'window': timedelta,
# 'min_periods': 1}
# if absolute:
# output = pd.Series(f.default_value, index=group.index)
# # mergesort is stable
# to_roll = to_roll[[bf_name, time_index]].sort_values(
# time_index, kind='mergesort')
# kwargs['on'] = time_index
# else:
# output = pd.Series(np.nan, index=group.index)
# to_roll = to_roll[bf_name]
# rolled = to_roll.rolling(**kwargs)
# rolled = getattr(rolled, rolling_func)()
# if absolute:
# rolled = rolled[bf_name]
# output[mask] = rolled
# else:
# output[mask] = rolled
# # values filtered out by the Where statement
# # should have their values be w
# output.fillna(method='ffill', inplace=True)
# # first value might still be nan
# if pd.isnull(output.iloc[0]):
# output.fillna(0, inplace=True)
# return output
# elif f.where:
# def apply_rolling(group):
# variable_data = [group[base.get_name()]
# for base in [f.where.left, f.where.right]
# if isinstance(base, PrimitiveBase)]
# mask = apply_dual_op_from_feat(f.where, *variable_data)
# output = pd.Series(np.nan, index=group.index)
# rolled = group[mask][bf_name].expanding(min_periods=1)
# rolled = getattr(rolled, rolling_func)()
# output[mask] = rolled
# # values filtered out by the Where statement
# # should have their values be w
# output.fillna(method='ffill', inplace=True)
# # first value might still be nan
# if pd.isnull(output.iloc[0]):
# output.fillna(0, inplace=True)
# return output

# new_index_name = str(uuid.uuid1())
# new_index = pd.RangeIndex(len(df), name=new_index_name)
# df.set_index(new_index, append=True, inplace=True)
# grouped = df.groupby(groupby, observed=True).apply(apply_rolling)
# original_index = pd.Series(np.nan, index=df.index)
# if isinstance(grouped, pd.DataFrame):
# if grouped.shape[0] == 0 or grouped.empty:
# return original_index.values
# else:
# grouped = pd.Series(grouped.values[0], index=grouped.columns)

# df.reset_index(new_index_name, inplace=True, drop=True)
# # case where some values of df[groupby] are nan
# # pandas groupby().apply() filters those out
# # and returns a series that's shorter than the original
# # we need to add these values to the original index to
# # preserve the length and these nan values
# grouped_index = grouped.index.get_level_values(new_index_name)
# original_index[grouped_index] = grouped.values
# return original_index.values
# return pd_rolling
def generate_name(self, base_feature_names):
return "CUM_MAX(%s by %s)" % (base_feature_names[0], base_feature_names[1])

0 comments on commit d93ff0d

Please sign in to comment.