Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding non-feature columns to calculated feature matrix #78

Merged
merged 12 commits into from Feb 5, 2018
2 changes: 1 addition & 1 deletion .circleci/config.yml
Expand Up @@ -3,7 +3,7 @@ jobs:
build:
working_directory: ~/featuretools
docker:
- image: themattrix/tox
- image: painless/tox
steps:
- checkout
- run: pyenv local 2.7.13 3.5.2 3.6.0
Expand Down
10 changes: 10 additions & 0 deletions docs/source/automated_feature_engineering/handling_time.rst
Expand Up @@ -71,8 +71,18 @@ There is one row in the feature matrix corresponding to a row in ``cutoff_times`
cutoff_time_in_index=True)
feature_matrix

It is often the case that we want our labels in our calculated feature matrix so that the ordering is consistent between the labels and the rows of the feature matrix. However, adding labels to the initial dataframe means that you would have to explicitly prohibit ``dfs`` from building features with that column. To bypass this, we can provide additional columns to cutoff times which will be added directly the feature matrix. While the first two columns will be used as an index and cutoff time regardless of their order in the dataframe, any additional columns will appear as features in the resulting feature matrix.
Copy link
Contributor

@kmax12 kmax12 Jan 30, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we may support mixing the order up if you name things correctly, but let’s document instance id first, followed by cutoff time. just to make sure people are consistent


.. ipython:: python

cutoff_times['label'] = pd.Series([0, 0, 1, 0, 1])

feature_matrix, features = ft.dfs(entityset=es,
target_entity="customers",
cutoff_time=cutoff_times,
cutoff_time_in_index=True)

feature_matrix['label']

Running DFS with training windows
---------------------------------
Expand Down
85 changes: 56 additions & 29 deletions featuretools/computational_backends/calculate_feature_matrix.py
Expand Up @@ -43,7 +43,8 @@ def calculate_feature_matrix(features, cutoff_time=None, instance_ids=None,
the features for each instance at. Can either be a DataFrame with
'instance_id' and 'time' columns, DataFrame with the name of the
index variable in the target entity and a time column, a list of values, or a single
value to calculate for all instances.
value to calculate for all instances. If the dataframe has more than two columns, any additional
columns will be added to the resulting feature matrix.

instance_ids (list): list of instances to calculate features on. Only
used if cutoff_time is a single datetime.
Expand Down Expand Up @@ -99,6 +100,7 @@ def calculate_feature_matrix(features, cutoff_time=None, instance_ids=None,

entityset = features[0].entityset
target_entity = features[0].entity
pass_columns = []

if not isinstance(cutoff_time, pd.DataFrame):
if cutoff_time is None:
Expand Down Expand Up @@ -130,6 +132,7 @@ def calculate_feature_matrix(features, cutoff_time=None, instance_ids=None,
# take the first column that isn't instance_id and assume it is time
not_instance_id = [c for c in cutoff_time.columns if c != "instance_id"]
cutoff_time.rename(columns={not_instance_id[0]: "time"}, inplace=True)
pass_columns = [column_name for column_name in cutoff_time.columns[2:]]

# Get dictionary of features to approximate
if approximate is not None:
Expand Down Expand Up @@ -186,7 +189,7 @@ def calculate_feature_matrix(features, cutoff_time=None, instance_ids=None,
training_window, profile, verbose,
save_progress, backend,
no_unapproximated_aggs, cutoff_df_time_var,
target_time)
target_time, pass_columns)
feature_matrix.append(_feature_matrix)
# Do a manual garbage collection in case objects from calculate_batch
# weren't collected automatically
Expand All @@ -202,21 +205,27 @@ def calculate_feature_matrix(features, cutoff_time=None, instance_ids=None,
return feature_matrix


def calculate_batch(features, group, approximate, entityset, backend_verbose, training_window,
profile, verbose, save_progress, backend,
no_unapproximated_aggs, cutoff_df_time_var, target_time):
def calculate_batch(features, group, approximate, entityset, backend_verbose,
training_window, profile, verbose, save_progress, backend,
no_unapproximated_aggs, cutoff_df_time_var, target_time,
pass_columns):
# if approximating, calculate the approximate features
if approximate is not None:
precalculated_features, all_approx_feature_set = approximate_features(features,
group,
window=approximate,
entityset=entityset,
training_window=training_window,
verbose=backend_verbose,
profile=profile)
precalculated_features, all_approx_feature_set = approximate_features(
features,
group,
window=approximate,
entityset=entityset,
training_window=training_window,
verbose=backend_verbose,
profile=profile
)
else:
precalculated_features = None
all_approx_feature_set = None

# if backend verbose wasn't set explicitly, set to True if verbose is true
# and there is only 1 cutoff time
if backend_verbose is None:
one_cutoff_time = group[cutoff_df_time_var].nunique() == 1
backend_verbose = verbose and one_cutoff_time
Expand All @@ -231,6 +240,7 @@ def calc_results(time_last, ids, precalculated_features=None, training_window=No
verbose=backend_verbose)
return matrix

# if all aggregations have been approximated, can calculate all together
if no_unapproximated_aggs and approximate is not None:
grouped = [[datetime.now(), group]]
else:
Expand All @@ -242,29 +252,46 @@ def calc_results(time_last, ids, precalculated_features=None, training_window=No

feature_matrix = []
for _time_last_to_calc, group in grouped:
time_last = group[cutoff_df_time_var].iloc[0]
# sort group by instance id
ids = group['instance_id'].sort_values().values

time_last = group[cutoff_df_time_var].iloc[0]
if no_unapproximated_aggs and approximate is not None:
window = None
else:
window = training_window

_feature_matrix = calc_results(_time_last_to_calc, ids, precalculated_features=precalculated_features, training_window=window)

# this can occur when the features for an instance are calculated at
# multiple cutoff times which were binned to the same frequency.
if len(_feature_matrix) != len(group):
indexer = group[['instance_id', cutoff_df_time_var]]
_feature_matrix = (indexer.merge(_feature_matrix,
left_on=['instance_id'],
right_index=True,
how='left')
.set_index('instance_id')
.drop([cutoff_df_time_var], axis=1))

time_index = pd.DatetimeIndex([time_last] * _feature_matrix.shape[0], name='time')
_feature_matrix.set_index(time_index, append=True, inplace=True)
# calculate values for those instances at time _time_last_to_calc
_feature_matrix = calc_results(_time_last_to_calc,
ids,
precalculated_features=precalculated_features,
training_window=window)

id_name = _feature_matrix.index.name

# if approximate, merge feature matrix with group frame to get original
# cutoff times and passed columns
if approximate:
indexer = group[['instance_id', target_time] + pass_columns]
_feature_matrix = indexer.merge(_feature_matrix,
left_on=['instance_id'],
right_index=True,
how='left')
_feature_matrix.set_index(['instance_id', target_time], inplace=True)
_feature_matrix.index.set_names([id_name, 'time'], inplace=True)
_feature_matrix.sort_index(level=1, kind='mergesort', inplace=True)
else:
# all rows have same cutoff time. set time and add passed columns
num_rows = _feature_matrix.shape[0]
time_index = pd.DatetimeIndex([time_last] * num_rows, name='time')
_feature_matrix.set_index(time_index, append=True, inplace=True)
if len(pass_columns) > 0:
pass_through = group[['instance_id', cutoff_df_time_var] + pass_columns]
pass_through.rename(columns={'instance_id': id_name,
cutoff_df_time_var: 'time'},
inplace=True)
pass_through.set_index([id_name, 'time'], inplace=True)
for col in pass_columns:
_feature_matrix[col] = pass_through[col]
feature_matrix.append(_feature_matrix)

feature_matrix = pd.concat(feature_matrix)
Expand Down
3 changes: 2 additions & 1 deletion featuretools/synthesis/dfs.py
Expand Up @@ -51,7 +51,8 @@ def dfs(entities=None,
the features for each instance at. Can either be a DataFrame with
'instance_id' and 'time' columns, DataFrame with the name of the
index variable in the target entity and a time column, a list of values, or a single
value to calculate for all instances.
value to calculate for all instances. If the dataframe has more than two columns, any additional
columns will be added to the resulting feature matrix.

instance_ids (list): list of instances to calculate features on. Only
used if cutoff_time is a single datetime.
Expand Down
Expand Up @@ -490,3 +490,85 @@ def test_cutoff_time_naming(entityset):

with pytest.raises(AttributeError):
calculate_feature_matrix([dfeat], cutoff_time=cutoff_df_wrong_index_name)


def test_cutoff_time_extra_columns(entityset):
es = entityset

agg_feat = Count(es['customers']['id'], es['regions'])
dfeat = DirectFeature(agg_feat, es['customers'])

cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-09 10:30:06'),
pd.Timestamp('2011-04-09 10:30:03'),
pd.Timestamp('2011-04-08 10:30:00')],
'instance_id': [0, 1, 0],
'label': [True, True, False]},
columns=['time', 'instance_id', 'label'])
fm = calculate_feature_matrix([dfeat], cutoff_time=cutoff_df)
# check column was added to end of matrix
assert 'label' == fm.columns[-1]
# check column was sorted by time labelike the rest of the feature matrix
true_series = pd.Series([False, True, True], index=[0, 1, 0])
assert (fm['label'] == true_series).all()

fm_2 = calculate_feature_matrix([dfeat],
cutoff_time=cutoff_df,
approximate="2 days")
# check column was added to end of matrix
assert 'label' in fm_2.columns
# check column was sorted by time like the rest of the feature matrix
true_series = pd.Series([False, True, True], index=[0, 1, 0])
assert (fm_2['label'] == true_series).all()


def test_cfm_returns_original_time_indexes(entityset):
es = entityset

agg_feat = Count(es['customers']['id'], es['regions'])
dfeat = DirectFeature(agg_feat, es['customers'])
agg_feat_2 = Count(es['sessions']['id'], es['customers'])
cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-09 10:30:06'),
pd.Timestamp('2011-04-09 10:30:03'),
pd.Timestamp('2011-04-08 10:30:00')],
'instance_id': [0, 1, 0]})
sorted_df = cutoff_df.sort_values(['time', 'instance_id'], kind='mergesort')

# no approximate
fm = calculate_feature_matrix([dfeat], cutoff_time=cutoff_df,
cutoff_time_in_index=True)
instance_level_vals = fm.index.get_level_values(0).values
time_level_vals = fm.index.get_level_values(1).values
assert (instance_level_vals == sorted_df['instance_id'].values).all()
assert (time_level_vals == sorted_df['time'].values).all()

# approximate, in different windows, no unapproximated aggs
fm2 = calculate_feature_matrix([dfeat], cutoff_time=cutoff_df,
cutoff_time_in_index=True, approximate="1 m")
instance_level_vals = fm2.index.get_level_values(0).values
time_level_vals = fm2.index.get_level_values(1).values
assert (instance_level_vals == sorted_df['instance_id'].values).all()
assert (time_level_vals == sorted_df['time'].values).all()

# approximate, in different windows, unapproximated aggs
fm2 = calculate_feature_matrix([dfeat, agg_feat_2], cutoff_time=cutoff_df,
cutoff_time_in_index=True, approximate="1 m")
instance_level_vals = fm2.index.get_level_values(0).values
time_level_vals = fm2.index.get_level_values(1).values
assert (instance_level_vals == sorted_df['instance_id'].values).all()
assert (time_level_vals == sorted_df['time'].values).all()

# approximate, in same window, no unapproximated aggs
fm3 = calculate_feature_matrix([dfeat], cutoff_time=cutoff_df,
cutoff_time_in_index=True, approximate="2 d")
instance_level_vals = fm3.index.get_level_values(0).values
time_level_vals = fm3.index.get_level_values(1).values
assert (instance_level_vals == sorted_df['instance_id'].values).all()
assert (time_level_vals == sorted_df['time'].values).all()

# approximate, in same window, unapproximated aggs
fm3 = calculate_feature_matrix([dfeat, agg_feat_2], cutoff_time=cutoff_df,
cutoff_time_in_index=True, approximate="2 d")
instance_level_vals = fm3.index.get_level_values(0).values
time_level_vals = fm3.index.get_level_values(1).values
assert (instance_level_vals == sorted_df['instance_id'].values).all()
assert (time_level_vals == sorted_df['time'].values).all()