alteryx · kmax12 · Feb 5, 2018 · Jan 23, 2018 · Jan 23, 2018 · Jan 23, 2018
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -3,7 +3,7 @@ jobs:
   build:
     working_directory: ~/featuretools
     docker:
-        - image: themattrix/tox
+        - image: painless/tox
     steps:
       - checkout
       - run: pyenv local 2.7.13 3.5.2 3.6.0

diff --git a/docs/source/automated_feature_engineering/handling_time.rst b/docs/source/automated_feature_engineering/handling_time.rst
@@ -71,8 +71,18 @@ There is one row in the feature matrix corresponding to a row in ``cutoff_times`
                                       cutoff_time_in_index=True)
     feature_matrix
 
+It is often the case that we want our labels in our calculated feature matrix so that the ordering is consistent between the labels and the rows of the feature matrix. However, adding labels to the initial dataframe means that you would have to explicitly prohibit ``dfs`` from building features with that column. To bypass this, we can provide additional columns to cutoff times which will be added directly the feature matrix. While the first two columns will be used as an index and cutoff time regardless of their order in the dataframe, any additional columns will appear as features in the resulting feature matrix. 
 
+.. ipython:: python
+
+    cutoff_times['label'] = pd.Series([0, 0, 1, 0, 1])
+
+    feature_matrix, features = ft.dfs(entityset=es,
+                                      target_entity="customers",
+                                      cutoff_time=cutoff_times,
+                                      cutoff_time_in_index=True)
 
+    feature_matrix['label']
 
 Running DFS with training windows
 ---------------------------------

diff --git a/featuretools/computational_backends/calculate_feature_matrix.py b/featuretools/computational_backends/calculate_feature_matrix.py
@@ -43,7 +43,8 @@ def calculate_feature_matrix(features, cutoff_time=None, instance_ids=None,
             the features for each instance at.  Can either be a DataFrame with
             'instance_id' and 'time' columns, DataFrame with the name of the
             index variable in the target entity and a time column, a list of values, or a single
-            value to calculate for all instances.
+            value to calculate for all instances. If the dataframe has more than two columns, any additional
+            columns will be added to the resulting feature matrix.
 
         instance_ids (list): list of instances to calculate features on. Only
             used if cutoff_time is a single datetime.
@@ -99,6 +100,7 @@ def calculate_feature_matrix(features, cutoff_time=None, instance_ids=None,
 
     entityset = features[0].entityset
     target_entity = features[0].entity
+    pass_columns = []
 
     if not isinstance(cutoff_time, pd.DataFrame):
         if cutoff_time is None:
@@ -130,6 +132,7 @@ def calculate_feature_matrix(features, cutoff_time=None, instance_ids=None,
             # take the first column that isn't instance_id and assume it is time
             not_instance_id = [c for c in cutoff_time.columns if c != "instance_id"]
             cutoff_time.rename(columns={not_instance_id[0]: "time"}, inplace=True)
+        pass_columns = [column_name for column_name in cutoff_time.columns[2:]]
 
     # Get dictionary of features to approximate
     if approximate is not None:
@@ -186,7 +189,7 @@ def calculate_feature_matrix(features, cutoff_time=None, instance_ids=None,
                                           training_window, profile, verbose,
                                           save_progress, backend,
                                           no_unapproximated_aggs, cutoff_df_time_var,
-                                          target_time)
+                                          target_time, pass_columns)
         feature_matrix.append(_feature_matrix)
         # Do a manual garbage collection in case objects from calculate_batch
         # weren't collected automatically
@@ -202,21 +205,27 @@ def calculate_feature_matrix(features, cutoff_time=None, instance_ids=None,
     return feature_matrix
 
 
-def calculate_batch(features, group, approximate, entityset, backend_verbose, training_window,
-                    profile, verbose, save_progress, backend,
-                    no_unapproximated_aggs, cutoff_df_time_var, target_time):
+def calculate_batch(features, group, approximate, entityset, backend_verbose,
+                    training_window, profile, verbose, save_progress, backend,
+                    no_unapproximated_aggs, cutoff_df_time_var, target_time,
+                    pass_columns):
+    # if approximating, calculate the approximate features
     if approximate is not None:
-        precalculated_features, all_approx_feature_set = approximate_features(features,
-                                                                              group,
-                                                                              window=approximate,
-                                                                              entityset=entityset,
-                                                                              training_window=training_window,
-                                                                              verbose=backend_verbose,
-                                                                              profile=profile)
+        precalculated_features, all_approx_feature_set = approximate_features(
+            features,
+            group,
+            window=approximate,
+            entityset=entityset,
+            training_window=training_window,
+            verbose=backend_verbose,
+            profile=profile
+        )
     else:
         precalculated_features = None
         all_approx_feature_set = None
 
+    # if backend verbose wasn't set explicitly, set to True if verbose is true
+    # and there is only 1 cutoff time
     if backend_verbose is None:
         one_cutoff_time = group[cutoff_df_time_var].nunique() == 1
         backend_verbose = verbose and one_cutoff_time
@@ -231,6 +240,7 @@ def calc_results(time_last, ids, precalculated_features=None, training_window=No
                                                 verbose=backend_verbose)
         return matrix
 
+    # if all aggregations have been approximated, can calculate all together
     if no_unapproximated_aggs and approximate is not None:
         grouped = [[datetime.now(), group]]
     else:
@@ -242,29 +252,46 @@ def calc_results(time_last, ids, precalculated_features=None, training_window=No
 
     feature_matrix = []
     for _time_last_to_calc, group in grouped:
-        time_last = group[cutoff_df_time_var].iloc[0]
+        # sort group by instance id
         ids = group['instance_id'].sort_values().values
-
+        time_last = group[cutoff_df_time_var].iloc[0]
         if no_unapproximated_aggs and approximate is not None:
             window = None
         else:
             window = training_window
 
-        _feature_matrix = calc_results(_time_last_to_calc, ids, precalculated_features=precalculated_features, training_window=window)
-
-        # this can occur when the features for an instance are calculated at
-        # multiple cutoff times which were binned to the same frequency.
-        if len(_feature_matrix) != len(group):
-            indexer = group[['instance_id', cutoff_df_time_var]]
-            _feature_matrix = (indexer.merge(_feature_matrix,
-                                             left_on=['instance_id'],
-                                             right_index=True,
-                                             how='left')
-                               .set_index('instance_id')
-                               .drop([cutoff_df_time_var], axis=1))
-
-        time_index = pd.DatetimeIndex([time_last] * _feature_matrix.shape[0], name='time')
-        _feature_matrix.set_index(time_index, append=True, inplace=True)
+        # calculate values for those instances at time _time_last_to_calc
+        _feature_matrix = calc_results(_time_last_to_calc,
+                                       ids,
+                                       precalculated_features=precalculated_features,
+                                       training_window=window)
+
+        id_name = _feature_matrix.index.name
+
+        # if approximate, merge feature matrix with group frame to get original
+        # cutoff times and passed columns
+        if approximate:
+            indexer = group[['instance_id', target_time] + pass_columns]
+            _feature_matrix = indexer.merge(_feature_matrix,
+                                            left_on=['instance_id'],
+                                            right_index=True,
+                                            how='left')
+            _feature_matrix.set_index(['instance_id', target_time], inplace=True)
+            _feature_matrix.index.set_names([id_name, 'time'], inplace=True)
+            _feature_matrix.sort_index(level=1, kind='mergesort', inplace=True)
+        else:
+            # all rows have same cutoff time. set time and add passed columns
+            num_rows = _feature_matrix.shape[0]
+            time_index = pd.DatetimeIndex([time_last] * num_rows, name='time')
+            _feature_matrix.set_index(time_index, append=True, inplace=True)
+            if len(pass_columns) > 0:
+                pass_through = group[['instance_id', cutoff_df_time_var] + pass_columns]
+                pass_through.rename(columns={'instance_id': id_name,
+                                             cutoff_df_time_var: 'time'},
+                                    inplace=True)
+                pass_through.set_index([id_name, 'time'], inplace=True)
+                for col in pass_columns:
+                    _feature_matrix[col] = pass_through[col]
         feature_matrix.append(_feature_matrix)
 
     feature_matrix = pd.concat(feature_matrix)

diff --git a/featuretools/synthesis/dfs.py b/featuretools/synthesis/dfs.py
@@ -51,7 +51,8 @@ def dfs(entities=None,
             the features for each instance at.  Can either be a DataFrame with
             'instance_id' and 'time' columns, DataFrame with the name of the
             index variable in the target entity and a time column, a list of values, or a single
-            value to calculate for all instances.
+            value to calculate for all instances. If the dataframe has more than two columns, any additional
+            columns will be added to the resulting feature matrix.
 
         instance_ids (list): list of instances to calculate features on. Only
             used if cutoff_time is a single datetime.

diff --git a/featuretools/tests/computational_backend/test_calculate_feature_matrix.py b/featuretools/tests/computational_backend/test_calculate_feature_matrix.py
@@ -490,3 +490,85 @@ def test_cutoff_time_naming(entityset):
 
     with pytest.raises(AttributeError):
         calculate_feature_matrix([dfeat], cutoff_time=cutoff_df_wrong_index_name)
+
+
+def test_cutoff_time_extra_columns(entityset):
+    es = entityset
+
+    agg_feat = Count(es['customers']['id'], es['regions'])
+    dfeat = DirectFeature(agg_feat, es['customers'])
+
+    cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-09 10:30:06'),
+                                       pd.Timestamp('2011-04-09 10:30:03'),
+                                       pd.Timestamp('2011-04-08 10:30:00')],
+                              'instance_id': [0, 1, 0],
+                              'label': [True, True, False]},
+                             columns=['time', 'instance_id', 'label'])
+    fm = calculate_feature_matrix([dfeat], cutoff_time=cutoff_df)
+    # check column was added to end of matrix
+    assert 'label' == fm.columns[-1]
+    # check column was sorted by time labelike the rest of the feature matrix
+    true_series = pd.Series([False, True, True], index=[0, 1, 0])
+    assert (fm['label'] == true_series).all()
+
+    fm_2 = calculate_feature_matrix([dfeat],
+                                    cutoff_time=cutoff_df,
+                                    approximate="2 days")
+    # check column was added to end of matrix
+    assert 'label' in fm_2.columns
+    # check column was sorted by time like the rest of the feature matrix
+    true_series = pd.Series([False, True, True], index=[0, 1, 0])
+    assert (fm_2['label'] == true_series).all()
+
+
+def test_cfm_returns_original_time_indexes(entityset):
+    es = entityset
+
+    agg_feat = Count(es['customers']['id'], es['regions'])
+    dfeat = DirectFeature(agg_feat, es['customers'])
+    agg_feat_2 = Count(es['sessions']['id'], es['customers'])
+    cutoff_df = pd.DataFrame({'time': [pd.Timestamp('2011-04-09 10:30:06'),
+                                       pd.Timestamp('2011-04-09 10:30:03'),
+                                       pd.Timestamp('2011-04-08 10:30:00')],
+                              'instance_id': [0, 1, 0]})
+    sorted_df = cutoff_df.sort_values(['time', 'instance_id'], kind='mergesort')
+
+    # no approximate
+    fm = calculate_feature_matrix([dfeat], cutoff_time=cutoff_df,
+                                  cutoff_time_in_index=True)
+    instance_level_vals = fm.index.get_level_values(0).values
+    time_level_vals = fm.index.get_level_values(1).values
+    assert (instance_level_vals == sorted_df['instance_id'].values).all()
+    assert (time_level_vals == sorted_df['time'].values).all()
+
+    # approximate, in different windows, no unapproximated aggs
+    fm2 = calculate_feature_matrix([dfeat], cutoff_time=cutoff_df,
+                                   cutoff_time_in_index=True, approximate="1 m")
+    instance_level_vals = fm2.index.get_level_values(0).values
+    time_level_vals = fm2.index.get_level_values(1).values
+    assert (instance_level_vals == sorted_df['instance_id'].values).all()
+    assert (time_level_vals == sorted_df['time'].values).all()
+
+    # approximate, in different windows, unapproximated aggs
+    fm2 = calculate_feature_matrix([dfeat, agg_feat_2], cutoff_time=cutoff_df,
+                                   cutoff_time_in_index=True, approximate="1 m")
+    instance_level_vals = fm2.index.get_level_values(0).values
+    time_level_vals = fm2.index.get_level_values(1).values
+    assert (instance_level_vals == sorted_df['instance_id'].values).all()
+    assert (time_level_vals == sorted_df['time'].values).all()
+
+    # approximate, in same window, no unapproximated aggs
+    fm3 = calculate_feature_matrix([dfeat], cutoff_time=cutoff_df,
+                                   cutoff_time_in_index=True, approximate="2 d")
+    instance_level_vals = fm3.index.get_level_values(0).values
+    time_level_vals = fm3.index.get_level_values(1).values
+    assert (instance_level_vals == sorted_df['instance_id'].values).all()
+    assert (time_level_vals == sorted_df['time'].values).all()
+
+    # approximate, in same window, unapproximated aggs
+    fm3 = calculate_feature_matrix([dfeat, agg_feat_2], cutoff_time=cutoff_df,
+                                   cutoff_time_in_index=True, approximate="2 d")
+    instance_level_vals = fm3.index.get_level_values(0).values
+    time_level_vals = fm3.index.get_level_values(1).values
+    assert (instance_level_vals == sorted_df['instance_id'].values).all()
+    assert (time_level_vals == sorted_df['time'].values).all()