alteryx · kmax12 · Sep 30, 2019 · Jun 30, 2019 · Jul 1, 2019 · Jul 1, 2019
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -19,6 +19,7 @@ Changelog
         * Don't delete the whole destination folder while saving entityset (:pr:`717`)
     * Changes
         * Raise warning and not error on schema version mismatch (:pr:`718`)
+	    * Change feature calculation to return in order of instance ids provided (:pr:`676`)
         * Removed time remaining from displayed progress bar in dfs() and calculate_feature_matrix() (:pr:`739`)
         * Raise warning in normalize_entity() when time_index of base_entity has an invalid type (:pr:`749`)
         * Remove toolz as a direct dependency (:pr:`755`)
@@ -31,6 +32,9 @@ Changelog
     Thanks to the following people for contributing to this release:
     :user:`jeff-hernandez`, :user:`chidauri`, :user:`christopherbunn`, :user:`kmax12`, :user:`MarcoGorelli`, :user:`angela97lin`, :user:`frances-h`, :user:`rwedge`, :user:`thehomebrewnerd`
 
+**Breaking Changes**
+
+* Feature calculations will return in the order of instance ids provided instead of the order of time points instances are calculated at.
 
 **v0.10.1 Aug 25, 2019**
     * Fixes
@@ -83,7 +87,6 @@ Changelog
     :user:`CJStadler`, :user:`ctduffy`, :user:`gsheni`, :user:`jeff-hernandez`,
     :user:`jeremyliweishih`, :user:`kmax12`, :user:`rwedge`, :user:`zhxt95`,
 
-
 **v0.9.1 July 3, 2019**
     * Enhancements
         * Speedup groupby transform calculations (:pr:`609`)

diff --git a/featuretools/computational_backends/calculate_feature_matrix.py b/featuretools/computational_backends/calculate_feature_matrix.py
@@ -282,7 +282,8 @@ def calculate_feature_matrix(features, entityset=None, cutoff_time=None, instanc
                                          progress_bar=progress_bar,
                                          progress_callback=progress_callback)
 
-    feature_matrix.sort_index(level='time', kind='mergesort', inplace=True)
+    # ensure rows are sorted by input order
+    feature_matrix = feature_matrix.reindex(cutoff_time[["instance_id", "time"]])
     if not cutoff_time_in_index:
         feature_matrix.reset_index(level='time', drop=True, inplace=True)
 

diff --git a/featuretools/entityset/entity.py b/featuretools/entityset/entity.py
@@ -82,8 +82,7 @@ def __init__(self, id, df, entityset, variable_types=None,
         self.time_index = None
         if time_index:
             self.set_time_index(time_index, already_sorted=already_sorted)
-        elif not already_sorted:
-            self.df.sort_index(kind="mergesort", inplace=True)
+
         self.set_secondary_time_index(secondary_time_index)
 
     def __repr__(self):
@@ -331,8 +330,7 @@ def update_data(self, df, already_sorted=False,
         self.set_index(self.index)
         if self.time_index is not None:
             self.set_time_index(self.time_index, already_sorted=already_sorted)
-        elif not already_sorted:
-            self.df.sort_index(kind="mergesort", inplace=True)
+
         self.set_secondary_time_index(self.secondary_time_index)
         if recalculate_last_time_indexes and self.last_time_index is not None:
             self.entityset.add_last_time_indexes(updated_entities=[self.id])

diff --git a/featuretools/tests/computational_backend/test_calculate_feature_matrix.py b/featuretools/tests/computational_backend/test_calculate_feature_matrix.py
@@ -103,6 +103,14 @@ def test_calc_feature_matrix(es):
                                                   entityset=es,
                                                   cutoff_time=cutoff_times_dup)
 
+    cutoff_reordered = cutoff_time.iloc[[-1, 10, 1]]  # 3 ids not ordered by cutoff time
+    feature_matrix = calculate_feature_matrix([property_feature],
+                                              es,
+                                              cutoff_time=cutoff_reordered,
+                                              verbose=True)
+
+    assert all(feature_matrix.index == cutoff_reordered["id"].values)
+
 
 def test_cfm_approximate_correct_ordering():
     trips = {
@@ -242,7 +250,7 @@ def test_cutoff_time_correctly(es):
                                               es,
                                               cutoff_time=cutoff_time)
 
-    labels = [0, 10, 5]
+    labels = [10, 5, 0]
     assert (feature_matrix[property_feature.get_name()] == labels).values.all()
 
 
@@ -681,19 +689,17 @@ def test_cutoff_time_extra_columns(es):
     fm = calculate_feature_matrix([dfeat], es, cutoff_time=cutoff_df)
     # check column was added to end of matrix
     assert 'label' == fm.columns[-1]
-    # check column was sorted by time labelike the rest of the feature matrix
-    true_series = pd.Series([False, True, True], index=[0, 1, 0])
-    assert (fm['label'] == true_series).all()
+
+    assert (fm['label'].values == cutoff_df['label'].values).all()
 
     fm_2 = calculate_feature_matrix([dfeat],
                                     es,
                                     cutoff_time=cutoff_df,
                                     approximate="2 days")
     # check column was added to end of matrix
     assert 'label' in fm_2.columns
-    # check column was sorted by time like the rest of the feature matrix
-    true_series = pd.Series([False, True, True], index=[0, 1, 0])
-    assert (fm_2['label'] == true_series).all()
+
+    assert (fm_2['label'].values == cutoff_df['label'].values).all()
 
 
 def test_instances_after_cutoff_time_removed(es):
@@ -706,7 +712,7 @@ def test_instances_after_cutoff_time_removed(es):
 
     # Customer with id 1 should be removed
     actual_ids = [id for (id, _) in fm.index]
-    assert actual_ids == [0, 2]
+    assert set(actual_ids) == set([2, 0])
 
 
 def test_instances_with_id_kept_after_cutoff(es):
@@ -721,7 +727,7 @@ def test_instances_with_id_kept_after_cutoff(es):
     # Customer #1 is after cutoff, but since it is included in instance_ids it
     # should be kept.
     actual_ids = [id for (id, _) in fm.index]
-    assert actual_ids == [0, 1, 2]
+    assert set(actual_ids) == set([0, 1, 2])
 
 
 def test_cfm_returns_original_time_indexes(es):
@@ -732,48 +738,47 @@ def test_cfm_returns_original_time_indexes(es):
                                        pd.Timestamp('2011-04-09 10:30:03'),
                                        pd.Timestamp('2011-04-08 10:30:00')],
                               'instance_id': [0, 1, 0]})
-    sorted_df = cutoff_df.sort_values(['time', 'instance_id'], kind='mergesort')
 
     # no approximate
     fm = calculate_feature_matrix([dfeat],
                                   es, cutoff_time=cutoff_df,
                                   cutoff_time_in_index=True)
     instance_level_vals = fm.index.get_level_values(0).values
     time_level_vals = fm.index.get_level_values(1).values
-    assert (instance_level_vals == sorted_df['instance_id'].values).all()
-    assert (time_level_vals == sorted_df['time'].values).all()
+    assert (instance_level_vals == cutoff_df['instance_id'].values).all()
+    assert (time_level_vals == cutoff_df['time'].values).all()
 
     # approximate, in different windows, no unapproximated aggs
     fm2 = calculate_feature_matrix([dfeat], es, cutoff_time=cutoff_df,
                                    cutoff_time_in_index=True, approximate="1 m")
     instance_level_vals = fm2.index.get_level_values(0).values
     time_level_vals = fm2.index.get_level_values(1).values
-    assert (instance_level_vals == sorted_df['instance_id'].values).all()
-    assert (time_level_vals == sorted_df['time'].values).all()
+    assert (instance_level_vals == cutoff_df['instance_id'].values).all()
+    assert (time_level_vals == cutoff_df['time'].values).all()
 
     # approximate, in different windows, unapproximated aggs
     fm2 = calculate_feature_matrix([dfeat, agg_feat_2], es, cutoff_time=cutoff_df,
                                    cutoff_time_in_index=True, approximate="1 m")
     instance_level_vals = fm2.index.get_level_values(0).values
     time_level_vals = fm2.index.get_level_values(1).values
-    assert (instance_level_vals == sorted_df['instance_id'].values).all()
-    assert (time_level_vals == sorted_df['time'].values).all()
+    assert (instance_level_vals == cutoff_df['instance_id'].values).all()
+    assert (time_level_vals == cutoff_df['time'].values).all()
 
     # approximate, in same window, no unapproximated aggs
     fm3 = calculate_feature_matrix([dfeat], es, cutoff_time=cutoff_df,
                                    cutoff_time_in_index=True, approximate="2 d")
     instance_level_vals = fm3.index.get_level_values(0).values
     time_level_vals = fm3.index.get_level_values(1).values
-    assert (instance_level_vals == sorted_df['instance_id'].values).all()
-    assert (time_level_vals == sorted_df['time'].values).all()
+    assert (instance_level_vals == cutoff_df['instance_id'].values).all()
+    assert (time_level_vals == cutoff_df['time'].values).all()
 
     # approximate, in same window, unapproximated aggs
     fm3 = calculate_feature_matrix([dfeat, agg_feat_2], es, cutoff_time=cutoff_df,
                                    cutoff_time_in_index=True, approximate="2 d")
     instance_level_vals = fm3.index.get_level_values(0).values
     time_level_vals = fm3.index.get_level_values(1).values
-    assert (instance_level_vals == sorted_df['instance_id'].values).all()
-    assert (time_level_vals == sorted_df['time'].values).all()
+    assert (instance_level_vals == cutoff_df['instance_id'].values).all()
+    assert (time_level_vals == cutoff_df['time'].values).all()
 
 
 def test_dask_kwargs(es):
@@ -1135,6 +1140,7 @@ def test_some_instances_not_in_data(es):
     a_time = datetime(2011, 4, 10, 10, 41, 9)  # only valid data
     b_time = datetime(2011, 4, 10, 11, 10, 5)  # some missing data
     c_time = datetime(2011, 4, 10, 12, 0, 0)  # all missing data
+
     times = [a_time, b_time, a_time, a_time, b_time, b_time] + [c_time] * 4
     cutoff_time = pd.DataFrame({"instance_id": list(range(12, 22)),
                                 "time": times})
@@ -1149,12 +1155,11 @@ def test_some_instances_not_in_data(es):
                                   entityset=es,
                                   cutoff_time=cutoff_time)
 
-    index_answer = [12, 14, 15, 13, 16, 17, 18, 19, 20, 21]
-    ifeat_answer = [0, 14, np.nan, 7] + [np.nan] * 6
-    prop_answer = [0, 1, np.nan, 0, 0] + [np.nan] * 5
-    dfeat_answer = [14, 14, np.nan, 14] + [np.nan] * 6
+    ifeat_answer = [0, 7, 14, np.nan] + [np.nan] * 6
+    prop_answer = [0, 0, 1, np.nan, 0] + [np.nan] * 5
+    dfeat_answer = [14, 14, 14, np.nan] + [np.nan] * 6
 
-    assert all(fm.index.values == index_answer)
+    assert all(fm.index.values == cutoff_time["instance_id"].values)
     for x, y in zip(fm.columns, [ifeat_answer, prop_answer, dfeat_answer]):
         np.testing.assert_array_equal(fm[x], y)
 
@@ -1163,10 +1168,11 @@ def test_some_instances_not_in_data(es):
                                   cutoff_time=cutoff_time,
                                   approximate="5 seconds")
 
-    dfeat_answer[0:2] = [7, 7]  # approximate calculated before 14 appears
-    prop_answer[2] = 0  # no_unapproximated_aggs code ignores cutoff time
+    dfeat_answer[0] = 7  # approximate calculated before 14 appears
+    dfeat_answer[2] = 7  # approximate calculated before 14 appears
+    prop_answer[3] = 0  # no_unapproximated_aggs code ignores cutoff time
 
-    assert all(fm.index.values == index_answer)
+    assert all(fm.index.values == cutoff_time["instance_id"].values)
     for x, y in zip(fm.columns, [ifeat_answer, prop_answer, dfeat_answer]):
         np.testing.assert_array_equal(fm[x], y)
 

diff --git a/featuretools/tests/entityset_tests/test_entity.py b/featuretools/tests/entityset_tests/test_entity.py
@@ -82,7 +82,7 @@ def test_update_data(es):
     df = es["sessions"].df.copy(deep=True)
     df["id"].iloc[1:3] = [2, 1]
     es["sessions"].update_data(df.copy(deep=True))
-    assert es["sessions"].df["id"].iloc[1] == 1
+    assert es["sessions"].df["id"].iloc[1] == 2  # no sorting since time index not defined
     es["sessions"].update_data(df.copy(deep=True), already_sorted=True)
     assert es["sessions"].df["id"].iloc[1] == 2
 

diff --git a/featuretools/tests/entityset_tests/test_es.py b/featuretools/tests/entityset_tests/test_es.py
@@ -444,6 +444,7 @@ def test_concat_entitysets(es):
     es_1 = copy.deepcopy(es)
     es_2 = copy.deepcopy(es)
 
+    # map of what rows to take from es_1 and es_2 for each entity
     emap = {
         'log': [list(range(10)) + [14, 15, 16], list(range(10, 14)) + [15, 16]],
         'sessions': [[0, 1, 2, 5], [1, 3, 4, 5]],
@@ -480,7 +481,7 @@ def test_concat_entitysets(es):
     assert old_es_1.__eq__(es_1, deep=True)
     assert old_es_2.__eq__(es_2, deep=True)
 
-    assert es_3.__eq__(es, deep=True)
+    assert es_3.__eq__(es)
     for entity in es.entities:
         df = es[entity.id].df.sort_index()
         df_3 = es_3[entity.id].df.sort_index()

diff --git a/featuretools/tests/entityset_tests/test_serialization.py b/featuretools/tests/entityset_tests/test_serialization.py
@@ -21,8 +21,8 @@
 BUCKET_NAME = "test-bucket"
 WRITE_KEY_NAME = "test-key"
 TEST_S3_URL = "s3://{}/{}".format(BUCKET_NAME, WRITE_KEY_NAME)
-S3_URL = "s3://featuretools-static/test_serialization_data_1.0.0.tar"
-URL = 'https://featuretools-static.s3.amazonaws.com/test_serialization_data_1.0.0.tar'
+S3_URL = "s3://featuretools-static/test_serialization_data_2.0.0.tar"
+URL = 'https://featuretools-static.s3.amazonaws.com/test_serialization_data_2.0.0.tar'
 TEST_KEY = "test_access_key_es"
 
 

diff --git a/featuretools/tests/primitive_tests/test_agg_feats.py b/featuretools/tests/primitive_tests/test_agg_feats.py
@@ -592,24 +592,17 @@ def pd_top3(x):
 
     fm, features = ft.dfs(entityset=es,
                           target_entity="customers",
+                          instance_ids=[0, 1, 2],
                           agg_primitives=[NMostCommoner],
                           trans_primitives=[])
 
-    true_results = pd.DataFrame([
-        ['coke zero', 'toothpaste', "car"],
-        ['coke zero', 'Haribo sugar-free gummy bears', np.nan],
-        ['taco clock', np.nan, np.nan]
-    ])
     df = fm[["PD_TOP3(log.product_id)[%s]" % i for i in range(3)]]
-    for i in range(df.shape[0]):
-        if i == 0:
-            # coke zero and toothpaste have same number of occurrences
-            # so just check that the top two match
-            assert set(true_results.iloc[i].values[:2]) == set(df.iloc[i].values[:2])
-            assert df.iloc[0].values[2] in ("brown bag", "car")
-        else:
-            for i1, i2 in zip(true_results.iloc[i], df.iloc[i]):
-                assert (pd.isnull(i1) and pd.isnull(i2)) or (i1 == i2)
+
+    assert set(df.iloc[0].values[:2]) == set(['coke zero', 'toothpaste'])  # coke zero and toothpaste have same number of occurrences
+    assert df.iloc[0].values[2] in ['car', 'brown bag']  # so just check that the top two match
+
+    assert df.iloc[1].reset_index(drop=True).equals(pd.Series(['coke zero', 'Haribo sugar-free gummy bears', np.nan]))
+    assert df.iloc[2].reset_index(drop=True).equals(pd.Series(['taco clock', np.nan, np.nan]))
 
 
 def test_stacking_multi(es):
@@ -620,7 +613,7 @@ def test_stacking_multi(es):
     for i in range(3):
         stacked.append(ft.Feature(tc[i], parent_entity=es['customers'], primitive=NumUnique))
 
-    fm = ft.calculate_feature_matrix(stacked, entityset=es)
+    fm = ft.calculate_feature_matrix(stacked, entityset=es, instance_ids=[0, 1, 2])
 
     correct_vals = [[3, 2, 1], [2, 1, 0], [0, 0, 0]]
     correct_vals1 = [[3, 1, 1], [2, 1, 0], [0, 0, 0]]
@@ -640,7 +633,9 @@ def test_use_previous_pd_dateoffset(es):
                                  use_previous=pd.DateOffset(hours=47, minutes=60),
                                  primitive=Count)
 
-    feature_matrix = ft.calculate_feature_matrix([total_events_pd], es, cutoff_time=pd.Timestamp('2011-04-11 10:31:30'))
+    feature_matrix = ft.calculate_feature_matrix([total_events_pd], es,
+                                                 cutoff_time=pd.Timestamp('2011-04-11 10:31:30'),
+                                                 instance_ids=[0, 1, 2])
     col_name = list(feature_matrix.head().keys())[0]
     assert (feature_matrix[col_name] == [1, 5, 2]).all()
 

diff --git a/featuretools/tests/synthesis/test_dfs_method.py b/featuretools/tests/synthesis/test_dfs_method.py
@@ -111,14 +111,10 @@ def test_approximate_features(entities, relationships):
     direct_agg_feat_name = 'cards.PERCENT_TRUE(transactions.fraud)'
     assert len(feature_matrix.index) == 6
     assert len(feature_matrix.columns) == len(features)
-    truth_index = pd.MultiIndex.from_arrays([[1, 3, 1, 5, 3, 6],
-                                             [11, 16, 16, 26, 17, 22]],
-                                            names=('id', 'time'))
-    truth_values = pd.Series(data=[1.0, 0.5, 0.5, 1.0, 0.5, 1.0],
-                             index=truth_index)
-    truth_values.sort_index(level='time', kind='mergesort', inplace=True)
-
-    assert (feature_matrix[direct_agg_feat_name] == truth_values).all()
+
+    truth_values = pd.Series(data=[1.0, 0.5, 0.5, 1.0, 0.5, 1.0])
+
+    assert (feature_matrix[direct_agg_feat_name] == truth_values.values).all()
 
 
 def test_all_variables(entities, relationships):