No last time index update on normalize (#169)

* do not caculate last time indexes on normalize * update last_time_indexes at end of concat * add tests for last_time_index element of update_data and Entityset.concat * recalculate last time index only if last time index was present before * fix concat es test
alteryx · Jun 18, 2018 · 200d7ce · 200d7ce
1 parent 488c60f
commit 200d7ce
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 3 deletions.
diff --git a/featuretools/entityset/entity.py b/featuretools/entityset/entity.py
@@ -572,7 +572,7 @@ def update_data(self, df=None, data=None, already_sorted=False,
         self.set_secondary_time_index(self.secondary_time_index)
         if reindex:
             self.index_data()
-        if recalculate_last_time_indexes:
+        if recalculate_last_time_indexes and self.last_time_index is not None:
             self.entityset.add_last_time_indexes(updated_entities=[self.id])
         self.add_all_variable_statistics()
 

diff --git a/featuretools/entityset/entityset.py b/featuretools/entityset/entityset.py
@@ -856,6 +856,8 @@ def concat(self, other, inplace=False):
             combined_es = self
         else:
             combined_es = copy.deepcopy(self)
+
+        has_last_time_index = []
         for entity in self.entities:
             self_df = entity.df
             other_df = other[entity.id].df
@@ -871,9 +873,14 @@ def concat(self, other, inplace=False):
                 combined_df.sort_values([entity.time_index, entity.index], inplace=True)
             else:
                 combined_df.sort_index(inplace=True)
+            if (entity.last_time_index is not None or
+                    other[entity.id].last_time_index is not None):
+                has_last_time_index.append(entity.id)
             combined_es[entity.id].update_data(df=combined_df,
                                                reindex=True,
-                                               recalculate_last_time_indexes=True)
+                                               recalculate_last_time_indexes=False)
+
+        combined_es.add_last_time_indexes(updated_entities=has_last_time_index)
         return combined_es
 
     ###########################################################################

diff --git a/featuretools/tests/entityset_tests/test_es.py b/featuretools/tests/entityset_tests/test_es.py
@@ -430,6 +430,8 @@ def test_concat_entitysets(entityset):
                                     make_index=True,
                                     variable_types=vtypes,
                                     dataframe=df)
+    entityset.add_last_time_indexes()
+
     assert entityset.__eq__(entityset)
     entityset_1 = copy.deepcopy(entityset)
     entityset_2 = copy.deepcopy(entityset)
@@ -441,12 +443,21 @@ def test_concat_entitysets(entityset):
         'test_entity': [[0, 1], [0, 2]],
     }
 
-    entityset.add_last_time_indexes()
+    assert entityset.__eq__(entityset_1, deep=True)
+    assert entityset.__eq__(entityset_2, deep=True)
+
     for i, es in enumerate([entityset_1, entityset_2]):
         for entity, rows in emap.items():
             df = es[entity].df
             es[entity].update_data(df=df.loc[rows[i]])
 
+    assert 10 not in entityset_1['log'].last_time_index.index
+    assert 10 in entityset_2['log'].last_time_index.index
+    assert 9 in entityset_1['log'].last_time_index.index
+    assert 9 not in entityset_2['log'].last_time_index.index
+    assert not entityset.__eq__(entityset_1, deep=True)
+    assert not entityset.__eq__(entityset_2, deep=True)
+
     # make sure internal indexes work before concat
     regions = entityset_1['customers'].query_by_values(['United States'], variable_id=u'région_id')
     assert regions.index.isin(entityset_1['customers'].df.index).all()
@@ -468,6 +479,30 @@ def test_concat_entitysets(entityset):
         for column in df:
             for x, y in zip(df[column], df_3[column]):
                 assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))
+        orig_lti = entityset[entity.id].last_time_index.sort_index()
+        new_lti = entityset_3[entity.id].last_time_index.sort_index()
+        for x, y in zip(orig_lti, new_lti):
+            assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))
+
+    entityset_1['stores'].last_time_index = None
+    entityset_1['test_entity'].last_time_index = None
+    entityset_2['test_entity'].last_time_index = None
+    entityset_4 = entityset_1.concat(entityset_2)
+    assert not entityset_4.__eq__(entityset, deep=True)
+    for entity in entityset.entities:
+        df = entityset[entity.id].df.sort_index()
+        df_4 = entityset_4[entity.id].df.sort_index()
+        for column in df:
+            for x, y in zip(df[column], df_4[column]):
+                assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))
+
+        if entity.id != 'test_entity':
+            orig_lti = entityset[entity.id].last_time_index.sort_index()
+            new_lti = entityset_4[entity.id].last_time_index.sort_index()
+            for x, y in zip(orig_lti, new_lti):
+                assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))
+        else:
+            assert entityset_4[entity.id].last_time_index is None
 
 
 def test_set_time_type_on_init():