Skip to content

Commit

Permalink
No last time index update on normalize (#169)
Browse files Browse the repository at this point in the history
* do not caculate last time indexes on normalize

* update last_time_indexes at end of concat

* add tests for last_time_index element of update_data and Entityset.concat

* recalculate last time index only if last time index was present before

* fix concat es test
  • Loading branch information
rwedge authored and kmax12 committed Jun 18, 2018
1 parent 488c60f commit 200d7ce
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 3 deletions.
2 changes: 1 addition & 1 deletion featuretools/entityset/entity.py
Expand Up @@ -572,7 +572,7 @@ def update_data(self, df=None, data=None, already_sorted=False,
self.set_secondary_time_index(self.secondary_time_index)
if reindex:
self.index_data()
if recalculate_last_time_indexes:
if recalculate_last_time_indexes and self.last_time_index is not None:
self.entityset.add_last_time_indexes(updated_entities=[self.id])
self.add_all_variable_statistics()

Expand Down
9 changes: 8 additions & 1 deletion featuretools/entityset/entityset.py
Expand Up @@ -856,6 +856,8 @@ def concat(self, other, inplace=False):
combined_es = self
else:
combined_es = copy.deepcopy(self)

has_last_time_index = []
for entity in self.entities:
self_df = entity.df
other_df = other[entity.id].df
Expand All @@ -871,9 +873,14 @@ def concat(self, other, inplace=False):
combined_df.sort_values([entity.time_index, entity.index], inplace=True)
else:
combined_df.sort_index(inplace=True)
if (entity.last_time_index is not None or
other[entity.id].last_time_index is not None):
has_last_time_index.append(entity.id)
combined_es[entity.id].update_data(df=combined_df,
reindex=True,
recalculate_last_time_indexes=True)
recalculate_last_time_indexes=False)

combined_es.add_last_time_indexes(updated_entities=has_last_time_index)
return combined_es

###########################################################################
Expand Down
37 changes: 36 additions & 1 deletion featuretools/tests/entityset_tests/test_es.py
Expand Up @@ -430,6 +430,8 @@ def test_concat_entitysets(entityset):
make_index=True,
variable_types=vtypes,
dataframe=df)
entityset.add_last_time_indexes()

assert entityset.__eq__(entityset)
entityset_1 = copy.deepcopy(entityset)
entityset_2 = copy.deepcopy(entityset)
Expand All @@ -441,12 +443,21 @@ def test_concat_entitysets(entityset):
'test_entity': [[0, 1], [0, 2]],
}

entityset.add_last_time_indexes()
assert entityset.__eq__(entityset_1, deep=True)
assert entityset.__eq__(entityset_2, deep=True)

for i, es in enumerate([entityset_1, entityset_2]):
for entity, rows in emap.items():
df = es[entity].df
es[entity].update_data(df=df.loc[rows[i]])

assert 10 not in entityset_1['log'].last_time_index.index
assert 10 in entityset_2['log'].last_time_index.index
assert 9 in entityset_1['log'].last_time_index.index
assert 9 not in entityset_2['log'].last_time_index.index
assert not entityset.__eq__(entityset_1, deep=True)
assert not entityset.__eq__(entityset_2, deep=True)

# make sure internal indexes work before concat
regions = entityset_1['customers'].query_by_values(['United States'], variable_id=u'région_id')
assert regions.index.isin(entityset_1['customers'].df.index).all()
Expand All @@ -468,6 +479,30 @@ def test_concat_entitysets(entityset):
for column in df:
for x, y in zip(df[column], df_3[column]):
assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))
orig_lti = entityset[entity.id].last_time_index.sort_index()
new_lti = entityset_3[entity.id].last_time_index.sort_index()
for x, y in zip(orig_lti, new_lti):
assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))

entityset_1['stores'].last_time_index = None
entityset_1['test_entity'].last_time_index = None
entityset_2['test_entity'].last_time_index = None
entityset_4 = entityset_1.concat(entityset_2)
assert not entityset_4.__eq__(entityset, deep=True)
for entity in entityset.entities:
df = entityset[entity.id].df.sort_index()
df_4 = entityset_4[entity.id].df.sort_index()
for column in df:
for x, y in zip(df[column], df_4[column]):
assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))

if entity.id != 'test_entity':
orig_lti = entityset[entity.id].last_time_index.sort_index()
new_lti = entityset_4[entity.id].last_time_index.sort_index()
for x, y in zip(orig_lti, new_lti):
assert ((pd.isnull(x) and pd.isnull(y)) or (x == y))
else:
assert entityset_4[entity.id].last_time_index is None


def test_set_time_type_on_init():
Expand Down

0 comments on commit 200d7ce

Please sign in to comment.