Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add relationship_path to agg and direct features #544

Merged
merged 24 commits into from
May 29, 2019
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
785b618
Add relationship_path to agg and direct features
CJStadler May 13, 2019
a1f7caf
Fix infinite loop in depth first search
CJStadler May 16, 2019
502e0fe
Check entity equality using names
CJStadler May 16, 2019
3d087ea
Set relationship_path on FeatureBase
CJStadler May 20, 2019
6e0b30f
Update features JSON schema version
CJStadler May 22, 2019
f561279
Remove unnecessary generate_name
CJStadler May 22, 2019
476c039
Consolidate relationship serialization
CJStadler May 22, 2019
5dbed8a
Keyword arguments
CJStadler May 22, 2019
9ea907c
Make parent_name and child_name properties
CJStadler May 22, 2019
2a19712
Replace find_forward_path with find_forward_paths
CJStadler May 22, 2019
7494473
Fix docs
CJStadler May 22, 2019
1f356ef
Add test that loops are ignored
CJStadler May 22, 2019
1842822
Fix imports
CJStadler May 22, 2019
83031c1
Merge branch 'master' into feature-relationship-paths
CJStadler May 22, 2019
28e2412
Merge branch 'master' into feature-relationship-paths
CJStadler May 24, 2019
1762b59
Make parent_entity required for AggregationFeature
CJStadler May 28, 2019
7ba7907
Extract _handle_relationship_path to simplify constructors
CJStadler May 28, 2019
47cb131
Make child_entity required for DirectFeature constructor
CJStadler May 28, 2019
1d127e8
Add EntitySet.has_unique_path
CJStadler May 28, 2019
6f636b5
Add tests for finding paths with multiple relationships
CJStadler May 28, 2019
73a09f0
Rename next to next_entity
CJStadler May 28, 2019
eb38f8e
Remove Mean.generate_name
CJStadler May 28, 2019
e643cf3
Update feature copy to use path
CJStadler May 28, 2019
d86b9dc
Replace if with else
CJStadler May 29, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 16 additions & 2 deletions featuretools/entityset/entityset.py
Original file line number Diff line number Diff line change
Expand Up @@ -463,10 +463,11 @@ def _forward_entity_paths(self, start_entity_id, seen_entities=None):
yield start_entity_id, []

for relationship in self.get_forward_relationships(start_entity_id):
next = relationship.parent_entity.id
next_entity = relationship.parent_entity.id
# Copy seen entities for each next node to allow multiple paths (but
# not cycles).
for sub_entity_id, sub_path in self._forward_entity_paths(next, seen_entities.copy()):
descendants = self._forward_entity_paths(next_entity, seen_entities.copy())
for sub_entity_id, sub_path in descendants:
yield sub_entity_id, [relationship] + sub_path

def get_forward_entities(self, entity_id, deep=False):
Expand Down Expand Up @@ -565,6 +566,19 @@ def path_relationships(self, path, start_entity_id):
prev_entity = r.parent_variable.entity.id
return rels

def has_unique_forward_path(self, start_entity_id, end_entity_id):
"""
Is the forward path from start to end unique?

This will raise if there is no such path.
"""
paths = self.find_forward_paths(start_entity_id, end_entity_id)

next(paths)
second_path = next(paths, None)

return not second_path

###########################################################################
# Entity creation methods ##############################################
###########################################################################
Expand Down
105 changes: 49 additions & 56 deletions featuretools/feature_base/feature_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,45 +371,47 @@ class DirectFeature(FeatureBase):
input_types = [Variable]
return_type = None

def __init__(self, base_feature, child_entity=None, relationship_path=None):
def __init__(self, base_feature, child_entity, relationship_path=None):
"""relationship_path is a forward path from child to parent."""
base_feature = _check_feature(base_feature)

self.parent_entity = base_feature.entity

relationship_path, self._path_is_unique = \
self._handle_relationship_path(child_entity, relationship_path)

super(DirectFeature, self).__init__(entity=child_entity,
base_features=[base_feature],
relationship_path=relationship_path,
primitive=PrimitiveBase)

def _handle_relationship_path(self, child_entity, relationship_path):
if relationship_path:
CJStadler marked this conversation as resolved.
Show resolved Hide resolved
first_child = relationship_path[0].child_entity
if child_entity:
assert child_entity == first_child, \
'child_entity must match the first relationship'
else:
child_entity = first_child
assert child_entity == first_child, \
'child_entity must match the first relationship'

assert self.parent_entity == relationship_path[-1].parent_entity, \
'Base feature must be defined on the entity at the end of relationship_path'

self._is_unique_path = _is_unique_forward_path(child_entity.id,
self.parent_entity.id,
child_entity.entityset)
if not relationship_path:
assert child_entity, 'child_entity or relationship_path must be provided'
path_is_unique = child_entity.entityset \
kmax12 marked this conversation as resolved.
Show resolved Hide resolved
.has_unique_forward_path(child_entity.id, self.parent_entity.id)

if not relationship_path:
relationship_path = _find_path(child_entity.id,
self.parent_entity.id,
child_entity.entityset)
self._is_unique_path = True
path_is_unique = True

super(DirectFeature, self).__init__(entity=child_entity,
base_features=[base_feature],
relationship_path=relationship_path,
primitive=PrimitiveBase)
return relationship_path, path_is_unique

@classmethod
def from_dictionary(cls, arguments, entityset, dependencies, primitives_deserializer):
base_feature = dependencies[arguments['base_feature']]
relationship_path = [Relationship.from_dictionary(r, entityset)
for r in arguments['relationship_path']]
return cls(base_feature, relationship_path=relationship_path)
child_entity = relationship_path[0].child_entity
return cls(base_feature, child_entity, relationship_path=relationship_path)

@property
def variable(self):
Expand All @@ -432,7 +434,7 @@ def variable_type(self):
return self.base_features[0].variable_type

def generate_name(self):
if self._is_unique_path:
if self._path_is_unique:
relationship_path_name = self.parent_entity.id
else:
relationship_names = [r.parent_name for r in self.relationship_path]
Expand Down Expand Up @@ -460,9 +462,8 @@ class AggregationFeature(FeatureBase):
# each time point during calculation
use_previous = None

def __init__(self, base_features, primitive,
relationship_path=None, parent_entity=None,
use_previous=None, where=None):
def __init__(self, base_features, parent_entity, primitive,
relationship_path=None, use_previous=None, where=None):
if hasattr(base_features, '__iter__'):
base_features = [_check_feature(bf) for bf in base_features]
msg = "all base features must share the same entity"
Expand All @@ -472,27 +473,8 @@ def __init__(self, base_features, primitive,

self.child_entity = base_features[0].entity

if relationship_path:
first_parent = relationship_path[0].parent_entity
if parent_entity:
assert parent_entity == first_parent, \
'parent_entity must match first relationship in path.'
else:
parent_entity = first_parent

assert self.child_entity == relationship_path[-1].child_entity, \
'Base feature must be defined on the entity at the end of relationship_path'

self._is_unique_path = _is_unique_forward_path(self.child_entity.id,
parent_entity.id,
parent_entity.entityset)
else:
assert parent_entity, "parent_entity or relationship_path must be provided."
relationship_path = _find_path(parent_entity.id,
self.child_entity.id,
parent_entity.entityset,
backward=True)
self._is_unique_path = True
relationship_path, self._path_is_unique = \
self._handle_relationship_path(parent_entity, relationship_path)

self.parent_entity = parent_entity.entityset.metadata[parent_entity.id]

Expand Down Expand Up @@ -520,11 +502,34 @@ def __init__(self, base_features, primitive,
relationship_path=relationship_path,
primitive=primitive)

def _handle_relationship_path(self, parent_entity, relationship_path):
if relationship_path:
first_parent = relationship_path[0].parent_entity
assert parent_entity == first_parent, \
'parent_entity must match first relationship in path.'

assert self.child_entity == relationship_path[-1].child_entity, \
'Base feature must be defined on the entity at the end of relationship_path'

path_is_unique = parent_entity.entityset \
.has_unique_forward_path(self.child_entity.id, parent_entity.id)

else:
relationship_path = _find_path(parent_entity.id,
self.child_entity.id,
parent_entity.entityset,
backward=True)
path_is_unique = True

return relationship_path, path_is_unique

@classmethod
def from_dictionary(cls, arguments, entityset, dependencies, primitives_deserializer):
base_features = [dependencies[name] for name in arguments['base_features']]
relationship_path = [Relationship.from_dictionary(r, entityset)
for r in arguments['relationship_path']]
parent_entity = relationship_path[0].parent_entity

primitive = primitives_deserializer.deserialize_primitive(arguments['primitive'])

use_previous_data = arguments['use_previous']
Expand All @@ -533,7 +538,7 @@ def from_dictionary(cls, arguments, entityset, dependencies, primitives_deserial
where_name = arguments['where']
where = where_name and dependencies[where_name]

return cls(base_features, primitive, relationship_path=relationship_path,
return cls(base_features, parent_entity, primitive, relationship_path=relationship_path,
use_previous=use_previous, where=where)

def copy(self):
Expand All @@ -555,7 +560,7 @@ def _use_prev_str(self):
return use_prev_str

def generate_name(self):
if self._is_unique_path:
if self._path_is_unique:
relationship_path_name = self.child_entity.id
else:
relationship_names = [r.child_name for r in self.relationship_path]
Expand Down Expand Up @@ -722,15 +727,3 @@ def _find_path(start_entity_id, end_entity_id, es, backward=False):
raise RuntimeError(message)
else:
return path


def _is_unique_forward_path(start_entity_id, end_entity_id, es):
"""
Is the path from start to end unique?
"""
paths = es.find_forward_paths(start_entity_id, end_entity_id)

next(paths)
second_path = next(paths, None)

return not second_path
21 changes: 21 additions & 0 deletions featuretools/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,24 @@ def diamond_es():
return ft.EntitySet(id='ecommerce_diamond',
entities=entities,
relationships=relationships)


@pytest.fixture
def home_games_es():
teams = pd.DataFrame({'id': range(3)})
games = pd.DataFrame({
'id': range(5),
'home_team_id': [2, 2, 1, 0, 1],
'away_team_id': [1, 0, 2, 1, 0],
})
entities = {'teams': (teams, 'id'), 'games': (games, 'id')}
relationships = [('teams', 'id', 'games', 'home_team_id')]
return ft.EntitySet(entities=entities,
relationships=relationships)


@pytest.fixture
def games_es(home_games_es):
away_team = ft.Relationship(home_games_es['teams']['id'],
home_games_es['games']['away_team_id'])
return home_games_es.add_relationship(away_team)
51 changes: 49 additions & 2 deletions featuretools/tests/entityset_tests/test_es_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def test_find_forward_paths(es):
assert path[1].parent_entity.id == 'customers'


def test_find_forward_paths_multiple(diamond_es):
def test_find_forward_paths_multiple_paths(diamond_es):
paths = list(diamond_es.find_forward_paths('transactions', 'regions'))
assert len(paths) == 2

Expand All @@ -101,6 +101,27 @@ def test_find_forward_paths_multiple(diamond_es):
assert r2.parent_entity.id == 'regions'


def test_find_forward_paths_multiple_relationships(games_es):
paths = list(games_es.find_forward_paths('games', 'teams'))
assert len(paths) == 2

path1, path2 = paths
assert len(path1) == 1
assert len(path2) == 1
r1 = path1[0]
r2 = path2[0]

assert r1.child_entity.id == 'games'
assert r2.child_entity.id == 'games'
assert r1.parent_entity.id == 'teams'
assert r2.parent_entity.id == 'teams'

assert r1.child_variable.id == 'home_team_id'
assert r2.child_variable.id == 'away_team_id'
assert r1.parent_variable.id == 'id'
assert r2.parent_variable.id == 'id'


def test_find_forward_paths_ignores_loops():
employee_df = pd.DataFrame({'id': [0], 'manager_id': [0]})
entities = {'employees': (employee_df, 'id')}
Expand All @@ -125,7 +146,7 @@ def test_find_backward_paths(es):
assert path[1].parent_entity.id == 'sessions'


def test_find_backward_paths_multiple(diamond_es):
def test_find_backward_paths_multiple_paths(diamond_es):
paths = list(diamond_es.find_backward_paths('regions', 'transactions'))
assert len(paths) == 2

Expand All @@ -144,6 +165,27 @@ def test_find_backward_paths_multiple(diamond_es):
assert r2.parent_entity.id == 'customers'


def test_find_backward_paths_multiple_relationships(games_es):
paths = list(games_es.find_backward_paths('teams', 'games'))
assert len(paths) == 2

path1, path2 = paths
assert len(path1) == 1
assert len(path2) == 1
r1 = path1[0]
r2 = path2[0]

assert r1.child_entity.id == 'games'
assert r2.child_entity.id == 'games'
assert r1.parent_entity.id == 'teams'
assert r2.parent_entity.id == 'teams'

assert r1.child_variable.id == 'home_team_id'
assert r2.child_variable.id == 'away_team_id'
assert r1.parent_variable.id == 'id'
assert r2.parent_variable.id == 'id'


def test_find_path(es):
path, forward = es.find_path('products', 'customers',
include_num_forward=True)
Expand Down Expand Up @@ -177,6 +219,11 @@ def test_find_path_no_path_found(es):
es.find_path('products', 'customers')


def test_has_unique_path(diamond_es):
assert diamond_es.has_unique_forward_path('customers', 'regions')
assert not diamond_es.has_unique_forward_path('transactions', 'regions')


def test_raise_key_error_missing_entity(es):
error_text = "Entity this entity doesn't exist does not exist in ecommerce"
with pytest.raises(KeyError, match=error_text):
Expand Down
24 changes: 0 additions & 24 deletions featuretools/tests/entityset_tests/test_relationship.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,6 @@
import pandas as pd
import pytest

import featuretools as ft


@pytest.fixture
def home_games_es():
teams = pd.DataFrame({'id': range(3)})
games = pd.DataFrame({
'id': range(5),
'home_team_id': [2, 2, 1, 0, 1],
'away_team_id': [1, 0, 2, 1, 0],
})
entities = {'teams': (teams, 'id'), 'games': (games, 'id')}
relationships = [('teams', 'id', 'games', 'home_team_id')]
return ft.EntitySet(entities=entities,
relationships=relationships)


@pytest.fixture
def games_es(home_games_es):
away_team = ft.Relationship(home_games_es['teams']['id'],
home_games_es['games']['away_team_id'])
return home_games_es.add_relationship(away_team)


def test_names_when_multiple_relationships_between_entities(games_es):
relationship = ft.Relationship(games_es['teams']['id'],
games_es['games']['home_team_id'])
Expand Down
Loading