Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

No EntitySet required in loading/saving features #141

Merged
merged 22 commits into from May 9, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/source/guides/deployment.rst
Expand Up @@ -41,11 +41,11 @@ Now, we can use :meth:`featuretools.save_features` to save a list features.
Calculating Feature Matrix for New Data
***************************************

We can use :meth:`featuretools.load_features` to read in a list of saved features for our new entity set.
We can use :meth:`featuretools.load_features` to read in a list of saved features to calculate for our new entity set.

.. ipython:: python

saved_features = ft.load_features('feature_definitions', es_test)
saved_features = ft.load_features('feature_definitions')

.. ipython:: python
:suppress:
Expand Down
18 changes: 7 additions & 11 deletions featuretools/computational_backends/calculate_feature_matrix.py
Expand Up @@ -28,8 +28,8 @@
logger = logging.getLogger('featuretools.computational_backend')


def calculate_feature_matrix(features, cutoff_time=None, instance_ids=None,
entities=None, relationships=None, entityset=None,
def calculate_feature_matrix(features, entityset=None, cutoff_time=None, instance_ids=None,
entities=None, relationships=None,
cutoff_time_in_index=False,
training_window=None, approximate=None,
save_progress=None, verbose=False,
Expand All @@ -40,6 +40,9 @@ def calculate_feature_matrix(features, cutoff_time=None, instance_ids=None,
Args:
features (list[PrimitiveBase]): Feature definitions to be calculated.

entityset (EntitySet): An already initialized entityset. Required if `entities` and `relationships`
not provided

cutoff_time (pd.DataFrame or Datetime): Specifies at which time to calculate
the features for each instance. Can either be a DataFrame with
'instance_id' and 'time' columns, DataFrame with the name of the
Expand All @@ -58,9 +61,6 @@ def calculate_feature_matrix(features, cutoff_time=None, instance_ids=None,
between entities. List items are a tuple with the format
(parent entity id, parent variable, child entity id, child variable).

entityset (EntitySet): An already initialized entityset. Required if
entities and relationships are not defined.

cutoff_time_in_index (bool): If True, return a DataFrame with a MultiIndex
where the second index is the cutoff time (first is instance id).
DataFrame will be sorted by (time, instance_id).
Expand Down Expand Up @@ -100,12 +100,7 @@ def calculate_feature_matrix(features, cutoff_time=None, instance_ids=None,
if entities is not None and relationships is not None:
entityset = EntitySet("entityset", entities, relationships)

if entityset is not None:
for f in features:
f.entityset = entityset

entityset = features[0].entityset
target_entity = features[0].entity
target_entity = entityset[features[0].entity.id]
pass_columns = []

if not isinstance(cutoff_time, pd.DataFrame):
Expand Down Expand Up @@ -462,6 +457,7 @@ def approximate_features(features, cutoff_time, window, entityset, backend,

cutoff_time_to_pass.drop_duplicates(inplace=True)
approx_fm = calculate_feature_matrix(approx_features,
entityset,
cutoff_time=cutoff_time_to_pass,
training_window=training_window,
approximate=None,
Expand Down
66 changes: 25 additions & 41 deletions featuretools/entityset/base_entity.py
@@ -1,12 +1,11 @@
from __future__ import print_function

import logging
from builtins import map

import pandas as pd
from past.builtins import basestring

from featuretools import variable_types as vtypes
from featuretools.utils.wrangle import _dataframes_equal

logger = logging.getLogger('featuretools.entityset')

Expand Down Expand Up @@ -105,24 +104,22 @@ def shape(self):
return self.get_shape()

def __eq__(self, other, deep=False):
if not deep:
if isinstance(other, self.__class__):
return self.id == other.id
if self.index != other.index:
return False
else:
if self.index != other.index:
return False
if self.time_index != other.time_index:
return False
if self.secondary_time_index != other.secondary_time_index:
return False
if len(self.variables) != len(other.variables):
if self.time_index != other.time_index:
return False
if self.secondary_time_index != other.secondary_time_index:
return False
if len(self.variables) != len(other.variables):
return False
for v in self.variables:
if v not in other.variables:
return False
for v in self.variables:
if v not in other.variables:
return False
if deep:
if self.indexed_by is None and other.indexed_by is not None:
return False
elif self.indexed_by is not None and other.indexed_by is None:
return False
else:
for v, index_map in self.indexed_by.items():
if v not in other.indexed_by:
Expand All @@ -136,7 +133,18 @@ def __eq__(self, other, deep=False):
# checked for equality, but don't care about the order.
if not set(related) == set(other.indexed_by[v][i]):
return False
return True
if self.last_time_index is None and other.last_time_index is not None:
return False
elif self.last_time_index is not None and other.last_time_index is None:
return False
elif self.last_time_index is not None and other.last_time_index is not None:
if not self.last_time_index.equals(other.last_time_index):
return False

if not _dataframes_equal(self.df, other.df):
return False

return True

def __hash__(self):
return id(self.id)
Expand Down Expand Up @@ -177,30 +185,6 @@ def show_instance(self, instance_ids):
def get_shape():
raise NotImplementedError()

def head(self, n=10, cutoff_time=None):
"""See first n instance in entity

Args:
n (int) : Number of instances to return.

Returns:
:class:`pd.DataFrame` : Pandas DataFrame


"""
if cutoff_time is None:
df = self.entityset.head(self.id, n=n)
else:
from featuretools.computational_backends.calculate_feature_matrix import calculate_feature_matrix
from featuretools.features import Feature

row = list(map(Feature, self.variables))
instance_ids = self.entityset.get_top_n_instances(self.id, n)
cutoff_time = pd.DataFrame({'instance_id': instance_ids})
cutoff_time['time'] = cutoff_time
df = calculate_feature_matrix(row, cutoff_time=cutoff_time)
return df

@property
def variable_types(self):
return {v.id: type(v) for v in self.variables}
Expand Down
13 changes: 6 additions & 7 deletions featuretools/entityset/base_entityset.py
Expand Up @@ -54,22 +54,21 @@ def __init__(self, id, verbose):
self.time_type = None

def __eq__(self, other, deep=False):
if not deep:
if isinstance(other, type(self)):
return self.id == other.id
return False
if len(self.entity_stores) != len(other.entity_stores):
return False
for eid, e in self.entity_stores.items():
if eid not in other.entity_stores:
return False
if not e.__eq__(other[eid], deep=True):
if not e.__eq__(other[eid], deep=deep):
return False
for r in self.relationships:
for r in other.relationships:
if r not in other.relationships:
return False
return True

def __ne__(self, other, deep=False):
return not self.__eq__(other, deep=deep)

def __getitem__(self, entity_id):
"""Get entity instance from entityset

Expand Down Expand Up @@ -183,7 +182,6 @@ def add_relationship(self, relationship):
# _operations?

# this is a new pair of entities
self.relationships.append(relationship)
child_e = relationship.child_entity
child_v = relationship.child_variable.id
parent_e = relationship.parent_entity
Expand All @@ -197,6 +195,7 @@ def add_relationship(self, relationship):
new_type=vtypes.Index,
convert_data=False)

self.relationships.append(relationship)
self.index_data(relationship)
return self

Expand Down
73 changes: 32 additions & 41 deletions featuretools/entityset/entity.py
Expand Up @@ -57,17 +57,46 @@ def __init__(self, id, df, entityset, variable_types=None, name=None,

"""
assert len(df.columns) == len(set(df.columns)), "Duplicate column names"
self.df = df
self.data = {"df": df,
"last_time_index": last_time_index,
"indexed_by": {}
}
self.encoding = encoding
self.indexed_by = {}
self._verbose = verbose
self.created_index = created_index
self.convert_variable_types(variable_types)
self.attempt_cast_index_to_int(index)
self.last_time_index = last_time_index
super(Entity, self).__init__(id, entityset, variable_types, name, index,
time_index, secondary_time_index, relationships, already_sorted)

@property
def is_metadata(self):
return self.entityset.is_metadata

@property
def df(self):
return self.data["df"]

@df.setter
def df(self, _df):
self.data["df"] = _df

@property
def last_time_index(self):
return self.data["last_time_index"]

@last_time_index.setter
def last_time_index(self, lti):
self.data["last_time_index"] = lti

@property
def indexed_by(self):
return self.data["indexed_by"]

@indexed_by.setter
def indexed_by(self, idx):
self.data["indexed_by"] = idx

def attempt_cast_index_to_int(self, index_var):
dtype_name = self.df[index_var].dtype.name
if (dtype_name.find('int') == -1 and
Expand Down Expand Up @@ -118,41 +147,6 @@ def is_index_column(self, varname):

return False

def head(self, n=10, cutoff_time=None):
"""See first n instance in entity

Args:
n (int) : Number of instances to return.
cutoff_time (pd.Timestamp,pd.DataFrame) : Timestamp(s) to restrict rows.

Returns:
:class:`pd.DataFrame` : A Pandas DataFrame.

"""

if cutoff_time is None:
valid_data = self.df

elif isinstance(cutoff_time, pd.Timestamp) or \
isinstance(cutoff_time, datetime):
valid_data = self.df[self.df[self.time_index] < cutoff_time]

elif isinstance(cutoff_time, pd.DataFrame):

instance_ids, time = list(cutoff_time)

# TODO filtering the top n during "isin" would be more efficient
valid_data = self.df[
self.df[self.index].isin(cutoff_time[instance_ids])]
valid_data = valid_data[
valid_data[self.time_index] < cutoff_time[time]]

else:
raise ValueError(
'cutoff_time must be None, a Datetime, a pd.Timestamp, or a pd.DataFrame')

return valid_data.head(n)

def get_column_type(self, column_id):
""" get type of column in underlying data structure """
return self.df[column_id].dtype.name
Expand Down Expand Up @@ -573,9 +567,6 @@ def set_secondary_time_index(self, secondary_time_index):

super(Entity, self).set_secondary_time_index(secondary_time_index)

def set_last_time_index(self, last_time_index):
self.last_time_index = last_time_index

def _vals_to_series(self, instance_vals, variable_id):
"""
instance_vals may be a pd.Dataframe, a pd.Series, a list, a single
Expand Down