Skip to content

Commit

Permalink
Fix serialization after adding interesting values (#798)
Browse files Browse the repository at this point in the history
* initial fix

* add tests and deep variable equality

* fix deep equality checks

* lint and changelog

* fix changelog and use pandas series instead of extending json encoder

* add manual interesting value checks and remove try/except blocks when reading json

* bump entity schema version

* update feature serialization filename

* update S3 URLs

* break up URLs by adding TEST_FILE constant

* add additional tests to improve codecov

* reorder tests to hit interesting_values equality test
  • Loading branch information
frances-h committed Nov 18, 2019
1 parent 91b8943 commit bd5973f
Show file tree
Hide file tree
Showing 12 changed files with 79 additions and 26 deletions.
1 change: 1 addition & 0 deletions docs/source/changelog.rst
Expand Up @@ -5,6 +5,7 @@ Changelog
**Future Release**
* Enhancements
* Fixes
* Fix issue with converting to pickle or parquet after adding interesting features (:pr:`798`)
* Changes
* Remove python 2.7 support from serialize.py (:pr:`812`)
* Documentation Changes
Expand Down
14 changes: 10 additions & 4 deletions featuretools/entityset/deserialize.py
Expand Up @@ -35,7 +35,8 @@ def description_to_variable(description, entity=None):
if entity is not None:
kwargs = {} if is_type_string else description['type']
variable = variable(description['id'], entity, **kwargs)
variable.interesting_values = description['properties']['interesting_values']
interesting_values = pd.read_json(description['properties']['interesting_values'])
variable.interesting_values = interesting_values
return variable


Expand All @@ -51,14 +52,19 @@ def description_to_entity(description, entityset, path=None):
dataframe = read_entity_data(description, path=path)
else:
dataframe = empty_dataframe(description)
variable_types = {variable['id']: description_to_variable(variable) for variable in description['variables']}
entityset.entity_from_dataframe(
variable_types = {variable['id']: (description_to_variable(variable), variable)
for variable in description['variables']}
es = entityset.entity_from_dataframe(
description['id'],
dataframe,
index=description.get('index'),
time_index=description.get('time_index'),
secondary_time_index=description['properties'].get('secondary_time_index'),
variable_types=variable_types)
variable_types={variable: variable_types[variable][0] for variable in variable_types})
for variable in es[description['id']].variables:
interesting_values = variable_types[variable.id][1]['properties']['interesting_values']
interesting_values = pd.read_json(interesting_values, typ="series")
variable.interesting_values = interesting_values


def description_to_entityset(description, **kwargs):
Expand Down
14 changes: 9 additions & 5 deletions featuretools/entityset/entity.py
Expand Up @@ -116,10 +116,14 @@ def __eq__(self, other, deep=False):
elif self.last_time_index is not None and other.last_time_index is not None:
if not self.last_time_index.equals(other.last_time_index):
return False

if not _dataframes_equal(self.df, other.df):
return False

variables = {variable: (variable, ) for variable in self.variables}
for variable in other.variables:
variables[variable] += (variable, )
for self_var, other_var in variables.values():
if not self_var.__eq__(other_var, deep=True):
return False
return True

def __sizeof__(self):
Expand Down Expand Up @@ -342,7 +346,7 @@ def add_interesting_values(self, max_values=5, verbose=False):
for variable in self.variables:
# some heuristics to find basic 'where'-able variables
if isinstance(variable, vtypes.Discrete):
variable.interesting_values = []
variable.interesting_values = pd.Series()

# TODO - consider removing this constraints
# don't add interesting values for entities in relationships
Expand Down Expand Up @@ -370,15 +374,15 @@ def add_interesting_values(self, max_values=5, verbose=False):
msg = "Variable {}: Marking {} as an "
msg += "interesting value"
logger.info(msg.format(variable.id, idx))
variable.interesting_values += [idx]
variable.interesting_values = variable.interesting_values.append(pd.Series([idx]))
else:
fraction = counts[idx] / total_count
if fraction > 0.05 and fraction < 0.95:
if verbose:
msg = "Variable {}: Marking {} as an "
msg += "interesting value"
logger.info(msg.format(variable.id, idx))
variable.interesting_values += [idx]
variable.interesting_values = variable.interesting_values.append(pd.Series([idx]))
# total_count -= counts[idx]
else:
break
Expand Down
2 changes: 1 addition & 1 deletion featuretools/entityset/serialize.py
Expand Up @@ -10,7 +10,7 @@
from featuretools.utils.wrangle import _is_s3, _is_url

FORMATS = ['csv', 'pickle', 'parquet']
SCHEMA_VERSION = "1.0.0"
SCHEMA_VERSION = "2.0.0"


def entity_to_description(entity):
Expand Down
4 changes: 3 additions & 1 deletion featuretools/synthesis/deep_feature_synthesis.py
@@ -1,6 +1,8 @@
import logging
from collections import defaultdict

import pandas as pd

from featuretools import primitives, variable_types
from featuretools.entityset.relationship import RelationshipPath
from featuretools.feature_base import (
Expand Down Expand Up @@ -515,7 +517,7 @@ def _build_where_clauses(self, all_features, entity):
# Features can contain a stale EntitySet reference without
# interesting_values
variable = self.es[feat.variable.entity.id][feat.variable.id]
if variable.interesting_values is None:
if variable.interesting_values.equals(pd.Series()):
continue

for val in variable.interesting_values:
Expand Down
6 changes: 6 additions & 0 deletions featuretools/tests/entityset_tests/test_entity.py
Expand Up @@ -6,6 +6,7 @@

import featuretools as ft
from featuretools import variable_types
from featuretools.tests.testing_utils import make_ecommerce_entityset


def test_enforces_variable_id_is_str(es):
Expand Down Expand Up @@ -43,11 +44,16 @@ def test_variable_ordering_matches_column_ordering(es):


def test_eq(es):
other_es = make_ecommerce_entityset()
latlong = es['log'].df['latlong'].copy()

assert es['log'].__eq__(es['log'], deep=True)
assert es['log'].__eq__(other_es['log'], deep=True)
assert (es['log'].df['latlong'] == latlong).all()

other_es['log'].add_interesting_values()
assert not es['log'].__eq__(other_es['log'], deep=True)

es['log'].id = 'customers'
es['log'].index = 'notid'
assert not es['customers'].__eq__(es['log'], deep=True)
Expand Down
2 changes: 1 addition & 1 deletion featuretools/tests/entityset_tests/test_es.py
Expand Up @@ -903,7 +903,7 @@ def test_version(major, minor, patch, raises=True):
version = '.'.join([str(v) for v in [major, minor, patch]])
if raises:
warning_text = ('The schema version of the saved entityset'
'(%s) is no longer supported by this version'
'(%s) is no longer supported by this version '
'of featuretools. Attempting to load entityset ...'
% (version))
else:
Expand Down
33 changes: 31 additions & 2 deletions featuretools/tests/entityset_tests/test_serialization.py
Expand Up @@ -17,8 +17,9 @@
BUCKET_NAME = "test-bucket"
WRITE_KEY_NAME = "test-key"
TEST_S3_URL = "s3://{}/{}".format(BUCKET_NAME, WRITE_KEY_NAME)
S3_URL = "s3://featuretools-static/test_serialization_data_2.0.0.tar"
URL = 'https://featuretools-static.s3.amazonaws.com/test_serialization_data_2.0.0.tar'
TEST_FILE = "test_serialization_data_entityset_schema_2.0.0.tar"
S3_URL = "s3://featuretools-static/" + TEST_FILE
URL = "https://featuretools-static.s3.amazonaws.com/" + TEST_FILE
TEST_KEY = "test_access_key_es"


Expand Down Expand Up @@ -116,6 +117,20 @@ def test_to_pickle(es, tmpdir):
assert type(new_es['log'].df['latlong'][0]) == tuple


def test_to_pickle_interesting_values(es, tmpdir):
es.add_interesting_values()
es.to_pickle(str(tmpdir))
new_es = deserialize.read_entityset(str(tmpdir))
assert es.__eq__(new_es, deep=True)


def test_to_pickle_manual_interesting_values(es, tmpdir):
es['log']['product_id'].interesting_values = ["coke_zero"]
es.to_pickle(str(tmpdir))
new_es = deserialize.read_entityset(str(tmpdir))
assert es.__eq__(new_es, deep=True)


def test_to_parquet(es, tmpdir):
es.to_parquet(str(tmpdir))
new_es = deserialize.read_entityset(str(tmpdir))
Expand All @@ -124,6 +139,20 @@ def test_to_parquet(es, tmpdir):
assert type(new_es['log'].df['latlong'][0]) == tuple


def test_to_parquet_manual_interesting_values(es, tmpdir):
es['log']['product_id'].interesting_values = ["coke_zero"]
es.to_pickle(str(tmpdir))
new_es = deserialize.read_entityset(str(tmpdir))
assert es.__eq__(new_es, deep=True)


def test_to_parquet_interesting_values(es, tmpdir):
es.add_interesting_values()
es.to_parquet(str(tmpdir))
new_es = deserialize.read_entityset(str(tmpdir))
assert es.__eq__(new_es, deep=True)


def test_to_parquet_with_lti(tmpdir):
es = load_mock_customer(return_entityset=True, random_seed=0)
es.to_parquet(str(tmpdir))
Expand Down
Expand Up @@ -16,8 +16,9 @@
BUCKET_NAME = "test-bucket"
WRITE_KEY_NAME = "test-key"
TEST_S3_URL = "s3://{}/{}".format(BUCKET_NAME, WRITE_KEY_NAME)
S3_URL = "s3://featuretools-static/test_feature_serialization_1.0.0"
URL = "https://featuretools-static.s3.amazonaws.com/test_feature_serialization_1.0.0"
TEST_FILE = "test_feature_serialization_feature_schema_3.2.0_entityset_schema_2.0.0.json"
S3_URL = "s3://featuretools-static/" + TEST_FILE
URL = "https://featuretools-static.s3.amazonaws.com/" + TEST_FILE
TEST_CONFIG = "CheckConfigPassesOn"
TEST_KEY = "test_access_key_features"

Expand Down
Expand Up @@ -123,7 +123,7 @@ def test_version(major, minor, patch, raises=True):

if raises:
warning_text = ('The schema version of the saved features'
'(%s) is no longer supported by this version'
'(%s) is no longer supported by this version '
'of featuretools. Attempting to load features ...'
% (version))
else:
Expand Down
2 changes: 1 addition & 1 deletion featuretools/utils/gen_utils.py
Expand Up @@ -98,7 +98,7 @@ def check_schema_version(cls, cls_type):
break

warning_text_outdated = ('The schema version of the saved %s'
'(%s) is no longer supported by this version'
'(%s) is no longer supported by this version '
'of featuretools. Attempting to load %s ...'
% (cls_type, version_string, cls_type))
# Check if saved has older major version.
Expand Down
20 changes: 12 additions & 8 deletions featuretools/variable_types/variable.py
Expand Up @@ -28,16 +28,20 @@ def __init__(self, id, entity, name=None):
self.entity_id = entity.id
assert entity.entityset is not None, "Entity must contain reference to EntitySet"
self.entity = entity
self._interesting_values = None
self._interesting_values = pd.Series()

@property
def entityset(self):
return self.entity.entityset

def __eq__(self, other):
return isinstance(other, self.__class__) and \
def __eq__(self, other, deep=False):
shallow_eq = isinstance(other, self.__class__) and \
self.id == other.id and \
self.entity_id == other.entity_id
if not deep:
return shallow_eq
else:
return shallow_eq and set(self.interesting_values.values) == set(other.interesting_values.values)

def __hash__(self):
return hash((self.id, self.entity_id))
Expand Down Expand Up @@ -78,7 +82,7 @@ def interesting_values(self):

@interesting_values.setter
def interesting_values(self, interesting_values):
self._interesting_values = interesting_values
self._interesting_values = pd.Series(interesting_values)

@property
def series(self):
Expand All @@ -93,7 +97,7 @@ def to_data_description(self):
'properties': {
'name': self.name,
'entity': self.entity.id,
'interesting_values': self._interesting_values
'interesting_values': self._interesting_values.to_json()
},
}

Expand All @@ -108,7 +112,7 @@ class Discrete(Variable):

def __init__(self, id, entity, name=None):
super(Discrete, self).__init__(id, entity, name)
self._interesting_values = []
self._interesting_values = pd.Series()

@property
def interesting_values(self):
Expand All @@ -118,8 +122,8 @@ def interesting_values(self):
def interesting_values(self, values):
seen = set()
seen_add = seen.add
self._interesting_values = [v for v in values
if not (v in seen or seen_add(v))]
self._interesting_values = pd.Series([v for v in values if not
(v in seen or seen_add(v))])


class Boolean(Variable):
Expand Down

0 comments on commit bd5973f

Please sign in to comment.