Fix serialization after adding interesting values (#798)

* initial fix * add tests and deep variable equality * fix deep equality checks * lint and changelog * fix changelog and use pandas series instead of extending json encoder * add manual interesting value checks and remove try/except blocks when reading json * bump entity schema version * update feature serialization filename * update S3 URLs * break up URLs by adding TEST_FILE constant * add additional tests to improve codecov * reorder tests to hit interesting_values equality test
alteryx · Nov 18, 2019 · bd5973f · bd5973f
1 parent 91b8943
commit bd5973f
Show file tree

Hide file tree

Showing 12 changed files with 79 additions and 26 deletions.
diff --git a/docs/source/changelog.rst b/docs/source/changelog.rst
@@ -5,6 +5,7 @@ Changelog
 **Future Release**
     * Enhancements
     * Fixes
+        * Fix issue with converting to pickle or parquet after adding interesting features (:pr:`798`)
     * Changes
         * Remove python 2.7 support from serialize.py (:pr:`812`)
     * Documentation Changes

diff --git a/featuretools/entityset/deserialize.py b/featuretools/entityset/deserialize.py
@@ -35,7 +35,8 @@ def description_to_variable(description, entity=None):
     if entity is not None:
         kwargs = {} if is_type_string else description['type']
         variable = variable(description['id'], entity, **kwargs)
-        variable.interesting_values = description['properties']['interesting_values']
+        interesting_values = pd.read_json(description['properties']['interesting_values'])
+        variable.interesting_values = interesting_values
     return variable
 
 
@@ -51,14 +52,19 @@ def description_to_entity(description, entityset, path=None):
         dataframe = read_entity_data(description, path=path)
     else:
         dataframe = empty_dataframe(description)
-    variable_types = {variable['id']: description_to_variable(variable) for variable in description['variables']}
-    entityset.entity_from_dataframe(
+    variable_types = {variable['id']: (description_to_variable(variable), variable)
+                      for variable in description['variables']}
+    es = entityset.entity_from_dataframe(
         description['id'],
         dataframe,
         index=description.get('index'),
         time_index=description.get('time_index'),
         secondary_time_index=description['properties'].get('secondary_time_index'),
-        variable_types=variable_types)
+        variable_types={variable: variable_types[variable][0] for variable in variable_types})
+    for variable in es[description['id']].variables:
+        interesting_values = variable_types[variable.id][1]['properties']['interesting_values']
+        interesting_values = pd.read_json(interesting_values, typ="series")
+        variable.interesting_values = interesting_values
 
 
 def description_to_entityset(description, **kwargs):

diff --git a/featuretools/entityset/entity.py b/featuretools/entityset/entity.py
@@ -116,10 +116,14 @@ def __eq__(self, other, deep=False):
             elif self.last_time_index is not None and other.last_time_index is not None:
                 if not self.last_time_index.equals(other.last_time_index):
                     return False
-
             if not _dataframes_equal(self.df, other.df):
                 return False
-
+            variables = {variable: (variable, ) for variable in self.variables}
+            for variable in other.variables:
+                variables[variable] += (variable, )
+            for self_var, other_var in variables.values():
+                if not self_var.__eq__(other_var, deep=True):
+                    return False
         return True
 
     def __sizeof__(self):
@@ -342,7 +346,7 @@ def add_interesting_values(self, max_values=5, verbose=False):
         for variable in self.variables:
             # some heuristics to find basic 'where'-able variables
             if isinstance(variable, vtypes.Discrete):
-                variable.interesting_values = []
+                variable.interesting_values = pd.Series()
 
                 # TODO - consider removing this constraints
                 # don't add interesting values for entities in relationships
@@ -370,15 +374,15 @@ def add_interesting_values(self, max_values=5, verbose=False):
                             msg = "Variable {}: Marking {} as an "
                             msg += "interesting value"
                             logger.info(msg.format(variable.id, idx))
-                        variable.interesting_values += [idx]
+                        variable.interesting_values = variable.interesting_values.append(pd.Series([idx]))
                     else:
                         fraction = counts[idx] / total_count
                         if fraction > 0.05 and fraction < 0.95:
                             if verbose:
                                 msg = "Variable {}: Marking {} as an "
                                 msg += "interesting value"
                                 logger.info(msg.format(variable.id, idx))
-                            variable.interesting_values += [idx]
+                            variable.interesting_values = variable.interesting_values.append(pd.Series([idx]))
                             # total_count -= counts[idx]
                         else:
                             break

diff --git a/featuretools/entityset/serialize.py b/featuretools/entityset/serialize.py
@@ -10,7 +10,7 @@
 from featuretools.utils.wrangle import _is_s3, _is_url
 
 FORMATS = ['csv', 'pickle', 'parquet']
-SCHEMA_VERSION = "1.0.0"
+SCHEMA_VERSION = "2.0.0"
 
 
 def entity_to_description(entity):

diff --git a/featuretools/synthesis/deep_feature_synthesis.py b/featuretools/synthesis/deep_feature_synthesis.py
@@ -1,6 +1,8 @@
 import logging
 from collections import defaultdict
 
+import pandas as pd
+
 from featuretools import primitives, variable_types
 from featuretools.entityset.relationship import RelationshipPath
 from featuretools.feature_base import (
@@ -515,7 +517,7 @@ def _build_where_clauses(self, all_features, entity):
             # Features can contain a stale EntitySet reference without
             # interesting_values
             variable = self.es[feat.variable.entity.id][feat.variable.id]
-            if variable.interesting_values is None:
+            if variable.interesting_values.equals(pd.Series()):
                 continue
 
             for val in variable.interesting_values:

diff --git a/featuretools/tests/entityset_tests/test_entity.py b/featuretools/tests/entityset_tests/test_entity.py
@@ -6,6 +6,7 @@
 
 import featuretools as ft
 from featuretools import variable_types
+from featuretools.tests.testing_utils import make_ecommerce_entityset
 
 
 def test_enforces_variable_id_is_str(es):
@@ -43,11 +44,16 @@ def test_variable_ordering_matches_column_ordering(es):
 
 
 def test_eq(es):
+    other_es = make_ecommerce_entityset()
     latlong = es['log'].df['latlong'].copy()
 
     assert es['log'].__eq__(es['log'], deep=True)
+    assert es['log'].__eq__(other_es['log'], deep=True)
     assert (es['log'].df['latlong'] == latlong).all()
 
+    other_es['log'].add_interesting_values()
+    assert not es['log'].__eq__(other_es['log'], deep=True)
+
     es['log'].id = 'customers'
     es['log'].index = 'notid'
     assert not es['customers'].__eq__(es['log'], deep=True)

diff --git a/featuretools/tests/entityset_tests/test_es.py b/featuretools/tests/entityset_tests/test_es.py
@@ -903,7 +903,7 @@ def test_version(major, minor, patch, raises=True):
         version = '.'.join([str(v) for v in [major, minor, patch]])
         if raises:
             warning_text = ('The schema version of the saved entityset'
-                            '(%s) is no longer supported by this version'
+                            '(%s) is no longer supported by this version '
                             'of featuretools. Attempting to load entityset ...'
                             % (version))
         else:

diff --git a/featuretools/tests/entityset_tests/test_serialization.py b/featuretools/tests/entityset_tests/test_serialization.py
@@ -17,8 +17,9 @@
 BUCKET_NAME = "test-bucket"
 WRITE_KEY_NAME = "test-key"
 TEST_S3_URL = "s3://{}/{}".format(BUCKET_NAME, WRITE_KEY_NAME)
-S3_URL = "s3://featuretools-static/test_serialization_data_2.0.0.tar"
-URL = 'https://featuretools-static.s3.amazonaws.com/test_serialization_data_2.0.0.tar'
+TEST_FILE = "test_serialization_data_entityset_schema_2.0.0.tar"
+S3_URL = "s3://featuretools-static/" + TEST_FILE
+URL = "https://featuretools-static.s3.amazonaws.com/" + TEST_FILE
 TEST_KEY = "test_access_key_es"
 
 
@@ -116,6 +117,20 @@ def test_to_pickle(es, tmpdir):
     assert type(new_es['log'].df['latlong'][0]) == tuple
 
 
+def test_to_pickle_interesting_values(es, tmpdir):
+    es.add_interesting_values()
+    es.to_pickle(str(tmpdir))
+    new_es = deserialize.read_entityset(str(tmpdir))
+    assert es.__eq__(new_es, deep=True)
+
+
+def test_to_pickle_manual_interesting_values(es, tmpdir):
+    es['log']['product_id'].interesting_values = ["coke_zero"]
+    es.to_pickle(str(tmpdir))
+    new_es = deserialize.read_entityset(str(tmpdir))
+    assert es.__eq__(new_es, deep=True)
+
+
 def test_to_parquet(es, tmpdir):
     es.to_parquet(str(tmpdir))
     new_es = deserialize.read_entityset(str(tmpdir))
@@ -124,6 +139,20 @@ def test_to_parquet(es, tmpdir):
     assert type(new_es['log'].df['latlong'][0]) == tuple
 
 
+def test_to_parquet_manual_interesting_values(es, tmpdir):
+    es['log']['product_id'].interesting_values = ["coke_zero"]
+    es.to_pickle(str(tmpdir))
+    new_es = deserialize.read_entityset(str(tmpdir))
+    assert es.__eq__(new_es, deep=True)
+
+
+def test_to_parquet_interesting_values(es, tmpdir):
+    es.add_interesting_values()
+    es.to_parquet(str(tmpdir))
+    new_es = deserialize.read_entityset(str(tmpdir))
+    assert es.__eq__(new_es, deep=True)
+
+
 def test_to_parquet_with_lti(tmpdir):
     es = load_mock_customer(return_entityset=True, random_seed=0)
     es.to_parquet(str(tmpdir))

diff --git a/featuretools/tests/primitive_tests/test_feature_serialization.py b/featuretools/tests/primitive_tests/test_feature_serialization.py
@@ -16,8 +16,9 @@
 BUCKET_NAME = "test-bucket"
 WRITE_KEY_NAME = "test-key"
 TEST_S3_URL = "s3://{}/{}".format(BUCKET_NAME, WRITE_KEY_NAME)
-S3_URL = "s3://featuretools-static/test_feature_serialization_1.0.0"
-URL = "https://featuretools-static.s3.amazonaws.com/test_feature_serialization_1.0.0"
+TEST_FILE = "test_feature_serialization_feature_schema_3.2.0_entityset_schema_2.0.0.json"
+S3_URL = "s3://featuretools-static/" + TEST_FILE
+URL = "https://featuretools-static.s3.amazonaws.com/" + TEST_FILE
 TEST_CONFIG = "CheckConfigPassesOn"
 TEST_KEY = "test_access_key_features"
 

diff --git a/featuretools/tests/primitive_tests/test_features_deserializer.py b/featuretools/tests/primitive_tests/test_features_deserializer.py
@@ -123,7 +123,7 @@ def test_version(major, minor, patch, raises=True):
 
         if raises:
             warning_text = ('The schema version of the saved features'
-                            '(%s) is no longer supported by this version'
+                            '(%s) is no longer supported by this version '
                             'of featuretools. Attempting to load features ...'
                             % (version))
         else:

diff --git a/featuretools/utils/gen_utils.py b/featuretools/utils/gen_utils.py
@@ -98,7 +98,7 @@ def check_schema_version(cls, cls_type):
                 break
 
         warning_text_outdated = ('The schema version of the saved %s'
-                                 '(%s) is no longer supported by this version'
+                                 '(%s) is no longer supported by this version '
                                  'of featuretools. Attempting to load %s ...'
                                  % (cls_type, version_string, cls_type))
         # Check if saved has older major version.

diff --git a/featuretools/variable_types/variable.py b/featuretools/variable_types/variable.py
@@ -28,16 +28,20 @@ def __init__(self, id, entity, name=None):
         self.entity_id = entity.id
         assert entity.entityset is not None, "Entity must contain reference to EntitySet"
         self.entity = entity
-        self._interesting_values = None
+        self._interesting_values = pd.Series()
 
     @property
     def entityset(self):
         return self.entity.entityset
 
-    def __eq__(self, other):
-        return isinstance(other, self.__class__) and \
+    def __eq__(self, other, deep=False):
+        shallow_eq = isinstance(other, self.__class__) and \
             self.id == other.id and \
             self.entity_id == other.entity_id
+        if not deep:
+            return shallow_eq
+        else:
+            return shallow_eq and set(self.interesting_values.values) == set(other.interesting_values.values)
 
     def __hash__(self):
         return hash((self.id, self.entity_id))
@@ -78,7 +82,7 @@ def interesting_values(self):
 
     @interesting_values.setter
     def interesting_values(self, interesting_values):
-        self._interesting_values = interesting_values
+        self._interesting_values = pd.Series(interesting_values)
 
     @property
     def series(self):
@@ -93,7 +97,7 @@ def to_data_description(self):
             'properties': {
                 'name': self.name,
                 'entity': self.entity.id,
-                'interesting_values': self._interesting_values
+                'interesting_values': self._interesting_values.to_json()
             },
         }
 
@@ -108,7 +112,7 @@ class Discrete(Variable):
 
     def __init__(self, id, entity, name=None):
         super(Discrete, self).__init__(id, entity, name)
-        self._interesting_values = []
+        self._interesting_values = pd.Series()
 
     @property
     def interesting_values(self):
@@ -118,8 +122,8 @@ def interesting_values(self):
     def interesting_values(self, values):
         seen = set()
         seen_add = seen.add
-        self._interesting_values = [v for v in values
-                                    if not (v in seen or seen_add(v))]
+        self._interesting_values = pd.Series([v for v in values if not
+                                              (v in seen or seen_add(v))])
 
 
 class Boolean(Variable):