Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Workaround categorical merge #231

Merged
merged 3 commits into from
Aug 27, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion featuretools/computational_backends/pandas_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import numpy as np
import pandas as pd
import pandas.api.types as pdtypes
from future import standard_library

from .base_backend import ComputationalBackend
Expand Down Expand Up @@ -454,8 +455,15 @@ def last_n(df):
observed=True, sort=False).agg(to_agg)
# rename columns to the correct feature names
to_merge.columns = [agg_rename["-".join(x)] for x in to_merge.columns.ravel()]
to_merge = to_merge[list(agg_rename.values())]

frame = pd.merge(left=frame, right=to_merge[list(agg_rename.values())],
# workaround for pandas bug where categories are in the wrong order
# see: https://github.com/pandas-dev/pandas/issues/22501
if pdtypes.is_categorical_dtype(frame.index):
categories = pdtypes.CategoricalDtype(categories=frame.index.categories)
to_merge.index = to_merge.index.astype(object).astype(categories)

frame = pd.merge(left=frame, right=to_merge,
left_index=True, right_index=True, how='left')

# Handle default values
Expand Down
45 changes: 25 additions & 20 deletions featuretools/entityset/entity.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import division, print_function

import copy
import logging
from builtins import range
from datetime import datetime
Expand Down Expand Up @@ -351,7 +350,7 @@ def query_by_values(self, instance_vals, variable_id=None, columns=None,
"training window must be an absolute Timedelta"

if instance_vals is None:
df = self.df
df = self.df.copy()

elif instance_vals.shape[0] == 0:
df = self.df.head(0)
Expand All @@ -361,19 +360,25 @@ def query_by_values(self, instance_vals, variable_id=None, columns=None,
df.dropna(subset=[self.index], inplace=True)

else:
df = self.df.merge(instance_vals.to_frame(),
how="inner", left_on=variable_id,
right_on=variable_id).set_index(self.index, drop=False)
df = self.df.merge(instance_vals.to_frame(variable_id),
how="inner", on=variable_id)
df = df.set_index(self.index, drop=False)

# ensure filtered df has same categories as original
# workaround for issue below
# github.com/pandas-dev/pandas/issues/22501#issuecomment-415982538
if pdtypes.is_categorical_dtype(self.df[variable_id]):
categories = pd.api.types.CategoricalDtype(categories=self.df[variable_id].cat.categories)
df[variable_id] = df[variable_id].astype(categories)

return self._filter_and_sort(df=df,
time_last=time_last,
training_window=training_window,
columns=columns)
df = self._handle_time(df=df,
time_last=time_last,
training_window=training_window)

if columns is not None:
df = df[columns]

return df

def infer_variable_types(self, ignore=None, link_vars=None):
"""Extracts the variables from a dataframe
Expand Down Expand Up @@ -648,16 +653,19 @@ def _vals_to_series(self, instance_vals, variable_id):
elif type(instance_vals) == pd.Series:
out_vals = instance_vals.rename(variable_id)
else:
out_vals = pd.Series(instance_vals, name=variable_id)
out_vals = pd.Series(instance_vals)

# we've had weird problem with pandas read-only errors
out_vals = copy.deepcopy(out_vals)
# no duplicates or NaN values
return pd.Series(out_vals).drop_duplicates().dropna()
out_vals = out_vals.drop_duplicates().dropna()

# want index to have no name for the merge in query_by_values
out_vals.index.name = None

def _filter_and_sort(self, df, time_last=None,
training_window=None,
columns=None):
return out_vals

def _handle_time(self, df, time_last=None,
training_window=None,
columns=None):
"""
Filter a dataframe for all instances before time_last.
If this entity does not have a time index, return the original
Expand Down Expand Up @@ -686,10 +694,7 @@ def _filter_and_sort(self, df, time_last=None,
second_time_index_columns = self.secondary_time_index[secondary_time_index]
df.loc[mask, second_time_index_columns] = np.nan

if columns is not None:
df = df[columns]

return df.copy()
return df


def col_is_datetime(col):
Expand Down
10 changes: 6 additions & 4 deletions featuretools/entityset/entityset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1020,11 +1020,14 @@ def add_last_time_indexes(self, updated_entities=None):
if child_e.last_time_index is None:
continue
link_var = child_vars[entity.id][child_e.id].id

lti_df = pd.DataFrame({'last_time': child_e.last_time_index,
entity.index: child_e.df[link_var]})

# sort by time and keep only the most recent
lti_df.sort_values(['last_time', entity.index],
kind="mergesort", inplace=True)

lti_df.drop_duplicates(entity.index,
keep='last',
inplace=True)
Expand Down Expand Up @@ -1245,7 +1248,6 @@ def _add_multigenerational_link_vars(self, frames, start_entity_id,
"""

# caller can pass either a path or a start/end entity pair

assert start_entity_id is not None
if path is None:
assert end_entity_id is not None
Expand Down Expand Up @@ -1300,9 +1302,9 @@ def _add_multigenerational_link_vars(self, frames, start_entity_id,
merge_df = parent_df[list(col_map.keys())].rename(columns=col_map)

# merge the dataframe, adding the link variable to the child
frames[child_entity.id] = pd.merge(left=merge_df,
right=child_df,
on=r.child_variable.id)
frames[child_entity.id] = merge_df.merge(child_df,
left_index=True,
right_on=r.child_variable.id)

@classmethod
def _load_dummy_entity_data_and_variable_types(cls, metadata):
Expand Down