alteryx · kmax12 · Aug 27, 2018 · Aug 24, 2018 · Aug 24, 2018 · Aug 27, 2018
diff --git a/featuretools/computational_backends/pandas_backend.py b/featuretools/computational_backends/pandas_backend.py
@@ -8,6 +8,7 @@
 
 import numpy as np
 import pandas as pd
+import pandas.api.types as pdtypes
 from future import standard_library
 
 from .base_backend import ComputationalBackend
@@ -454,8 +455,15 @@ def last_n(df):
                                           observed=True, sort=False).agg(to_agg)
             # rename columns to the correct feature names
             to_merge.columns = [agg_rename["-".join(x)] for x in to_merge.columns.ravel()]
+            to_merge = to_merge[list(agg_rename.values())]
 
-            frame = pd.merge(left=frame, right=to_merge[list(agg_rename.values())],
+            # workaround for pandas bug where categories are in the wrong order
+            # see: https://github.com/pandas-dev/pandas/issues/22501
+            if pdtypes.is_categorical_dtype(frame.index):
+                categories = pdtypes.CategoricalDtype(categories=frame.index.categories)
+                to_merge.index = to_merge.index.astype(object).astype(categories)
+
+            frame = pd.merge(left=frame, right=to_merge,
                              left_index=True, right_index=True, how='left')
 
         # Handle default values

diff --git a/featuretools/entityset/entity.py b/featuretools/entityset/entity.py
@@ -1,6 +1,5 @@
 from __future__ import division, print_function
 
-import copy
 import logging
 from builtins import range
 from datetime import datetime
@@ -351,7 +350,7 @@ def query_by_values(self, instance_vals, variable_id=None, columns=None,
                 "training window must be an absolute Timedelta"
 
         if instance_vals is None:
-            df = self.df
+            df = self.df.copy()
 
         elif instance_vals.shape[0] == 0:
             df = self.df.head(0)
@@ -361,19 +360,25 @@ def query_by_values(self, instance_vals, variable_id=None, columns=None,
             df.dropna(subset=[self.index], inplace=True)
 
         else:
-            df = self.df.merge(instance_vals.to_frame(),
-                               how="inner", left_on=variable_id,
-                               right_on=variable_id).set_index(self.index, drop=False)
+            df = self.df.merge(instance_vals.to_frame(variable_id),
+                               how="inner", on=variable_id)
+            df = df.set_index(self.index, drop=False)
 
             # ensure filtered df has same categories as original
+            # workaround for issue below
+            # github.com/pandas-dev/pandas/issues/22501#issuecomment-415982538
             if pdtypes.is_categorical_dtype(self.df[variable_id]):
                 categories = pd.api.types.CategoricalDtype(categories=self.df[variable_id].cat.categories)
                 df[variable_id] = df[variable_id].astype(categories)
 
-        return self._filter_and_sort(df=df,
-                                     time_last=time_last,
-                                     training_window=training_window,
-                                     columns=columns)
+        df = self._handle_time(df=df,
+                               time_last=time_last,
+                               training_window=training_window)
+
+        if columns is not None:
+            df = df[columns]
+
+        return df
 
     def infer_variable_types(self, ignore=None, link_vars=None):
         """Extracts the variables from a dataframe
@@ -648,16 +653,19 @@ def _vals_to_series(self, instance_vals, variable_id):
         elif type(instance_vals) == pd.Series:
             out_vals = instance_vals.rename(variable_id)
         else:
-            out_vals = pd.Series(instance_vals, name=variable_id)
+            out_vals = pd.Series(instance_vals)
 
-        # we've had weird problem with pandas read-only errors
-        out_vals = copy.deepcopy(out_vals)
         # no duplicates or NaN values
-        return pd.Series(out_vals).drop_duplicates().dropna()
+        out_vals = out_vals.drop_duplicates().dropna()
+
+        # want index to have no name for the merge in query_by_values
+        out_vals.index.name = None
 
-    def _filter_and_sort(self, df, time_last=None,
-                         training_window=None,
-                         columns=None):
+        return out_vals
+
+    def _handle_time(self, df, time_last=None,
+                     training_window=None,
+                     columns=None):
         """
         Filter a dataframe for all instances before time_last.
         If this entity does not have a time index, return the original
@@ -686,10 +694,7 @@ def _filter_and_sort(self, df, time_last=None,
                 second_time_index_columns = self.secondary_time_index[secondary_time_index]
                 df.loc[mask, second_time_index_columns] = np.nan
 
-        if columns is not None:
-            df = df[columns]
-
-        return df.copy()
+        return df
 
 
 def col_is_datetime(col):

diff --git a/featuretools/entityset/entityset.py b/featuretools/entityset/entityset.py
@@ -1020,11 +1020,14 @@ def add_last_time_indexes(self, updated_entities=None):
                     if child_e.last_time_index is None:
                         continue
                     link_var = child_vars[entity.id][child_e.id].id
+
                     lti_df = pd.DataFrame({'last_time': child_e.last_time_index,
                                            entity.index: child_e.df[link_var]})
+
                     # sort by time and keep only the most recent
                     lti_df.sort_values(['last_time', entity.index],
                                        kind="mergesort", inplace=True)
+
                     lti_df.drop_duplicates(entity.index,
                                            keep='last',
                                            inplace=True)
@@ -1245,7 +1248,6 @@ def _add_multigenerational_link_vars(self, frames, start_entity_id,
         """
 
         # caller can pass either a path or a start/end entity pair
-
         assert start_entity_id is not None
         if path is None:
             assert end_entity_id is not None
@@ -1300,9 +1302,9 @@ def _add_multigenerational_link_vars(self, frames, start_entity_id,
                     merge_df = parent_df[list(col_map.keys())].rename(columns=col_map)
 
                     # merge the dataframe, adding the link variable to the child
-                    frames[child_entity.id] = pd.merge(left=merge_df,
-                                                       right=child_df,
-                                                       on=r.child_variable.id)
+                    frames[child_entity.id] = merge_df.merge(child_df,
+                                                             left_index=True,
+                                                             right_on=r.child_variable.id)
 
     @classmethod
     def _load_dummy_entity_data_and_variable_types(cls, metadata):