alteryx · kmax12 · May 3, 2019 · Apr 23, 2019 · Apr 23, 2019 · Apr 23, 2019
diff --git a/docs/source/api_reference.rst b/docs/source/api_reference.rst
@@ -185,7 +185,7 @@ Feature methods
 
 Feature calculation
 ~~~~~~~~~~~~~~~~~~~~
-.. currentmodule:: featuretools.computational_backends
+.. currentmodule:: featuretools
 .. autosummary::
     :toctree: generated/
 
@@ -194,7 +194,7 @@ Feature calculation
 
 Feature encoding
 ~~~~~~~~~~~~~~~~~
-.. currentmodule:: featuretools.synthesis
+.. currentmodule:: featuretools
 .. autosummary::
     :toctree: generated/
 

diff --git a/docs/source/automated_feature_engineering/handling_time.rst b/docs/source/automated_feature_engineering/handling_time.rst
diff --git a/docs/source/usage_tips/glossary.rst b/docs/source/usage_tips/glossary.rst
@@ -16,6 +16,10 @@ Glossary
     variable
         Equivalent to a column in a relational database. Represented by the :class:`.Variable` class.
 
+
+    cutoff time
+        The last point in time data is allowed to be used when calculating a feature
+
     entity
         Equivalent to a table in relational database. Represented by the :class:`.Entity` class.
 

diff --git a/featuretools/computational_backends/calculate_feature_matrix.py b/featuretools/computational_backends/calculate_feature_matrix.py
@@ -56,7 +56,8 @@ def calculate_feature_matrix(features, entityset=None, cutoff_time=None, instanc
             not provided
 
         cutoff_time (pd.DataFrame or Datetime): Specifies at which time to calculate
-            the features for each instance.  Can either be a DataFrame with
+            the features for each instance. The resulting feature matrix will use data
+            up to and including the cutoff_time. Can either be a DataFrame with
             'instance_id' and 'time' columns, DataFrame with the name of the
             index variable in the target entity and a time column, or a single
             value to calculate for all instances. If the dataframe has more than two columns, any additional
@@ -77,9 +78,10 @@ def calculate_feature_matrix(features, entityset=None, cutoff_time=None, instanc
             where the second index is the cutoff time (first is instance id).
             DataFrame will be sorted by (time, instance_id).
 
-        training_window (Timedelta, optional):
-            Window defining how much older than the cutoff time data
-            can be to be included when calculating the feature. If None, all older data is used.
+        training_window (Timedelta or str, optional):
+            Window defining how much time before the cutoff time data
+            can be used when calculating features. If ``None``, all data before cutoff time is used.
+            Defaults to ``None``.
 
         approximate (Timedelta or str): Frequency to group instances with similar
             cutoff times by for features with costly calculations. For example,
@@ -402,7 +404,8 @@ def approximate_features(features, cutoff_time, window, entityset, backend,
             for the aggregation feature will be calculated
 
         cutoff_time (pd.DataFrame): specifies what time to calculate
-            the features for each instance at.  A DataFrame with
+            the features for each instance at. The resulting feature matrix will use data
+            up to and including the cutoff_time. A DataFrame with
             'instance_id' and 'time' columns.
 
         window (Timedelta or str): frequency to group instances with similar

diff --git a/featuretools/computational_backends/pandas_backend.py b/featuretools/computational_backends/pandas_backend.py
@@ -60,8 +60,8 @@ def calculate_all_features(self, instance_ids, time_last,
             time_last (pd.Timestamp): Last allowed time. Data from exactly this
                 time not allowed.
 
-            training_window (Timedelta, optional): Data older than
-                time_last by more than this will be ignored.
+            training_window (Timedelta, optional): Window defining how much time before the cutoff time data
+                can be used when calculating features. If None, all data before cutoff time is used.
 
             profile (bool): Enable profiler if True.
 

diff --git a/featuretools/demo/flight.py b/featuretools/demo/flight.py
@@ -90,22 +90,22 @@ def load_flight(month_filter=None,
 
 def make_es(data):
     es = ft.EntitySet('Flight Data')
-    labely_columns = ['arr_delay', 'dep_delay', 'carrier_delay', 'weather_delay',
-                      'national_airspace_delay', 'security_delay',
-                      'late_aircraft_delay', 'cancelled', 'diverted',
-                      'taxi_in', 'taxi_out', 'air_time', 'dep_time']
+    arr_time_columns = ['arr_delay', 'dep_delay', 'carrier_delay', 'weather_delay',
+                        'national_airspace_delay', 'security_delay',
+                        'late_aircraft_delay', 'canceled', 'diverted',
+                        'taxi_in', 'taxi_out', 'air_time', 'dep_time']
 
     variable_types = {'flight_num': vtypes.Categorical,
                       'distance_group': vtypes.Ordinal,
-                      'cancelled': vtypes.Boolean,
+                      'canceled': vtypes.Boolean,
                       'diverted': vtypes.Boolean}
 
     es.entity_from_dataframe('trip_logs',
                              data,
                              index='trip_log_id',
                              make_index=True,
-                             time_index='time_index',
-                             secondary_time_index={'arr_time': labely_columns},
+                             time_index='date_scheduled',
+                             secondary_time_index={'arr_time': arr_time_columns},
                              variable_types=variable_types)
 
     es.normalize_entity('trip_logs', 'flights', 'flight_id',
@@ -134,7 +134,8 @@ def _clean_data(data):
                                             'crs_elapsed_time': 'scheduled_elapsed_time',
                                             'nas_delay': 'national_airspace_delay',
                                             'origin_city_name': 'origin_city',
-                                            'dest_city_name': 'dest_city'})
+                                            'dest_city_name': 'dest_city',
+                                            'cancelled': 'canceled'})
 
     # Combine strings like 0130 (1:30 AM) with dates (2017-01-01)
     clean_data['scheduled_dep_time'] = clean_data['scheduled_dep_time'].apply(lambda x: str(x)) + clean_data['flight_date'].astype('str')
@@ -148,7 +149,7 @@ def _clean_data(data):
     clean_data = _reconstruct_times(clean_data)
 
     # Create a time index 6 months before scheduled_dep
-    clean_data.loc[:, 'time_index'] = clean_data['scheduled_dep_time'] - \
+    clean_data.loc[:, 'date_scheduled'] = clean_data['scheduled_dep_time'].dt.date - \
         pd.Timedelta('120d')
 
     # A null entry for a delay means no delay
@@ -163,13 +164,34 @@ def _clean_data(data):
     clean_data.loc[:, 'flight_id'] = clean_data['carrier'] + '-' + \
         clean_data['flight_num'].apply(lambda x: str(x)) + ':' + clean_data['origin'] + '->' + clean_data['dest']
 
+    column_order = [
+        'flight_id',
+        'flight_num',
+        'date_scheduled',
+        'scheduled_dep_time',
+        'scheduled_arr_time',
+        'carrier',
+        'origin', 'origin_city', 'origin_state',
+        'dest', 'dest_city', 'dest_state',
+        'distance_group',
+        'dep_time',
+        'arr_time',
+        'dep_delay', 'taxi_out', 'taxi_in', 'arr_delay',
+        'diverted', 'scheduled_elapsed_time', 'air_time', 'distance',
+        'carrier_delay', 'weather_delay',
+        'national_airspace_delay', 'security_delay', 'late_aircraft_delay',
+        'canceled'
+    ]
+
+    clean_data = clean_data[column_order]
+
     return clean_data
 
 
 def _fill_labels(clean_data):
     labely_columns = ['arr_delay', 'dep_delay', 'carrier_delay', 'weather_delay',
                       'national_airspace_delay', 'security_delay',
-                      'late_aircraft_delay', 'cancelled', 'diverted',
+                      'late_aircraft_delay', 'canceled', 'diverted',
                       'taxi_in', 'taxi_out', 'air_time']
     for col in labely_columns:
         clean_data.loc[:, col] = clean_data[col].fillna(0)

diff --git a/featuretools/entityset/entity.py b/featuretools/entityset/entity.py
@@ -230,7 +230,8 @@ def query_by_values(self, instance_vals, variable_id=None, columns=None,
             time_last (pd.TimeStamp) : Query data up to and including this
                 time. Only applies if entity has a time index.
             training_window (Timedelta, optional):
-                Data older than time_last by more than this will be ignored
+                Window defining how much time before the cutoff time data
+                can be used when calculating features. If None, all data before cutoff time is used.
 
         Returns:
             pd.DataFrame : instances that match constraints with ids in order of underlying dataframe

diff --git a/featuretools/synthesis/dfs.py b/featuretools/synthesis/dfs.py
@@ -55,7 +55,8 @@ def dfs(entities=None,
         target_entity (str): Entity id of entity on which to make predictions.
 
         cutoff_time (pd.DataFrame or Datetime): Specifies times at which to
-            calculate each instance. Can either be a DataFrame with
+            calculate each instance. The resulting feature matrix will use data
+            up to and including the cutoff_time. Can either be a DataFrame with
             'instance_id' and 'time' columns, a DataFrame with the name of the
             index variable in the target entity and a time column, a
             list of values, or a single
@@ -116,9 +117,10 @@ def dfs(entities=None,
             where the second index is the cutoff time (first is instance id).
             DataFrame will be sorted by (time, instance_id).
 
-        training_window (Timedelta, optional):
-            Window defining how much older than the cutoff time data
-            can be to be included when calculating the feature. If None, all older data is used.
+        training_window (Timedelta or str, optional):
+            Window defining how much time before the cutoff time data
+            can be used when calculating features. If ``None`` , all data before cutoff time is used.
+            Defaults to ``None``.
 
         approximate (Timedelta): Bucket size to group instances with similar
             cutoff times by for features with costly calculations. For example,

diff --git a/featuretools/tests/demo_tests/test_demo_data.py b/featuretools/tests/demo_tests/test_demo_data.py
@@ -25,6 +25,6 @@ def test_load_flight():
                      return_single_table=False, nrows=1000)
 
     entity_names = ['airports', 'flights', 'trip_logs', 'airlines']
-    realvals = [(11, 3), (13, 9), (103, 22), (1, 1)]
+    realvals = [(11, 3), (13, 9), (103, 21), (1, 1)]
     for i, name in enumerate(entity_names):
         assert es[name].shape == realvals[i]