In [11]:
import featuretools as ft
import pandas as pd
from sklearn.preprocessing import Imputer

from featuretools.features import AvgTimeBetween, Mean, Sum, Count, Day

# Prepare Data

In [3]:
es = ft.EntitySet("retail")

es.entity_from_csv("item_purchases",
                   csv_path="s3://featuretools-static/uk_online_retail.csv",
                   id_variable="item_purchase_id",
                   make_id_variable=True,
                   time_index="InvoiceDate",
                   parse_date_cols=["InvoiceDate"])

es.normalize_entity(new_entity_id="items",
                    base_entity_id="item_purchases",
                    id_variable="StockCode",
                    additional_variables=["Description"])

es.normalize_entity(new_entity_id="invoices",
                    base_entity_id="item_purchases",
                    id_variable="InvoiceNo",
                    additional_variables=["CustomerID","Country"])

es.normalize_entity(new_entity_id="customers",
                    base_entity_id="invoices",
                    id_variable="CustomerID",
                    additional_variables=["Country"])

cutoff_times = es["customers"].df[["CustomerID", "first_invoices_time"]].rename(columns={"CustomerID": "instance_id", "first_invoices_time": "time"})
cutoff_times["time"] = cutoff_times["time"] + pd.Timedelta("365 days")

feature_matrix, features = ft.dfs(entityset=es, prediction_entity="customers",
                                  agg_primitives=[AvgTimeBetween, Mean, Sum, Count],
                                  trans_primitives=[Day], max_depth=5, verbose=True)

calulate_feature_matrix: 100%|██████████| 2/2 [00:57<00:00, 28.79s/it]


In [9]:
f_encoded, fm_encoded = ft.encode_features(features,feature_matrix)
f_encoded

[<Feature: DAY(first_invoices_time)>,
 <Feature: Country = United Kingdom>,
 <Feature: Country = Germany>,
 <Feature: Country = France>,
 <Feature: Country = Spain>,
 <Feature: Country = Belgium>,
 <Feature: Country = Switzerland>,
 <Feature: Country = Portugal>,
 <Feature: Country = Italy>,
 <Feature: Country = Finland>,
 <Feature: Country = Norway>,
 <Feature: Country = unknown>,
 <Feature: COUNT(invoices)>,
 <Feature: AVG_TIME_BETWEEN(item_purchases)>,
 <Feature: AVG_TIME_BETWEEN(invoices)>,
 <Feature: MEAN(item_purchases.Quantity)>,
 <Feature: COUNT(item_purchases)>,
 <Feature: MEAN(item_purchases.UnitPrice)>,
 <Feature: SUM(item_purchases.UnitPrice)>,
 <Feature: SUM(item_purchases.Quantity)>,
 <Feature: MEAN(invoices.SUM(item_purchases.UnitPrice))>,
 <Feature: MEAN(invoices.SUM(item_purchases.Quantity))>,
 <Feature: MEAN(invoices.MEAN(item_purchases.UnitPrice))>,
 <Feature: MEAN(invoices.AVG_TIME_BETWEEN(item_purchases))>,
 <Feature: MEAN(invoices.MEAN(item_purchases.Quantity))>,


In [14]:
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
fm_cleaned = pd.DataFrame(imp.fit_transform(fm_encoded.values))
fm_cleaned.columns = fm_encoded.columns

# Export Data
Because the data will be processed by a library which requires a different python-version, the feature-matrix will be exported at this point. In the next step the feature-matrix will be imported in a different notebook.

In [15]:
fm_cleaned.to_csv('./example_data.csv')