Feature primitives are the building blocks of Featuretools. They define individual computations that can be applied to raw datasets to create new features. Because a primitive only constrains the input and output data types, they can be applied across datasets and can stack to create new calculations.

In [1]:
import featuretools as ft

es = ft.demo.load_mock_customer(return_entityset=True)

In [2]:
feature_defs = ft.dfs(entityset=es,
                     target_entity='customers',
                     agg_primitives=['mean'],
                     trans_primitives=['time_since_previous'],
                     features_only=True)
feature_defs

[<Feature: zip_code>,
 <Feature: MEAN(transactions.amount)>,
 <Feature: TIME_SINCE_PREVIOUS(join_date)>,
 <Feature: MEAN(sessions.TIME_SINCE_PREVIOUS(session_start))>,
 <Feature: MEAN(sessions.MEAN(transactions.amount))>]

In [3]:
feature_matrix, feature_defs = ft.dfs(entityset=es,
                                     target_entity='customers',
                                     agg_primitives=['mean','max','min','std','skew'],
                                     trans_primitives=['time_since_previous'])

In [4]:
feature_matrix

Unnamed: 0_level_0,zip_code,MEAN(transactions.amount),MAX(transactions.amount),MIN(transactions.amount),STD(transactions.amount),SKEW(transactions.amount),TIME_SINCE_PREVIOUS(join_date),MEAN(sessions.MEAN(transactions.amount)),MEAN(sessions.STD(transactions.amount)),MEAN(sessions.TIME_SINCE_PREVIOUS(session_start)),...,STD(sessions.MEAN(transactions.amount)),STD(sessions.TIME_SINCE_PREVIOUS(session_start)),STD(sessions.MIN(transactions.amount)),STD(sessions.MAX(transactions.amount)),STD(sessions.SKEW(transactions.amount)),SKEW(sessions.MEAN(transactions.amount)),SKEW(sessions.STD(transactions.amount)),SKEW(sessions.TIME_SINCE_PREVIOUS(session_start)),SKEW(sessions.MIN(transactions.amount)),SKEW(sessions.MAX(transactions.amount))
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,60091,80.375443,149.02,7.55,44.09563,-0.025941,,78.705187,43.312326,1007.5,...,11.007471,157.884451,4.961414,7.928001,0.415426,0.335175,0.204548,-1.507217,-0.47041,-0.333796
4,60091,80.070459,149.95,5.73,45.068765,-0.036348,22948824.0,81.207189,44.515729,999.375,...,13.027258,308.688904,16.960575,3.514421,0.387884,1.980948,-1.065663,1.065177,2.10351,0.027256
1,60091,71.631905,139.43,5.81,40.442059,0.019698,744019.0,72.77414,39.093244,966.875,...,13.759314,171.754341,6.954507,7.322191,0.589386,-0.424949,-0.312355,-0.254557,2.440005,-0.780493
3,13244,67.06043,149.15,5.89,43.683296,0.41823,10212841.0,67.539577,42.883316,888.333333,...,11.174282,177.613813,5.424407,10.724241,0.429374,0.678544,-0.245703,0.434581,1.000771,-0.941078
2,13244,77.422366,146.81,8.73,37.705178,0.098259,21282510.0,78.415122,36.957218,725.833333,...,11.477071,194.638554,15.874374,17.221593,0.509798,0.235296,0.013087,0.162631,2.154929,-1.539467


In [5]:
feature_matrix[["MEAN(sessions.TIME_SINCE_PREVIOUS(session_start))",
"MAX(sessions.TIME_SINCE_PREVIOUS(session_start))",
"MIN(sessions.TIME_SINCE_PREVIOUS(session_start))",
"STD(sessions.TIME_SINCE_PREVIOUS(session_start))",
"SKEW(sessions.TIME_SINCE_PREVIOUS(session_start))"]]

Unnamed: 0_level_0,MEAN(sessions.TIME_SINCE_PREVIOUS(session_start)),MAX(sessions.TIME_SINCE_PREVIOUS(session_start)),MIN(sessions.TIME_SINCE_PREVIOUS(session_start)),STD(sessions.TIME_SINCE_PREVIOUS(session_start)),SKEW(sessions.TIME_SINCE_PREVIOUS(session_start))
customer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
5,1007.5,1170.0,715.0,157.884451,-1.507217
4,999.375,1625.0,650.0,308.688904,1.065177
1,966.875,1170.0,715.0,171.754341,-0.254557
3,888.333333,1170.0,650.0,177.613813,0.434581
2,725.833333,975.0,520.0,194.638554,0.162631


Aggregation primitives: These primitives take related instances as an input and output a single value. They are applied across a parent-child relationship in an entity set. E.g: "count", "sum", "avg_time_between".

Transform primitives: These primitives take one or more variables from an entity as an input and output a new variable for that entity. They are applied to a single entity. E.g: "hour", "time_since_previous", "absolute".

In [6]:
ft.list_primitives()

Unnamed: 0,name,type,description
0,skew,aggregation,Computes the extent to which a distribution di...
1,n_most_common,aggregation,Determines the `n` most common elements.
2,time_since_first,aggregation,Calculates the time elapsed since the first da...
3,num_true,aggregation,Counts the number of `True` values.
4,all,aggregation,Calculates if all values are 'True' in a list.
...,...,...,...
73,subtract_numeric,transform,Element-wise subtraction of two lists.
74,greater_than_scalar,transform,Determines if values are greater than a given ...
75,isin,transform,Determines whether a value is present in a pro...
76,cum_mean,transform,Calculates the cumulative mean.


# Simple custom primitives

In [11]:
from featuretools.primitives import make_agg_primitive, make_trans_primitive

In [12]:
from featuretools.variable_types import Text, Numeric

In [13]:
def absolute(column):
    return abs(column)

Absolute = make_trans_primitive(function=absolute,
                               input_types=[Numeric],
                               return_type=Numeric)

In [14]:
def maximum(column):
    return max(column)

Maximum = make_agg_primitive(function=maximum,
                            input_types=[Numeric],
                            return_type=Numeric)

In [15]:
def word_count(column):
    word_counts = []
    for value in column:
        words = value.split(None)
        word_counts.append(len(words))
    return word_counts

WordCount = make_trans_primitive(function=word_count,
                                input_types=[Text],
                                return_type=Numeric)

In [16]:
from featuretools.tests.testing_utils import make_ecommerce_entityset

es = make_ecommerce_entityset()

In [17]:
feature_matrix, features = ft.dfs(entityset=es,
                                 target_entity='sessions',
                                 agg_primitives=['sum','mean','std'],
                                 trans_primitives=[WordCount])

In [15]:
feature_matrix[["customers.WORD_COUNT(favorite_quote)", "STD(log.WORD_COUNT(comments))", "SUM(log.WORD_COUNT(comments))", "MEAN(log.WORD_COUNT(comments))"]]

Unnamed: 0_level_0,customers.WORD_COUNT(favorite_quote),STD(log.WORD_COUNT(comments)),SUM(log.WORD_COUNT(comments)),MEAN(log.WORD_COUNT(comments))
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,9,540.43686,2500,500
1,9,583.70255,1732,433
2,9,,246,246
3,6,883.883476,1256,628
4,6,0.0,9,3
5,12,19.79899,68,34


# Multiple Input Types

In [18]:
from featuretools.variable_types import Datetime, Timedelta, Variable

import pandas as pd

In [19]:
def mean_sunday(numeric, datetime):
    days = pd.DatetimeIndex(datetime).weekday.values
    df = pd.DataFrame({'numeric':numeric, 'time':days})
    return df[df['time']==6]['numeric'].mean()

MeanSunday = make_agg_primitive(function=mean_sunday,
                               input_types=[Numeric,Datetime],
                               return_type=Numeric)

In [20]:
feature_matrix, features = ft.dfs(entityset=es,
                                 target_entity='sessions',
                                 agg_primitives=[MeanSunday],
                                 trans_primitives=[],
                                 max_depth=1)

In [21]:
feature_matrix[["MEAN_SUNDAY(log.value, datetime)", "MEAN_SUNDAY(log.value_2, datetime)"]]

Unnamed: 0_level_0,"MEAN_SUNDAY(log.value, datetime)","MEAN_SUNDAY(log.value_2, datetime)"
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,,
1,,
2,,
3,2.5,1.0
4,7.0,3.0
5,,
