In [1]:
source_dir = ''
feature_set_name = ''
features_dir = ''
split_date = '' # date on which to split dataset into train and test data (alternatively, use Spark's randomSplit)

In [2]:
ddf = spark.read.option("header", "true").option("schemaInfer", "true").csv('%s/*' % source_dir)

Example column map mapping source data column names to relay42-industry concepts

In [5]:
column_map = {
    '': 'user_id',
    '': 'item_id',
    '': 'item_booking_date',
    '': 'item_departure_date',
    '': 'item_travel_class',
    '': 'item_pax',
    '': 'user_member',
    '': 'user_gender',
    '': 'num_bookings'
}

In [6]:
from pyspark.sql.functions import col

ddf = ddf.select([col(k).alias(v) for k, v in column_map.items()]).fillna(0)

In [7]:
from pyspark.sql.functions import datediff

ddf = ddf.withColumn('user_dbd', datediff(ddf.item_departure_date, ddf.item_booking_date))

In [8]:
numerical_features = ['item_pax_int']
categorical_features = ['item_travel_class', 'user_gender', 'item_round_trip', 'user_member', 'user_infant', 'user_age_bracket']
indexed_cat_cols = [c + '_idx' for c in categorical_features]
ohe_cat_cols = [c.replace('_idx', '_') for c in indexed_cat_cols]
user_features = list(filter(lambda c: c.startswith('user_'), ohe_cat_cols + numerical_features))
item_features = list(filter(lambda c: c.startswith('item_'), ohe_cat_cols + numerical_features))

In [9]:
from pyspark.ml.feature import StringIndexer, SQLTransformer, OneHotEncoderEstimator, VectorAssembler
from pyspark.ml import Pipeline

casts = SQLTransformer(statement="SELECT *, CAST(item_pax AS int) item_pax_int, CAST(user_dbd AS int) user_dbd_int FROM __THIS__")
user_indexer = StringIndexer(inputCol='user_id', outputCol='user_idx', handleInvalid='keep')
item_indexer = StringIndexer(inputCol='item_id', outputCol='item_idx', handleInvalid='keep')
feature_indexers = [StringIndexer(inputCol=col, outputCol=col +'_idx', handleInvalid='keep') for col in categorical_features]
onehotencoders = OneHotEncoderEstimator(inputCols=indexed_cat_cols, outputCols=ohe_cat_cols, handleInvalid='keep', dropLast=False)
item_feature_assembler = VectorAssembler(inputCols=item_features, outputCol='item_features')
user_feature_assembler = VectorAssembler(inputCols=user_features, outputCol='user_features')

features_pipeline = Pipeline(stages=[casts] + [user_indexer] + [item_indexer] + feature_indexers + [onehotencoders] + [item_feature_assembler] + [user_feature_assembler])

In [10]:
feature_model = features_pipeline.fit(ddf)

In [11]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, DoubleType

array_udf = udf(lambda value: value.toArray().tolist(), ArrayType(DoubleType()))

In [12]:
feature_ddf = feature_model.transform(ddf).select(col('user_idx').cast('int').alias('user'), col('item_idx').cast('int').alias('item'), 'item_booking_date', 'item_departure_date', array_udf('item_features').alias('single_item_features'), array_udf('user_features').alias('single_user_features'))

In [13]:
feature_model.stages[1].write().overwrite().save('%s/user_index_%s' % (feature_dir, feature_set_name))
feature_model.stages[2].write().overwrite().save('%s/item_index_%s' % (feature_dir, feature_set_name))

In [14]:
feature_ddf.printSchema()

In [15]:
feature_ddf.show()

In [16]:
print('unique users: %s; unique items: %s' % (feature_ddf.select('user').distinct().count(), feature_ddf.select('item').distinct().count()))

In [17]:
departures_ddf = feature_ddf.groupBy('item_departure_date').count().withColumnRenamed('count', 'departures')
conversions_ddf = feature_ddf.groupBy('item_booking_date').count().withColumnRenamed('count', 'conversions')
display(departures_ddf.join(conversions_ddf, departures_ddf.item_departure_date == conversions_ddf.item_booking_date).withColumnRenamed('item_booking_date', 'date').select('date', 'conversions', 'departures').orderBy(col('date').asc()))

In [18]:
train_ddf = feature_ddf.filter(feature_ddf.item_booking_date < split_date)
test_ddf = feature_ddf.filter(feature_ddf.item_booking_date >= split_date)

In [19]:
print('train observations: %s; evaluation observations: %s' % (train_ddf.count(), test_ddf.count()))

In [20]:
from pyspark.sql.functions import size

num_item_features = feature_ddf.select(size('single_item_features').alias('nif')).first().nif
num_user_features = feature_ddf.select(size('single_user_features').alias('nuf')).first().nuf
print('item features: %s\nuser features: %s' % (num_item_features, num_user_features))

In [21]:
from pyspark.sql.functions import udf
from pyspark.sql.types import  StringType

vec_to_string_udf = udf(lambda value: ', '. join([str(d) for d in value]), StringType())

In [22]:
from pyspark.sql.functions import col, array, sum, count

agg_test_ddf = test_ddf.groupBy('user', 'item').agg(array(*[sum(col('single_item_features')[i]) for i in range(num_item_features)]).alias('item_features_vector'), array(*[sum(col('single_user_features')[i]) for i in range(num_user_features)]).alias('user_features_vector'), count(col('item')).alias('num_bookings')).select('user', 'item', 'num_bookings', vec_to_string_udf('item_features_vector').alias('item_features'), vec_to_string_udf('user_features_vector').alias('user_features'))

In [23]:
agg_train_ddf = train_ddf.groupBy('user', 'item').agg(array(*[sum(col('single_item_features')[i]) for i in range(num_item_features)]).alias('item_features_vector'), array(*[sum(col('single_user_features')[i]) for i in range(num_user_features)]).alias('user_features_vector'), count(col('item')).alias('num_bookings')).select('user', 'item', 'num_bookings', vec_to_string_udf('item_features_vector').alias('item_features'), vec_to_string_udf('user_features_vector').alias('user_features'))

In [24]:
agg_full_ddf = feature_ddf.groupBy('user', 'item').agg(array(*[sum(col('single_item_features')[i]) for i in range(num_item_features)]).alias('item_features_vector'), array(*[sum(col('single_user_features')[i]) for i in range(num_user_features)]).alias('user_features_vector'), count(col('item')).alias('num_bookings')).select('user', 'item', 'num_bookings', vec_to_string_udf('item_features_vector').alias('item_features'), vec_to_string_udf('user_features_vector').alias('user_features'))

In [25]:
agg_test_ddf.show()

In [26]:
agg_test_ddf.printSchema()

In [27]:
dev_ddf, rest_ddf = agg_train_ddf.randomSplit([0.05, 0.95], 411)
dev_ddf.coalesce(1).write.mode('overwrite').csv('%s/dev_%s' % (features_dir, feature_set_name), sep='\t', header='true')

In [28]:
agg_test_ddf.coalesce(1).write.mode('overwrite').csv('%s/evaluation_%s' % (features_dir, feature_set_name), sep='\t', header='true')

In [29]:
agg_train_ddf.coalesce(1).write.mode('overwrite').csv('%s/train_%s' % (features_dir, feature_set_name), sep='\t', header='true')

In [30]:
agg_full_ddf.coalesce(1).write.mode('overwrite').csv('%s/full_%s' % (features_dir, feature_set_name), sep='\t', header='true')

Historical bookings in the training data

In [32]:
from pyspark.sql.functions import concat_ws, collect_list

history_ddf = train_ddf.groupby('user').agg(concat_ws(',', collect_list(col('item'))).alias('historical_destinations'))

In [33]:
history_ddf.show(truncate = False)

In [34]:
history_ddf.coalesce(1).write.mode('overwrite').csv('%s/train_user_history_%s' % (features_dir, feature_set_name), sep='\t', header='true')