In [None]:
!pip install nvtabular
!pip install merlin.models

## Importing libraries

In [2]:
import os

import nvtabular as nvt
from nvtabular.ops import *
from merlin.models.utils.example_utils import workflow_fit_transform

from merlin.schema.tags import Tags

import merlin.models.tf as mm
from merlin.io.dataset import Dataset
from merlin.models.utils.dataset import unique_rows_by_features

import tensorflow as tf

  warn(f"Triton dtype mappings did not load successfully due to an error: {exc.msg}")


In [3]:
# disable INFO and DEBUG logging everywhere
import logging

logging.disable(logging.WARNING)

## Synthetic data generation:

In [4]:
from merlin.datasets.synthetic import generate_data
DATA_FOLDER = os.environ.get("DATA_FOLDER", "/workspace/data/")
NUM_ROWS = os.environ.get("NUM_ROWS", 1000000)
SYNTHETIC_DATA = eval(os.environ.get("SYNTHETIC_DATA", "True"))

if SYNTHETIC_DATA:
    train, valid = generate_data("aliccp-raw", int(NUM_ROWS), set_sizes=(0.7, 0.3))
else:
    train = nvt.Dataset(DATA_FOLDER + "/train/*.parquet")
    valid = nvt.Dataset(DATA_FOLDER + "/valid/*.parquet")



In [5]:
train = train.to_ddf().compute()
valid = valid.to_ddf().compute()

### Filtering the postive interation as they are more relevant

In [6]:
train.columns

Index(['user_id', 'user_shops', 'user_profile', 'user_group', 'user_gender',
       'user_age', 'user_consumption_1', 'user_consumption_2',
       'user_is_occupied', 'user_geography', 'user_intentions', 'user_brands',
       'user_categories', 'item_id', 'item_category', 'item_shop',
       'item_brand', 'item_intention', 'user_item_categories',
       'user_item_shops', 'user_item_brands', 'user_item_intentions',
       'position', 'click', 'conversion'],
      dtype='object')

In [7]:
train = train.loc[train['click']==1].reset_index(drop=True)
valid = valid.loc[valid['click']==1].reset_index(drop=True)

In [8]:
# converting into dataset object
train = Dataset(train)
valid = Dataset(valid)



#### Storing processed data as parquet files

In [9]:
output_path = os.path.join(DATA_FOLDER, "processed")

#### Defining NVTabular workflow for creating tags and features for user and products

In [10]:
category_temp_directory = os.path.join(DATA_FOLDER, "categories")

# creating user and item tags
user_id = ["user_id"] >> Categorify(out_path=category_temp_directory) >> TagAsUserID()
item_id = ["item_id"] >> Categorify(out_path=category_temp_directory) >> TagAsItemID()

#Defining item features
item_features = ["item_category", "item_shop", "item_brand"] >> Categorify(out_path=category_temp_directory) >> TagAsItemFeatures()

#Defining user features
user_features = (
    [
        "user_shops",
        "user_profile",
        "user_group",
        "user_gender",
        "user_age",
        "user_consumption_2",
        "user_is_occupied",
        "user_geography",
        "user_intentions",
        "user_brands",
        "user_categories",
    ]
    >> Categorify(out_path=category_temp_directory)
    >> TagAsUserFeatures()
)
targets = ["click"] >> AddMetadata(tags=[Tags.BINARY_CLASSIFICATION, "target"])
outputs = user_id + item_id + item_features + user_features + targets
outputs = outputs >> Dropna()

#### Applying the above workflow to process the synthetic data

In [11]:
from merlin.datasets.ecommerce import transform_aliccp

transform_aliccp((train, valid), output_path, nvt_workflow=outputs)



In [13]:
train = Dataset(os.path.join(output_path, "train", "*.parquet"))
valid = Dataset(os.path.join(output_path, "valid", "*.parquet"))



#### Creating schema for model

In [14]:
schema = train.schema.select_by_tag([Tags.ITEM_ID, Tags.USER_ID, Tags.ITEM, Tags.USER]).without(['click'])
train.schema = schema
valid.schema = schema

In [15]:
schema

Unnamed: 0,name,tags,dtype,is_list,is_ragged,properties.num_buckets,properties.embedding_sizes.dimension,properties.embedding_sizes.cardinality,properties.cat_path,properties.max_size,properties.freq_threshold,properties.domain.min,properties.domain.max,properties.domain.name
0,user_id,"(Tags.USER, Tags.ID, Tags.CATEGORICAL)","DType(name='int64', element_type=<ElementType....",False,False,,61.0,668.0,/workspace/data/categories/categories/unique.u...,0.0,0.0,0,667,user_id
1,item_id,"(Tags.ITEM, Tags.ID, Tags.CATEGORICAL)","DType(name='int64', element_type=<ElementType....",False,False,,60.0,648.0,/workspace/data/categories/categories/unique.i...,0.0,0.0,0,647,item_id
2,item_category,"(Tags.ITEM, Tags.CATEGORICAL)","DType(name='int64', element_type=<ElementType....",False,False,,60.0,648.0,/workspace/data/categories/categories/unique.i...,0.0,0.0,0,647,item_category
3,item_shop,"(Tags.ITEM, Tags.CATEGORICAL)","DType(name='int64', element_type=<ElementType....",False,False,,60.0,648.0,/workspace/data/categories/categories/unique.i...,0.0,0.0,0,647,item_shop
4,item_brand,"(Tags.ITEM, Tags.CATEGORICAL)","DType(name='int64', element_type=<ElementType....",False,False,,60.0,648.0,/workspace/data/categories/categories/unique.i...,0.0,0.0,0,647,item_brand
5,user_shops,"(Tags.USER, Tags.CATEGORICAL)","DType(name='int64', element_type=<ElementType....",False,False,,61.0,668.0,/workspace/data/categories/categories/unique.u...,0.0,0.0,0,667,user_shops
6,user_profile,"(Tags.USER, Tags.CATEGORICAL)","DType(name='int64', element_type=<ElementType....",False,False,,16.0,47.0,/workspace/data/categories/categories/unique.u...,0.0,0.0,0,46,user_profile
7,user_group,"(Tags.USER, Tags.CATEGORICAL)","DType(name='int64', element_type=<ElementType....",False,False,,16.0,11.0,/workspace/data/categories/categories/unique.u...,0.0,0.0,0,10,user_group
8,user_gender,"(Tags.USER, Tags.CATEGORICAL)","DType(name='int64', element_type=<ElementType....",False,False,,16.0,5.0,/workspace/data/categories/categories/unique.u...,0.0,0.0,0,4,user_gender
9,user_age,"(Tags.USER, Tags.CATEGORICAL)","DType(name='int64', element_type=<ElementType....",False,False,,16.0,8.0,/workspace/data/categories/categories/unique.u...,0.0,0.0,0,7,user_age


In [16]:
label_names = schema.select_by_tag(Tags.TARGET).column_names
label_names
# verifying target is not included

[]

### Creating two towers (user or query) and (item or candidate) for two tower model. As it will create its embeddings and calculate the dot product between them.

In [17]:
tower_dim = 64

# create user schema using USER tag
user_schema = schema.select_by_tag(Tags.USER)
# create user (query) tower input block
user_inputs = mm.InputBlockV2(user_schema)
# create user (query) encoder block
query = mm.Encoder(user_inputs, mm.MLPBlock([128, tower_dim], no_activation_last_layer=True))

# create item schema using ITEM tag
item_schema = schema.select_by_tag(Tags.ITEM)
# create item (candidate) tower input block
item_inputs = mm.InputBlockV2(item_schema)
# create item (candidate) encoder block
candidate = mm.Encoder(item_inputs, mm.MLPBlock([128, tower_dim], no_activation_last_layer=True))

#### As due to large number of non-interacted items we only considered positive iteraction and for randomness we introduce in-batch negative sampling

In [31]:
model = mm.TwoTowerModelV2(query, candidate)
# in-batch negative sampling

### Training Two tower model

In [32]:
model.compile(optimizer="adam",
              run_eagerly=False,
              loss="categorical_crossentropy",
              metrics=[mm.RecallAt(10), mm.NDCGAt(10)])
model.fit(train, validation_data=valid, batch_size=4096, epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7c85baaa9d50>

In [20]:
train_dl = Dataset(os.path.join(output_path, "train", "*.parquet"), part_size="500MB")
valid_dl = Dataset(os.path.join(output_path, "valid", "*.parquet"), part_size="500MB")

# define schema object
schema_dl = train_dl.schema



In [21]:
target_column = schema_dl.select_by_tag(Tags.TARGET).column_names[0]
target_column

'click'

### Training the DLRM model

In [22]:
model = mm.DLRMModel(
    schema_dl,
    embedding_dim=64,
    bottom_block=mm.MLPBlock([128, 64]),
    top_block=mm.MLPBlock([128, 64, 32]),
    prediction_tasks=mm.BinaryClassificationTask(target_column),
)


In [23]:
model.compile(optimizer="adam", run_eagerly=False, metrics=[tf.keras.metrics.AUC()])
model.fit(train_dl, validation_data=valid_dl, batch_size=16 * 1024)



<keras.callbacks.History at 0x7c858960fb80>

### Evaluate the model accuracy

In [26]:
# Top-K evaluation
candidate_features = unique_rows_by_features(train, Tags.ITEM, Tags.ITEM_ID)
candidate_features.head()



Unnamed: 0,item_id,item_category,item_shop,item_brand
0,12,12,12,12
1,11,11,11,11
2,78,78,78,78
3,6,6,6,6
4,30,30,30,30


In [33]:
topk = 20
topk_model = model.to_top_k_encoder(candidate_features, k=topk, batch_size=128)
topk_model.compile(run_eagerly=False)



In [34]:
eval_loader = mm.Loader(valid, batch_size=1024).map(mm.ToTarget(schema, "item_id"))

metrics = topk_model.evaluate(eval_loader, return_dict=True)
metrics



{'loss': 0.14823666214942932,
 'recall_at_10': 0.02634178288280964,
 'mrr_at_10': 0.007390325888991356,
 'ndcg_at_10': 0.011713020503520966,
 'map_at_10': 0.007390325888991356,
 'precision_at_10': 0.0026341788470745087,
 'regularization_loss': 0.0,
 'loss_batch': 0.07162051647901535}

### Generate top-K recommendations

In [35]:
eval_loader = mm.Loader(valid, batch_size=8, shuffle=False)
batch =next(iter(eval_loader))

UserID's in current batch

In [36]:
batch[0]['user_id']

<tf.Tensor: shape=(8,), dtype=int64, numpy=array([ 8, 12, 42, 16, 26, 11,  3, 48])>

### Top 20 recommendations for th above batch of users. with product ID's

In [41]:
topk_model(batch[0])[1]

<tf.Tensor: shape=(8, 20), dtype=int32, numpy=
array([[167, 103, 359, 222,  24, 517, 558,  18, 201,  36, 109, 272,  39,
        414, 444, 337,   7, 294, 398, 384],
       [206, 191, 256,  31, 223, 117, 167, 275, 348, 315, 445, 460, 139,
        317, 171, 358, 510,  79,  33, 536],
       [171, 510,  13, 160, 164, 220, 468,  64, 348,  26, 217,  31,  35,
        223, 610, 113,  98, 253,  32, 266],
       [ 87,  44, 264,  13, 295, 410, 253, 383, 526,  56, 340, 565, 369,
        366, 300, 583, 215, 105, 454, 159],
       [477, 581, 197, 274, 483, 256,  62, 475, 187,  84,  43, 225, 403,
        194, 402,   3, 275, 392,  90, 364],
       [103, 284, 316,  71,  24, 331, 550, 462, 607, 265, 167, 144, 447,
        363, 161, 113, 287, 557, 407,   6],
       [285, 224, 455,  79, 402, 177, 260, 183, 552, 434, 477, 481, 121,
        116, 549,  82, 518,  16, 129, 223],
       [ 67, 165, 179, 222,   7, 234, 271, 265,  12, 181,  16, 399,  37,
        440,  94, 480, 332, 577, 216, 347]], dtype=int32)>