In [None]:
!pip install tensorflow_recommenders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow_recommenders
  Downloading tensorflow_recommenders-0.6.0-py3-none-any.whl (85 kB)
[K     |████████████████████████████████| 85 kB 2.7 MB/s 
Collecting tf-estimator-nightly==2.8.0.dev2021122109
  Downloading tf_estimator_nightly-2.8.0.dev2021122109-py2.py3-none-any.whl (462 kB)
[K     |████████████████████████████████| 462 kB 39.4 MB/s 
Installing collected packages: tf-estimator-nightly, tensorflow-recommenders
Successfully installed tensorflow-recommenders-0.6.0 tf-estimator-nightly-2.8.0.dev2021122109


In [None]:
# Essential imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle
from tqdm import tqdm_notebook as tqdm
from typing import Dict, Text
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs
import tensorflow_datasets as tfds
import tempfile

In [None]:
# Mounting google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Loading the orders data
ords = pd.read_csv('drive/My Drive/Flipkart Grid/orders.csv').drop(columns = ['Unnamed: 0'])
ords.head()u

FileNotFoundError: ignored

In [None]:
ords.shape

In [None]:
# Converting the ID indice to string format, as will be required by our model
ords['Order ID'] = ords['Order ID'].astype(str)
ords['User'] = ords['User'].astype(str)
ords['Item ID'] = ords['Item ID'].astype(str)

In [None]:
print(ords.dtypes)

In [None]:
# Loading the products table
products = pd.read_csv('drive/My Drive/Flipkart Grid/products.csv').drop(columns = ['Unnamed: 0'])
products.head()

In [None]:
# Converting the Item Id to string format here as well
products['Item ID'] = products['Item ID'].astype(str)
print(products.dtypes)

In [None]:
# Creating a TF data set out of the orders table, for passing into our model
orders = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(ords['Item ID'].values, tf.string),
            tf.cast(ords['User'].values, tf.string),
            tf.cast(ords['Day of Month'].values, tf.int16)
        )
    )
)

In [None]:
# Creating a TF data set out of the products table, for passing into our model
prods = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(products['Item ID'].values, tf.string)
        )
    )
)

In [None]:
type(orders)

In [None]:
# Splitting into train and test sets after shuffling
tf.random.set_seed(42)
shuffled = orders.shuffle(18000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(15000)
test = shuffled.skip(15000).take(3000)

In [None]:
print(len(train))
print(len(test))

In [None]:
# Generating lists of unique product names (IDs) and user names (IDs) 
product_names = prods.batch(300)
user_ids = orders.batch(18000).map(lambda x, y, z: y)

unique_products = np.unique(np.concatenate(list(product_names)))
unique_user_ids = np.unique(np.concatenate(list(user_ids)))

print(unique_products[:10])
print(unique_user_ids[:10])

In [None]:
# Defining the embedding dimensions
embedding_dimension = 64

In [None]:
# Creating the Query tower
# The StringLookup layer is creating a vocabuary of user IDs
# A provision to handle new IDs is also present
# The embedding layer would learn a unique representation for each user
user_model = tf.keras.Sequential([
  tf.keras.layers.experimental.preprocessing.StringLookup(
      vocabulary=unique_user_ids, mask_token=None),
  # We add an additional embedding to account for unknown tokens.
  tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])

In [None]:
# Creating the Candidate tower
# The StringLookup layer is creating a vocabuary of item IDs
# A provision to handle new IDs is also present
# The embedding layer would learn a unique representation for each item
product_model = tf.keras.Sequential([
  tf.keras.layers.experimental.preprocessing.StringLookup(
      vocabulary=unique_products, mask_token=None),
  tf.keras.layers.Embedding(len(unique_products) + 1, embedding_dimension)
])

In [None]:
# Defining the metric
metrics = tfrs.metrics.FactorizedTopK(
  candidates=prods.batch(128).map(product_model)
)

In [None]:
# Defining the loss
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

In [None]:
# Creating a model that combines both the user and product models
class productlensModel(tfrs.Model):

  def __init__(self, user_model, product_model):
    super().__init__()
    self.product_model: tf.keras.Model = product_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    # We pick out the user features and pass them into the user model.
    user_embeddings = self.user_model(features[1])
    # And pick out the product features and pass them into the product model,
    # getting embeddings back.
    positive_product_embeddings = self.product_model(features[0])

    # The task computes the loss and the metrics.
    return self.task(user_embeddings, positive_product_embeddings)

In [None]:
# Defining the final model that implements a custom training loop
# This is an alternative to the one defined above
# It has been provided to better explain what the training looks like
# However, we have used the previous model finally
class NoBaseClassproductlensModel(tf.keras.Model):

  def __init__(self, user_model, product_model):
    super().__init__()
    self.product_model: tf.keras.Model = product_model
    self.user_model: tf.keras.Model = user_model
    self.task: tf.keras.layers.Layer = task

  def train_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:

    # Set up a gradient tape to record gradients.
    with tf.GradientTape() as tape:

      # Loss computation.
      user_embeddings = self.user_model(features[1])
      positive_product_embeddings = self.product_model(features[0])
      loss = self.task(user_embeddings, positive_product_embeddings)

      # Handle regularization losses as well.
      regularization_loss = sum(self.losses)

      total_loss = loss + regularization_loss

    gradients = tape.gradient(total_loss, self.trainable_variables)
    self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))

    metrics = {metric.name: metric.result() for metric in self.metrics}
    metrics["loss"] = loss
    metrics["regularization_loss"] = regularization_loss
    metrics["total_loss"] = total_loss

    return metrics

  def test_step(self, features: Dict[Text, tf.Tensor]) -> tf.Tensor:

    # Loss computation.
    user_embeddings = self.user_model(features[1])
    positive_product_embeddings = self.product_model(features[0])
    loss = self.task(user_embeddings, positive_product_embeddings)

    # Handle regularization losses as well.
    regularization_loss = sum(self.losses)

    total_loss = loss + regularization_loss

    metrics = {metric.name: metric.result() for metric in self.metrics}
    metrics["loss"] = loss
    metrics["regularization_loss"] = regularization_loss
    metrics["total_loss"] = total_loss

    return metrics

In [None]:
tf.keras.backend.clear_session()

In [None]:
# Defining the following callbacks for training
# ModelCheckpoint - Will save the best model seen up till any epoch
# ReduceLROnPlateau - Will reduce the learning rate by a factor of 10 if the loss hasn't reduced for 4 epochs
# EarlyStppoing - Stops training if the loss doesn't improve for 6 epochs
# Tensorboard - Stores the training logs to be plotted later
!rm -rf ./logs/
from datetime import datetime
logdir="logs/fit/" + datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=logdir, histogram_freq=0)
callbacks = [
    tf.keras.callbacks.ModelCheckpoint('./best_model', save_weights_only=True, save_best_only=True, mode='min', monitor='total_loss', save_format = 'tf'),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='total_loss', min_lr=0.000001,patience=4),
    tf.keras.callbacks.EarlyStopping(monitor='total_loss', patience=6, mode='min', baseline=None, restore_best_weights=False),
    tensorboard_callback
]

In [None]:
# Compiling the model
model = productlensModel(user_model, product_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [None]:
# Creating data generators for the train and test data
cached_train = train.shuffle(15000).batch(64).cache()
cached_test = test.batch(64).cache()

In [None]:
# Fitting the model
model.fit(cached_train, epochs=30, callbacks = callbacks)

In [None]:
# Plotting the tensorboard
%load_ext tensorboard
%tensorboard --logdir logs/fit

All the metrics are very promising. Let us check the performance on the test set.

In [None]:
model.evaluate(cached_test, return_dict=True)

The test metrics are understandable much lower, but this doesn't mean the performance is bad. We can verify this from our observations below.

In [None]:
# Creating a model that takes in raw query features, and
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
# recommending movies out of the entire movies dataset.
index.index_from_dataset(
  tf.data.Dataset.zip((prods.batch(100), prods.batch(100).map(model.product_model)))
)
user_number = "777"
# Getting recommendations.
_, titles = index(tf.constant([user_number]))
print(f"Recommendations for user {user_number}: {titles[0, :10]}")

In [None]:
# Printing the product type and name for the top 10 predictions
rec_types, rec_names = [], []
for rec in titles[0]:
  print(products[products['Item ID'] == np.array(rec).astype(str)].values[0][3:5])
  rec_types.append(products[products['Item ID'] == np.array(rec).astype(str)].values[0][3])
  rec_names.append(products[products['Item ID'] == np.array(rec).astype(str)].values[0][4])

In [None]:
# Printing the types and names of all ordered products for the concerned user
ord_types, ord_names = [], []
for _, _, item, _ in ords[ords['User'] == user_number].values:
  print(products[products['Item ID'] == item].values[0][3:5])
  ord_types.append(products[products['Item ID'] == item].values[0][3])
  ord_names.append(products[products['Item ID'] == item].values[0][4])

Excellent! All products that have been ordered by this user are being recommended!

In [None]:
# Getting Top 3 Recommendations
for i in range(3):
  print("ID: ", products[products.Name == rec_names[i]].values[0][0], " | Name: ", rec_names[i])

In [None]:
# Most frequently ordered products
user_data = ords[ords.User == user_number]
frequent = user_data['Item ID'].value_counts().index.tolist()[:2]
values = list(user_data['Item ID'].value_counts())
for i in range(2):
  print("ID: ", frequent[i], " | Name: ", products[products['Item ID'] == frequent[i]].values[0][4], " | No. of purchases: ", values[i])

In [None]:
# Bigger/Discounted/Sponosored products
user_data = ords[ords.User == user_number]
user_data = user_data.merge(products, on = 'Item ID')
freq_types = user_data['Type'].value_counts().index.tolist()[:3]
discounted = []
bigger = []
sponsored = []
for ptype in freq_types:
  print('Category = ', ptype)
  subset = products[products['Type'] == ptype]
  disc = subset[subset['Discount'] == subset['Discount'].value_counts().index.tolist()[0]].values[0]
  biggest = subset[subset['Quantity'] == subset['Quantity'].value_counts().index.tolist()[0]].values[0]
  spons = subset[subset['Sponsored'] == True].values
  print("Discounted product -> ", end = " ")
  if disc[-3]:
    print("ID: ", disc[0], " | Name: ", disc[4], " | Discount: ", disc[-3], "%")
    discounted.append(disc[0])
  else:
    print('No discount for type ', ptype)
  print("Largest Product in Category -> ", end = " ")
  print("ID: ", biggest[0], " | Name: ", biggest[4], " | Quantity: ", disc[-2])
  bigger.append(biggest)
  if len(spons) > 0:
    print("Sponsored product -> ", end = " ")
    print("ID: ", spons[0][0], " | Name: ", spons[0][4], " | Discount: ", disc[-3], "%")
  print()

In [None]:
# Exporting the query model.
with tempfile.TemporaryDirectory() as tmp:
  path = 'drive/My Drive/Flipkart Grid/TFRS_model'

  # Saving the index.
  tf.saved_model.save(index, path)

  # Loading it back; can also be done in TensorFlow Serving.
  loaded = tf.saved_model.load(path)

  # Passing a user id in, getting top predicted movie titles back.
  scores, titles = loaded([user_number])

  print(f"Recommendations: {titles[0][:10]}")

In [None]:
# Getting recommendations.
print("The user {} is in our records: {}". format("1050", len(ords[ords['User'] == "1050"]) > 0))
_, titles = index(tf.constant(["1050"]))
print(f"Recommendations for new user: {titles[0, :10]}")

In [None]:
# Printing recommendations for new users
for rec in titles[0]:
  print(products[products['Item ID'] == np.array(rec).astype(str)].values[0][3:5])

In [None]:
# Getting the most purchased products by all users
ords[['Order ID', 'Item ID']].groupby(by = 'Item ID').count().sort_values(by = 'Order ID', ascending = False)

In [None]:
# Printing the top 10 most frequently purchased products along with the frequenies
pop = ords[['Order ID', 'Item ID']].groupby(by = 'Item ID').count().sort_values(by = 'Order ID', ascending = False)
popular = pop.index.tolist()[:10]
values = pop.values[:10]
for i, item in enumerate(popular):
  print("ID: ", item, " | Name: ", products[products['Item ID'] == item].values[0][4], " | No. of purchases: ", values[i][0])

In [None]:
# Prediction function
def SmartBagRecommendations(user_number):
  with tempfile.TemporaryDirectory() as tmp:
    path = 'drive/My Drive/Flipkart Grid/TFRS_model'

    # Load it back; can also be done in TensorFlow Serving.
    loaded = tf.saved_model.load(path)

    # Pass a user id in, get top predicted movie titles back.
    scores, titles = loaded([user_number])
    
    rec_ids = []
    for rec in titles[0]:
      rec_ids.append(products[products['Item ID'] == np.array(rec).astype(str)].values[0][0])

    recommendations = {}

    if len(ords[ords['User'] == user_number]) > 0:
      # Initializing the dictionary of labeled predictions

      # Adding model's top 10 predictions to the 'top' list
      recommendations['top'] = rec_ids

      # Adding the 2 most frequently ordered products
      user_data = ords[ords.User == user_number]
      frequent = user_data['Item ID'].value_counts().index.tolist()[:3]
      values = list(user_data['Item ID'].value_counts())
      recommendations['most_bought'] = frequent

      # Adding discount/larger/sponsored product recommendations
      recommendations['discounted'], recommendations['bigger'], recommendations['sponsored'] = [], [], []
      user_data = user_data.merge(products, on = 'Item ID')
      freq_types = user_data['Type'].value_counts().index.tolist()[:3]
      for ptype in freq_types:
        subset = products[products['Type'] == ptype]
        disc = subset[subset['Discount'] == subset['Discount'].value_counts().index.tolist()[0]].values[0]
        biggest = subset[subset['Quantity'] == subset['Quantity'].value_counts().index.tolist()[0]].values[0]
        spons = subset[subset['Sponsored'] == True].values
        if disc[-3]: recommendations['discounted'].append(disc[0])
        recommendations['bigger'].append(biggest[0])
        if len(spons) > 0: recommendations['sponsored'].append(spons[0][0])

    else:
      recommendations = {'model_based': rec_ids}
      pop = ords[['Order ID', 'Item ID']].groupby(by = 'Item ID').count().sort_values(by = 'Order ID', ascending = False).index.tolist()[:10]
      recommendations['data_based'] = pop
    return recommendations

In [None]:
# Getting recommendations for old users
%%time
SmartBagRecommendations("777")

Please note that the difference in exact recommended values is because of the fact the a few lines of code were changed in the prediction notebook after the model was trained and stored. We felt it was better to leave this function, instead of running the risks of getting bugs by editing without running the notebook. Running the notebook would mean chaning the model.

In [None]:
# Getting recommendations for new users
%%time
SmartBagRecommendations("2000")