# Trails
This code is to trail the models in an easy design place

In [2]:
pip install -q tensorflow-recommenders

[?25l[K     |███▊                            | 10 kB 25.1 MB/s eta 0:00:01[K     |███████▍                        | 20 kB 30.6 MB/s eta 0:00:01[K     |███████████                     | 30 kB 37.8 MB/s eta 0:00:01[K     |██████████████▊                 | 40 kB 41.5 MB/s eta 0:00:01[K     |██████████████████▍             | 51 kB 27.0 MB/s eta 0:00:01[K     |██████████████████████          | 61 kB 30.3 MB/s eta 0:00:01[K     |█████████████████████████▊      | 71 kB 27.1 MB/s eta 0:00:01[K     |█████████████████████████████▍  | 81 kB 28.3 MB/s eta 0:00:01[K     |████████████████████████████████| 89 kB 7.2 MB/s 
[?25h

In [3]:
import subprocess
import sys
from typing import Dict, Text
import os
import tempfile
import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs
import pandas as pd  
import time
start_time = time.time

In [10]:
df = pd.read_parquet("sample.parquet") 

In [11]:
df.head()

Unnamed: 0,CUSTOMER_ACCOUNT_ID,DATE_PURCHASE_REQUEST,LINE_NUMBER,Credit,GENDER_CODE,LINE_DESC,int
22505,M4369584,20220228,RA4YJ,Y,F,50 LITRE RECTANGULAR SENSOR BI,P
32525,49845713,20210903,T9URN,Y,F,ACCESSORIZE 2 X EMBELLISHED FL,P
31292,89275233,20220111,TLR7R,Y,F,"META QUEST 2 128GB, ALL-IN-ONE",P
21817,82643296,20220122,TEALG,Y,F,MONSOON GEETA ANIMAL LUREX PRI,P
3773,W9019882,20211208,RQ670,Y,F,DEEP SLEEP 15.0 TOG DUVET COVE,P


In [12]:
ds = tf.data.Dataset.from_tensor_slices(dict(df[['CUSTOMER_ACCOUNT_ID','DATE_PURCHASE_REQUEST','GENDER_CODE','Credit','LINE_DESC','int']]))
product = tf.data.Dataset.from_tensor_slices(dict(df[['LINE_NUMBER','LINE_DESC']].drop_duplicates(subset=['LINE_NUMBER'], keep='first')))


In [13]:
ratings = ds.map(lambda x: {
    "LINE_DESC": x["LINE_DESC"],
    "CUSTOMER_ACCOUNT_ID": x["CUSTOMER_ACCOUNT_ID"],
    "DATE_PURCHASE_REQUEST": x["DATE_PURCHASE_REQUEST"],
    "GENDER_CODE": x["GENDER_CODE"],
    "Credit": x["Credit"],
    "int": x["int"],
},num_parallel_calls=tf.data.AUTOTUNE)
    
products = product.map(lambda x: x["LINE_DESC"],num_parallel_calls=tf.data.AUTOTUNE)
timestamps = np.concatenate(list(ratings.map(lambda x: x["DATE_PURCHASE_REQUEST"]).batch(100)))

max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=2000,
)

print("Time Stamps made")

unique_product_titles = df['LINE_NUMBER'].unique()#np.unique(np.concatenate(list(ratings.batch(1_000).map(lambda x: x["LINE_NUMBER"],num_parallel_calls=tf.data.AUTOTUNE)))
unique_user_ids = df['CUSTOMER_ACCOUNT_ID'].unique()
unique_gender_ids = df['GENDER_CODE'].unique()
unique_credit_ids = df['Credit'].unique()
unique_int_ids = df['int'].unique()

print("Number of customers ",len(unique_user_ids))


print("List of products and users")
layers=32

length=df.shape[0]
frac = int(length*0.8)
small_frac = length-frac
print("Found fractions")
train = ratings.take(frac)
test = ratings.skip(frac).take(small_frac)

print("Test train split complete")

cached_train = train.batch(128).cache()
cached_test = test.batch(128).cache()



Time Stamps made
Number of customers  117307
List of products and users
Found fractions
Test train split complete


In [14]:
class UserModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    self.user_embedding = tf.keras.Sequential([
        tf.keras.layers.StringLookup(
            vocabulary=unique_user_ids, mask_token=None),
        tf.keras.layers.Embedding(len(unique_user_ids) + 1, layers),
    ])
    
    self.gender_embedding = tf.keras.Sequential([
        tf.keras.layers.StringLookup(vocabulary=unique_gender_ids, mask_token=None),
        tf.keras.layers.Embedding(len(unique_gender_ids) + 1, layers),
    ])
    
    self.credit_embedding = tf.keras.Sequential([
        tf.keras.layers.StringLookup(vocabulary=unique_credit_ids, mask_token=None),
        tf.keras.layers.Embedding(len(unique_credit_ids) + 1, layers),
    ])

    self.timestamp_embedding = tf.keras.Sequential([
        tf.keras.layers.Discretization(timestamp_buckets.tolist()),
        tf.keras.layers.Embedding(len(timestamp_buckets) + 1, layers),
    ])
    self.normalized_timestamp = tf.keras.layers.Normalization(
        axis=None
    )

    self.normalized_timestamp.adapt(timestamps)

  def call(self, inputs):
    return tf.concat([
        self.user_embedding(inputs["CUSTOMER_ACCOUNT_ID"]),
        self.gender_embedding(inputs["GENDER_CODE"]),
        self.credit_embedding(inputs["Credit"]),
        self.timestamp_embedding(inputs["DATE_PURCHASE_REQUEST"]),
        tf.reshape(self.normalized_timestamp(inputs["DATE_PURCHASE_REQUEST"]), (-1, 1)),
    ], axis=1)


class ProductModel(tf.keras.Model):
  
  def __init__(self):
    super().__init__()

    max_tokens = 10_000
    num_hashing_bins = 200_000
        
    self.title_embedding = tf.keras.Sequential([
      tf.keras.layers.StringLookup(vocabulary=unique_product_titles, mask_token=None),
      #tf.keras.layers.Hashing(num_bins=num_hashing_bins), 
      tf.keras.layers.Embedding(len(unique_product_titles) + 1, 32)
    ])

    self.title_vectorizer = tf.keras.layers.TextVectorization(
        max_tokens=max_tokens)

    self.title_text_embedding = tf.keras.Sequential([
      self.title_vectorizer,
      tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
      tf.keras.layers.GlobalAveragePooling1D(),
    ])

    self.title_vectorizer.adapt(products)
    
    self.int_embedding = tf.keras.Sequential([
        tf.keras.layers.StringLookup(
            vocabulary=unique_int_ids, mask_token=None),
        tf.keras.layers.Embedding(len(unique_int_ids) + 1, layers),
    ])
    
  def call(self, inputs):
    return tf.concat([
        self.title_embedding(inputs["titles"]),
        self.title_text_embedding(inputs["titles"]),
        self.int_embedding(inputs["int"])
    ], axis=1)



class RecsModel(tfrs.models.Model):

  def __init__(self):
    super().__init__()
    self.query_model = tf.keras.Sequential([
      UserModel(),
      tf.keras.layers.Dense(layers)
    ])
    self.candidate_model = tf.keras.Sequential([
      ProductModel(),
      tf.keras.layers.Dense(layers)
    ])
    self.task = tfrs.tasks.Retrieval(
        metrics=tfrs.metrics.FactorizedTopK(
            candidates=products.apply(tf.data.experimental.dense_to_ragged_batch(128)).map(self.candidate_model), 
        ),
    )

    
  def compute_loss(self, features, training=False):
    # We only pass the user id and timestamp features into the query model. This
    # is to ensure that the training inputs would have the same keys as the
    # query inputs. Otherwise the discrepancy in input structure would cause an
    # error when loading the query model after saving it.
    query_embeddings = self.query_model({
        "CUSTOMER_ACCOUNT_ID": features["CUSTOMER_ACCOUNT_ID"],
        "GENDER_CODE": features["GENDER_CODE"],
        "Credit": features["Credit"],
        "DATE_PURCHASE_REQUEST": features["DATE_PURCHASE_REQUEST"],
    })
    product_embeddings = self.candidate_model({
        "title":features["LINE_DESC"],
        "int":features["int"],
    })

    return self.task(query_embeddings, product_embeddings)

In [15]:
model = RecsModel()
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.1))

TypeError: ignored

In [None]:
temp=model.fit(cached_train, epochs=2,validation_data=cached_test,validation_freq=1)

Epoch 1/2
Epoch 2/2


In [None]:
temp.history

{'factorized_top_k/top_1_categorical_accuracy': [1.0, 1.0],
 'factorized_top_k/top_5_categorical_accuracy': [1.0, 1.0],
 'factorized_top_k/top_10_categorical_accuracy': [1.0, 1.0],
 'factorized_top_k/top_50_categorical_accuracy': [1.0, 1.0],
 'factorized_top_k/top_100_categorical_accuracy': [1.0, 1.0],
 'loss': [3483.13720703125, 3714.119384765625],
 'regularization_loss': [0, 0],
 'total_loss': [3483.13720703125, 3714.119384765625],
 'val_factorized_top_k/top_1_categorical_accuracy': [1.0, 1.0],
 'val_factorized_top_k/top_5_categorical_accuracy': [1.0, 1.0],
 'val_factorized_top_k/top_10_categorical_accuracy': [1.0, 1.0],
 'val_factorized_top_k/top_50_categorical_accuracy': [1.0, 1.0],
 'val_factorized_top_k/top_100_categorical_accuracy': [1.0, 1.0],
 'val_loss': [144.16986083984375, 398.935546875],
 'val_regularization_loss': [0, 0],
 'val_total_loss': [144.16986083984375, 398.935546875]}

In [None]:
temp.history["val_factorized_top_k/top_100_categorical_accuracy"]

[1.0, 1.0]

In [None]:
num_validation_runs = len(temp.history["val_factorized_top_k/top_100_categorical_accuracy"])
epochs = [(x + 1)* 5 for x in range(num_validation_runs)]
plt.plot(epochs, two_layer_history.history["val_factorized_top_k/top_100_categorical_accuracy"], label="2 layers")
plt.title("Accuracy vs epoch")
plt.xlabel("epoch")
plt.ylabel("Top-100 accuracy");