# Feature preprocessing

In [1]:
import pprint

import tensorflow_datasets as tfds

ratings = tfds.load("movielens/100k-ratings", split="train")

for x in ratings.take(1).as_numpy_iterator():
    pprint.pprint(x)

{'bucketized_user_age': 45.0,
 'movie_genres': array([7], dtype=int64),
 'movie_id': b'357',
 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'raw_user_age': 46.0,
 'timestamp': 879024327,
 'user_gender': True,
 'user_id': b'138',
 'user_occupation_label': 4,
 'user_occupation_text': b'doctor',
 'user_rating': 4.0,
 'user_zip_code': b'53211'}


### Categorical features into embeddings

In [2]:
import numpy as np
import tensorflow as tf

movie_title_lookup = tf.keras.layers.StringLookup()

In [3]:
movie_title_lookup.adapt(ratings.map(lambda x: x["movie_title"]))

print(f"Vocabulary: {movie_title_lookup.get_vocabulary()[:3]}")

Vocabulary: ['[UNK]', 'Star Wars (1977)', 'Contact (1997)']


In [4]:
movie_title_lookup(["Star Wars (1977)", "One Flew Over the Cuckoo's Nest (1975)"])

<tf.Tensor: shape=(2,), dtype=int64, numpy=array([ 1, 58], dtype=int64)>

In [5]:
# We set up a large number of bins to reduce the chance of hash collisions.
num_hashing_bins = 200_000

movie_title_hashing = tf.keras.layers.Hashing(
    num_bins=num_hashing_bins
)

In [6]:
movie_title_hashing(["Star Wars (1977)", "One Flew Over the Cuckoo's Nest (1975)"])

<tf.Tensor: shape=(2,), dtype=int64, numpy=array([101016,  96565], dtype=int64)>

#### Embeddings definition

In [7]:
movie_title_embedding = tf.keras.layers.Embedding(
    # Let's use the explicit vocabulary lookup.
    input_dim=movie_title_lookup.vocab_size(),
    output_dim=32
)





In [8]:
movie_title_model = tf.keras.Sequential([movie_title_lookup, movie_title_embedding])

In [9]:
movie_title_model(["Star Wars (1977)"])





<tf.Tensor: shape=(1, 32), dtype=float32, numpy=
array([[ 0.00610381,  0.0199898 , -0.00529752,  0.00561236,  0.00504239,
         0.01730869, -0.02592282,  0.00428874, -0.01569873, -0.02609544,
         0.03421838,  0.00506601,  0.02429708, -0.00970216,  0.02343206,
        -0.01074767,  0.02097772, -0.0414677 ,  0.01972109,  0.00086432,
         0.03630659,  0.01607773, -0.0374319 ,  0.03371208,  0.01802197,
        -0.04982349, -0.03293213, -0.01756442,  0.02507091, -0.04020079,
        -0.02304267, -0.03547617]], dtype=float32)>

In [10]:
user_id_lookup = tf.keras.layers.StringLookup()
user_id_lookup.adapt(ratings.map(lambda x: x["user_id"]))

user_id_embedding = tf.keras.layers.Embedding(user_id_lookup.vocab_size(), 32)

user_id_model = tf.keras.Sequential([user_id_lookup, user_id_embedding])





### Continuous features normalization

In [11]:
for x in ratings.take(3).as_numpy_iterator():
    print(f"Timestamp: {x['timestamp']}.")

Timestamp: 879024327.
Timestamp: 875654590.
Timestamp: 882075110.


#### Standardization

In [13]:
timestamp_normalization = tf.keras.layers.Normalization(
    axis=None
)
timestamp_normalization.adapt(ratings.map(lambda x: x["timestamp"]).batch(1024))

for x in ratings.take(3).as_numpy_iterator():
    print(f"Normalized timestamp: {timestamp_normalization(x['timestamp'])}.")

Normalized timestamp: [-0.8429372].
Normalized timestamp: [-1.4735202].
Normalized timestamp: [-0.27203265].


#### Discretizatio    
Another common transformation is to turn a continuous feature into a number of categorical features. This makes good sense if we have reasons to suspect that a feature's effect is non-continuous.

To do this, we first need to establish the boundaries of the buckets we will use for discretization. The easiest way is to identify the minimum and maximum value of the feature, and divide the resulting interval equally:

In [14]:
max_timestamp = ratings.map(lambda x: x["timestamp"]).reduce(
    tf.cast(0, tf.int64), tf.maximum).numpy().max()
min_timestamp = ratings.map(lambda x: x["timestamp"]).reduce(
    np.int64(1e9), tf.minimum).numpy().min()

timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=1000)

print(f"Buckets: {timestamp_buckets[:3]}")

Buckets: [8.74724710e+08 8.74743291e+08 8.74761871e+08]


In [15]:
timestamp_embedding_model = tf.keras.Sequential([
  tf.keras.layers.Discretization(timestamp_buckets.tolist()),
  tf.keras.layers.Embedding(len(timestamp_buckets) + 1, 32)
])

for timestamp in ratings.take(1).map(lambda x: x["timestamp"]).batch(1).as_numpy_iterator():
    print(f"Timestamp embedding: {timestamp_embedding_model(timestamp)}.")

Timestamp embedding: [[-0.00444943 -0.03609366  0.00537453  0.02719212 -0.04922748 -0.04301534
  -0.00922342 -0.00604397 -0.04302615  0.03931583 -0.0448388   0.00739118
  -0.02025586 -0.03494602 -0.02490181 -0.0369978  -0.00393345 -0.03461321
  -0.01413335 -0.01352978  0.03437234 -0.03381085 -0.03317273 -0.04448729
  -0.02640278 -0.02385683  0.03025926  0.01867988  0.0499028   0.03835675
   0.04041982  0.00054783]].


### Processing text features

In [16]:
title_text = tf.keras.layers.TextVectorization()
title_text.adapt(ratings.map(lambda x: x["movie_title"]))

In [17]:
for row in ratings.batch(1).map(lambda x: x["movie_title"]).take(1):
    print(title_text(row))

tf.Tensor([[ 32 266 162   2 267 265  53]], shape=(1, 7), dtype=int64)


In [18]:
title_text.get_vocabulary()[40:45]

['first', '1998', '1977', '1971', 'monty']

### Entity models

In [19]:
class UserModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        self.user_embedding = tf.keras.Sequential([
            user_id_lookup,
            tf.keras.layers.Embedding(user_id_lookup.vocab_size(), 32),
        ])
        self.timestamp_embedding = tf.keras.Sequential([
          tf.keras.layers.Discretization(timestamp_buckets.tolist()),
          tf.keras.layers.Embedding(len(timestamp_buckets) + 2, 32)
        ])
        self.normalized_timestamp = tf.keras.layers.Normalization(
            axis=None
        )
        
    def call(self, inputs):
        # Take the input dictionary, pass it through each input layer,
        # and concatenate the result.
        return tf.concat([
            self.user_embedding(inputs["user_id"]),
            self.timestamp_embedding(inputs["timestamp"]),
            tf.reshape(self.normalized_timestamp(inputs["timestamp"]), (-1, 1))
        ], axis=1)

In [20]:
user_model = UserModel()

user_model.normalized_timestamp.adapt(
    ratings.map(lambda x: x["timestamp"]).batch(128))

for row in ratings.batch(1).take(1):
    print(f"Computed representations: {user_model(row)[0, :3]}")





Computed representations: [-0.01049465  0.02520282 -0.03168577]


In [21]:
class MovieModel(tf.keras.Model):

    def __init__(self):
        super().__init__()
        max_tokens = 10_000
        self.title_embedding = tf.keras.Sequential([
          movie_title_lookup,
       tf.keras.layers.Embedding(movie_title_lookup.vocab_size(), 32)
        ])
        self.title_text_embedding = tf.keras.Sequential([
          tf.keras.layers.TextVectorization(max_tokens=max_tokens),
          tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
          # We average the embedding of individual words to get one embedding vector
          # per title.
          tf.keras.layers.GlobalAveragePooling1D(),
        ])
        
    def call(self, inputs):
        return tf.concat([
            self.title_embedding(inputs["movie_title"]),
            self.title_text_embedding(inputs["movie_title"]),
        ], axis=1)

In [22]:
movie_model = MovieModel()

movie_model.title_text_embedding.layers[0].adapt(
    ratings.map(lambda x: x["movie_title"]))

for row in ratings.batch(1).take(1):
    print(f"Computed representations: {movie_model(row)[0, :3]}")





Computed representations: [-0.0181139   0.01279012 -0.03667275]


# Leverage Context Features

In [24]:
import os
import tempfile

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import tensorflow_recommenders as tfrs

In [25]:
ratings = tfds.load("movielens/100k-ratings", split="train")
movies = tfds.load("movielens/100k-movies", split="train")

ratings = ratings.map(lambda x: {
    "movie_title": x["movie_title"],
    "user_id": x["user_id"],
    "timestamp": x["timestamp"],
})
movies = movies.map(lambda x: x["movie_title"])

In [26]:
timestamps = np.concatenate(list(ratings.map(lambda x: x["timestamp"]).batch(100)))

max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=1000,
)

unique_movie_titles = np.unique(np.concatenate(list(movies.batch(1000))))
unique_user_ids = np.unique(np.concatenate(list(ratings.batch(1_000).map(
    lambda x: x["user_id"]))))

### Model Definition