In [17]:
from typing import Dict, Text

import os
import numpy as np
import pandas as pd
import tensorflow as tf
import pprint

import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

In [47]:
ratings = tfds.load("movielens/100k-ratings", split="train")

for x in ratings.take(1).as_numpy_iterator():
    pprint.pprint(x)

print(ratings)

{'bucketized_user_age': 45.0,
 'movie_genres': array([7]),
 'movie_id': b'357',
 'movie_title': b"One Flew Over the Cuckoo's Nest (1975)",
 'raw_user_age': 46.0,
 'timestamp': 879024327,
 'user_gender': True,
 'user_id': b'138',
 'user_occupation_label': 4,
 'user_occupation_text': b'doctor',
 'user_rating': 4.0,
 'user_zip_code': b'53211'}
<_PrefetchDataset element_spec={'bucketized_user_age': TensorSpec(shape=(), dtype=tf.float32, name=None), 'movie_genres': TensorSpec(shape=(None,), dtype=tf.int64, name=None), 'movie_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'movie_title': TensorSpec(shape=(), dtype=tf.string, name=None), 'raw_user_age': TensorSpec(shape=(), dtype=tf.float32, name=None), 'timestamp': TensorSpec(shape=(), dtype=tf.int64, name=None), 'user_gender': TensorSpec(shape=(), dtype=tf.bool, name=None), 'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'user_occupation_label': TensorSpec(shape=(), dtype=tf.int64, name=None), 'user_occupation_text': Ten

2024-02-04 13:42:50.560822: W tensorflow/core/kernels/data/cache_dataset_ops.cc:858] The calling iterator did not fully read the dataset being cached. In order to avoid unexpected truncation of the dataset, the partially cached contents of the dataset  will be discarded. This can happen if you have an input pipeline similar to `dataset.cache().take(k).repeat()`. You should use `dataset.take(k).cache().repeat()` instead.


In [48]:
df = pd.read_csv(os.path.join('data','all_races_cleaned.csv'), index_col=0)

ratings = pd.DataFrame()
ratings['movie_title'] = df['horse_colour'].astype(str)
ratings['user_id'] = df['race_class'].astype(str)
ratings['timestamp'] = df['horse_weight'].astype('int64')
ratings['user_rating'] = df['place'].astype('float32')
ratings = tf.data.Dataset.from_tensor_slices(dict(ratings))
# ratings = tf.cast(ratings, tf.string)

# movies = pd.DataFrame()
# movies['movie_title'] = df['horse_colour'].astype(str)
# movies = tf.data.Dataset.from_tensor_slices(dict(movies))
# movies = tf.cast(movies, tf.string)

ratings

<_TensorSliceDataset element_spec={'movie_title': TensorSpec(shape=(), dtype=tf.string, name=None), 'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'timestamp': TensorSpec(shape=(), dtype=tf.int64, name=None), 'user_rating': TensorSpec(shape=(), dtype=tf.float32, name=None)}>

In [49]:
movie_title_lookup = tf.keras.layers.StringLookup()
movie_title_lookup.adapt(ratings.map(lambda x: x["movie_title"]))

print(f"Vocabulary: {movie_title_lookup.get_vocabulary()[:3]}")

print(movie_title_lookup(["Star Wars (1977)", "One Flew Over the Cuckoo's Nest (1975)"]))

movie_title_embedding = tf.keras.layers.Embedding(
    # Let's use the explicit vocabulary lookup.
    input_dim=movie_title_lookup.vocabulary_size(),
    output_dim=32
)

movie_title_model = tf.keras.Sequential([movie_title_lookup, movie_title_embedding])

print(movie_title_model(["Star Wars (1977)"]))

Vocabulary: ['[UNK]', 'bay', 'chestnut']
tf.Tensor([0 0], shape=(2,), dtype=int64)
tf.Tensor(
[[ 0.01168508 -0.02628584 -0.03255516  0.00796981  0.04420919 -0.00262135
  -0.00652572  0.02385812 -0.01603056 -0.02144766 -0.03840078  0.00241581
  -0.03845628 -0.01541411  0.02311036 -0.00094662  0.0314948  -0.00097756
   0.02480873 -0.00167195 -0.0293787   0.03293855 -0.01756399  0.03900624
   0.03613484  0.01605323 -0.04101385  0.02197155  0.01800734  0.00361491
  -0.00247269  0.00419196]], shape=(1, 32), dtype=float32)


In [50]:
# We set up a large number of bins to reduce the chance of hash collisions.
num_hashing_bins = 200_000

movie_title_hashing = tf.keras.layers.Hashing(
    num_bins=num_hashing_bins
)

print(movie_title_hashing(["Star Wars (1977)", "One Flew Over the Cuckoo's Nest (1975)"]))

tf.Tensor([101016  96565], shape=(2,), dtype=int64)


In [51]:
user_id_lookup = tf.keras.layers.StringLookup()
user_id_lookup.adapt(ratings.map(lambda x: x["user_id"]))

user_id_embedding = tf.keras.layers.Embedding(user_id_lookup.vocabulary_size(), 32)

user_id_model = tf.keras.Sequential([user_id_lookup, user_id_embedding])

In [52]:
for x in ratings.take(3).as_numpy_iterator():
    print(f"Timestamp: {x['timestamp']}.")

Timestamp: 1153.
Timestamp: 1076.
Timestamp: 1142.


In [53]:
timestamp_normalization = tf.keras.layers.Normalization(
    axis=None
)
timestamp_normalization.adapt(ratings.map(lambda x: x["timestamp"]).batch(1024))

for x in ratings.take(3).as_numpy_iterator():
    print(f"Normalized timestamp: {timestamp_normalization(x['timestamp'])}.")

Normalized timestamp: [0.39381862].
Normalized timestamp: [-0.7245862].
Normalized timestamp: [0.2340465].


In [54]:
max_timestamp = ratings.map(lambda x: x["timestamp"]).reduce(
    tf.cast(0, tf.int64), tf.maximum).numpy().max()
min_timestamp = ratings.map(lambda x: x["timestamp"]).reduce(
    np.int64(1e9), tf.minimum).numpy().min()

timestamp_buckets = np.linspace(
    min_timestamp, max_timestamp, num=1000)

print(f"Buckets: {timestamp_buckets[:3]}")

timestamp_embedding_model = tf.keras.Sequential([
    tf.keras.layers.Discretization(timestamp_buckets.tolist()),
    tf.keras.layers.Embedding(len(timestamp_buckets) + 1, 32)
])

for timestamp in ratings.take(1).map(lambda x: x["timestamp"]).batch(1).as_numpy_iterator():
    print(f"Timestamp embedding: {timestamp_embedding_model(timestamp)}.")

Buckets: [930.         930.57357357 931.14714715]
Timestamp embedding: [[-0.01126814  0.01927097  0.04903176  0.03438972 -0.04880816  0.00473027
   0.00607485 -0.03134649  0.00976001  0.02342154  0.00758326 -0.00995881
  -0.03468177  0.00965523 -0.01875994  0.02685263 -0.02455081  0.03758484
   0.04908016 -0.01390896  0.040326   -0.01622738 -0.02180339 -0.00511694
  -0.03684079  0.04526445  0.02674117  0.04600663  0.03111876  0.02676419
   0.0324604   0.03121363]].


In [60]:
title_text = tf.keras.layers.TextVectorization()
title_text.adapt(ratings.map(lambda x: x["movie_title"]))

for row in ratings.batch(1).map(lambda x: x["movie_title"]).take(1):
    print(title_text(row))

print(title_text.get_vocabulary()[:5])

tf.Tensor([[2]], shape=(1, 1), dtype=int64)
['', '[UNK]', 'bay', 'chestnut', 'brown']


In [61]:
class UserModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    self.user_embedding = tf.keras.Sequential([
        user_id_lookup,
        tf.keras.layers.Embedding(user_id_lookup.vocabulary_size(), 32),
    ])
    self.timestamp_embedding = tf.keras.Sequential([
      tf.keras.layers.Discretization(timestamp_buckets.tolist()),
      tf.keras.layers.Embedding(len(timestamp_buckets) + 2, 32)
    ])
    self.normalized_timestamp = tf.keras.layers.Normalization(
        axis=None
    )

  def call(self, inputs):

    # Take the input dictionary, pass it through each input layer,
    # and concatenate the result.
    return tf.concat([
        self.user_embedding(inputs["user_id"]),
        self.timestamp_embedding(inputs["timestamp"]),
        tf.reshape(self.normalized_timestamp(inputs["timestamp"]), (-1, 1))
    ], axis=1)

In [62]:
user_model = UserModel()

user_model.normalized_timestamp.adapt(
    ratings.map(lambda x: x["timestamp"]).batch(128))

for row in ratings.batch(1).take(1):
    print(f"Computed representations: {user_model(row)[0, :3]}")

Computed representations: [-0.04870258 -0.01183021 -0.02222172]


In [63]:
class MovieModel(tf.keras.Model):

  def __init__(self):
    super().__init__()

    max_tokens = 10_000

    self.title_embedding = tf.keras.Sequential([
      movie_title_lookup,
      tf.keras.layers.Embedding(movie_title_lookup.vocabulary_size(), 32)
    ])
    self.title_text_embedding = tf.keras.Sequential([
      tf.keras.layers.TextVectorization(max_tokens=max_tokens),
      tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
      # We average the embedding of individual words to get one embedding vector
      # per title.
      tf.keras.layers.GlobalAveragePooling1D(),
    ])

  def call(self, inputs):
    return tf.concat([
        self.title_embedding(inputs["movie_title"]),
        self.title_text_embedding(inputs["movie_title"]),
    ], axis=1)

In [64]:
movie_model = MovieModel()

movie_model.title_text_embedding.layers[0].adapt(
    ratings.map(lambda x: x["movie_title"]))

for row in ratings.batch(1).take(1):
  print(f"Computed representations: {movie_model(row)[0, :3]}")

Computed representations: [-0.03881745 -0.02605334 -0.0085703 ]
