## Import the important libraries

In [1]:
import tensorflow_recommenders as tfrs
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Read the dataset 

In [65]:
user_data = pd.read_csv('data/BX-CSV/BX-Users.csv', delimiter=';', encoding='latin-1')
book_data = pd.read_csv('data/BX-CSV/BX-Books.csv', delimiter=';', encoding='latin-1', on_bad_lines='skip', low_memory=False)
rating_data = pd.read_csv('data/BX-CSV/BX-Book-Ratings.csv', delimiter=';', encoding='latin-1')

## Preparing dataset

In [3]:
rating_data.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [4]:
book_data.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [5]:
book_data.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [67]:
rated_data = rating_data.merge(book_data, on='ISBN')
#rating = rating.drop(['ISBN','Book-Author', 'Year-Of-Publication', 'Publisher',
#       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis=1)

In [7]:
##len(rating['Book-Title'].unique())

## Preprocess the data for model building

In [68]:
rated_data['User-ID'] = rated_data['User-ID'].astype(str)

In [9]:
rated_data.columns

Index(['User-ID', 'ISBN', 'Book-Rating', 'Book-Title', 'Book-Author',
       'Year-Of-Publication', 'Publisher', 'Image-URL-S', 'Image-URL-M',
       'Image-URL-L'],
      dtype='object')

In [10]:
# creating tensorflow source dataset from input dataset to perform some transformation
rated_data1 = tf.data.Dataset.from_tensor_slices(dict(rated_data[['Book-Title', 'User-ID']]))

In [11]:
# creating tensorflow source dataset from input dataset to perform some transformation
book_data = tf.data.Dataset.from_tensor_slices(dict(book_data[['Book-Title']]))

In [12]:
# extracting specific columns from dataset
ratings = rated_data1.map(lambda x: {
    "Book-Title" : x['Book-Title'],
    "User-ID" : x['User-ID']
})

books = book_data.map(lambda x : x['Book-Title'])

In [13]:
# selecing batch of data
user_ids = ratings.batch(1_000_000).map(lambda x : x['User-ID'])
book_titles = books.batch(1000)

# extracting unique users and book titles
unique_book_titles = np.unique(np.concatenate(list(book_titles)))
unique_users = np.unique(np.concatenate(list(user_ids)))

In [14]:
# separating training and test set
tf.random.set_seed = 42
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False,)
train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

## Model building : Retrieval model

#### building embedding layers

In [172]:
embedding_dim = 32

In [16]:
user_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(mask_token=None, vocabulary=unique_users),
    tf.keras.layers.Embedding(len(unique_users)+1, embedding_dim)
])

In [17]:
book_model = tf.keras.Sequential([
    tf.keras.layers.StringLookup(mask_token=None, vocabulary=unique_book_titles),
    tf.keras.layers.Embedding(len(unique_book_titles)+1, embedding_dim)
])

In [25]:
metrics = tfrs.metrics.FactorizedTopK(
  candidates=books.batch(128).map(book_model)
)

In [26]:
task = tfrs.tasks.Retrieval(
  metrics=metrics
)

#### model development

In [22]:
class BookModel(tfrs.Model):
    def __init__(self, book_model, user_model):
        super().__init__(self)
        self.book_model = book_model
        self.user_model = user_model
        self.task = task

    def compute_loss(self, features, training=False):
        user_embedding = self.user_model(features['User-ID'])
        book_embedding = self.book_model(features['Book-Title'])
        
        return self.task(user_embedding, book_embedding)

In [27]:
model = BookModel(book_model, user_model)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1) )

In [28]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [29]:
model.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1a5b6a227d0>

In [30]:
model.evaluate(cached_test, return_dict=True)



{'factorized_top_k/top_1_categorical_accuracy': 0.0009500000160187483,
 'factorized_top_k/top_5_categorical_accuracy': 0.005849999841302633,
 'factorized_top_k/top_10_categorical_accuracy': 0.01075000036507845,
 'factorized_top_k/top_50_categorical_accuracy': 0.03750000149011612,
 'factorized_top_k/top_100_categorical_accuracy': 0.06159999966621399,
 'loss': 29540.271484375,
 'regularization_loss': 0,
 'total_loss': 29540.271484375}

In [62]:
def predict(user):
    index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
    index.index_from_dataset(tf.data.Dataset.zip((books.batch(100), books.batch(100).map(model.book_model))))
    
    _, titles = index(tf.constant([str(user)]))
    
    print(titles)

In [61]:
predict(276726)

tf.Tensor(
[[b"Forever Isn'T Long Enough (Family Ties) (Harlequin Romance, No 3377)"
  b'The Berenstain Bears: No Girls Allowed (First Time Books)'
  b"Groom'S Revenge (Harlequin Presents, 2035)"
  b'My Best Picture Word Book'
  b'Puppies need someone to love (A Golden look-look book)'
  b'Wild Side (Harlequin Romance, No 2979)' b"Mooly's Slow Teeth"
  b'Dark Enigma (Harlequin Romance)'
  b"Sax Rohmer's the Trail of Fu Manchu"
  b"Buchanan'S Bride (Buckles &amp; Broncos) (Silhouette Special Edition, No 1012)"]], shape=(1, 10), dtype=string)


## Model building : Ranking model

In [69]:
rank_rated_data = tf.data.Dataset.from_tensor_slices(dict(rated_data[['Book-Title', 'User-ID', 'Book-Rating']]))

In [123]:
# ratings will be added for prediction
rating_rank = rank_rated_data.map(lambda x: {
    "User-ID": x['User-ID'],
    "Book-Title": x['Book-Title'],
    "Book-Rating": x['Book-Rating']
})

In [124]:
tf.random.set_seed = 42

shuffled = rating_rank.shuffle(100_000, seed=None,reshuffle_each_iteration=False)
train = shuffled.take(80_000)
test = shuffled.skip(80_000).take(20_000)

In [125]:
book_titles = rating_rank.batch(1_000_000).map(lambda x: x['Book-Title'])
user_ids = rating_rank.batch(1_000_000).map(lambda x: x['User-ID'])

unique_book_titles = np.unique(np.concatenate(list(book_titles)))
unique_users = np.unique(np.concatenate(list(user_ids)))

#### Model development

In [126]:
class RankingModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        embedding_dim = 32
        self.book_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_book_titles, mask_token=None),
            tf.keras.layers.Embedding(len(unique_book_titles)+1, embedding_dim)
        ])
        self.user_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_users, mask_token=None),
            tf.keras.layers.Embedding(len(unique_users)+1, embedding_dim)
        ])
        self.ratings = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(1)
        ])
        
    def call(self, inputs):
        user_id, book_title = inputs
        
        book_embedding = self.book_embedding(book_title)
        user_embedding = self.user_embedding(user_id)
        
        return self.ratings(tf.concat([user_embedding, book_embedding], axis=1))

In [109]:
RankingModel()((["10001"], ["The Notebook"]))



<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.00699688]], dtype=float32)>

In [80]:
task = tfrs.tasks.Ranking(
    loss = tf.keras.losses.MeanSquaredError(),
    metrics = [tf.keras.metrics.RootMeanSquaredError()])

In [152]:
class BookrankModel(tfrs.models.Model):
    def __init__(self):
        super().__init__()
        self.ranking_model = RankingModel()
        self.task = tfrs.tasks.Ranking(
            loss = tf.keras.losses.MeanSquaredError(),
            metrics = [tf.keras.metrics.RootMeanSquaredError()])
        
    def call(self, features):
            return self.ranking_model(
                (features["User-ID"],features["Book-Title"])
            )
        
    def compute_loss(self, features, training=False):
            labels = features.pop("Book-Rating")
            rating_predictions = self(features)
            
            return self.task(labels=labels, predictions=rating_predictions)

In [153]:
model_rank = BookrankModel()
model_rank.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [154]:
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()

In [155]:
model_rank.fit(cached_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1a655fdc610>

In [157]:
model_rank.evaluate(cached_test, return_dict=True)



{'root_mean_squared_error': 3.864675521850586,
 'loss': 14.96182918548584,
 'regularization_loss': 0,
 'total_loss': 14.96182918548584}

In [165]:
test_ranking = {}
test_titles = ['Nemesis', 'I Is for Innocent', 'Out of Africa ; and, Shadows on the grass']

for book_titles in test_titles:
    test_ranking[book_titles] = model_rank({
        "User-ID" : np.array(["100011"]),
        "Book-Title" : np.array([book_titles])
    })

In [166]:
print(test_ranking)

{'Nemesis': <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[2.9469976]], dtype=float32)>, 'I Is for Innocent': <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[2.9540777]], dtype=float32)>, 'Out of Africa ; and, Shadows on the grass': <tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[3.0142553]], dtype=float32)>}


In [168]:
for title, score in sorted(test_ranking.items(), key=lambda x: x[1], reverse=True):
  print(f"{title}: {score}")

Out of Africa ; and, Shadows on the grass: [[3.0142553]]
I Is for Innocent: [[2.9540777]]
Nemesis: [[2.9469976]]


## Multi-task model Recommendation

In [174]:
class BooktaskModel(tfrs.models.Model):
    def __init__(self, rating_weight, retrieval_weight):
        super().__init__()
        embedding_dim = 32
        
        self.book_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_book_titles, mask_token=None),
            tf.keras.layers.Embedding(len(unique_book_titles)+1, embedding_dim)
        ])
        
        self.user_model = tf.keras.Sequential([
            tf.keras.layers.StringLookup(vocabulary=unique_users, mask_token=None),
            tf.keras.layers.Embedding(len(unique_users)+1, embedding_dim)
        ])
        
        self.ratings = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(1)
        ])
        self.rating_task = tfrs.tasks.Ranking(
            loss = tf.keras.losses.MeanSquaredError(),
            metrics = [tf.keras.metrics.RootMeanSquaredError()])
        
        self.retrieval_task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(candidates=books.batch(128).map(self.book_model))
        )
        
    
        self.rating_weight = rating_weight
        self.retrieval_weight = retrieval_weight
        
    def call(self, features):
        user_embedding = self.user_model(features['User-ID'])
        book_embedding = self.book_model(features['Book-Title'])
        
        return (user_embedding, book_embedding, self.ratings(tf.concat([user_embedding, book_embedding], axis=1)))
    
    def compute_loss(self, features, training=False):
        
        labels = features.pop("Book-Rating")
        user_embedding, book_embedding, rating_predictions = self(features)
        
        rating_loss = self.rating_task(labels=labels, predictions=rating_predictions)
        retrieval_loss = self.retrieval_task(user_embedding, book_embedding)
        
        return(self.rating_weight*rating_loss +  self.retrieval_weight*retrieval_loss)

In [175]:
multi_model = BooktaskModel(1,1)
multi_model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))

In [176]:
multi_model.fit(cached_train, epochs=3)
metrics = multi_model.evaluate(cached_test, return_dict=True)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [177]:
print(f"Retrieval top-100 accuracy: {metrics['factorized_top_k/top_100_categorical_accuracy']:.3f}.")
print(f"Ranking RMSE: {metrics['root_mean_squared_error']:.3f}.")

Retrieval top-100 accuracy: 0.125.
Ranking RMSE: 3.878.


In [179]:
trained_movie_embeddings, trained_user_embeddings, predicted_rating = multi_model({
      "User-ID": np.array(["100042"]),
      "Book-Title": np.array(["I Is for Innocent"])
  })
print("Predicted rating:")
print(predicted_rating)


Predicted rating:
tf.Tensor([[4.261733]], shape=(1, 1), dtype=float32)
