In [None]:
#RETRIEVAL MODEL CELL

from typing import Dict, Text

class UsersBooksModel(tfrs.Model):

  def __init__(self, embedding_dimension, layers = None):
    super().__init__()
    self.embedding_dimension = embedding_dimension
    self.layers_size = layers
    #Embedding per utenti
    self.user_embeddings = tf.keras.Sequential([
      #tf.keras.layers.Input(),
      tf.keras.layers.StringLookup(
        vocabulary=users_vocabulary_2, mask_token=None),
      tf.keras.layers.Embedding(len(users_vocabulary_2) + 1, self.embedding_dimension),
      #tf.keras.layers.Dense(32, activation='relu')
    ])
    
    #Embedding per i libri
    self.book_embeddings = tf.keras.Sequential([
      #tf.keras.layers.Input(),
      tf.keras.layers.StringLookup(
        vocabulary=titles_vocabulary_2, mask_token=None),
      tf.keras.layers.Embedding(len(titles_vocabulary_2) + 1, self.embedding_dimension),
      #tf.keras.layers.Dense(32, activation='relu')
    ])

    self.task = tfrs.tasks.Retrieval(
      metrics=tfrs.metrics.FactorizedTopK(
        candidates=
            TFdata_books_titles.batch(128).cache().map(self.book_embeddings)
      )
    )
    
    if layers != None:
        self.dense_layers = tf.keras.Sequential()

        for layer_size in layers[:-1]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size, activation="relu"))

        # No activation for the last layer.
        for layer_size in layers[-1:]:
            self.dense_layers.add(tf.keras.layers.Dense(layer_size))
  def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
    if self.layers_size == None:
        user_embeddings = self.user_embeddings(features['person_id'])
        book_embeddings = self.book_embeddings(features['title'])
    else:
        user_embeddings = self.dense_layers(self.user_embeddings(features['person_id']))
        book_embeddings = self.dense_layers(self.book_embeddings(features['title']))
        
    return self.task(user_embeddings, book_embeddings, compute_metrics = not training)

In [None]:
model = UsersBooksModel(128, None)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
def scheduler(epoch, lr):
  if epoch < 4:
    return 0.1
  elif epoch >= 4:
    return 0.001
callback = tf.keras.callbacks.LearningRateScheduler(scheduler)

history = model.fit(cached_train, validation_data = cached_val, epochs=4, verbose=1)

In [None]:
#EVALUATION LOSS PLOT

#FOR CUSTOM SERVING 
mapping = ['Comics&GraphicNovels', #0
              'Family-Sex&Relationships',#1
              'Humor',#2
              'History',#3
              'ScienceFiction&Fantasy',#4
              'Romance',#5
              'Travel',#6
              'Mystery&Thrillers',#7
              'FreeTime',#8
              'Non-fiction',#9
              'Biography',#10
              'SocialScience',#11
              'Political',#12
              'Crime',#13
              'Children&Teens',#14
              'Philosophy',#15
              'Horror',#16
              'Health-Mind&Body',#17
              'Professional&Technical',#18
              'Science&Nature']#19
              #'Fiction&Literature']
    
sub_df_books = []
sub_tensors = []
sub_titles = []

for genre in mapping:
    sub_df_books.append(df_books[df_books['genre_string'].str.contains(genre)])
sub_df_books.append(df_books) #ALL GENRES

for dataframe in sub_df_books:
    sub_tensors.append(tf.data.Dataset.from_tensor_slices(dict(dataframe)))
    
for tensor in sub_tensors:
    sub_titles.append(tensor.map(lambda x: x["title"]))

retrieving_layers = []

for GENRE in range(0,21):
    if GENRE != 20:
        retrieving = tfrs.layers.factorized_top_k.BruteForce(model.user_embeddings)
        retrieving.index_from_dataset(
            sub_titles[GENRE].batch(8192).map(lambda title: (title, model.book_embeddings(title)))
        )
    else:
        retrieving = tfrs.layers.factorized_top_k.BruteForce(model.user_embeddings)
        retrieving.index_from_dataset(
            TFdata_books_titles.batch(8192).map(lambda title: (title, model.book_embeddings(title)))
        )
    retrieving_layers.append(retrieving)

#EVALUATION LOSS PLOT

epochs = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
emb_loss_128 = [69484.21875, 66804.921875, 64446.19921875, 64028.34375, 64608.6640625, 65709.34375, 66901.25, 67988.7109375, 69031.4140625, 69966.8515625]
emb_loss_64 = [69572.15625, 67922.671875, 65706.921875, 64586.23046875, 64244.6328125, 64476.9296875, 64981.9765625, 65678.109375, 66410.140625, 67183.578125]
emb_loss_32 = [69612.9453125, 68702.1171875, 67113.0703125, 65801.546875, 65081.4375, 64724.9765625, 64697.08984375, 64843.30078125, 65141.2578125, 65502.36328125]
emb_loss_16 = [69630.8515625, 69143.359375, 68086.0859375, 67019.71875, 66199.34375, 65673.71875, 65350.4296875, 65205.1484375, 65166.53125, 65222.36328125]
emb_loss_8 = [69641.9140625, 69399.5625, 68796.65625, 68078.546875, 67451.28125, 66971.3671875, 66613.1328125, 66364.5625, 66187.71875, 66077.7265625]
emb_loss_4 = [69649.796875, 69517.9375, 69161.84375, 68712.828125, 68285.4296875, 67933.5, 67644.59375, 67427.9765625, 67249.625, 67120.34375]

plt.figure()
plt.xlabel("Epoch")
plt.ylabel("Validation log loss")
plt.plot(epochs, emb_loss_128, label = "embedding dim. 128")
plt.plot(epochs, emb_loss_64, label = "embedding dim. 64")
plt.plot(epochs, emb_loss_32, label = "embedding dim. 32")
plt.plot(epochs, emb_loss_16, label = "embedding dim. 16")
plt.plot(epochs, emb_loss_8, label = "embedding dim. 8")
plt.plot(epochs, emb_loss_4, label = "embedding dim. 4")
plt.legend()
plt.show()

In [None]:
#CREATION OF USER FEATURE (FAVOURITE GENRE OF USER)
def most_common(lst):
    return max(set(lst), key=lst.count)

interactions_train = df_train.to_records(index=False)

#Creiamo un dizionario con le interazioni per utente (TRAIN). In questo caso abbiamo bisogno anche dei generi.
interactions_per_user_train = {}
for index, interaction in enumerate(interactions_train):
    if interaction[0] not in interactions_per_user_train.keys():
        interactions_per_user_train[interaction[0]] = []
        
    for genre in eval(interaction[13]).keys():     
        interactions_per_user_train[interaction[0]].append(genre)

for user in interactions_per_user_train.keys():
    interactions_per_user_train[user] = most_common(interactions_per_user_train[user]) 

In [None]:
list_users = df_train['person_id'].unique().tolist()
retrieving = tfrs.layers.factorized_top_k.BruteForce(model.user_embeddings, k=20)
retrieving.index_from_dataset(
        TFdata_books_titles.batch(8192).map(lambda title: (title, model.book_embeddings(title)))
    )
users = []
recomms = []
user_genre_counter = {}

mapping = {'Comics&GraphicNovels': 0,
              'Family-Sex&Relationships': 1,
              'Humor': 2,
              'History': 3,
              'ScienceFiction&Fantasy': 4,
              'Romance': 5,
              'Travel': 6,
              'Mystery&Thrillers': 7,
              'FreeTime': 8,
              'Non-fiction': 9,
              'Biography': 10,
              'SocialScience': 11,
              'Political': 12,
              'Crime': 13,
              'Children&Teens': 14,
              'Philosophy': 15,
              'Horror': 16,
              'Health-Mind&Body': 17,
              'Professional&Technical': 18,
              'Science&Nature': 19}

#RETRIEVAL AVG HIT USERS AND RECOMM. (CUSTOM OR NORMAL SERVING)
for index, user in enumerate(list_users):
    user_flag = 0
    recomms_count = 0
    if index % 10000 == 0:
        print(index)
        
    if False:
        #Favourite user genre
        favourite_genre = interactions_per_user_train[user]

        _, titles_1 = retrieving_layers[mapping[favourite_genre]](tf.constant([user]), k=10)
        _, titles_2 = retrieving_layers[20](tf.constant([user]), k=20)

        titles_decoded = [item.decode() for item in titles_1[0].numpy()]
        for title in titles_2[0].numpy():
            title = title.decode('utf-8')
            #print(title)
            if title not in titles_decoded:
                titles_decoded.append(title)
        titles_decoded = np.asarray(titles_decoded)
    #print(titles_decoded)
    
    _, titles = retrieving(tf.constant([user]), k=20)
    titles_decoded = np.asarray([item.decode() for item in titles[0].numpy()])
    #print(titles_decoded)
    
    user_read_books = np.asarray(df_test[df_test['person_id'] == user].title.to_list())
    #user_genres = np.asarray(df_test[df_test['person_id'] == user].genre.to_list())
    
    if user_read_books.size == 0:
        continue
    #print(user_read_books)
    intersection = np.intersect1d(titles_decoded, user_read_books)
    #print(intersection.size)
    #print(intersection)
    if intersection.size != 0:
        user_flag = 1
        recomms_count += intersection.size
        
        #Genres
        for intersect in intersection:
            genre_dict = df_test[df_test['title'] == intersect].genre.to_list()[0]
            #print(genre_dict)
            for genre in eval(genre_dict).keys():
                #print(genre)
                if genre in user_genre_counter.keys():
                    user_genre_counter[genre] += 1
                else:
                    user_genre_counter[genre] = 1
                #print(user_genre_counter)
    users.append(user_flag)
    recomms.append(recomms_count)
    #print(users)
print(np.mean(np.asarray(users)), np.mean(np.asarray(recomms)))

In [None]:
#AVG RANK RETRIEVAL
avg_rank = []
for index, user in enumerate(list_users):
    if index % 10000 == 0:
        print(index)
        
    if False:
        #Favourite user genre
        favourite_genre = interactions_per_user_train[user]

        _, titles_1 = retrieving_layers[mapping[favourite_genre]](tf.constant([user]), k=10)
        _, titles_2 = retrieving_layers[20](tf.constant([user]), k=20)

        titles_decoded = [item.decode() for item in titles_1[0].numpy()]
        for title in titles_2[0].numpy():
            title = title.decode('utf-8')
            #print(title)
            if title not in titles_decoded:
                titles_decoded.append(title)
        titles_decoded = np.asarray(titles_decoded)
    #print(titles_decoded)
    
    _, titles = retrieving(tf.constant([user]), k=df_books.shape[0])
    titles_decoded = np.asarray([item.decode() for item in titles[0].numpy()])
    #print(titles_decoded)
    
    user_read_books = np.asarray(df_test[df_test['person_id'] == user].title.to_list())
    #user_genres = np.asarray(df_test[df_test['person_id'] == user].genre.to_list())
    
    if user_read_books.size != 0:
        ranks = np.where(np.in1d(titles_decoded, user_read_books))
        #print(ranks)
        ranks_min = np.amin(ranks)
        avg_rank.append(ranks_min)
print(np.mean(np.asarray(avg_rank)))

In [None]:
#HIT GENRES

dict_plot = {'Mystery&Thrillers': 7387, 'Children&Teens': 1831, 'ScienceFiction&Fantasy': 5401, 'Humor': 1868, 'History': 2579, 'Crime': 3218, 'FreeTime': 2402, 'Family-Sex&Relationships': 4751, 'Romance': 3430, 'Non-fiction': 1093, 'Professional&Technical': 750, 'Philosophy': 1001, 'Political': 520, 'Science&Nature': 349, 'Biography': 746, 'Travel': 1703, 'Horror': 1052, 'SocialScience': 419, 'Health-Mind&Body': 777, 'Comics&GraphicNovels': 519}

names = dict_plot.keys()
values = dict_plot.values()

plt.figure()
plt.bar(names, values)
plt.xticks(rotation='vertical')
plt.ylabel('Number of hit books')
plt.show()
plt.close()