In [12]:
import pprint
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_ranking as tfr
import tensorflow_recommenders as tfrs
import keras
import mysql.connector
import pandas as pd
from getpass import getpass
from mysql.connector import connect, Error

# Connect to database
try:
    with connect(
        host="localhost",
        user="root",
        password="mysql",
        database="Sprint1BasicEComDb"
    ) as connection:
        print(connection)
except Error as e:
    print(e)

<mysql.connector.connection_cext.CMySQLConnection object at 0x000001068A5CAD50>


In [13]:
def get_user_ratings(): #TODO: like get All Books, add attributes to the query
    query = "SELECT userId,bookId,rating FROM UserBookRatings"
    # incase connection is lost, reconnect
    connection.reconnect(attempts=3, delay=5)
    mydb = connection.cursor()
    mydb.execute(query)
    user_ratings = mydb.fetchall()
    return user_ratings


def get_all_books_metadata():
    query =""" 
    """
    # incase connection is lost, reconnect
    connection.reconnect(attempts=3, delay=5)
    mydb = connection.cursor()
    mydb.execute(query)
    all_books = mydb.fetchall()
    return all_books


def get_all_users_metadata():
    query = "SELECT id,birthDate,sex FROM Users"
    # incase connection is lost, reconnect
    connection.reconnect(attempts=3, delay=5)
    mydb = connection.cursor()
    mydb.execute(query)
    all_users = mydb.fetchall()
    return all_users


def save_to_csv(data, filename, header):
    df = pd.DataFrame(data).set_axis(header, axis=1)
    df.to_csv(filename, index=False, )
    
#save_to_csv(get_user_ratings(), 'user_ratings.csv', ['user_id', 'movie_title', 'rating'])
#save_to_csv(get_all_books(), 'all_books.csv', ['book_id', 'book_title', 'description', 'num_pages', 'rating', 'num_of_voters','genres','formats','authors'])
#save_to_csv(get_all_users(), 'all_users.csv', ['user_id', 'birth_date'])



In [14]:
df = get_user_ratings()
user_ratings_ids = pd.DataFrame(df, columns=["user_id", "movie_title", "user_rating"])

# convert to string
user_ratings_ids['user_id'] = user_ratings_ids['user_id'].astype(str)
user_ratings_ids['movie_title'] = user_ratings_ids['movie_title'].astype(str)

rating_rank = user_ratings_ids[['user_id', 'movie_title', 'user_rating']].copy()
book_rank = user_ratings_ids[['movie_title']].copy()

user_ratings_ids=[]


In [15]:
# Create a tf.data.Dataset from the dataframe
# Will cause error if ran again

books = tf.data.Dataset.from_tensor_slices(dict(book_rank)) # book_rank only 
books = books.map(lambda x: x["movie_title"])


rating_rank = tf.data.Dataset.from_tensor_slices(dict(rating_rank))
rating_rank = rating_rank.map(lambda x: {
    'user_id': x['user_id'],
    'movie_title': x['movie_title'],
    'user_rating': x['user_rating'],
})

## new 

unique_books = np.unique(np.concatenate(list(books.batch(1000)))) # is it correct IDK # should be strings

unique_user_ids = np.unique(np.concatenate(list(rating_rank.batch(1_000).map(lambda x: x['user_id'])))) ## could be book id

# decode from bytes to string
unique_books = [book.decode('utf-8') for book in unique_books]
unique_user_ids = [user_id.decode('utf-8') for user_id in unique_user_ids]


In [16]:
tf.random.set_seed(42)
# do we have 100_000 ratings? # user_ratings needs to be a new type
shuffled = rating_rank.shuffle(
    100, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(70)

test = shuffled.skip(70).take(30)

# We sample 50 lists for each user for the training data. For each list we
# sample 5 movies from the movies the user rated.
train = tfrs.examples.movielens.sample_listwise(
    train,
    num_list_per_user=50,
    num_examples_per_list=5,
    seed=42
)
test = tfrs.examples.movielens.sample_listwise( ## making test empty
    test,
    num_list_per_user=1,
    num_examples_per_list=5,
    seed=42
)
print(train)

<_TensorSliceDataset element_spec={'user_id': TensorSpec(shape=(), dtype=tf.string, name=None), 'movie_title': TensorSpec(shape=(5,), dtype=tf.string, name=None), 'user_rating': TensorSpec(shape=(5,), dtype=tf.float64, name=None)}>


In [17]:
class RankingModel(tfrs.Model):

  def __init__(self, loss):
    super().__init__()
    embedding_dimension = 32

    # Compute embeddings for users.
    self.user_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_user_ids),
      tf.keras.layers.Embedding(len(unique_user_ids) + 2, embedding_dimension)
    ])

    # Compute embeddings for movies.
    self.movie_embeddings = tf.keras.Sequential([
      tf.keras.layers.StringLookup(
        vocabulary=unique_books),
      tf.keras.layers.Embedding(len(unique_books) + 2, embedding_dimension)
    ])

    # Compute predictions.
    self.score_model = tf.keras.Sequential([
      # Learn multiple dense layers.
      tf.keras.layers.Dense(256, activation="relu"),
      tf.keras.layers.Dense(64, activation="relu"),
      # Make rating predictions in the final layer.
      tf.keras.layers.Dense(1)
    ])

    self.task = tfrs.tasks.Ranking(
      loss=loss,
      metrics=[
        tfr.keras.metrics.NDCGMetric(name="ndcg_metric"),
        tf.keras.metrics.RootMeanSquaredError()
      ]
    )

  def call(self, features):
    # We first convert the id features into embeddings.
    # User embeddings are a [batch_size, embedding_dim] tensor.
    user_embeddings = self.user_embeddings(features["user_id"])

    # Movie embeddings are a [batch_size, num_movies_in_list, embedding_dim]
    # tensor.
    movie_embeddings = self.movie_embeddings(features["movie_title"])

    # We want to concatenate user embeddings with movie emebeddings to pass
    # them into the ranking model. To do so, we need to reshape the user
    # embeddings to match the shape of movie embeddings.
    list_length = features["movie_title"].shape[1]
    user_embedding_repeated = tf.repeat(
        tf.expand_dims(user_embeddings, 1), [list_length], axis=1)

    # Once reshaped, we concatenate and pass into the dense layers to generate
    # predictions.
    concatenated_embeddings = tf.concat(
        [user_embedding_repeated, movie_embeddings], 2)

    return self.score_model(concatenated_embeddings)

  def compute_loss(self, features, training=False):
    labels = features.pop("user_rating")

    scores = self(features)

    return self.task(
        labels=labels,
        predictions=tf.squeeze(scores, axis=-1),
    )


In [18]:
cached_train = train.shuffle(100).batch(8192).cache()
print(len(cached_train))
cached_test = test.batch(4096).cache()
print (len(cached_test))

1
1


In [19]:
listwise_model = RankingModel(tfr.keras.losses.ListMLELoss())
listwise_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
listwise_model.fit(cached_train, epochs=5, verbose=True)  # error here

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x106bec98210>

In [20]:
listwise_model_result = listwise_model.evaluate(cached_test, return_dict=True)
print("NDCG of the ListMLE model: {:.4f}".format(listwise_model_result["ndcg_metric"]))
listwise_model.save("easy_listwise_model_saved")


NDCG of the ListMLE model: 0.6607








INFO:tensorflow:Assets written to: easy_listwise_model_saved\assets


INFO:tensorflow:Assets written to: easy_listwise_model_saved\assets










In [21]:
predictions = listwise_model({
    "user_id": tf.constant(["42"]),
    "movie_title": tf.constant([["1", "2", "3", "4", "5"]])
})

loaded_model = keras.models.load_model('easy_listwise_model_saved')


def make_predictions(model, user_id, books_lists): # model, user_id:str[], books_lists:str[[]], has to be equal lengths
    predictions = []
    for i in range(len(books_lists)):
        predictions.append( model({
            "user_id": tf.constant([user_id[i]]),
            "movie_title": tf.constant([books_lists[i]])
        }))
    return predictions

predictions = make_predictions(listwise_model, ["1","2"], [["1", "2", "3", "4", "5"],["1", "2", "3", "4", "5"]])


print(predictions)

[<tf.Tensor: shape=(1, 5, 1), dtype=float32, numpy=
array([[[0.43344492],
        [0.12169501],
        [0.46525118],
        [0.31846273],
        [0.16784719]]], dtype=float32)>, <tf.Tensor: shape=(1, 5, 1), dtype=float32, numpy=
array([[[0.42098093],
        [0.08873773],
        [0.43597856],
        [0.28630453],
        [0.1273804 ]]], dtype=float32)>]
