In [1]:
import pprint
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_ranking as tfr
import tensorflow_recommenders as tfrs
import mysql.connector
import pandas as pd
from getpass import getpass
from mysql.connector import connect, Error
import collections
from typing import Optional, List
# Connect to database
try:
    with connect(
        host="localhost",
        user="root",
        password="mysql",
        database="Sprint1BasicEComDb"
    ) as connection:
        print(connection)
except Error as e:
    print(e)
    
def save_to_csv(data, filename, header):
    df = pd.DataFrame(data).set_axis(header, axis=1)
    df.to_csv(filename, index=False, )




  from .autonotebook import tqdm as notebook_tqdm



<mysql.connector.connection_cext.CMySQLConnection object at 0x00000204677D9B50>


# Prep-Data

In [2]:
def _sample_list(feature_lists, num_examples_per_list, book_features, random_state):
    indices = random_state.choice(
        range(len(feature_lists['book_id'])),
        size=num_examples_per_list,
        replace=False
    )
    return {feature: [feature_lists[feature][i] for i in indices] for feature in book_features}, [feature_lists["user_rating"][i] for i in indices]


def sample_listwise(
    rating_dataset: tf.data.Dataset,
    user_features: List[str],
    book_features: List[str],
    num_list_per_user: int = 50,
    num_examples_per_list: int = 5,
    seed: Optional[int] = None,
) -> tf.data.Dataset:
    """Function for converting a dataset to a listwise dataset with user and book features.
    Args:
        ... (arguments remain the same)
    Returns:
        A tf.data.Dataset containing list examples with additional user and book features.
    """
    random_state = np.random.RandomState(seed)
    example_lists_by_user = collections.defaultdict(
        lambda: collections.defaultdict(list))

    for example in rating_dataset.as_numpy_iterator():
        user_id = example["user_id"]
        for feature in user_features + book_features + ["user_rating"]:
            example_lists_by_user[user_id][feature].append(example[feature])

    # Initialize tensor_slices with empty lists for each feature
    tensor_slices = {feature: []
                     for feature in user_features + book_features + ["user_rating"]}

    for user_id, feature_lists in example_lists_by_user.items():
        for _ in range(num_list_per_user):
            if len(feature_lists["book_id"]) < num_examples_per_list:
                continue

            sampled_indices = random_state.choice(
                len(feature_lists["book_id"]),
                size=num_examples_per_list,
                replace=False
            )

            for feature in book_features:
                tensor_slices[feature].extend(
                    [feature_lists[feature][i] for i in sampled_indices])

            tensor_slices["user_rating"].extend(
                [feature_lists["user_rating"][i] for i in sampled_indices])

            for feature in user_features:
                tensor_slices[feature].extend(
                    [feature_lists[feature][0]] * num_examples_per_list)

    # Convert lists to numpy arrays or Tensors
    tensor_slices = {k: tf.convert_to_tensor(
        np.array(v)) for k, v in tensor_slices.items()}

    # Finally, return a tf.data.Dataset object
    return tf.data.Dataset.from_tensor_slices(tensor_slices)


def sample_listwise_2(
    rating_dataset: tf.data.Dataset,
    user_features: List[str],
    book_features: List[str],
    num_list_per_user: int = 50,
    num_examples_per_list: int = 5,
    seed: Optional[int] = None
) -> tf.data.Dataset:
    """Convert a dataset to a listwise dataset with user and book features."""
    random_state = np.random.RandomState(seed)
    example_lists_by_user = collections.defaultdict(
        lambda: collections.defaultdict(list))

    # Collect features for each user
    for example in rating_dataset.as_numpy_iterator():
        user_id = example["user_id"]
        for feature in user_features + book_features + ["user_rating"]:
            example_lists_by_user[user_id][feature].append(example[feature])

    structured_lists = []
    # Build structured lists for each user
    for user_id, feature_lists in example_lists_by_user.items():
        for _ in range(num_list_per_user):
            if len(feature_lists["book_id"]) < num_examples_per_list:
                continue  # Skip users with fewer books than required for a full list

            sampled_indices = random_state.choice(len(feature_lists["book_id"]),
                                                  size=num_examples_per_list,
                                                  replace=False)

            list_entry = {feature: []
                          for feature in user_features + book_features + ["user_rating"]}
            for feature in book_features + ["user_rating"]:
                for i in sampled_indices:
                    list_entry[feature].append(feature_lists[feature][i])

            # Add user features (assumed to be the same for all books in the list)
            for feature in user_features:
                list_entry[feature] = feature_lists[feature][0]

            structured_lists.append(list_entry)

    # Convert structured lists to a format suitable for tf.data.Dataset
    

    def generator():
        for entry in structured_lists:
            # Convert lists for listwise features to the required fixed length
            # Ensure single-value features are correctly formatted
            # This assumes `entry` is already structured to match these expectations
            yield entry
    
    output_types = {
        'book_id': tf.int64,
        'book_title': tf.string,
        'user_rating': tf.float32,
        'user_id': tf.int64,
        'sex': tf.int64  # Assuming 'sex' is an example of a single-value user feature
    }

    output_shapes = {
        'book_id': (num_examples_per_list,),
        'book_title': (num_examples_per_list,),
        'user_rating': (num_examples_per_list,),
        'user_id': (),
        'sex': ()  # Ensure shapes are specified for all features
    }

    # Create the dataset with known types and shapes
    return tf.data.Dataset.from_generator(generator, output_types=output_types, output_shapes=output_shapes)

# Assuming the rating_dataset is preloaded and structured correctly.


# Here's how you would call the function with the specific features:

In [3]:
def get_all_books_user_and_ratings(): #TODO: like get All Books, add attributes to the query
    query = """SELECT 
    ubr.userid, 
    u.birthdate, 
    u.sex, 
    u.genrePreference, 
    u.authorPreference,
    ubr.bookid,
    b.book,
    b.description,
    b.num_page,
    b.pre_rating,
    b.publication,
    b.authorid,
    b.genreid,
    ubr.rating AS user_rating
FROM userbookratings ubr
JOIN (
    -- User Details and Preferences Subquery
    SELECT 
        u.id AS userid, 
        u.birthdate, 
        u.sex, 
        bp.genrePreference, 
        bp.authorPreference
    FROM users u
    JOIN bookpreferences bp ON u.id = bp.userID
) u ON ubr.userid = u.userid
JOIN (
    -- Book Details Subquery
    SELECT 
        bk.id AS bookid, 
        bk.book,
        bk.description,
        bk.numPages AS num_page,
        bk.rating AS pre_rating,
        bk.publication,
        GROUP_CONCAT(DISTINCT ba.authorId ORDER BY ba.authorId ASC) AS authorid,
        GROUP_CONCAT(DISTINCT bg.genreId ORDER BY bg.genreId ASC) AS genreid
    FROM bookitems bk
    LEFT JOIN bookauthors ba ON ba.bookId = bk.id
    LEFT JOIN bookgenres bg ON bg.bookId = bk.id
    GROUP BY bk.id
) b ON ubr.bookid = b.bookid; """
    # incase connection is lost, reconnect
    connection.reconnect(attempts=3, delay=5)
    mydb = connection.cursor()
    mydb.execute(query)
    user_ratings = mydb.fetchall()
    return user_ratings


print()
save_to_csv(get_all_books_user_and_ratings(), '../User_book_Ratings.csv', ['user_id','birth_date','sex','genre_preference','author_preference','book_id','book_title','description','num_pages','pre_rating','publication','author_id','genre_id','user_rating'])

rating_rank = pd.read_csv('../User_book_Ratings.csv')
rating_rank = rating_rank.dropna()






In [4]:
# all the ones I want to use
user_features = ['userid', 'birthdate', 'sex',
                 'genrePreference', 'authorPreference']
book_features = ['book_id', 'book_title', 'description',
                 'num_page', 'pre_rating', 'publication', 'authorids', 'genreids']
# the ones I'm ac
user_features = ['user_id', 'sex']
book_features = ['book_id', 'book_title']



# convert dataframe to tf.data.Dataset
rating_rank = tf.data.Dataset.from_tensor_slices(dict(rating_rank))# LINE CAN ONLY BE USED 1NCE

# TODO: Add more features to the dataset
rating_rank = rating_rank.map(lambda x: {
    "user_id": x["user_id"],
    "sex": x['sex'],
    "book_id": x['book_id'],
    "book_title": x['book_title'],
    "user_rating": x["user_rating"],
    }
)

book_ids = rating_rank.batch(1_000_000).map(lambda x: x["book_id"])
unique_book_ids = np.unique(np.concatenate(list(book_ids)))

book_titles = rating_rank.batch(1_000_000).map(lambda x: x["book_title"])
unique_book_titles = np.unique(np.concatenate(list(book_titles)))

#num_pages = rating_rank.batch(1_000_000).map(lambda x: x["num_pages"])
#unique_num_pages = np.unique(np.concatenate(list(num_pages)))


user_ids = rating_rank.batch(1_000_000).map(lambda x: x["user_id"])
unique_user_ids = np.unique(np.concatenate(list(user_ids)))


tf.random.set_seed(42)
#######################################
# NUMBER TAKINGS NEEDS TO BE ADJUSTED #
#######################################
print(len(rating_rank))
shuffled = rating_rank.shuffle(40000, seed=42, reshuffle_each_iteration=False)
train = shuffled.take(30000)
test = shuffled.skip(30000).take(10000)


#################################################################
# Tensor Shape doesn't look right book != TensorSpec(shape=(5,) #
#################################################################
train = sample_listwise_2(train, user_features, book_features,
                              num_list_per_user=50, num_examples_per_list=5, seed=42)

test = sample_listwise_2(test, user_features, book_features,
                             num_list_per_user=1, num_examples_per_list=5, seed=42)

cached_train = train.shuffle(100).batch(32, drop_remainder=True).cache()

cached_test = test.batch(32).cache()



49000


# Pre-Process Data

In [5]:
# for now the book features we are doing are book_id, book_title, num_pages 



#book_id
book_id_lookup = tf.keras.layers.IntegerLookup()
book_id_lookup.adapt(unique_book_ids) # list of all book ids, the strings

# Book title 
book_title_lookup = tf.keras.layers.StringLookup()
book_title_lookup.adapt(unique_book_titles)# should be unique book titles, the strings


#num pages # not sure honestly
#num_pages_lookup = tf.keras.layers.Normalization(axis=None)
#num_pages_lookup.adapt(unique_num_pages.astype(np.float32)) # list of all num pages








In [6]:
class BookModel(tf.keras.Model):

    def __init__(self):
        super().__init__()

        # book_id
        # self.book_id_embedding = tf.keras.Sequential([
        #     book_id_lookup,
        #     tf.keras.layers.Embedding(book_id_lookup.vocabulary_size(), 32)
        # ])

        self.book_id_embedding = tf.keras.Sequential([
            tf.keras.layers.IntegerLookup(
                vocabulary=unique_book_ids, mask_token=None),
            tf.keras.layers.Embedding(len(unique_book_ids) + 1, 32)
        ])

        self.book_title_embedding = tf.keras.Sequential([
            tf.keras.layers.StringLookup(
                vocabulary=unique_book_titles, mask_token=None),
            tf.keras.layers.Embedding(len(unique_book_titles) + 1, 32)
        ])

        # self.book_title_embedding = tf.keras.Sequential([
        #     book_title_lookup,
        #     tf.keras.layers.Embedding(book_title_lookup.vocabulary_size(), 32)
        # ])

        # self.num_pages_embedding = tf.keras.Sequential([
        #    num_pages_lookup,
        #    tf.keras.layers.Embedding(num_pages_lookup.vocabulary_size(), 32)
        # ])

    def call(self, inputs):
        print("Model input keys:", list(inputs.keys()))
        for key, value in inputs.items():
            print(f"Shape of {key}:", value.shape)
        return tf.concat([
            self.book_id_embedding(inputs["book_id"]),
            self.book_title_embedding(inputs["book_title"]),
            # self.num_pages_embedding(inputs["num_pages"]),
        ], axis=-1)

    def get_config(self):
        config = super(BookModel, self).get_config()
        config.update({
            "unique_book_ids": self.book_id_embedding.layers[0].get_vocabulary(),
            "unique_book_titles": self.book_title_embedding.layers[0].get_vocabulary()
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [7]:
#  For now features are user_id, sex;
# User ID 
user_id_lookup = tf.keras.layers.IntegerLookup()
user_id_lookup.adapt(unique_user_ids)


In [8]:
class UserModel(tf.keras.Model):

    # don't these need to be passed in?
    def __init__(self):
        super().__init__()

        # User embedding based on user_id
        self.user_id_embedding = tf.keras.Sequential([
            user_id_lookup,
            tf.keras.layers.Embedding(user_id_lookup.vocabulary_size(), 32),
        ])

        # Sex as a boolean feature, could be treated directly as an input or embedded
        self.sex_embedding = tf.keras.layers.Embedding(
            2, 32)  # Assuming sex is represented as 0 or 1

    def call(self, inputs):
        print("Model input keys:", list(inputs.keys()))
        for key, value in inputs.items():
            print(f"Shape of {key}:", value.shape)

        user_id_feature = self.user_id_embedding(inputs["user_id"])
        sex_feature = self.sex_embedding(
            tf.cast(inputs["sex"], tf.int32))  # Corrected attribute name

        return tf.concat([
            user_id_feature,
            sex_feature,
        ], axis=1)

    def get_config(self):
        config = super(UserModel, self).get_config()
        config.update({
            "user_ids_vocabulary": self.user_id_embedding.layers[0].get_vocabulary(),
            "sex_embedding_dim": self.sex_embedding.output_dim
        })
        return config

    @classmethod
    def from_config(cls, config):
        return cls(**config)

# Ranking Model

In [9]:
class RankingModel(tfrs.Model):

    def __init__(self, loss):
        super().__init__()
        embedding_dimension = 32

        # User embeddings
        self.user_embeddings = tf.keras.Sequential([
            UserModel()
            ])
        # Restaurand embeddings
        self.book_embeddings = tf.keras.Sequential([
            BookModel(),
            
        ])
        # Compute predictions
        self.score_model = tf.keras.Sequential([
            # Learn multiple dense layers.
            tf.keras.layers.Dense(128, activation="relu"),
            tf.keras.layers.Dense(64, activation="relu"),
            # Make rating predictions in the final layer.
            tf.keras.layers.Dense(1)

        ])

        self.task = tfrs.tasks.Ranking(
            loss=loss,
            metrics=[
                tfr.keras.metrics.NDCGMetric(name="ndcg_metric"),
                tf.keras.metrics.RootMeanSquaredError()
            ]
        )

    def call(self, features):
        # Extract user embeddings [batch_size, embedding_dim].
        user_embeddings = self.user_embeddings({
            'user_id': features['user_id'],
            'sex': features['sex'],
        })

        # Extract book embeddings [batch_size, num_books, embedding_dim].
        book_embeddings = self.book_embeddings({
            'book_id': features['book_id'],
            'book_title': features['book_title'],
            # Potentially include other book features here
        })

        # Determine the number of books in the list for each user.
        list_length = features['book_id'].shape[1]

        # Repeat the user embeddings to match the shape of book embeddings.
        # New shape: [batch_size, num_books, embedding_dim].
        user_embedding_repeated = tf.repeat(
            tf.expand_dims(user_embeddings, 1), [list_length], axis=1)

        print("User Embedding Repeated Shape:", user_embedding_repeated.shape)
        print("Book Embeddings Shape:", book_embeddings.shape)

        # Concatenate user and book embeddings along the last dimension.
        combined_embeddings = tf.concat(
            [user_embedding_repeated, book_embeddings], axis=2)

        combined_embeddings_flat = tf.reshape(combined_embeddings, [tf.shape(combined_embeddings)[0] * list_length, -1])

        # Passing the flattened embeddings to the scoring model.
        scores_flat = self.score_model(combined_embeddings_flat)

        # Reshaping scores to match the labels shape (10, 5)
        scores = tf.reshape(scores_flat, [tf.shape(features['book_id'])[0], list_length])

        return scores


    def compute_loss(self, features, training=False):
        labels = features.pop("user_rating")
        scores = self(features)
        return self.task(
            labels=labels,
            predictions=scores,
        )

# Train Model

In [10]:
listwise_model = RankingModel(tfr.keras.losses.ListMLELoss())
listwise_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
listwise_model.fit(cached_train, epochs=50, verbose=False)

Model input keys: ['user_id', 'sex']
Shape of user_id: (32,)
Shape of sex: (32,)
Model input keys: ['book_id', 'book_title']
Shape of book_id: (32, 5)
Shape of book_title: (32, 5)
User Embedding Repeated Shape: (32, 5, 64)
Book Embeddings Shape: (32, 5, 64)

Model input keys: ['user_id', 'sex']
Shape of user_id: (32,)
Shape of sex: (32,)
Model input keys: ['book_id', 'book_title']
Shape of book_id: (32, 5)
Shape of book_title: (32, 5)
User Embedding Repeated Shape: (32, 5, 64)
Book Embeddings Shape: (32, 5, 64)


<keras.src.callbacks.History at 0x204705d0450>

In [11]:
#test
listwise_model.evaluate(cached_test, return_dict=True)

Model input keys: ['user_id', 'sex']
Shape of user_id: (None,)
Shape of sex: (None,)
Model input keys: ['book_id', 'book_title']
Shape of book_id: (None, 5)
Shape of book_title: (None, 5)
User Embedding Repeated Shape: (None, 5, 64)
Book Embeddings Shape: (None, 5, 64)


{'ndcg_metric': 0.8532437086105347,
 'root_mean_squared_error': 11.451894760131836,
 'loss': 25.12437629699707,
 'regularization_loss': 0,
 'total_loss': 25.12437629699707}

In [12]:
tf.saved_model.save(listwise_model, "export")

Model input keys: ['user_id', 'sex']
Shape of user_id: (None,)
Shape of sex: (None,)
Model input keys: ['book_id', 'book_title']
Shape of book_id: (None, 5)
Shape of book_title: (None, 5)
User Embedding Repeated Shape: (None, 5, 64)
Book Embeddings Shape: (None, 5, 64)
Model input keys: ['user_id', 'sex']
Shape of user_id: (None,)
Shape of sex: (None,)
Model input keys: ['user_id', 'sex']
Shape of user_id: (None,)
Shape of sex: (None,)
Model input keys: ['book_id', 'book_title']
Shape of book_id: (None, 5)
Shape of book_title: (None, 5)
Model input keys: ['book_id', 'book_title']
Shape of book_id: (None, 5)
Shape of book_title: (None, 5)
User Embedding Repeated Shape: (None, 5, 64)
Book Embeddings Shape: (None, 5, 64)
User Embedding Repeated Shape: (None, 5, 64)
Book Embeddings Shape: (None, 5, 64)
INFO:tensorflow:Assets written to: export\assets


INFO:tensorflow:Assets written to: export\assets


In [15]:

loaded = tf.saved_model.load("export")
loaded({
    "user_id": tf.constant([1]),  # Two lists for the same user
    "sex": tf.constant([0]),
    "book_id": tf.constant([[101,101,101,101,101]]),  # Each user has lists of books
    "book_title": tf.constant([["Book A", "Book B", "Book C", "Book D", "Book E"]]),
}).numpy()

ValueError: Could not find matching concrete function to call loaded from the SavedModel. Got:
  Positional arguments (1 total):
    * {'book_id': <tf.Tensor 'features_2:0' shape=(1, 5) dtype=int32>,
 'book_title': <tf.Tensor 'features_3:0' shape=(1, 5) dtype=string>,
 'sex': <tf.Tensor 'features_1:0' shape=(1,) dtype=int32>,
 'user_id': <tf.Tensor 'features:0' shape=(1,) dtype=int32>}
  Keyword arguments: {'training': False}

 Expected these arguments to match one of the following 2 option(s):

Option 1:
  Positional arguments (1 total):
    * {'book_id': TensorSpec(shape=(None, 5), dtype=tf.int64, name='book_id'),
 'book_title': TensorSpec(shape=(None, 5), dtype=tf.string, name='book_title'),
 'sex': TensorSpec(shape=(None,), dtype=tf.int64, name='sex'),
 'user_id': TensorSpec(shape=(None,), dtype=tf.int64, name='user_id')}
  Keyword arguments: {'training': True}

Option 2:
  Positional arguments (1 total):
    * {'book_id': TensorSpec(shape=(None, 5), dtype=tf.int64, name='book_id'),
 'book_title': TensorSpec(shape=(None, 5), dtype=tf.string, name='book_title'),
 'sex': TensorSpec(shape=(None,), dtype=tf.int64, name='sex'),
 'user_id': TensorSpec(shape=(None,), dtype=tf.int64, name='user_id')}
  Keyword arguments: {'training': False}