In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import re
import tensorflow as tf
import numpy as np
import datetime
import os
#from faker import Faker
import pickle as pickle



# Preparing Dataset

Using Amazon Books Reviews Dataset - [link](https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews)

In [3]:
reviews_file_path = '../Data/Books_rating.csv'
books_details_file_path = '../Data/books_data.csv'

# Load the reviews file
reviews_df = pd.read_csv(reviews_file_path)

# Load the Books Details file
books_details_df = pd.read_csv(books_details_file_path)

In [4]:
# Explore the reviews dataset
print("Reviews Dataset:")
print(reviews_df.info())


# Explore the Books Details dataset
print("\nBooks Details Dataset:")
print(books_details_df.info())

Reviews Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000000 entries, 0 to 2999999
Data columns (total 10 columns):
 #   Column              Dtype  
---  ------              -----  
 0   Id                  object 
 1   Title               object 
 2   Price               float64
 3   User_id             object 
 4   profileName         object 
 5   review/helpfulness  object 
 6   review/score        float64
 7   review/time         int64  
 8   review/summary      object 
 9   review/text         object 
dtypes: float64(2), int64(1), object(7)
memory usage: 228.9+ MB
None

Books Details Dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212404 entries, 0 to 212403
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Title          212403 non-null  object 
 1   description    143962 non-null  object 
 2   authors        180991 non-null  object 
 3   image          160329 non-null  object 


In [5]:
# Check for missing values in Reviews dataset
reviews_missing = reviews_df.isnull().sum()

# Check for missing values in Books Details dataset
books_missing = books_details_df.isnull().sum()

# Display missing values
print("Reviews Missing Values:")
print(reviews_missing)

print("\nBooks Details Missing Values:")
print(books_missing)


Reviews Missing Values:
Id                          0
Title                     208
Price                 2518829
User_id                561787
profileName            561905
review/helpfulness          0
review/score                0
review/time                 0
review/summary            407
review/text                 8
dtype: int64

Books Details Missing Values:
Title                 1
description       68442
authors           31413
image             52075
previewLink       23836
publisher         75886
publishedDate     25305
infoLink          23836
categories        41199
ratingsCount     162652
dtype: int64


## Dealing with Missing Values

In [6]:
# Drop rows with missing 'Title' and 'User_id'
reviews_df = reviews_df.dropna(subset=['Title', 'User_id'])

# Drop 'profileName' column, we won't be using it
reviews_df = reviews_df.drop(columns=['profileName'])

# Drop 'Price' column, we won't be using it
reviews_df = reviews_df.drop(columns=['Price'])

# Fill missing values in 'review/summary' and 'review/text' with empty strings
reviews_df['review/summary'] = reviews_df['review/summary'].fillna('')
reviews_df['review/text'] = reviews_df['review/text'].fillna('')

# Display updated information about missing values
reviews_missing_values = reviews_df.isnull().sum()
print("Reviews Missing Values After Handling:")
print(reviews_missing_values)


Reviews Missing Values After Handling:
Id                    0
Title                 0
User_id               0
review/helpfulness    0
review/score          0
review/time           0
review/summary        0
review/text           0
dtype: int64


In [7]:
# Drop rows with missing 'Title'
books_details_df = books_details_df.dropna(subset=['Title'])

# Impute missing values in 'ratingsCount' with the median
books_details_df['ratingsCount'] = books_details_df['ratingsCount'].fillna(books_details_df['ratingsCount'].median())

# Fill missing values in textual columns with empty strings
textual_columns = ['description', 'authors', 'publisher', 'publishedDate', 'categories']
books_details_df[textual_columns] = books_details_df[textual_columns].fillna('')

# Dropping columns we are not going to use
columns_to_drop = ['image', 'previewLink', 'infoLink']
books_details_df = books_details_df.drop(columns=columns_to_drop)

# Display updated information about missing values
books_details_missing_values = books_details_df.isnull().sum()
print("Books Details Missing Values After Handling:")
print(books_details_missing_values)


Books Details Missing Values After Handling:
Title            0
description      0
authors          0
publisher        0
publishedDate    0
categories       0
ratingsCount     0
dtype: int64


## Merging Datasets

In [8]:

#Merge DataFrame or named Series objects with a database-style join.
#here i want to get all user id from left dataframe so, using left join
merged_df = pd.merge(reviews_df, books_details_df, on='Title', how='left')

In [9]:
# Pickle the merged dataframe
with open('merged_dataframe.pkl', 'wb') as f:
    pickle.dump(merged_df, f)

In [10]:
#analyzing merged dataframe
merged_df_missing_values = merged_df.isnull().sum()
print("Merged Dataframe Missing Values:")
print(merged_df_missing_values)

Merged Dataframe Missing Values:
Id                    0
Title                 0
User_id               0
review/helpfulness    0
review/score          0
review/time           0
review/summary        0
review/text           0
description           0
authors               0
publisher             0
publishedDate         0
categories            0
ratingsCount          0
dtype: int64


In [11]:
merged_df.head()

Unnamed: 0,Id,Title,User_id,review/helpfulness,review/score,review/time,review/summary,review/text,description,authors,publisher,publishedDate,categories,ratingsCount
0,1882931173,Its Only Art If Its Well Hung!,AVCGYZL8FQQTD,7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...,,['Julie Strain'],,1996,['Comics & Graphic Novels'],2.0
1,826414346,Dr. Seuss: American Icon,A30TK6U7DNS82R,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...,Philip Nel takes a fascinating look into the k...,['Philip Nel'],A&C Black,2005-01-01,['Biography & Autobiography'],2.0
2,826414346,Dr. Seuss: American Icon,A3UH4UZ4RSVO82,10/11,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t...",Philip Nel takes a fascinating look into the k...,['Philip Nel'],A&C Black,2005-01-01,['Biography & Autobiography'],2.0
3,826414346,Dr. Seuss: American Icon,A2MVUWT453QH61,7/7,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D...",Philip Nel takes a fascinating look into the k...,['Philip Nel'],A&C Black,2005-01-01,['Biography & Autobiography'],2.0
4,826414346,Dr. Seuss: American Icon,A22X4XUPKF66MR,3/3,4.0,1107993600,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...,Philip Nel takes a fascinating look into the k...,['Philip Nel'],A&C Black,2005-01-01,['Biography & Autobiography'],2.0


In [12]:
display(merged_df["User_id"].nunique())
display(merged_df["Id"].nunique())
display(merged_df["User_id"].nunique() * merged_df["Id"].nunique())

1008961

216014

217949701454

In [13]:
# Initialize Faker and set the seed for reproducibility
#fake = Faker()
#Faker.seed(0)

# Generate random names using Faker
#num_names = merged_df["User_id"].nunique()
#random_names = [fake.name() for _ in range(num_names)]

# Create a DataFrame with the random names
#unique_user_ids = merged_df['User_id'].unique()

# Create a mapping between user IDs and fake names
#id_to_name_mapping = dict(zip(unique_user_ids, random_names))

#merged_df['Full Name'] = merged_df['User_id'].map(id_to_name_mapping)

# Print the first few rows to check
#print(merged_df.head(5))


NameError: name 'Faker' is not defined

In [14]:
interaction_matrix = merged_df[["Id", "User_id", "review/score"]]
interaction_matrix.shape

(2438018, 3)

In [16]:
train_matrix = interaction_matrix.sample(n=10000, random_state=69)
train_matrix.shape
train_matrix = train_matrix.rename(columns={
    'Id': 'item_id_unenc',
    'User_id': 'user_id_unenc',
    'review/score': 'interaction'
})


In [17]:
from sklearn.preprocessing import LabelEncoder

# Initialize the Label Encoders for user_id and item_id
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

# Fit and transform the user_id and item_id columns with label encoding
train_matrix['user_id'] = user_encoder.fit_transform(train_matrix['user_id_unenc'])
train_matrix['item_id'] = item_encoder.fit_transform(train_matrix['item_id_unenc'])

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(-1, 1))
train_matrix['interaction'] = scaler.fit_transform(train_matrix[['interaction']])

train_matrix.head()

Unnamed: 0,item_id_unenc,user_id_unenc,interaction,user_id,item_id
1197273,1891100599,A8XXX3QWYUZCJ,-1.0,7468,3757
138854,1593553978,A2AAFNQQIFZXT5,-1.0,3092,3548
1592055,684830337,AQ96MYTH8FEIE,-0.5,8749,1375
491462,192853961,AH6G5NF264QSH,1.0,8073,259
2121273,373272839,A16TVRF32P50VD,1.0,431,532


In [18]:
with open('item_encoder.pkl', 'wb') as f:
    pickle.dump(item_encoder, f)

In [19]:
display(train_matrix["item_id_unenc"].nunique())
display(train_matrix["user_id_unenc"].nunique())

7327

9391

In [20]:
import tensorflow.keras as keras
from tensorflow.keras.layers import (
    Concatenate,  # Layer that concatenates a list of inputs
    Dense,        # Regular densely-connected NN layer
    Embedding,    # Turns positive integers (indexes) into dense vectors of fixed size
    Flatten,      # Flattens the input
    Input,        # Used to instantiate a Keras tensor
    Multiply,     # Layer that multiplies two inputs
)
from tensorflow.keras.models import Model  # The Model class from Keras
from tensorflow.keras.regularizers import l2  # Regularizer for L2 regularization
from typing import List  # For type annotations

def create_ncf(
    number_of_users: int,  # Total number of unique users
    number_of_items: int,  # Total number of unique items
    latent_dim_mf: int = 4,  # Dimensionality of the MF (Matrix Factorization) embedding space
    latent_dim_mlp: int = 32,  # Dimensionality of the MLP (Multi-Layer Perceptron) embedding space
    reg_mf: int = 0,  # L2 regularization factor for MF embeddings
    reg_mlp: int = 0.01,  # L2 regularization factor for MLP embeddings
    dense_layers: List[int] = [16, 8],  # List of units in each dense layer in the MLP
    reg_layers: List[int] = [0.01, 0.01],  # L2 regularization factors for each dense layer in the MLP
    activation_dense: str = "relu",  # Activation function for dense layers in the MLP
) -> keras.Model:

    # Input layers for user and item IDs
    user = Input(shape=(), dtype="int32", name="user_id")
    item = Input(shape=(), dtype="int32", name="item_id")

    # Embedding layers for MF (Matrix Factorization)
    mf_user_embedding = Embedding(
        input_dim=number_of_users,  # Number of unique users
        output_dim=latent_dim_mf,   # Dimensionality of user embeddings
        name="mf_user_embedding",
        embeddings_initializer="RandomNormal",  # Initialize embeddings with a random normal distribution
        embeddings_regularizer=l2(reg_mf),  # Apply L2 regularization
        input_length=1,  # Length of input sequences (1 for single user ID)
    )
    mf_item_embedding = Embedding(
        input_dim=number_of_items,  # Number of unique items
        output_dim=latent_dim_mf,   # Dimensionality of item embeddings
        name="mf_item_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mf),
        input_length=1,  # Length of input sequences (1 for single item ID)
    )

    # Embedding layers for MLP (Multi-Layer Perceptron)
    mlp_user_embedding = Embedding(
        input_dim=number_of_users,  # Number of unique users
        output_dim=latent_dim_mlp,  # Dimensionality of user embeddings
        name="mlp_user_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mlp),  # Apply L2 regularization
        input_length=1,  # Length of input sequences (1 for single user ID)
    )
    mlp_item_embedding = Embedding(
        input_dim=number_of_items,  # Number of unique items
        output_dim=latent_dim_mlp,  # Dimensionality of item embeddings
        name="mlp_item_embedding",
        embeddings_initializer="RandomNormal",
        embeddings_regularizer=l2(reg_mlp),
        input_length=1,  # Length of input sequences (1 for single item ID)
    )

    # MF (Matrix Factorization) latent vectors
    mf_user_latent = Flatten()(mf_user_embedding(user))  # Flatten user embedding to create latent vector
    mf_item_latent = Flatten()(mf_item_embedding(item))  # Flatten item embedding to create latent vector
    mf_cat_latent = Multiply()([mf_user_latent, mf_item_latent])  # Element-wise multiplication of user and item vectors

    # MLP (Multi-Layer Perceptron) latent vectors
    mlp_user_latent = Flatten()(mlp_user_embedding(user))  # Flatten user embedding to create latent vector
    mlp_item_latent = Flatten()(mlp_item_embedding(item))  # Flatten item embedding to create latent vector
    mlp_cat_latent = Concatenate()([mlp_user_latent, mlp_item_latent])  # Concatenate user and item vectors

    mlp_vector = mlp_cat_latent  # Initial input to MLP

    # Build dense layers for the MLP
    for i in range(len(dense_layers)):
        layer = Dense(
            dense_layers[i],  # Number of units in this layer
            activity_regularizer=l2(reg_layers[i]),  # Apply L2 regularization
            activation=activation_dense,  # Activation function
            name="layer%d" % i,  # Name of the layer
        )
        mlp_vector = layer(mlp_vector)  # Apply the layer to the MLP vector

    # Concatenate MF and MLP vectors
    predict_layer = Concatenate()([mf_cat_latent, mlp_vector])

    # Output layer
    result = Dense(
        1,  # Single output unit
        activation="sigmoid",  # Sigmoid activation for binary classification
        kernel_initializer="lecun_uniform",  # Initializer for kernel weights
        name="interaction"  # Name of the output layer
    )

    output = result(predict_layer)  # Apply output layer to the concatenated vector

    # Define the model with user and item inputs and the output
    model = Model(
        inputs=[user, item],  # Model inputs
        outputs=[output],     # Model output
    )

    return model  # Return the constructed model


In [21]:
from tensorflow.keras.optimizers import Adam

n_users = train_matrix["user_id"].nunique()
n_items = train_matrix["item_id"].nunique()
print(f"n_users {n_users}, n_items {n_items}")


n_users 9391, n_items 7327


In [22]:
ncf_model = create_ncf(n_users, n_items)

ncf_model.compile(
    optimizer=Adam(),
    loss="binary_crossentropy",
    metrics=[
        tf.keras.metrics.TruePositives(name="tp"),
        tf.keras.metrics.FalsePositives(name="fp"),
        tf.keras.metrics.TrueNegatives(name="tn"),
        tf.keras.metrics.FalseNegatives(name="fn"),
        tf.keras.metrics.BinaryAccuracy(name="accuracy"),
        tf.keras.metrics.Precision(name="precision"),
        tf.keras.metrics.Recall(name="recall"),
        tf.keras.metrics.AUC(name="auc"),
    ],
)
ncf_model._name = "neural_collaborative_filtering"
ncf_model.summary()

Metal device set to: Apple M2 Pro


2024-05-21 21:24:22.866189: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-05-21 21:24:22.866438: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Model: "neural_collaborative_filtering"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_id (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 item_id (InputLayer)           [(None,)]            0           []                               
                                                                                                  
 mlp_user_embedding (Embedding)  (None, 32)          300512      ['user_id[0][0]']                
                                                                                                  
 mlp_item_embedding (Embedding)  (None, 32)          234464      ['item_id[0][0]']                
                                                                     

In [23]:
def make_tf_dataset(
    df: pd.DataFrame,
    targets: List[str],
    val_split: float = 0.1,
    batch_size: int = 512,
    seed=42,
):
    """Make TensorFlow dataset from Pandas DataFrame.
    :param df: input DataFrame - only contains features and target(s)
    :param targets: list of columns names corresponding to targets
    :param val_split: fraction of the data that should be used for validation
    :param batch_size: batch size for training
    :param seed: random seed for shuffling data - `None` won't shuffle the data"""

    n_val = round(df.shape[0] * val_split)
    if seed:
        # shuffle all the rows
        x = df.sample(frac=1, random_state=seed).to_dict("series")
    else:
        x = df.to_dict("series")
    y = dict()
    for t in targets:
        y[t] = x.pop(t)
    ds = tf.data.Dataset.from_tensor_slices((x, y))

    ds_val = ds.take(n_val).batch(batch_size)
    ds_train = ds.skip(n_val).batch(batch_size)
    return ds_train, ds_val

In [24]:
def make_tf_dataset(
    df: pd.DataFrame, 
    targets: List[str], 
    val_split: float = 0.1, 
    batch_size: int = 512, 
    seed=42
):
    """
    Make TensorFlow dataset from Pandas DataFrame.

    :param df: input DataFrame - only contains features and target(s)
    :param targets: list of columns names corresponding to targets
    :param val_split: fraction of the data that should be used for validation
    :param batch_size: batch size for training
    :param seed: random seed for shuffling data - `None` won't shuffle the data
    """
    
    # Check and ensure data types
    print(df.dtypes)
    df = df.apply(pd.to_numeric, errors='coerce')
    if df.isnull().values.any():
        raise ValueError("DataFrame contains NaN values, please handle them before creating TensorFlow dataset.")
    
    # Compute validation split size
    n_val = round(df.shape[0] * val_split)
    
    # Shuffle and convert data to dictionary of series
    if seed:
        x = df.sample(frac=1, random_state=seed).to_dict("series")
    else:
        x = df.to_dict("series")

    # Separate features and target(s)
    y = {t: x.pop(t) for t in targets}
    
    # Convert to TensorFlow dataset and enforce data types
    def to_tf_tensor(x_dict, y_dict):
        x_tensors = {k: tf.convert_to_tensor(v, dtype=tf.float32 if v.dtype == float else tf.int32) for k, v in x_dict.items()}
        y_tensors = {k: tf.convert_to_tensor(v, dtype=tf.float32) for k, v in y_dict.items()}
        return x_tensors, y_tensors
    
        # Convert to TensorFlow dataset
    ds_raw = tf.data.Dataset.from_tensor_slices((dict(x), dict(y)))
    
    # Ensure correct data types
    def enforce_data_types(x_dict, y_dict):
        x_tensors = {k: tf.cast(v, tf.int32) if 'user_id' in k or 'item_id' in k else tf.cast(v, tf.float32) for k, v in x_dict.items()}
        y_tensors = {k: tf.cast(v, tf.float32) for k, v in y_dict.items()}
        return x_tensors, y_tensors
    
    ds_raw = ds_raw.map(enforce_data_types)
    ds_raw = ds_raw.map(lambda x_dict, y_dict: to_tf_tensor(x_dict, y_dict))
    
    # Create training and validation datasets
    ds_val = ds_raw.take(n_val).batch(batch_size)
    ds_train = ds_raw.skip(n_val).batch(batch_size)
    
    return ds_train, ds_val

In [25]:
X = train_matrix[["item_id","user_id"]]
y = train_matrix["interaction"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)


In [27]:
# create train and validation datasets

ds_train, ds_val = make_tf_dataset(train_matrix.drop(columns=['item_id_unenc', 'user_id_unenc']), targets=['interaction'], val_split=0.2, batch_size=2)


interaction    float64
user_id          int64
item_id          int64
dtype: object


In [28]:
TOP_K = 5
N_EPOCHS = 10

train_hist = ncf_model.fit(
    ds_train,
    validation_data=ds_val,
    epochs=N_EPOCHS,
    verbose=1,
)



Epoch 1/10


2024-05-21 21:25:12.865822: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2024-05-21 21:25:13.492793: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
loc("mps_select"("(mpsFileLoc): /AppleInternal/Library/BuildRoots/4e1473ee-9f66-11ee-8daf-cedaeb4cabe2/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShadersGraph/mpsgraph/MetalPerformanceShadersGraph/Core/Files/MPSGraphUtilities.mm":294:0)): error: 'anec.gain_offset_control' op result #0 must be 4D/5D memref of 16-bit float or 8-bit signed integer or 8-bit unsigned integer values, but got 'memref<1x2x1x1xi1>'
loc("mps_select"("(mpsFileLoc): /AppleInternal/Library/BuildRoots/4e1473ee-9f66-11ee-8daf-cedaeb4cabe2/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShadersGraph/mpsgraph/MetalPerformanceShadersGraph/Core/Files/MPSGraphUtilities.mm":294:0)): error: 'anec.gain_offset_control' op result #0 must be 



2024-05-21 21:26:39.802838: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
loc("mps_select"("(mpsFileLoc): /AppleInternal/Library/BuildRoots/4e1473ee-9f66-11ee-8daf-cedaeb4cabe2/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShadersGraph/mpsgraph/MetalPerformanceShadersGraph/Core/Files/MPSGraphUtilities.mm":294:0)): error: 'anec.gain_offset_control' op result #0 must be 4D/5D memref of 16-bit float or 8-bit signed integer or 8-bit unsigned integer values, but got 'memref<1x2x1x1xi1>'
loc("mps_select"("(mpsFileLoc): /AppleInternal/Library/BuildRoots/4e1473ee-9f66-11ee-8daf-cedaeb4cabe2/Library/Caches/com.apple.xbs/Sources/MetalPerformanceShadersGraph/mpsgraph/MetalPerformanceShadersGraph/Core/Files/MPSGraphUtilities.mm":294:0)): error: 'anec.gain_offset_control' op result #0 must be 4D/5D memref of 16-bit float or 8-bit signed integer or 8-bit unsigned integer values, but got 'memref<1x2x1x1xi1>'


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [165]:
all_items = train_matrix['item_id'].unique()

def get_top_n_recommendations(model, user_id, item_id, all_items, top_n=10):
    # Generate an array where the user_id and item_id are repeated for each item
    user_array = np.array([user_id] * len(all_items))
    items_array = np.array(all_items)   

    # Predict the interaction scores for these user-item pairs
    predictions = model.predict([user_array, items_array], batch_size=128, verbose=1)
   
    # Flatten the predictions to get a 1D array
    predictions = predictions.flatten()

    # Get top `n` items with highest prediction scores, except the input item_id
    sorted_indices = predictions.argsort()[::-1]
    top_n_items = [all_items[i] for i in sorted_indices if all_items[i] != item_id][:top_n]
    return top_n_items

test = get_top_n_recommendations(ncf_model, 7468, 3757, all_items)
display(test)



[5597, 5162, 4577, 3184, 6686, 4176, 7177, 1616, 3068, 2393]

In [180]:
def get_titles_from_ids(id_arr):
    original_book_ids = item_encoder.inverse_transform(id_arr)
    book_titles = merged_df[merged_df['Id'].isin(original_book_ids)]['Title'].unique()
    return book_titles

In [182]:
display(get_titles_from_ids([3757]))
display(get_titles_from_ids(test))

array(["It's Potty Time for Girls (It's Time to... Board Book Series)"],
      dtype=object)

array(['Dune',
       'The Bureau and the Mole: The Unmasking of Robert Philip Hanssen, the Most Dangerous Double Agent in FBI History',
       'The Hobbit', 'Jonathan Strange & Mr. Norrell', 'The Giver',
       'The Catcher in the Rye [Audiobook] [Cd] [Unabridged] (Audio CD)',
       'Pride and Prejudice',
       'A Christmas Carol (Enriched Classics (Pocket))',
       'Blink: The Power of Thinking Without Thinking',
       'Rich Dad, Poor Dad'], dtype=object)



['Dune'
 'The Bureau and the Mole: The Unmasking of Robert Philip Hanssen, the Most Dangerous Double Agent in FBI History'
 'The Hobbit' 'Jonathan Strange & Mr. Norrell' 'The Giver'
 'The Catcher in the Rye [Audiobook] [Cd] [Unabridged] (Audio CD)'
 'Pride and Prejudice' 'A Christmas Carol (Enriched Classics (Pocket))'
 'Blink: The Power of Thinking Without Thinking' 'Rich Dad, Poor Dad']

In [157]:
first_batch = ds_train.take(1)
first_item = next(iter(first_batch))  # Get the first batch from the iterator
first_item_value = first_item[0]["user_id"]

print("First item from the BatchDataset:", first_item_value)

First item from the BatchDataset: tf.Tensor([1395 1751], shape=(2,), dtype=int32)
