In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

ratings_df = pd.read_csv('../../data/Ratings_Time.csv', delimiter=';', dtype={'User-ID': np.int32, 'ISBN': str, 'Rating': np.int8, 'timestamp': str})
print(ratings_df.head())

ratings_df = ratings_df.sort_values(by=['User-ID', 'timestamp'])

# Encode User-IDs and ISBNs to numerical values
user_encoder = LabelEncoder()
book_encoder = LabelEncoder()

ratings_df['User-ID'] = user_encoder.fit_transform(ratings_df['User-ID'])
ratings_df['ISBN'] = book_encoder.fit_transform(ratings_df['ISBN'])

# Create sequences of interactions per user
user_grouped = ratings_df.groupby('User-ID').apply(lambda x: x.sort_values(by='timestamp'))

# Generate input sequences and labels
sequences = []
next_books = []

for user_id, user_data in user_grouped.groupby(level=0):
    user_books = user_data['ISBN'].tolist()
    for i in range(1, len(user_books)):
        sequences.append(user_books[:i])
        next_books.append(user_books[i])

# Padding sequences to the same length
max_sequence_len = max([len(seq) for seq in sequences])
sequences_padded = pad_sequences(sequences, maxlen=max_sequence_len)

# Convert labels to categorical
next_books = np.array(next_books)
num_books = len(book_encoder.classes_)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(sequences_padded, next_books, test_size=0.2, random_state=42)

# Define the model
embedding_dim = 50

input_layer = Input(shape=(max_sequence_len,))
embedding_layer = Embedding(input_dim=num_books, output_dim=embedding_dim, input_length=max_sequence_len)(input_layer)
lstm_layer = LSTM(128, return_sequences=False)(embedding_layer)
output_layer = Dense(num_books, activation='softmax')(lstm_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=64)

# Predict the next book for a given user sequence
def predict_next_book(user_sequence):
    user_sequence_padded = pad_sequences([user_sequence], maxlen=max_sequence_len)
    prediction = model.predict(user_sequence_padded)
    next_ISBN = np.argmax(prediction)
    next_book = book_encoder.inverse_transform([next_ISBN])
    return next_book[0]

# Example of predicting the next book for a given sequence
example_sequence = [book_encoder.transform(['ISBN1', 'ISBN2', 'ISBN3'])]
predicted_book = predict_next_book(example_sequence)
print(f"Predicted next book: {predicted_book}")

2024-08-10 00:10:39.166073: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


   User-ID        ISBN  Rating            timestamp
0   276725  034545104X       0  2007-05-26 07:29:30
1   276726  0155061224       5  2002-07-09 02:39:20
2   276727  0446520802       0  2006-09-28 23:13:25
3   276729  052165615X       3  2003-10-03 13:12:44
4   276729  0521795028       6  2010-04-29 18:50:50


  user_grouped = ratings_df.groupby('User-ID').apply(lambda x: x.sort_values(by='timestamp'))


: 