# Sequential recommender
The idea of this task is to use a sequential model for recommending. For this task we only use the order of items users recommended. Order is defined by the augmented timestamps was also tested

In [5]:
import numpy as np
import pandas as pd

ratings_df = pd.read_csv('../../data/Ratings_merged_emb_time.csv')
print("Total ratings:", ratings_df.shape[0])

ratings_df = ratings_df.dropna()
ratings_df = ratings_df.drop_duplicates()
print("Total ratings (dropna, duplicates):", ratings_df.shape[0])

ratings_df = ratings_df[ratings_df['label'] != 0]
print("Total ratings (drop zeroes):", ratings_df.shape[0])

ratings_df = ratings_df.sort_values(by=['user', 'time'])

rating_count=pd.DataFrame(ratings_df["user"].value_counts())
u_threshold=rating_count[rating_count["count"]<5].index
ratings_df=ratings_df[~ratings_df["user"].isin(u_threshold)]

print('Total ratings (users rated <5 books excluded):', ratings_df.shape[0])

Total ratings: 10244
Total ratings (dropna, duplicates): 10244
Total ratings (drop zeroes): 10244
Total ratings (users rated <5 books excluded): 9509


In [6]:
ratings_df.head()

Unnamed: 0,user,item,label,Age,pca_dim_1,pca_dim_2,pca_dim_3,pca_dim_4,pca_dim_5,pca_dim_6,pca_dim_7,pca_dim_8,pca_dim_9,pca_dim_10,Year,time
1565,1435.0,394742591,7.0,36.0,0.034801,0.056853,0.057859,-0.075493,-0.051257,-0.101462,-0.039232,0.108641,0.065861,0.026866,1980.0,1085904000.0
1568,1435.0,802713815,5.0,36.0,0.1244,0.048072,-0.067927,-0.057565,-0.017031,-0.043261,-0.068278,0.005158,-0.013976,-0.007283,2001.0,1136215000.0
1566,1435.0,618127453,9.0,36.0,-0.020441,0.135455,0.150682,-0.013349,-0.049744,0.002055,-0.056898,0.036038,0.010957,-0.058842,2001.0,1289050000.0
1564,1435.0,60977477,5.0,36.0,0.161237,-0.018448,-0.002502,0.02007,0.078334,-0.037889,-0.037328,-0.101203,-0.066508,0.112463,1999.0,1302138000.0
1567,1435.0,812590236,4.0,36.0,0.027918,-0.035166,0.072684,0.113108,-0.078089,-0.05903,-0.003526,0.100786,0.102507,-0.052539,2000.0,1541256000.0


In [7]:
from sklearn.preprocessing import LabelEncoder

book_encoder = LabelEncoder()
ratings_df['item'] = book_encoder.fit_transform(ratings_df['item'])

In [8]:
from sklearn.model_selection import train_test_split
train_df, eval_df = train_test_split(ratings_df, test_size=0.2, random_state=0)

In [9]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
user_grouped = train_df.groupby('user').apply(lambda x: x.sort_values(by='time'))

sequences = []
next_books = []

for user_id, user_data in user_grouped.groupby(level=0):
    user_books = user_data['item'].tolist()
    for i in range(1, len(user_books)):
        sequences.append(user_books[:i])
        next_books.append(user_books[i])

print(sequences[:10])

2024-08-11 23:02:02.428840: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


[[2593], [2593, 6256], [2593, 6256, 4815], [2593, 6256, 4815, 310], [1712], [1712, 5135], [1712, 5135, 4077], [1712, 5135, 4077, 3367], [1712, 5135, 4077, 3367, 1793], [1712, 5135, 4077, 3367, 1793, 2053]]


  user_grouped = train_df.groupby('user').apply(lambda x: x.sort_values(by='time'))


In [10]:
# Padding sequences to the same length
max_sequence_len = max([len(seq) for seq in sequences])
sequences_padded = pad_sequences(sequences, maxlen=max_sequence_len)

# Convert labels to categorical
next_books = np.array(next_books)
num_books = len(book_encoder.classes_)

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(sequences_padded, next_books, test_size=0.2, random_state=42)

# Define the model
embedding_dim = 50

input_layer = Input(shape=(max_sequence_len,))
embedding_layer = Embedding(input_dim=num_books, output_dim=embedding_dim, input_length=max_sequence_len)(input_layer)
lstm_layer = LSTM(128, return_sequences=False)(embedding_layer)
output_layer = Dense(num_books, activation='softmax')(lstm_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=64)

2024-08-11 23:02:06.422764: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fa4053d5c70>

In [17]:
from sklearn.metrics import precision_score, recall_score

top_k = 10
eval_df = eval_df.sort_values(by=['user', 'time'])

# Create sequences of interactions per user
user_grouped = eval_df.groupby('user').apply(lambda x: x.sort_values(by='time'))

# Generate input sequences and labels for evaluation
eval_sequences = []
true_labels = []

for user_id, user_data in user_grouped.groupby(level=0):
    user_books = user_data['item'].tolist()
    for i in range(1, len(user_books)):
        eval_sequences.append(user_books[:i])
        true_labels.append(user_books[i])

# Padding sequences to the same length
eval_sequences_padded = pad_sequences(eval_sequences, maxlen=max_sequence_len)

# Predict the next book
predictions = model.predict(eval_sequences_padded, verbose=1)

# Calculate precision, recall, and NDCG
precisions = []
recalls = []
ndcgs = []

for i in range(len(predictions)):
    top_k_predictions = np.argsort(predictions[i])[-top_k:][::-1]  # Get top k predictions
    true_label = true_labels[i]

    # Precision@k: Did the model recommend the true book within the top k?
    precision = 1 if true_label in top_k_predictions else 0
    precisions.append(precision)

    # Recall@k: Since there's only one true book, recall is the same as precision here.
    recalls.append(precision)

    # NDCG@k: Evaluates the ranking of the true book in the top k predictions
    rank = np.where(top_k_predictions == true_label)[0][0] + 1 if true_label in top_k_predictions else top_k + 1
    dcg = 1 / np.log2(rank + 1)
    idcg = 1.0  # Ideal DCG is 1 when the true book is ranked first
    ndcg = dcg / idcg
    ndcgs.append(ndcg)

# Calculate average metrics
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)
avg_ndcg = np.mean(ndcgs)

print(f"Precision@{top_k}: {avg_precision:.4f}")
print(f"Recall@{top_k}: {avg_recall:.4f}")
print(f"NDCG@{top_k}: {avg_ndcg:.4f}")


  user_grouped = eval_df.groupby('user').apply(lambda x: x.sort_values(by='time'))


Precision@10: 0.0044
Recall@10: 0.0044
NDCG@10: 0.2802
