# LightGCN Book-Crossing implementation

In [54]:
from os.path import join as jp
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from model import LightGCN
from torch import optim
import tqdm
from utils import bpr_loss

In [1]:
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.__version__

'1.13.1'

## Downloading the data

- From Kaggle: https://www.kaggle.com/datasets/somnambwl/bookcrossing-dataset/
- Destination path: /Users/davidamat/Documents/david/learning/graph/data

In [2]:
path_data = "/Users/davidamat/Documents/david/learning/graph/data/book-crossing/"

In [3]:
path_ratings = jp(path_data, 'Ratings.csv')
path_users = jp(path_data, 'Users.csv')
path_books = jp(path_data, 'Books.csv')


ratings = pd.read_csv(path_ratings, sep=';', encoding='latin-1')
users = pd.read_csv(path_users, sep=';', encoding='latin-1')
books = pd.read_csv(path_books, sep=';', encoding='latin-1', on_bad_lines="skip")

  users = pd.read_csv(path_users, sep=';', encoding='latin-1')


## Preprocessing the Book-Crossing dataset


In [4]:
# Identifiers
books_ids = books['ISBN'].unique()
user_ids = users['User-ID'].unique()

# Ratings as df
df = ratings.copy()

# Mask only ratings of books and users that appear on the master tables of each one
mask_books_ids = df['ISBN'].isin(books_ids)
mask_users_ids = df['User-ID'].isin(user_ids)
df = df.loc[mask_books_ids & mask_users_ids]

# Keep the 100k highest ratings
df = df[df['Rating'] >= 8].iloc[:100000]

# Create mappings
user_mapping = {userid: i for i, userid in enumerate(df['User-ID'].unique())}
item_mapping = {isbn: i for i, isbn in enumerate(df['ISBN'].unique())}

# Count users and items
num_users = len(user_mapping)
num_items = len(item_mapping)
num_total = num_users + num_items

# Construct the IDS columns
df_ids = df.copy()
df_ids["u_id"] = df_ids["User-ID"].map(user_mapping)
df_ids["b_id"] = df_ids["ISBN"].map(item_mapping)

print("Users:", num_users)
print("Items:", num_items)
print("Total Users and Items:", num_total)

Users: 19557
Items: 56913
Total Users and Items: 76470


In [5]:
df.head()

Unnamed: 0,User-ID,ISBN,Rating
9586,12,1879384493,10
9591,16,345402871,9
9607,26,446310786,10
9608,26,449005615,9
9609,32,60168013,8


## Edge Indices

In [24]:
# Build the adjacency matrix based on user ratings:

# 1) Take the column of users and convert their ID into the internal ID
user_ids = torch.LongTensor([user_mapping[i] for i in df['User-ID']])

# 2) Take the column of items and convert their ID into the internal ID
item_ids = torch.LongTensor([item_mapping[i] for i in df['ISBN']])

# Number of users and items
num_users = len(user_ids.unique())
num_items = len(item_ids.unique())
num_nodes = num_users + num_items

# 3) Create the edge tensor as the relationship between 1) and 2) (they come from ratings matrix)
edge_index = torch.stack((user_ids, item_ids))

print("Num users:", num_users)
print("Num items:", num_items)
print("Num nodes:", num_nodes)


Num users: 19557
Num items: 56913
Num nodes: 76470


## Splitting dataset

In [12]:
# Create training, validation, and test adjacency matrices
train_index, test_index = train_test_split(range(len(df)), test_size=0.2, random_state=0)
val_index, test_index = train_test_split(test_index, test_size=0.5, random_state=0)

In [13]:
print("Train:", sorted(train_index)[:20])
print("Test:", sorted(test_index)[:20])
print("Valid:", sorted(val_index)[:20])

Train: [0, 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 15, 16, 18, 19, 20, 21, 22, 24, 27]
Test: [3, 14, 17, 23, 34, 48, 52, 56, 60, 63, 65, 68, 79, 109, 110, 117, 157, 165, 187, 196]
Valid: [6, 9, 25, 26, 36, 41, 51, 54, 69, 72, 90, 105, 119, 121, 125, 128, 133, 151, 156, 166]


In [25]:
# Edge indices
train_edge_index = edge_index[:, train_index]
val_edge_index = edge_index[:, val_index]
test_edge_index = edge_index[:, test_index]

# Edge values
train_edge_values = torch.ones_like(train_edge_index[0,:])
valid_edge_values = torch.ones_like(val_edge_index[0,:])
test_edge_values = torch.ones_like(test_edge_index[0,:])

## LightGCN

In [52]:
K = 20
K_LIST = [1,5,10,15]

LAMBDA = 1e-6
BATCH_SIZE = 32
NUM_LAYERS = 4
DIM_EMBEDDING = 64
EPOCHS = 31

# Side computations
n_batch = int(len(train_index)/BATCH_SIZE)
print("Number of batches per epoch:", n_batch)

Number of batches per epoch: 2500


In [30]:
%%time
model = LightGCN(
    num_users=num_users, 
    num_items=num_items, 
    edge_index=train_edge_index,
    edge_values=train_edge_values,
    edge_index_val=val_edge_index,
    edge_values_val=valid_edge_values,
    num_layers=NUM_LAYERS,
    dim_h=DIM_EMBEDDING
)

  d_inv = np.power(rowsum, -0.5).flatten()


CPU times: user 2min 53s, sys: 21.9 s, total: 3min 15s
Wall time: 3min 17s


## To Device

In [44]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = model.to(device)
edge_index = edge_index.to(device)
train_edge_index = train_edge_index.to(device)
val_edge_index = val_edge_index.to(device)

optimizer = optim.Adam(model.parameters(), lr=0.001)

## Training Loop

In [58]:
# Metrics loss
l_metrics = []

for epoch in tqdm.tqdm(range(EPOCHS)):
    model.train()
    
    for _ in tqdm.tqdm(range(n_batch)):
        # Forward pass
        embf_users, emb0_users, embf_items, emb0_items = model.forward()
        
        # Getting sample indices
        user_indices, pos_item_indices, neg_item_indices = model.sample_mini_batch()
        
        # Applying sample indices
        s_embf_users, s_emb0_users = embf_users[user_indices], emb0_users[user_indices]
        s_embf_items_pos, s_emb0_items_pos = embf_items[pos_item_indices], emb0_items[pos_item_indices]
        s_embf_items_neg, s_emb0_items_neg = embf_items[neg_item_indices], emb0_items[neg_item_indices]
        
        # Loss computation
        train_loss = bpr_loss(
            s_embf_users, s_emb0_users, 
            s_embf_items_pos, s_emb0_items_pos, 
            s_embf_items_neg, s_emb0_items_neg,
            LAMBDA=LAMBDA
        )
        
        train_loss.backward()
        optimizer.step()
        
    if epoch % 5 == 0:

        # Precision and recall on validation (generate all items recs)
        l_epoch_metrics = model.get_val_metrics(
            epoch=epoch, 
            topk_recs=model.num_items,
            k_list=K_LIST
        )
        l_metrics.append(l_epoch_metrics)


        # If we want to print Prec@K with K=2, we will select the second item of k_list
        k_print = 10
        idx_k = np.where(np.array(K_LIST)==k_print)[0][0]
        prec = l_epoch_metrics[idx_k][-2]
        rec = l_epoch_metrics[idx_k][-1]

        print(f"Epoch - {epoch}", f"Precision@{k_print} - {prec}", f"Recall@{k_print} - {rec}")
        

  0%|                                                                                                                                                    | 0/31 [00:00<?, ?it/s]
  0%|                                                                                                                                                  | 0/2500 [00:00<?, ?it/s][A
  0%|                                                                                                                                          | 1/2500 [00:00<09:41,  4.30it/s][A
  0%|                                                                                                                                          | 2/2500 [00:00<09:53,  4.21it/s][A
  0%|▏                                                                                                                                         | 3/2500 [00:00<10:17,  4.04it/s][A
  0%|▏                                                                                                 

KeyboardInterrupt: 

In [59]:
# Precision and recall on validation (generate all items recs)
l_epoch_metrics = model.get_val_metrics(
    epoch=epoch, 
    topk_recs=model.num_items,
    k_list=K_LIST
)
l_metrics.append(l_epoch_metrics)


# If we want to print Prec@K with K=2, we will select the second item of k_list
k_print = 10
idx_k = np.where(np.array(K_LIST)==k_print)[0][0]
prec = l_epoch_metrics[idx_k][-2]
rec = l_epoch_metrics[idx_k][-1]

print(f"Epoch - {epoch}", f"Precision@{k_print} - {prec}", f"Recall@{k_print} - {rec}")

KeyboardInterrupt: 

In [None]:
#df_metrics_epoch = pd.DataFrame(l_metrics, columns=["epoch", "K", "TP", "FP", "P", "precision", "recall", "ndcg"])

In [51]:
model.num_items

56913