In [1]:
import torch
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile
import pandas as pd
from os.path import join as jp
import os
import matplotlib.pyplot as plt 
import seaborn as sns 
import matplotlib

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
import numpy as np
from sklearn.model_selection import train_test_split

import torch
import torch.nn.functional as F
from torch import nn, optim, Tensor

from torch_geometric.utils import structured_negative_sampling
from torch_geometric.nn.conv.gcn_conv import gcn_norm
from torch_geometric.nn import LGConv
import scipy.sparse as sp

In [4]:
import warnings
warnings.filterwarnings("ignore")

# Downloading the data

- From Kaggle: https://www.kaggle.com/datasets/somnambwl/bookcrossing-dataset/
- Destination path: /Users/davidamat/Documents/david/learning/graph/data

In [5]:
path_data = "/Users/davidamat/Documents/david/learning/graph/data/book-crossing/"

In [6]:
path_ratings = jp(path_data, 'Ratings.csv')
path_users = jp(path_data, 'Users.csv')
path_books = jp(path_data, 'Books.csv')


ratings = pd.read_csv(path_ratings, sep=';', encoding='latin-1')
users = pd.read_csv(path_users, sep=';', encoding='latin-1')
books = pd.read_csv(path_books, sep=';', encoding='latin-1', on_bad_lines="skip")

# Preprocessing the Book-Crossing dataset


In [8]:
# Identifiers
books_ids = books['ISBN'].unique()
user_ids = users['User-ID'].unique()

# Ratings as df
df = ratings.copy()

# Mask only ratings of books and users that appear on the master tables of each one
mask_books_ids = df['ISBN'].isin(books_ids)
mask_users_ids = df['User-ID'].isin(user_ids)
df = df.loc[mask_books_ids & mask_users_ids]

# Keep the 100k highest ratings
df = df[df['Rating'] >= 8].iloc[:100000]

In [9]:
# Create mappings
user_mapping = {userid: i for i, userid in enumerate(df['User-ID'].unique())}
item_mapping = {isbn: i for i, isbn in enumerate(df['ISBN'].unique())}

# Count users and items
num_users = len(user_mapping)
num_items = len(item_mapping)
num_total = num_users + num_items

In [10]:
# Construct the IDS columns
df_ids = df.copy()
df_ids["u_id"] = df_ids["User-ID"].map(user_mapping)
df_ids["b_id"] = df_ids["ISBN"].map(item_mapping)

In [11]:
print("Users", num_users)
print("Items", num_items)
print("Total Users and Items", num_total)

Users 19557
Items 56913
Total Users and Items 76470


In [12]:
df

Unnamed: 0,User-ID,ISBN,Rating
9586,12,1879384493,10
9591,16,0345402871,9
9607,26,0446310786,10
9608,26,0449005615,9
9609,32,0060168013,8
...,...,...,...
480298,114601,0743419049,10
480303,114604,0618002219,9
480314,114611,0445409134,8
480315,114611,0446313033,8


## Edge Indices

In [13]:
# Build the adjacency matrix based on user ratings:

# 1) Take the column of users and convert their ID into the internal ID
user_ids = torch.LongTensor([user_mapping[i] for i in df['User-ID']])

# 2) Take the column of items and convert their ID into the internal ID
item_ids = torch.LongTensor([item_mapping[i] for i in df['ISBN']])

# 3) Create the edge tensor as the relationship between 1) and 2) (they come from ratings matrix)
edge_index = torch.stack((user_ids, item_ids))

In [16]:
user_ids

tensor([    0,     1,     2,  ..., 19556, 19556, 19556])

In [14]:
edge_index

tensor([[    0,     1,     2,  ..., 19556, 19556, 19556],
        [    0,     1,     2,  ..., 56910, 56911, 56912]])

# Splitting dataset

In [17]:
# Create training, validation, and test adjacency matrices
train_index, test_index = train_test_split(range(len(df)), test_size=0.2, random_state=0)
val_index, test_index = train_test_split(test_index, test_size=0.5, random_state=0)

In [18]:
print("Train:", sorted(train_index)[:20])
print("Test:", sorted(test_index)[:20])
print("Valid:", sorted(val_index)[:20])

Train: [0, 1, 2, 4, 5, 7, 8, 10, 11, 12, 13, 15, 16, 18, 19, 20, 21, 22, 24, 27]
Test: [3, 14, 17, 23, 34, 48, 52, 56, 60, 63, 65, 68, 79, 109, 110, 117, 157, 165, 187, 196]
Valid: [6, 9, 25, 26, 36, 41, 51, 54, 69, 72, 90, 105, 119, 121, 125, 128, 133, 151, 156, 166]


In [19]:
# Slice train/test/valid
train_edge_index = edge_index[:, train_index]
val_edge_index = edge_index[:, val_index]
test_edge_index = edge_index[:, test_index]

In [20]:
# Interaction values (assume all 1s)
train_edge_values = torch.ones_like(train_edge_index[0,:])


In [23]:
train_edge_index.shape

torch.Size([2, 80000])

In [22]:
train_edge_values.shape

torch.Size([80000])

## Adjacency Matrix

In [33]:
# Interaction matrix
R = sp.coo_matrix((train_edge_values, (train_edge_index[0], train_edge_index[1])), shape=(num_users, num_items))
R = R.tolil()

In [39]:
# Adjacency matrix
MN = num_users + num_items
adj_mat = sp.dok_matrix((MN, MN), dtype=np.float32)
adj_mat = adj_mat.tolil()

In [40]:
adj_mat

<76470x76470 sparse matrix of type '<class 'numpy.float32'>'
	with 0 stored elements in List of Lists format>

In [41]:
%%time
# Fill adjacency matrix
adj_mat[:num_users, num_users:] = R
adj_mat[num_users:, :num_users] = R.T

CPU times: user 1min 26s, sys: 9.05 s, total: 1min 36s
Wall time: 1min 37s


In [44]:
# Rowsum
rowsum = np.array(adj_mat.sum(1))
degrees = rowsum.ravel()

In [52]:
#dd = pd.Series(degrees).value_counts()
# dd[:15].plot(kind='bar', title="Degree distrib", xlabel="Degree of node")

In [53]:
# Inverse of the Degree matrix
d_inv = np.power(rowsum, -0.5).flatten()
d_inv[np.isinf(d_inv)] = 0.
d_mat = sp.diags(d_inv)

In [54]:
# Normalized Adjacency Matrix
norm_adj = d_mat.dot(adj_mat)
norm_adj = norm_adj.dot(d_mat)
# norm_adj = norm_adj.tocsr()

In [55]:
norm_adj

<76470x76470 sparse matrix of type '<class 'numpy.float32'>'
	with 160000 stored elements in Compressed Sparse Row format>

## LightGCN

In [117]:
class LightGCN(nn.Module):
    def __init__(self, num_users, num_items, edge_index, edge_values, num_layers=4, dim_h=64):
        super().__init__()
        
        self.num_users = num_users
        self.num_items = num_items
        self.num_layers = num_layers
        self.emb_users = nn.Embedding(num_embeddings=self.num_users, embedding_dim=dim_h)
        self.emb_items = nn.Embedding(num_embeddings=self.num_items, embedding_dim=dim_h)
        self.edge_index = edge_index
        self.edge_values = edge_values
        self.adj_mat = self.compute_norm_adj_matrix(edge_index, edge_values)
        self.sp_adj_mat = self._convert_sp_mat_to_sp_tensor(self.adj_mat)
        self.alpha = 1/(self.num_layers+1)

        # self.convs = nn.ModuleList(LGConv() for _ in range(num_layers))

        nn.init.normal_(self.emb_users.weight, std=0.01)
        nn.init.normal_(self.emb_items.weight, std=0.01)
        
    @staticmethod
    def _convert_sp_mat_to_sp_tensor(X):
        coo = X.tocoo().astype(np.float32)
        row = torch.Tensor(coo.row).long()
        col = torch.Tensor(coo.col).long()
        index = torch.stack([row, col])
        data = torch.FloatTensor(coo.data)
        return torch.sparse.FloatTensor(index, data, torch.Size(coo.shape))
    
    def compute_norm_adj_matrix(self, edge_index, edge_values):
        num_users = self.num_users
        num_items = self.num_items
        # Interaction matrix
        R = sp.coo_matrix((
            edge_values, 
            (edge_index[0], edge_index[1])),
            shape=(num_users, num_items))
        R = R.tolil()
        
        # Adjacency matrix
        MN = self.num_users + self.num_items
        adj_mat = sp.dok_matrix((MN, MN), dtype=np.float32)
        adj_mat = adj_mat.tolil()
        
        # Fill adjacency matrix
        adj_mat[:num_users, num_users:] = R
        adj_mat[num_users:, :num_users] = R.T
        
        # Degrees
        rowsum = np.array(adj_mat.sum(1))
        
        # Inverse of the Degree matrix
        d_inv = np.power(rowsum, -0.5).flatten()
        d_inv[np.isinf(d_inv)] = 0.
        d_mat = sp.diags(d_inv)
        
        # Normalized Adjacency Matrix
        norm_adj = d_mat.dot(adj_mat)
        norm_adj = norm_adj.dot(d_mat)
        return norm_adj
        

    def forward(self):
        # Keep track of starting embeddings for feeding into the BPR Loss 
        # for regularizing the learned embedding params
        emb0_users = self.emb_users.weight
        emb0_items = self.emb_items.weight
                
        # Embedding is dimension M + N
        emb = torch.cat([emb0_users, emb0_items])
        embs = [emb]

        # For each layer
        for layer_i in range(self.num_layers):
            emb = torch.sparse.mm(self.sp_adj_mat, emb)
            embs.append(emb)

        emb_final = self.alpha * torch.mean(torch.stack(embs, dim=1), dim=1)

        embf_users, embf_items = torch.split(emb_final, [self.num_users, self.num_items])

        return embf_users, emb0_users, embf_items, emb0_items

### TOY example

In [118]:
toy_edge_index = torch.tensor(
    [[0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4],
     [0, 1, 2, 0, 2, 1, 2, 1, 2, 0, 2]])

toy_edge_index = torch.LongTensor(toy_edge_index) 
toy_edge_values = torch.ones_like(toy_edge_index[0])
nu = 5
ni = 3
embdi = 3
layers = 1 

model = LightGCN(
    num_users=nu, 
    num_items=ni, 
    edge_index=toy_edge_index,
    edge_values=toy_edge_values,
    num_layers=1, 
    dim_h=embdi
)

In [119]:
model.adj_mat.todense().round(2)

array([[0.  , 0.  , 0.  , 0.  , 0.  , 0.33, 0.33, 0.26],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.41, 0.  , 0.32],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.41, 0.32],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.41, 0.32],
       [0.  , 0.  , 0.  , 0.  , 0.  , 0.41, 0.  , 0.32],
       [0.33, 0.41, 0.  , 0.  , 0.41, 0.  , 0.  , 0.  ],
       [0.33, 0.  , 0.41, 0.41, 0.  , 0.  , 0.  , 0.  ],
       [0.26, 0.32, 0.32, 0.32, 0.32, 0.  , 0.  , 0.  ]], dtype=float32)

In [120]:
emb0 = torch.cat([model.emb_users.weight, model.emb_items.weight])
emb0

tensor([[-0.0015,  0.0058, -0.0111],
        [-0.0116,  0.0086, -0.0100],
        [ 0.0023,  0.0084,  0.0090],
        [-0.0008,  0.0137, -0.0070],
        [-0.0033,  0.0143,  0.0039],
        [-0.0034, -0.0103,  0.0048],
        [ 0.0164, -0.0081,  0.0149],
        [ 0.0108,  0.0147,  0.0012]], grad_fn=<CatBackward0>)

In [121]:
# Forward
embf_users, emb0_user, embf_items, emb0_items = model.forward()

In [122]:
# Embedding is dimension M + N
emb = torch.cat([emb0_user, emb0_items])
embs = [emb]

# For each layer
for layer_i in range(1):
    emb = torch.sparse.mm(model.sp_adj_mat, emb)
    embs.append(emb)

In [132]:

m1 = model.sp_adj_mat.to_dense()
m2 = emb0

In [135]:
m2

tensor([[-0.0015,  0.0058, -0.0111],
        [-0.0116,  0.0086, -0.0100],
        [ 0.0023,  0.0084,  0.0090],
        [-0.0008,  0.0137, -0.0070],
        [-0.0033,  0.0143,  0.0039],
        [-0.0034, -0.0103,  0.0048],
        [ 0.0164, -0.0081,  0.0149],
        [ 0.0108,  0.0147,  0.0012]], grad_fn=<CatBackward0>)

In [124]:
emb_final = model.alpha * torch.mean(torch.stack(embs, dim=1), dim=1)
# embf_users, embf_items = torch.split(emb_final, [self.num_users, self.num_items])

In [126]:
emb_initial = torch.cat([emb0_user, emb0_items])

In [127]:
emb_initial

tensor([[-0.0015,  0.0058, -0.0111],
        [-0.0116,  0.0086, -0.0100],
        [ 0.0023,  0.0084,  0.0090],
        [-0.0008,  0.0137, -0.0070],
        [-0.0033,  0.0143,  0.0039],
        [-0.0034, -0.0103,  0.0048],
        [ 0.0164, -0.0081,  0.0149],
        [ 0.0108,  0.0147,  0.0012]], grad_fn=<CatBackward0>)

In [125]:
emb_final

tensor([[ 0.0014,  0.0009, -0.0010],
        [-0.0024,  0.0023, -0.0019],
        [ 0.0031,  0.0024,  0.0039],
        [ 0.0023,  0.0038, -0.0001],
        [-0.0003,  0.0037,  0.0016],
        [-0.0025,  0.0002, -0.0004],
        [ 0.0041,  0.0007,  0.0030],
        [ 0.0015,  0.0076, -0.0007]], grad_fn=<MulBackward0>)

In [110]:
AdjSparse = _convert_sp_mat_to_sp_tensor(model.adj_mat)

In [112]:
AdjSparse.coalesce()

tensor(indices=tensor([[0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7,
                        7, 7, 7],
                       [5, 6, 7, 5, 7, 6, 7, 6, 7, 5, 7, 0, 1, 4, 0, 2, 3, 0, 1,
                        2, 3, 4]]),
       values=tensor([0.3333, 0.3333, 0.2582, 0.4082, 0.3162, 0.4082, 0.3162,
                      0.4082, 0.3162, 0.4082, 0.3162, 0.3333, 0.4082, 0.4082,
                      0.3333, 0.4082, 0.4082, 0.2582, 0.3162, 0.3162, 0.3162,
                      0.3162]),
       size=(8, 8), nnz=22, layout=torch.sparse_coo)

In [113]:
torch.sparse.mm(AdjSparse, emb)

tensor([[-5.2439e-03,  2.9526e-03, -3.0117e-05],
        [ 2.1050e-03,  4.6518e-03, -1.8666e-03],
        [-9.0366e-03,  4.2872e-03,  2.1864e-03],
        [-9.0366e-03,  4.2872e-03,  2.1864e-03],
        [ 2.1050e-03,  4.6518e-03, -1.8666e-03],
        [-2.0340e-03, -1.2537e-02,  2.4030e-04],
        [-2.9509e-03, -1.3933e-02, -2.8841e-03],
        [-4.1417e-03, -1.7116e-02, -9.8257e-04]],
       grad_fn=<SparseAddmmBackward0>)

In [102]:
model.adj_mat

<8x8 sparse matrix of type '<class 'numpy.float32'>'
	with 22 stored elements in Compressed Sparse Row format>

In [104]:
emb.shape

torch.Size([8, 3])