# Graph Convolutional Matrix Completion (GC-MC)

In this notebook we implement the GCMC model proposed in the [paper](https://arxiv.org/abs/1706.02263).

#### Architecture
1. Encoder:
    - create 5 bipartite graphs (one for each rating level)
    - for each node, message pass the embeddings of its neighbors
    - concatenate the embeddings of the different rating levels
    - pass the concatenated embeddings through a fully connected layer (decide whether to use act-MLP or just MLP)
2. Decoder:
    - compute probability of each rating level
    - compute final rating as expectation of the rating levels

#### Possible loss functions
- cross entropy: $-\sum_{(i,j) \in \Omega} \sum_{r \in \mathcal{R}} \text{I}\{M_{(i,j)} == r\} \log P(M_{(i,j)} == r)$
- RMSE: $\sqrt{\frac{1}{|\Omega|} \sum_{(i,j) \in \Omega} (M_{(i,j)} - \mathbb{E}[M_{(i,j)}])^2}$

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from torch import nn

## 1. Load and Preprocess Data

In [2]:
from config import N_u, N_v, VAL_SIZE, DEVICE
from load import load_train_data
from preprocess import extract_users_items_ratings, create_bipartite_graph, create_degree_matrix, create_inverse_sqrt_degree_matrix

# Load data
train_df = load_train_data()

# Extract adjacency lists: observed values edge index (src, tgt) and ratings (values)
all_users, all_items, all_ratings = extract_users_items_ratings(train_df)

# Create rating matrix from the triplets
all_ratings_matrix = np.zeros((N_u, N_v))
all_ratings_matrix[all_users, all_items] = all_ratings

# Split the data into trai and val sets
train_users, val_users, train_items, val_items, train_ratings, val_ratings = \
    train_test_split(all_users, all_items, all_ratings, test_size=VAL_SIZE)

# convert lists to torch tensors
train_users = torch.tensor(train_users, dtype=torch.long).to(DEVICE)
val_users = torch.tensor(val_users, dtype=torch.long).to(DEVICE)
train_items = torch.tensor(train_items, dtype=torch.long).to(DEVICE)
val_items = torch.tensor(val_items, dtype=torch.long).to(DEVICE)
train_ratings = torch.tensor(train_ratings, dtype=torch.float).to(DEVICE)
val_ratings = torch.tensor(val_ratings, dtype=torch.float).to(DEVICE)

print(len(train_users) + len(val_users))

# Create adjacency lists for rating r 
u_v_r_train = []
u_v_r_val = []
total = 0
for r in range(1, 6):
    # assign value of 1 to each triplet with rating r instead of rating r
    u_v_r_train.append((train_users[train_ratings == r], train_items[train_ratings == r], torch.ones_like(train_ratings[train_ratings == r])))
    u_v_r_val.append((val_users[val_ratings == r], val_items[val_ratings == r], torch.ones_like(val_ratings[val_ratings == r])))
    total += len(train_users[train_ratings == r]) + len(val_users[val_ratings == r])
print(total)

# Create bipartite graphs
graphs_r_train = [create_bipartite_graph(u, v, r) for u, v, r in u_v_r_train]
graphs_r_val = [create_bipartite_graph(u, v, r) for u, v, r in u_v_r_val]

# check that graphs_r_train[0] doesn't contain only zero entries
mask1 = graphs_r_train[0] == 1
mask2 = graphs_r_train[1] == 1
mask3 = graphs_r_train[2] == 1
mask4 = graphs_r_train[3] == 1
mask5 = graphs_r_train[4] == 1

mask21 = graphs_r_val[0] == 1
mask22 = graphs_r_val[1] == 1
mask23 = graphs_r_val[2] == 1
mask24 = graphs_r_val[3] == 1
mask25 = graphs_r_val[4] == 1

total_entries = mask1.sum() + mask2.sum() + mask3.sum() + mask4.sum() + mask5.sum() + mask21.sum() + mask22.sum() + mask23.sum() + mask24.sum() + mask25.sum()
print(total_entries / 2)

# create degree matrix for each rating
degree_matrices_r = [create_degree_matrix(graph) for graph in graphs_r_train]
print(np.unique(degree_matrices_r[0].cpu().numpy()))
degree_norms_r = [create_inverse_sqrt_degree_matrix(degree_matrix) for degree_matrix in degree_matrices_r]
print(np.unique(degree_norms_r[0].cpu().numpy()))
# for each degree matrix, replace inf with 0
for i in range(5):
    degree_norms_r[i][degree_norms_r[i] == float('inf')] = 0
print(np.unique(degree_norms_r[0].cpu().numpy()))

# check that degree matrices are symmetric
print((degree_matrices_r[0] == degree_matrices_r[0].T).all())

# check that degree_norms_r[0] is symmetric
print((degree_norms_r[0] == degree_norms_r[0].T).all())

# create normalized adjacency matrices
norm_adj_r = [degree_norm @ graph @ degree_norm for degree_norm, graph in zip(degree_norms_r, graphs_r_train)]
print(np.unique(norm_adj_r[0].cpu().numpy()))

# send adj_matrices_r to device
norm_adj_r = [adj_matrix.to(DEVICE) for adj_matrix in norm_adj_r]

# Nice, everything works

1176952
1176952
tensor(1176952., device='mps:0')
[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  77  78  79  80  81  82  83  84  85  86  87  88  89  90
  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107 108
 109 111 112 113 116 119 121 123 124 125 126 127 128 131 133 134 135 138
 142 143 144 146 151 152 166 167 172 174 184 189 192 193 197 199 210 218
 245 253 267 289 291 311 319 359]
[0.         0.05277798 0.05598925 0.0567048  0.05862104 0.05882353
 0.061199   0.06286946 0.06388766 0.06772855 0.06900655 0.07088812
 0.07124705 0.07198158 0.07216878 0.0727393  0.07372098 0.07580981
 0.07624929 0.07738233 0.07761505 0.08111071 0.08137884 0.08276059
 0.08333334 0.0836242  0.08391814 0.08512565 0.0860663  0.0863868

In [14]:
from models import save_model_inputs

class BaseLightGCN(nn.Module):
    def __init__(self, norm_adj_r, act_fn, embedding_dim, n_layers, init_emb_std, dropout_rate):
        super(BaseLightGCN, self).__init__()

        self.norm_adj_r = norm_adj_r  # bipartite graphs (one for each rating r)
        self.K = embedding_dim
        self.L = n_layers 
        self.act_fn = act_fn

        # Initialize embeddings
        self.E_u = nn.Embedding(num_embeddings=N_u, embedding_dim=self.K)
        self.E_v = nn.Embedding(num_embeddings=N_v, embedding_dim=self.K)
        nn.init.normal_(self.E_u.weight, std=init_emb_std)
        nn.init.normal_(self.E_v.weight, std=init_emb_std)

        # Projection to output space after message passing, aggregation, and selection
        self.mlp = self.create_mlp(dropout_rate)

        # crate learnable parameter list of Q_r matrices of shape K x K
        self.Q_r = nn.ParameterList([nn.Parameter(torch.randn(self.K, self.K)) for _ in range(5)])

    def create_mlp(self, dropout_rate):
        raise NotImplementedError("Derived classes must implement this method")
    
    def message_passing_r(self, r) -> list[torch.Tensor]:
        E_0 = torch.cat([self.E_u.weight, self.E_v.weight], dim=0)  # size (N_u + N_v) x K
        E_layers = [E_0]
        E_l = E_0

        for l in range(self.L):
            E_l = torch.mm(self.norm_adj_r[r], E_l)  # shape (N_u + N_v) x K
            E_layers.append(E_l) 
        return E_layers
    
    def message_pass_r(self, r) -> list[torch.Tensor]:
        E_u_v = torch.cat([self.E_u.weight, self.E_v.weight], dim=0)  
        E_r = torch.mm(self.norm_adj_r[r], E_u_v) 
        return E_r
    
    def aggregate(self, embs: list) -> torch.Tensor:
        """
        Aggregate the embeddings from the message passing layers.
        """
        E_agg = torch.cat(embs, dim=1)
        return E_agg
    
    def select_embeddings(self, users, items, E_agg):
        E_u, E_v = torch.split(E_agg, [N_u, N_v], dim=0)
        # Select embeddings of users and items from the adjacency lists
        E_u = E_u[users]
        E_v = E_v[items]  # shape (N_train, K * (L + 1))
        return E_u, E_v
    
    def forward1(self, users, items):

        # TODO: try average aggregation, or project to each embedding to dim K from 5K, and then use bilinear decoder

        E_r = [self.message_pass_r(r) for r in range(5)]
        E_agg = self.aggregate(E_r)
        # input shape for attention: (N_u + N_v) x (K * 2)
        E_u_sel, E_v_sel = self.select_embeddings(users, items, E_agg)

        # Project to output space
        concat_users_items = torch.cat([E_u_sel, E_v_sel], dim=1)  # shape (N_train, 2K * (L + 1))
        out = self.mlp(concat_users_items).squeeze()  
        return out 

    def forward(self, users, items):
        # TODO: use lightGCN message passing to aggregate information over hops 
        E_r = [self.message_pass_r(r) for r in range(5)]
        # TODO: project down to K using MLP instead of aggregation
        E_agg = torch.mean(torch.stack(E_r), dim=0)
        E_u_sel, E_v_sel = self.select_embeddings(users, items, E_agg)

        # Compute logits for each rating level
        logits = []
        for r in range(5):
            logit = torch.einsum('ij,jk,ik->i', E_u_sel, self.Q_r[r], E_v_sel)
            logits.append(logit)

        logits = torch.stack(logits, dim=1)  # Shape: (batch_size, 5)

        # Compute the softmax probabilities
        softmax_probs = torch.softmax(logits, dim=1)  # Shape: (batch_size, 5)

        # Compute the final rating prediction as the expected value of the softmax probabilities
        ratings = torch.arange(1, 6).float().to(DEVICE)  # Shape: (5,)
        preds = torch.sum(softmax_probs * ratings, dim=1)  # Shape: (batch_size,)
        
        # assert all preds are in [1, 5]
        #assert (preds >= 1).all() and (preds <= 5).all()

        return preds

        

    def get_ratings(self, users, items):
        return self.forward(users, items)

class LightGCN(BaseLightGCN):
    def __init__(self, norm_adj_r, act_fn, embedding_dim, n_layers, init_emb_std, dropout_rate, projections):
        self.projections = projections
        super().__init__(norm_adj_r, act_fn, embedding_dim, n_layers, init_emb_std, dropout_rate)

        # For reproducibility after training
        # save_model_inputs(norm_adj_r, act_fn, embedding_dim, n_layers, init_emb_std, dropout_rate, projections)

    def create_mlp(self, dropout_rate):
        layers = []
        input_dim = self.K * 5 * 2
        for proj in self.projections:
            output_dim = self.K * proj
            layers.append(nn.Linear(input_dim, output_dim))
            layers.append(self.act_fn)
            layers.append(nn.Dropout(dropout_rate))
            input_dim = output_dim
        layers.append(nn.Linear(input_dim, 1))
        return nn.Sequential(*layers)

In [15]:
from train import train_model
from config import DEVICE
from train import train_model
from postprocess import report_training_results

# Model and optimizer hyperparameters
L=4
K=28
INIT_EMBS_STD=0.075
LR=0.1
WEIGHT_DECAY=0.00005
DROPOUT=0.5
PROJECTIONS = (5,)
ACT_FN = nn.GELU()

# Train loop hyperparameters
EPOCHS=2000
STOP_THRESHOLD=1e-06

ks = [28, 30, 32]
layers = [3, 4]
projections = [(4,), (2,), (1,)]  # (8,), 

# to not change train loop (should actually separate concerns better and work with the reversing in the postprocessing)
means = np.zeros(N_v)
stds = np.ones(N_v)

for K in ks:
    for L in layers:
        for C in projections:
            print(f"Training LightGCN with K={K}, L={L}, C={C}")
            model = LightGCN(norm_adj_r, ACT_FN, K, L, INIT_EMBS_STD, DROPOUT, C).to(DEVICE)
            optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
            loss_fn = nn.MSELoss()
            train_rmse, val_rmse_std, val_rmse_orig = train_model(model, optimizer, loss_fn, train_users, train_items, train_ratings, val_users, val_items, val_ratings, val_ratings, means, stds, EPOCHS, STOP_THRESHOLD, False, verbosity=1)
            report_training_results(train_rmse, val_rmse_std, val_rmse_orig)

Training LightGCN with K=28, L=3, C=(4,)
Epoch 0 - Train loss: 1.9872 - Val loss: 1.8727 - Val loss original: 1.8727
Epoch 1 - Train loss: 1.8735 - Val loss: 1.3769 - Val loss original: 1.3769
Epoch 2 - Train loss: 1.3639 - Val loss: 1.4100 - Val loss original: 1.4100
Epoch 3 - Train loss: 1.4051 - Val loss: 1.2589 - Val loss original: 1.2589
Epoch 4 - Train loss: 1.2393 - Val loss: 1.3009 - Val loss original: 1.3009
Epoch 5 - Train loss: 1.2759 - Val loss: 1.3018 - Val loss original: 1.3018
Epoch 6 - Train loss: 1.2762 - Val loss: 1.2976 - Val loss original: 1.2976
Epoch 7 - Train loss: 1.2723 - Val loss: 1.2938 - Val loss original: 1.2938
Epoch 8 - Train loss: 1.2689 - Val loss: 1.2896 - Val loss original: 1.2896
Epoch 9 - Train loss: 1.2651 - Val loss: 1.2832 - Val loss original: 1.2832
Epoch 10 - Train loss: 1.2594 - Val loss: 1.2707 - Val loss original: 1.2707
Epoch 11 - Train loss: 1.2482 - Val loss: 1.2427 - Val loss original: 1.2427
Epoch 12 - Train loss: 1.2230 - Val loss: 1.1

KeyboardInterrupt: 