<center>
<img src="https://upload.wikimedia.org/wikipedia/fr/thumb/1/1d/Logo_T%C3%A9l%C3%A9com_SudParis.svg/1014px-Logo_T%C3%A9l%C3%A9com_SudParis.svg.png" width="10%" />
</center>

<center> <h2> NET 4103/7431 Complex Network </h2> </center>

<center> <h3> Vincent Gauthier (vincent.gauthier@telecom-sudparis.eu) </h3> </center>

### Note
Avant de commencer les exercices, assurez-vous que tout fonctionne comme prévu. Tout d'abord, le redémarrage du kernel **(dans la barre de menus, sélectionnez le kernel $\rightarrow$ Restart)**.

Assurez-vous que vous remplir les célluler aux endroits marquer «YOUR CODE HERE». 

Veuillez supprimer les ligne «raise NotImplementedError()» dans toutes les cellules auxquelles vous avez répondu, ainsi que votre nom et prénom ci-dessous:

In [None]:
NOM = "XXX"
PRENOM = "XXX"

---

<h1 align="center">Lab #6: Building Recommander System (RecSys) With LightGCN</h1> 
<br />
<br />
<br />
<img src="../../images/network.png" style="display:block;margin-left:auto;margin-right:auto;width:80%;"></img>

In [None]:
import os.path as osp
import wandb
import torch

from tqdm import tqdm
from dataset import MovieLensSmall
from torch_geometric.utils import degree
from torch_geometric.transforms import RandomLinkSplit

# Style pour le Notebook
from IPython.core.display import HTML

def css_styling():
    styles = open("../../styles/custom.css", "r").read()
    return HTML(styles)
css_styling()

In [None]:
import networkx as nx
from packaging import version
import sys 
import torch
import torch_geometric

print("Python version:", sys.version)
print("networkx version:", nx.__version__)
print("torch versions:", torch.__version__)
print("torch versions:", torch_geometric.__version__)

# assert networkx version is greater or equal to 3.0
assert version.parse(nx.__version__) >= version.parse("3.0")
assert version.parse(torch.__version__) >= version.parse("2.0")
assert version.parse(torch_geometric.__version__) >= version.parse("2.5")
# assert python version is greater that 3.9
assert sys.version_info[0] == 3
assert sys.version_info[1] >= 9  

# If working in colab mount the drive filesystem 
if 'google.colab' in str(get_ipython()):
    print('Working in colab')
    
    from google.colab import drive
    drive.mount('/content/drive')
else:
    print("working locally")

## LightGCN model 

In [None]:
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import degree

class LGConv(MessagePassing):
    def __init__(self):
        super().__init__(aggr='add')
        
    def forward(self, x, edge_index):
        # Compute normalization.
        row, col = edge_index
        deg = degree(col, x.size(0), dtype=x.dtype)
        deg_inv_sqrt = deg.pow(-0.5)
        deg_inv_sqrt[deg_inv_sqrt == float('inf')] = 0
        norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]   
        out = self.propagate(edge_index, x=x, norm=norm)
        return out 
        
    def message(self, x_j, norm):
        return norm.view(-1,1) * x_j

In [None]:
import torch.nn as nn

class LightGCN(nn.Module):
    def __init__(self, num_nodes, embedding_dim, num_layers=4):
        super().__init__()
        self.n_node = num_nodes
        self.embedding_dim = embedding_dim
        self.K = num_layers
        self.alpha = torch.tensor(1. / (self.K + 1))
        self.node_emb = nn.Embedding(num_embeddings=self.n_node,  embedding_dim=self.embedding_dim)
        self.convs = nn.ModuleList([LGConv() for _ in range(self.K)])
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.normal_(self.node_emb.weight, std=0.1)

    def get_embedding(self, edge_index):
        # Eq.8 in the LightGCN paper https://arxiv.org/pdf/2002.02126
        x = self.node_emb.weight
        # YOUR CODE HERE
        raise NotImplementedError()
        return out
        
    def forward(self, edge_index, edge_label_index=None):
        out = self.get_embedding(edge_index)
        
        out_src = out[edge_label_index[0]]
        out_dst = out[edge_label_index[1]]
        #Eq.5 in the LightGCN paper https://arxiv.org/pdf/2002.02126
        # YOUR CODE HERE
        raise NotImplementedError()
    
    def __repr__(self) -> str:
        return (f'{self.__class__.__name__}({self.n_nodes}, '
                f'{self.embedding_dim}, num_layers={self.K})')

    def recommendation_loss(
        self,
        pos_edge_rank,
        neg_edge_rank,
        node_id,
        lambda_reg = 1e-4,
    ):
        r"""The Bayesian Personalized Ranking (BPR) loss.
        
        The BPR loss is a pairwise loss that encourages the prediction of an
        observed entry to be higher than its unobserved counterparts
        (see `here <https://arxiv.org/abs/2002.02126>`__).
        """
        import torch.nn.functional as F
        log_prob = F.logsigmoid(pos_edge_rank - neg_edge_rank).mean()
        emb = self.node_emb.weight[node_id]
        regularization = lambda_reg * emb.norm(p=2).pow(2)
        regularization = regularization / pos_edge_rank.size(0)
        return -log_prob + regularization

## Trainning 

In [None]:
def get_user_positive_items(edge_index):
    """Generates dictionary of positive items for each user

    Args:
        edge_index (torch.Tensor): 2 by N list of edges

    Returns:
        dict: dictionary of positive items for each user
    """
    user_pos_items = {}
    for i in range(edge_index.shape[1]):
        user = edge_index[0][i].item()
        item = edge_index[1][i].item()
        if user not in user_pos_items:
            user_pos_items[user] = []
        user_pos_items[user].append(item)
    return user_pos_items

## Question: Explain what is the NDCG and how does it compare with with Recall@K

In [None]:
def NDCGAtK(groundTruth, pred, k):
    """
    https://towardsdatascience.com/demystifying-ndcg-bee3be58cfe0
    """
    max_r = torch.zeros(k)
    length = min(len(groundTruth), k)
    max_r[:length] = 1
    idcg = torch.sum(max_r * 1. / torch.log2(torch.arange(2, k + 2)))
    idcg
    r = []
    for item in pred:
        if item in groundTruth:
            r.append(1)
        else:
            r.append(0)
    dcg = torch.sum(torch.tensor(r) * 1. / torch.log2(torch.arange(2, k + 2)))
    return dcg/idcg

In [None]:
def train(optimizer, model, train_edge_index, train_edge_label_index, num_user, num_movie, batch_size):

    total_loss = total_examples = 0
    device = torch.device('cpu')
    train_loader = torch.utils.data.DataLoader(
        range(train_edge_label_index.size(1)),
        shuffle=True,
        batch_size=batch_size,
    )

    for index in tqdm(train_loader):
        pos_edge_label_index = train_edge_label_index[:, index]
        
        neg_edge_label_index = torch.stack([
                pos_edge_label_index[0],
                torch.randint(num_user, num_user + num_movie,
                              (index.numel(), ), device=device)
            ], dim=0)
        
        edge_label_index = torch.cat([
                pos_edge_label_index,
                neg_edge_label_index,
            ], dim=1)

        optimizer.zero_grad()
        pos_rank, neg_rank = model(train_edge_index, edge_label_index).chunk(2)
        loss = model.recommendation_loss(
                pos_rank,
                neg_rank,
                node_id=edge_label_index.unique(),
            )
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pos_rank.numel()
        total_examples += pos_rank.numel()
    return total_loss / total_examples

## Tests 

In [None]:
@torch.no_grad()
def test(model, train_data_homo_edge_index, train_edge_label_index, test_edge_label_index, num_user, num_movie, k=20):
    # fetch the embedding with the full train graph on homogenous graph 
    emb = model.get_embedding(train_data_homo_edge_index)
    user_emb, movie_emb = emb[:num_user], emb[num_user:]
    logits = user_emb @ movie_emb.t()
    
    v = torch.tensor([1]*train_edge_label_index.size(1))
    mask = torch.sparse_coo_tensor(train_edge_label_index, v, (num_user, num_movie)).bool().to_dense()
    logits[mask] = float('-inf')
    
    pred = logits.topk(k=k).indices
    groundTruth = get_user_positive_items(test_edge_label_index)
    
    ndcg = precision = recall = total_examples = 0
    for elem in groundTruth.keys():
        total_examples += 1
        num_elem = len(groundTruth[elem])
        predSet = set(pred[elem].tolist())
        groundTruthSet = set(groundTruth[elem])
        intersection = groundTruthSet.intersection(predSet)
        ndcg += NDCGAtK(groundTruth[elem], pred[elem], k)
        recall += len(intersection)/num_elem
        precision += len(intersection)/k
    
    return recall/total_examples, precision/total_examples, ndcg/total_examples

## Main

In [None]:
def main():
    run = wandb.init(
        # Set the project where this run will be logged
        project="LightGCN",
        config=config
    )
    transform = RandomLinkSplit(
        num_val=0.0,
        num_test=config["test_split"],
        neg_sampling_ratio=0.0,
        is_undirected=True,
        edge_types=[('user', 'rates', 'movie')],
        rev_edge_types=[('movie', 'rev_rates', 'user')],
    )
    
    device = torch.device('cpu')
    dataset = MovieLensSmall('./data')
    full_data = dataset[0]
    train_data, _, test_data = transform(full_data)
    train_data_homo = train_data.to_homogeneous()
    num_user, num_movie = full_data['user'].num_nodes, full_data['movie'].num_nodes
    train_edge_label_index = train_data['user', 'rates', 'movie'].edge_label_index
    test_edge_label_index = test_data['user', 'rates', 'movie'].edge_label_index
    train_data_homo_edge_index = train_data_homo.edge_index
    train_data_homo_edge_label_index = train_data_homo.edge_label_index
    
    model = LightGCN(
        num_nodes=num_user+num_movie,
        embedding_dim=config["latent_dim"],
        num_layers=config["num_layers"],
    ).to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"])

    for epoch in range(config["num_epoch"]):
        loss = train(optimizer, model, train_data_homo_edge_index, train_data_homo_edge_label_index, num_user, num_movie, config["batch_size"])
        recall, precision, ndcg = test(model, train_data_homo_edge_index, train_edge_label_index, test_edge_label_index, num_user, num_movie, k=20)
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Recall@20: {recall:.4f}, Precision@20: {precision:.4f}, Ndcg@K: {ndcg:.4f}')
        wandb.log({"Loss": loss, "Recall@20": recall,  "Precision@20":precision, "Ndcg@K":ndcg})

In [None]:
# Hyperparameters
config = {
    "batch_size": 1024,
    "num_epoch": 50,
    "lr": 0.005,
    "latent_dim": 128,
    "num_layers": 3,
    "test_split": 0.3,
}

main()