# Link prediction - (Neighborsampler. wrong one) 
- Based on this questions
    - https://github.com/pyg-team/pytorch_geometric/discussions/6098
- and this tutorial:
    - https://colab.research.google.com/drive/1xpzn1Nvai1ygd_P5Yambc_oe4VBPK_ZT?usp=sharing
- and this runthrough
    - https://medium.com/@pytorch_geometric/link-prediction-on-heterogeneous-graphs-with-pyg-6d5c29677c70]
----    
-- Actually 2.2.0 tagged version https://github.com/pyg-team/pytorch_geometric/tree/2.2.0

In [2]:
import torch
from torch import Tensor
print(torch.__version__)

import pandas as pd
from torch_geometric.data import download_url, extract_zip

import pandas as pd

from torch_geometric.data import HeteroData
import torch_geometric.transforms as T

# note it uses the wrong linkNeighborLoader (there are two). one for sampler and one for loader.
from torch_geometric.loader import LinkNeighborLoader


  from .autonotebook import tqdm as notebook_tqdm


1.13.1


## Prepare Feature Store

In [3]:
from torch_geometric.data import Data, HeteroData
from torch_geometric.loader import NeighborLoader
from torch_sparse import SparseTensor

feature_store = HeteroData()
graph_store = HeteroData()
data = HeteroData()

## Load data

In [4]:

url = 'https://files.grouplens.org/datasets/movielens/ml-latest-small.zip'
extract_zip(download_url(url, '.'), '.')

movies_path = './ml-latest-small/movies.csv'
ratings_path = './ml-latest-small/ratings.csv'

Using existing file ml-latest-small.zip
Extracting ./ml-latest-small.zip


In [5]:
print('movies.csv:')
print('===========')
print(pd.read_csv(movies_path)[["movieId", "genres"]].head())
print()
print('ratings.csv:')
print('============')
print(pd.read_csv(ratings_path)[["userId", "movieId"]].head())

movies.csv:
   movieId                                       genres
0        1  Adventure|Animation|Children|Comedy|Fantasy
1        2                   Adventure|Children|Fantasy
2        3                               Comedy|Romance
3        4                         Comedy|Drama|Romance
4        5                                       Comedy

ratings.csv:
   userId  movieId
0       1        1
1       1        3
2       1        6
3       1       47
4       1       50


In [6]:
# Load the entire movie data frame into memory:
movies_df = pd.read_csv(movies_path, index_col='movieId')

# Split genres and convert into indicator variables:
genres = movies_df['genres'].str.get_dummies('|')
print(genres[["Action", "Adventure", "Drama", "Horror"]].head())

# Use genres as movie input features:
movie_feat = torch.from_numpy(genres.values).to(torch.float)
assert movie_feat.size() == (9742, 20)  # 20 genres in total.

         Action  Adventure  Drama  Horror
movieId                                  
1             0          1      0       0
2             0          1      0       0
3             0          0      0       0
4             0          0      1       0
5             0          0      0       0


In [7]:
# Load the entire ratings data frame into memory:
ratings_df = pd.read_csv(ratings_path)

# Create a mapping from unique user indices to range [0, num_user_nodes):
unique_user_id = ratings_df['userId'].unique()
unique_user_id = pd.DataFrame(data={
    'userId': unique_user_id,
    'mappedID': pd.RangeIndex(len(unique_user_id)),
})
print("Mapping of user IDs to consecutive values:")
print("==========================================")
print(unique_user_id.head())
print()
# Create a mapping from unique movie indices to range [0, num_movie_nodes):
unique_movie_id = ratings_df['movieId'].unique()
unique_movie_id = pd.DataFrame(data={
    'movieId': unique_movie_id,
    'mappedID': pd.RangeIndex(len(unique_movie_id)),
})
print("Mapping of movie IDs to consecutive values:")
print("===========================================")
print(unique_movie_id.head())

# Perform merge to obtain the edges from users and movies:
ratings_user_id = pd.merge(ratings_df['userId'], unique_user_id,
                            left_on='userId', right_on='userId', how='left')
ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)
ratings_movie_id = pd.merge(ratings_df['movieId'], unique_movie_id,
                            left_on='movieId', right_on='movieId', how='left')
ratings_movie_id = torch.from_numpy(ratings_movie_id['mappedID'].values)

# With this, we are ready to construct our `edge_index` in COO format
# following PyG semantics:
edge_index_user_to_movie = torch.stack([ratings_user_id, ratings_movie_id], dim=0)
assert edge_index_user_to_movie.size() == (2, 100836)

print()
print("Final edge indices pointing from users to movies:")
print("=================================================")
print(edge_index_user_to_movie)


Mapping of user IDs to consecutive values:
   userId  mappedID
0       1         0
1       2         1
2       3         2
3       4         3
4       5         4

Mapping of movie IDs to consecutive values:
   movieId  mappedID
0        1         0
1        3         1
2        6         2
3       47         3
4       50         4

Final edge indices pointing from users to movies:
tensor([[   0,    0,    0,  ...,  609,  609,  609],
        [   0,    1,    2,  ..., 3121, 1392, 2873]])


In [83]:

data = HeteroData()

## Keep data object to only transform it to the graph store after transformation
## but omit adding the feature vectors of movies.
# Save node indices:
data["user"].node_id = torch.arange(len(unique_user_id))
data["movie"].node_id = torch.arange(len(movies_df))

# Add the node features and edge indices:
# data["movie"].x = movie_feat # TODO
data["user", "rates", "movie"].edge_index = edge_index_user_to_movie  # TODO

# We also need to make sure to add the reverse edges from movies to users
# in order to let a GNN be able to pass messages in both directions.
# We can leverage the `T.ToUndirected()` transform for this from PyG:

# TODO:
# FROM HERE https://pytorch-geometric.readthedocs.io/en/latest/modules/transforms.html
transform = T.Compose([T.ToUndirected()])
data = transform(data)          

feature_store = HeteroData()
graph_store = HeteroData()

# Add the data to the feature store
feature_store.put_tensor(movie_feat, group_name='movie', attr_name='x', index=torch.arange(len(movies_df)))
# TODO: Is there a way to add a feature with only id to the feature store? 
feature_store.put_tensor(torch.arange(len(unique_user_id)),group_name='user', attr_name='x', index=torch.arange(len(unique_user_id)))


## Delay adding graph to graph store after linksplits 
## Add graph to graph store

# coo = (edge_index_user_to_movie[0], edge_index_user_to_movie[1])

# graph_store.put_edge_index(edge_index=coo,
#                            edge_type=('user', 'rates', 'movie'),
#                            layout='coo', size=(len(coo[0]),len(coo[1])) )

# # put reverse index
# graph_store.put_edge_index(edge_index=coo[::-1],
#                            edge_type=('movie', 'rev_rates', 'user'),
#                            layout='coo', size=(len(coo[1]),len(coo[0])) )


print(data)

assert data.node_types == ["user", "movie"]
assert data.edge_types == [("user", "rates", "movie"),
                           ("movie", "rev_rates", "user")]
assert data["user"].num_nodes == 610
assert data["user"].num_features == 0
assert data["movie"].num_nodes == 9742
#assert data["movie"].num_features == 20
assert data["user", "rates", "movie"].num_edges == 100836
assert data["movie", "rev_rates", "user"].num_edges == 100836

HeteroData(
  [1muser[0m={ node_id=[610] },
  [1mmovie[0m={ node_id=[9742] },
  [1m(user, rates, movie)[0m={ edge_index=[2, 100836] },
  [1m(movie, rev_rates, user)[0m={ edge_index=[2, 100836] }
)


In [84]:
# For this, we first split the set of edges into
# training (80%), validation (10%), and testing edges (10%).
# Across the training edges, we use 70% of edges for message passing,
# and 30% of edges for supervision.
# We further want to generate fixed negative edges for evaluation with a ratio of 2:1.
# Negative edges during training will be generated on-the-fly.
# We can leverage the `RandomLinkSplit()` transform for this from PyG:
transform = T.RandomLinkSplit(
    num_val=0.1, 
    num_test=0.1,  
    disjoint_train_ratio=.3,  
    neg_sampling_ratio=2,  
    add_negative_train_samples=False,  
    edge_types=("user", "rates", "movie"),
    rev_edge_types=("movie", "rev_rates", "user"), 
)

train_data, val_data, test_data = transform(data)
print("Training data:")
print("==============")
print(train_data)
print()
print("Validation data:")
print("================")
print(val_data)

# coo = (edge_index_user_to_movie[0], edge_index_user_to_movie[1])

edge_index = train_data[("user", "rates", "movie")].edge_index
coo = (edge_index[0], edge_index[1])
graph_store.put_edge_index(edge_index = coo ,
                            edge_type = ('user', 'rates', 'movie'),
                            layout = 'coo',
                            size = (edge_index[0].size(dim=0),edge_index[1].size(dim=0)) )

# # put reverse index
coo_reverse=(edge_index[1], edge_index[0])
graph_store.put_edge_index(edge_index = coo_reverse,
                            edge_type = ('movie', 'rev_rates', 'user'),
                            layout = 'coo',
                            size = (edge_index[1].size(dim=0),edge_index[0].size(dim=0)) )



assert train_data["user", "rates", "movie"].num_edges == 56469
assert train_data["user", "rates", "movie"].edge_label_index.size(1) == 24201
assert train_data["movie", "rev_rates", "user"].num_edges == 56469
# No negative edges added:
assert train_data["user", "rates", "movie"].edge_label.min() == 1
assert train_data["user", "rates", "movie"].edge_label.max() == 1

assert val_data["user", "rates", "movie"].num_edges == 80670
assert val_data["user", "rates", "movie"].edge_label_index.size(1) == 30249
assert val_data["movie", "rev_rates", "user"].num_edges == 80670
# Negative edges with ratio 2:1:
assert val_data["user", "rates", "movie"].edge_label.long().bincount().tolist() == [20166, 10083]

Training data:
HeteroData(
  [1muser[0m={ node_id=[610] },
  [1mmovie[0m={ node_id=[9742] },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 56469],
    edge_label=[24201],
    edge_label_index=[2, 24201]
  },
  [1m(movie, rev_rates, user)[0m={ edge_index=[2, 56469] }
)

Validation data:
HeteroData(
  [1muser[0m={ node_id=[610] },
  [1mmovie[0m={ node_id=[9742] },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 80670],
    edge_label=[30249],
    edge_label_index=[2, 30249]
  },
  [1m(movie, rev_rates, user)[0m={ edge_index=[2, 80670] }
)


In [85]:
## Disregard - for now we assume the graph will fit in memory. 
# And we can keep a skeleton dataset to 


# Ideally we need to implement transform RandomLinkSplit
# However it has not been thought through with feature store... 
# the problem is we now have 3 objects with similar names (our feature store does not understand dataset types). 
# So either we would need to Create 3 instances of features store (easiest, but wasteful)
# or we need to add some sort of filter object or similar to the feature store / do a lot of re-engineering of all functions. 
# (right thing to do, but feels almost as a task / questions for PyG?)

In [96]:
# Define seed edges:
edge_label_index = train_data[("user", "rates", "movie")].edge_label_index
edge_label = train_data[("user", "rates", "movie")].edge_label


In [97]:
feature_store

HeteroData(
  [1mmovie[0m={ x=[9742, 20] },
  [1muser[0m={ x=[610] }
)

In [98]:
graph_store

HeteroData(
  [1muser[0m={ num_nodes=56469 },
  [1mmovie[0m={ num_nodes=56469 },
  [1m(user, rates, movie)[0m={ edge_index=[2, 56469] },
  [1m(movie, rev_rates, user)[0m={ edge_index=[2, 56469] }
)

In [99]:
train_data[("user", "rates", "movie")].edge_label_index

tensor([[ 165,    8,  181,  ...,  104,  293,  469],
        [ 203,  834, 1042,  ..., 4344, 2588,  689]])

In [100]:
edge_label_index[0].max()

tensor(609)

In [101]:
edge_label_index[1].max()

tensor(9721)

In [103]:
train_loader = LinkNeighborLoader(
    data=(feature_store,graph_store),  # <--- Changed to feature store, graph store
    num_neighbors=[20,10],  # Original [20,10]
    neg_sampling_ratio=2,  
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=128, # Original 128
    shuffle=True,
)


In [104]:
# Inspect a sample:
sampled_data = next(iter(train_loader))

print("Sampled mini-batch:")
print("===================")
print(sampled_data)

assert sampled_data["user", "rates", "movie"].edge_label_index.size(1) == 3 * 128
assert sampled_data["user", "rates", "movie"].edge_label.min() == 0
assert sampled_data["user", "rates", "movie"].edge_label.max() == 1

IndexError: index 40827 is out of bounds for dimension 0 with size 9742

In [None]:
from torch_geometric.nn import SAGEConv, to_hetero
import torch_geometric
import torch.nn.functional as F


class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()

        self.conv1 = SAGEConv(hidden_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)

    def forward(self, x: Tensor, edge_index: Tensor) -> Tensor:
         # Define a 2-layer GNN computation graph.
        # Use a *single* `ReLU` non-linearity in-between.
        # TODO:
        # https://pytorch-geometric.readthedocs.io/en/latest/modules/nn.html
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x
                   

# Our final classifier applies the dot-product between source and destination
# node embeddings to derive edge-level predictions:
class Classifier(torch.nn.Module):
    def forward(self, x_user: Tensor, x_movie: Tensor, edge_label_index: Tensor) -> Tensor:
        # Convert node embeddings to edge-level representations:
        edge_feat_user = x_user[edge_label_index[0]]
        edge_feat_movie = x_movie[edge_label_index[1]]

        # Apply dot-product to get a prediction per supervision edge:
        return (edge_feat_user * edge_feat_movie).sum(dim=-1)


class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        # Since the dataset does not come with rich features, we also learn two
        # embedding matrices for users and movies:
        self.movie_lin = torch.nn.Linear(20, hidden_channels)
        self.user_emb = torch.nn.Embedding(data["user"].num_nodes, hidden_channels)
        self.movie_emb = torch.nn.Embedding(data["movie"].num_nodes, hidden_channels)

        # Instantiate homogeneous GNN:
        self.gnn = GNN(hidden_channels)

        # Convert GNN model into a heterogeneous variant:
        self.gnn = to_hetero(self.gnn, metadata=data.metadata())

        self.classifier = Classifier()

    def forward(self, data: HeteroData) -> Tensor:
        x_dict = {
          "user": self.user_emb(data["user"].node_id),
          "movie": self.movie_lin(data["movie"].x) + self.movie_emb(data["movie"].node_id),
        } 

        # `x_dict` holds feature matrices of all node types
        # `edge_index_dict` holds all edge indices of all edge types
        x_dict = self.gnn(x_dict, data.edge_index_dict)

        pred = self.classifier(
            x_dict["user"],
            x_dict["movie"],
            data["user", "rates", "movie"].edge_label_index,
        )

        return pred

        
model = Model(hidden_channels=64)

print(model)

In [None]:
import tqdm
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: '{device}'")

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(1, 6):
    total_loss = total_examples = 0
    for sampled_data in tqdm.tqdm(train_loader):
        optimizer.zero_grad()

        # TODO: Move `sampled_data` to the respective `device`
        # TODO: Run `forward` pass of the model
        # TODO: Apply binary cross entropy via
        # `F.binary_cross_entropy_with_logits(pred, ground_truth)`
        sampled_data.to(device)
        pred = model(sampled_data)
        ground_truth = sampled_data["user", "rates", "movie"].edge_label
        loss = F.binary_cross_entropy_with_logits(pred, ground_truth)
                        
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.numel()
        total_examples += pred.numel()
    print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")

In [None]:
# Define the validation seed edges:
edge_label_index = val_data["user", "rates", "movie"].edge_label_index
edge_label = val_data["user", "rates", "movie"].edge_label

val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=[20, 10],
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=3 * 128,
    shuffle=False,
)

sampled_data = next(iter(val_loader))

print("Sampled mini-batch:")
print("===================")
print(sampled_data)

assert sampled_data["user", "rates", "movie"].edge_label_index.size(1) == 3 * 128
assert sampled_data["user", "rates", "movie"].edge_label.min() >= 0
assert sampled_data["user", "rates", "movie"].edge_label.max() <= 1

In [None]:
from sklearn.metrics import roc_auc_score

preds = []
ground_truths = []
for sampled_data in tqdm.tqdm(val_loader):
    with torch.no_grad():
        # TODO: Collect predictions and ground-truths and write them into
        # `preds` and `ground_truths`.
        sampled_data.to(device)
        preds.append(model(sampled_data))
        ground_truths.append(sampled_data["user", "rates", "movie"].edge_label)

pred = torch.cat(preds, dim=0).cpu().numpy()
ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
auc = roc_auc_score(ground_truth, pred)
print()
print(f"Validation AUC: {auc:.4f}")