# Link Prediction using Movielens dataset

# Import libraries

In [1]:
import torch
from torch import Tensor
import torch_geometric
import os
import pandas as pd


from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.nn import SAGEConv, to_hetero
import tqdm
from torch.nn import  ReLU
import torch.nn.functional as F


In [None]:
# Download additional libraries
%pip install torch-scatter -f https://data.pyg.org/whl/torch-${torch.__version__}.html
%pip install torch-sparse -f https://data.pyg.org/whl/torch-${torch.__version__}.html
%pip install pyg-lib -f https://data.pyg.org/whl/nightly/torch-${torch.__version__}.html

In [2]:
from torch_geometric.data import download_url, extract_zip

url = "https://files.grouplens.org/datasets/movielens/ml-latest.zip"
extract_zip(download_url(url, "."), ".")



Using existing file ml-latest.zip
Extracting ./ml-latest.zip


In [2]:

movies_path = './ml-latest/movies.csv'
ratings_path = './ml-latest/ratings.csv'

# Check and prerocess movies_df

In [3]:
# Load movie data
movies_df = pd.read_csv(movies_path, index_col="movieId")

movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [4]:
# Split genres and convert into indicator variables (create dummy variables)
genres = movies_df['genres'].str.get_dummies("|")
genres.head()

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
# User genres as movie input features (node features)
movie_features = torch.from_numpy(genres.values).to(torch.float)
assert movie_features.size() == (86537, 20) # 20 genres in total

In [6]:
# Create a mapping from unique movie indices to range[0, num_movie_nodes]
unique_movie_id = pd.DataFrame(data={
    'movieId': movies_df.index,
    "mappedID": pd.RangeIndex(len(movies_df)),
})
unique_movie_id.head()

Unnamed: 0,movieId,mappedID
0,1,0
1,2,1
2,3,2
3,4,3
4,5,4


# Check and preprocess ratings_df

In [7]:
# Load ratings data
ratings_df = pd.read_csv(ratings_path)
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119


In [8]:
# Create a mapping from unique user indices to range [0, num_user_nodes]
unique_user_id = ratings_df['userId'].unique()
unique_user_id = pd.DataFrame(data={
    'userId': unique_user_id,
    "mappedID": pd.RangeIndex(len(unique_user_id)),
})
unique_user_id.head()

Unnamed: 0,userId,mappedID
0,1,0
1,2,1
2,3,2
3,4,3
4,5,4


# Creating `edge_index` in COO Format for User-Movie Relationships 

In [9]:
# Perform merge to obtain the edges from users and movies
ratings_user_id = pd.merge(ratings_df['userId'], unique_user_id,
                            on='userId', how='left')
ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)

ratings_movie_id = pd.merge(ratings_df['movieId'], unique_movie_id,
                            on='movieId', how='left')
ratings_movie_id = torch.from_numpy(ratings_movie_id['mappedID'].values)

# Create `edge_index` in COO format following PyG semantics
edge_index_user_to_movie = torch.stack([ratings_user_id, ratings_movie_id], dim=0)
assert edge_index_user_to_movie.size() == (2, 33832162)

# Data view

In [10]:
print("Mapping of user IDs to consecutive values:")
print("==========================================")
print(unique_user_id.head())

Mapping of user IDs to consecutive values:
   userId  mappedID
0       1         0
1       2         1
2       3         2
3       4         3
4       5         4


In [11]:
print("Mapping of movie IDs to consecutive values:")
print("===========================================")
print(unique_movie_id.head())

Mapping of movie IDs to consecutive values:
   movieId  mappedID
0        1         0
1        2         1
2        3         2
3        4         3
4        5         4


In [12]:
print("Final edge indices pointing from users to movies:")
print("=================================================")
print(edge_index_user_to_movie)

Final edge indices pointing from users to movies:
tensor([[     0,      0,      0,  ..., 330974, 330974, 330974],
        [     0,    108,    156,  ...,   7911,   7954,   8071]])


# Creating graph based HeteroData (where nodes have different origins)

In [13]:
data = HeteroData()

# Save node indices
data['user'].node_id = torch.arange(len(unique_user_id))
data['movie'].node_id = torch.arange(len(movies_df))

# Add the node features and edge indices
data['movie'].x = movie_features

data['user', 'rates', 'movie'].edge_index = edge_index_user_to_movie

# Also need to make sure add the reverse adges from movies to users
# in order to let a GNN model be able to pass messages in both directions.
# For this use `ToUndirected()` transform 

data = T.ToUndirected()(data)

In [14]:
# Check data

assert data.node_types == ['user', 'movie']
assert data.edge_types == [("user", "rates", "movie"),
                           ("movie", "rev_rates", "user")]
assert data['user'].num_nodes == 330975
assert data['user'].num_features == 0
assert data['movie'].num_nodes == 86537
assert data['movie'].num_features == 20
assert data['user', 'rates', 'movie'].num_edges == 33832162
assert data['movie','rev_rates', 'user'].num_edges == 33832162


# Defining Edge-level Training Splits

In [15]:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio = 0.3,
    neg_sampling_ratio = 2.0,
    add_negative_train_samples=False,
    edge_types = ('user', 'rates', 'movie'),
    rev_edge_types = ('movie', 'rev_rates', 'user'),
)

train_data, val_data, test_data = transform(data)

In [16]:
# Check splits
assert train_data['user', 'rates', 'movie'].num_edges == 18946011
assert train_data['user', 'rates', 'movie'].edge_label_index.size(1) == 8119719
assert train_data['movie', 'rev_rates', 'user'].num_edges == 18946011


# No negative edges added:
assert train_data["user", "rates", "movie"].edge_label.min() == 1
assert train_data["user", "rates", "movie"].edge_label.max() == 1

assert val_data["user", "rates", "movie"].num_edges == 27065730
assert val_data["user", "rates", "movie"].edge_label_index.size(1) == 10149648
assert val_data["movie", "rev_rates", "user"].num_edges == 27065730
# Negative edges with ratio 2:1:
assert val_data["user", "rates", "movie"].edge_label.long().bincount().tolist() == [6766432, 3383216]

In [30]:
# Define seed edges:
edge_label_index = train_data["user", "rates", "movie"].edge_label_index
edge_label = train_data["user", "rates", "movie"].edge_label

train_loader = LinkNeighborLoader(
    data=train_data,  # TODO
    num_neighbors=[20, 10],  # TODO
    neg_sampling_ratio=2.0,  # TODO
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=524288, # OPtional default= 128
    shuffle=True,
)

# Inspect a sample:
sampled_data = next(iter(train_loader))

print("Sampled mini-batch:")
print("===================")
print(sampled_data)

assert sampled_data["user", "rates", "movie"].edge_label_index.size(1) == 3 * 524288
assert sampled_data["user", "rates", "movie"].edge_label.min() == 0
assert sampled_data["user", "rates", "movie"].edge_label.max() == 1

Sampled mini-batch:
HeteroData(
  user={
    node_id=[324236],
    n_id=[324236],
    num_sampled_nodes=[3],
  },
  movie={
    node_id=[86536],
    x=[86536, 20],
    n_id=[86536],
    num_sampled_nodes=[3],
  },
  (user, rates, movie)={
    edge_index=[2, 592385],
    edge_label=[1572864],
    edge_label_index=[2, 1572864],
    e_id=[592385],
    num_sampled_edges=[2],
    input_id=[524288],
  },
  (movie, rev_rates, user)={
    edge_index=[2, 4499094],
    e_id=[4499094],
    num_sampled_edges=[2],
  }
)


# Creating a Heterogeneous Link-level GNN

In [68]:
from torch_geometric.nn import SAGEConv, to_hetero
from torch.nn import  ReLU
import torch.nn.functional as F


class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()

        self.conv1 = SAGEConv(hidden_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)

    def forward(self, x: Tensor, edge_index: Tensor) -> Tensor:
        # Define a 2-layer GNN computation graph.
        # Use a *single* `ReLU` non-linearity in-between.
        # TODO:
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)

        return x

# Our final classifier applies the dot-product between source and destination
# node embeddings to derive edge-level predictions:
class Classifier(torch.nn.Module):
    def forward(self, x_user: Tensor, x_movie: Tensor, edge_label_index: Tensor) -> Tensor:
        # Convert node embeddings to edge-level representations:
        edge_feat_user = x_user[edge_label_index[0]]
        edge_feat_movie = x_movie[edge_label_index[1]]

        # Apply dot-product to get a prediction per supervision edge:
        return (edge_feat_user * edge_feat_movie).sum(dim=-1)


class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        # Since the dataset does not come with rich features, we also learn two
        # embedding matrices for users and movies:
        self.movie_lin = torch.nn.Linear(20, hidden_channels)
        self.user_emb = torch.nn.Embedding(data["user"].num_nodes, hidden_channels)
        self.movie_emb = torch.nn.Embedding(data["movie"].num_nodes, hidden_channels)

        # Instantiate homogeneous GNN:
        self.gnn = GNN(hidden_channels)

        # Convert GNN model into a heterogeneous variant:
        self.gnn = to_hetero(self.gnn, metadata=data.metadata())

        self.classifier = Classifier()

    def forward(self, data: HeteroData) -> Tensor:
        x_dict = {
          "user": self.user_emb(data["user"].node_id),
          "movie": self.movie_lin(data["movie"].x) + self.movie_emb(data["movie"].node_id),
        }

        # `x_dict` holds feature matrices of all node types
        # `edge_index_dict` holds all edge indices of all edge types
        x_dict = self.gnn(x_dict, data.edge_index_dict)

        pred = self.classifier(
            x_dict["user"],
            x_dict["movie"],
            data["user", "rates", "movie"].edge_label_index,
        )

        return pred

model = Model(hidden_channels=64)

print(model)

Model(
  (movie_lin): Linear(in_features=20, out_features=64, bias=True)
  (user_emb): Embedding(330975, 64)
  (movie_emb): Embedding(86537, 64)
  (gnn): GraphModule(
    (conv1): ModuleDict(
      (user__rates__movie): SAGEConv(64, 64, aggr=mean)
      (movie__rev_rates__user): SAGEConv(64, 64, aggr=mean)
    )
    (conv2): ModuleDict(
      (user__rates__movie): SAGEConv(64, 64, aggr=mean)
      (movie__rev_rates__user): SAGEConv(64, 64, aggr=mean)
    )
  )
  (classifier): Classifier()
)


# Training a Heterogeneous Link-Level GNN

In [70]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(1, 2):
    total_loss = total_examples = 0
    for sampled_data in tqdm.tqdm(train_loader):
        optimizer.zero_grad()

        sampled_data = sampled_data.to(device)

        pred = model(sampled_data)

        ground_truth = sampled_data['user', 'rates', 'movie'].edge_label.to(device)

        loss = F.binary_cross_entropy_with_logits(pred, ground_truth)

        loss.backward()
        optimizer.step()

        total_loss += float(loss) * pred.numel()
        total_examples += pred.numel()
    print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")


100%|██████████| 16/16 [00:08<00:00,  1.78it/s]

Epoch: 001, Loss: 0.5549





# Evaluating

In [71]:
edge_label_index = val_data['user', 'rates', 'movie'].edge_label_index
edge_label = val_data['user', 'rates', 'movie'].edge_label

val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors = [20, 10],
    edge_label_index = (('user', 'rates', 'movie'), edge_label_index),
    edge_label=edge_label,
    batch_size=524288,
    shuffle=False
)

sampled_data = next(iter(val_loader))
print("Sampled mini-batch:")
print("===================")
print(sampled_data)

Sampled mini-batch:
HeteroData(
  user={
    node_id=[173060],
    n_id=[173060],
    num_sampled_nodes=[3],
  },
  movie={
    node_id=[32738],
    x=[32738, 20],
    n_id=[32738],
    num_sampled_nodes=[3],
  },
  (user, rates, movie)={
    edge_index=[2, 446058],
    edge_label=[524288],
    edge_label_index=[2, 524288],
    e_id=[446058],
    num_sampled_edges=[2],
    input_id=[524288],
  },
  (movie, rev_rates, user)={
    edge_index=[2, 3058504],
    e_id=[3058504],
    num_sampled_edges=[2],
  }
)


In [72]:
from sklearn.metrics import roc_auc_score
preds = []
ground_truths = []

# Set the model to evaluation mode
model.eval()

for sampled_data in tqdm.tqdm(val_loader):
    sampled_data = sampled_data.to(device)
    pred = model(sampled_data)
    ground_truth = sampled_data['user', 'rates', 'movie'].edge_label

    preds.append(pred)
    ground_truths.append(ground_truth)

# Concatenate predictions and ground truths
pred = torch.cat(preds, dim=0).cpu().detach().numpy()
ground_truth = torch.cat(ground_truths, dim=0).cpu().detach().numpy()

# Calculate the AUC score
auc = roc_auc_score(ground_truth, pred)

print(f"Validation AUC: {auc:.4f}")




100%|██████████| 20/20 [00:06<00:00,  3.06it/s]


Validation AUC: 0.8532


In [73]:
from collections import defaultdict

# Assuming val_loader is already defined and contains the validation data
model.eval()  # Ensure the model is in evaluation mode

recommendations = defaultdict(list)  # Store recommendations for each user

for sampled_data in tqdm.tqdm(val_loader):
    # Move data to the device the model is on (e.g., GPU)
    sampled_data = sampled_data.to(device)
    
    # Get the predictions from the model
    with torch.no_grad():  # Disable gradient calculation for inference
        pred = model(sampled_data)
    
    # Extract ground truth labels (optional, for evaluating) and edge labels for recommendation
    user_ids = sampled_data['user', 'rates', 'movie'].edge_label_index[0]  # Users
    movie_ids = sampled_data['user', 'rates', 'movie'].edge_label_index[1]  # Movies
    
    # For each user, store the predicted score and corresponding movie
    for i in range(len(user_ids)):
        user_id = user_ids[i].item()
        movie_id = movie_ids[i].item()
        score = pred[i].item()  # Predicted score
        
        # Append the movie and its score for each user
        recommendations[user_id].append((movie_id, score))

# Now sort the recommendations for each user by score (descending) and show top N movies
top_n = 10  # Set how many top movies to recommend for each user
for user_id, movie_scores in recommendations.items():
    # Sort the list of (movie_id, score) by score in descending order
    sorted_recommendations = sorted(movie_scores, key=lambda x: x[1], reverse=True)
    
    # Get the top N movie recommendations
    top_movies = sorted_recommendations[:top_n]
    
    print(f"\nTop {top_n} recommendations for user {user_id}:")
    for movie_id, score in top_movies:
        print(f"Movie ID: {movie_id}, Predicted Score: {score:.4f}")


 95%|█████████▌| 19/20 [05:11<00:16, 16.40s/it]

In [66]:
len(ground_truth)

766197

In [65]:
len(pred)

766197

In [62]:
val_data['user'].num_nodes

330975