# Link Prediction using Movielens dataset

# Import libraries

In [1]:
import torch
from torch import Tensor
import torch_geometric
import os
import pandas as pd


from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.nn import SAGEConv, to_hetero
import tqdm
from torch.nn import  ReLU
import torch.nn.functional as F


In [None]:
# Download additional libraries
%pip install torch-scatter -f https://data.pyg.org/whl/torch-${torch.__version__}.html
%pip install torch-sparse -f https://data.pyg.org/whl/torch-${torch.__version__}.html
%pip install pyg-lib -f https://data.pyg.org/whl/nightly/torch-${torch.__version__}.html

In [None]:
from torch_geometric.data import download_url, extract_zip

url = "https://files.grouplens.org/datasets/movielens/ml-latest.zip"
extract_zip(download_url(url, "."), ".")



In [2]:

movies_path = './ml-latest-small/movies.csv'
ratings_path = './ml-latest-small/ratings.csv'

# Check and prerocess movies_df

In [3]:
# Load movie data
movies_df = pd.read_csv(movies_path, index_col="movieId")

movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [4]:
# Split genres and convert into indicator variables (create dummy variables)
genres = movies_df['genres'].str.get_dummies("|")
genres.head()

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
# User genres as movie input features (node features)
movie_features = torch.from_numpy(genres.values).to(torch.float)

In [6]:
# Create a mapping from unique movie indices to range[0, num_movie_nodes]
unique_movie_id = pd.DataFrame(data={
    'movieId': movies_df.index,
    "mappedID": pd.RangeIndex(len(movies_df)),
})
unique_movie_id.head()

Unnamed: 0,movieId,mappedID
0,1,0
1,2,1
2,3,2
3,4,3
4,5,4


# Check and preprocess ratings_df

In [9]:
# Load ratings data
ratings_df = pd.read_csv(ratings_path)
ratings_df = ratings_df[ratings_df['rating'].isin([4.0, 4.5, 5.0])].copy()
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [10]:
ratings_df['rating'].value_counts()

rating
4.0    26818
5.0    13211
4.5     8551
Name: count, dtype: int64

In [11]:
# Create a mapping from unique user indices to range [0, num_user_nodes]
unique_user_id = ratings_df['userId'].unique()
unique_user_id = pd.DataFrame(data={
    'userId': unique_user_id,
    "mappedID": pd.RangeIndex(len(unique_user_id)),
})
unique_user_id.head()

Unnamed: 0,userId,mappedID
0,1,0
1,2,1
2,3,2
3,4,3
4,5,4


# Creating `edge_index` in COO Format for User-Movie Relationships 

In [12]:
# Perform merge to obtain the edges from users and movies
ratings_user_id = pd.merge(ratings_df['userId'], unique_user_id,
                            on='userId', how='left')
ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)

ratings_movie_id = pd.merge(ratings_df['movieId'], unique_movie_id,
                            on='movieId', how='left')
ratings_movie_id = torch.from_numpy(ratings_movie_id['mappedID'].values)

# Create `edge_index` in COO format following PyG semantics
edge_index_user_to_movie = torch.stack([ratings_user_id, ratings_movie_id], dim=0)

# Data view

In [13]:
print("Mapping of user IDs to consecutive values:")
print("==========================================")
print(unique_user_id.head())

Mapping of user IDs to consecutive values:
   userId  mappedID
0       1         0
1       2         1
2       3         2
3       4         3
4       5         4


In [14]:
print("Mapping of movie IDs to consecutive values:")
print("===========================================")
print(unique_movie_id.head())

Mapping of movie IDs to consecutive values:
   movieId  mappedID
0        1         0
1        2         1
2        3         2
3        4         3
4        5         4


In [15]:
print("Final edge indices pointing from users to movies:")
print("=================================================")
print(edge_index_user_to_movie)

Final edge indices pointing from users to movies:
tensor([[   0,    0,    0,  ...,  608,  608,  608],
        [   0,    2,    5,  ..., 9461, 9462, 9463]])


# Creating graph based HeteroData (where nodes have different origins)

In [16]:
data = HeteroData()

# Save node indices
data['user'].node_id = torch.arange(len(unique_user_id))
data['movie'].node_id = torch.arange(len(movies_df))

# Add the node features and edge indices
data['movie'].x = movie_features

data['user', 'rates', 'movie'].edge_index = edge_index_user_to_movie

# Also need to make sure add the reverse adges from movies to users
# in order to let a GNN model be able to pass messages in both directions.
# For this use `ToUndirected()` transform 

data = T.ToUndirected()(data)

# Defining Edge-level Training Splits

In [17]:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio = 0.3,
    neg_sampling_ratio = 2.0,
    add_negative_train_samples=False,
    edge_types = ('user', 'rates', 'movie'),
    rev_edge_types = ('movie', 'rev_rates', 'user'),
)

train_data, val_data, test_data = transform(data)

In [18]:
# Define seed edges:
edge_label_index = train_data["user", "rates", "movie"].edge_label_index
edge_label = train_data["user", "rates", "movie"].edge_label

train_loader = LinkNeighborLoader(
    data=train_data,  
    num_neighbors=[20, 10],  
    neg_sampling_ratio=2.0,  
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=524288, # OPtional default= 128
    shuffle=True,
)


# Evaluation split

In [19]:
edge_label_index = val_data['user', 'rates', 'movie'].edge_label_index
edge_label = val_data['user', 'rates', 'movie'].edge_label

val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors = [20, 10],
    edge_label_index = (('user', 'rates', 'movie'), edge_label_index),
    edge_label=edge_label,
    batch_size=524288,
    shuffle=False
)



## Metrics for evaluation

In [28]:
import numpy as np
from sklearn.metrics import precision_score, recall_score
def precision_at_k(y_true, y_pred, k):
    # Sort by predicted values
    idx = np.argsort(y_pred)[::-1]
    y_true = y_true[idx]
    
    # Get top k predictions
    y_true_at_k = y_true[:k]
    
    return np.sum(y_true_at_k) / k

def normalized_recall_at_k(y_true,
                         y_pred,
                         k: int) -> float:
   
    if np.sum(y_true) == 0:
        return 0.0
        
    # Sort by predicted values
    idx = np.argsort(y_pred)[::-1]
    y_true = y_true[idx]
    
    # Get top k predictions
    y_true_at_k = y_true[:k]
    
    # Use min(k, total_relevant) as denominator
    denominator = min(k, np.sum(y_true))
    return np.sum(y_true_at_k) / denominator


# Creating a Heterogeneous Link-level GrahpSAGE GNN

In [21]:
from torch_geometric.nn import SAGEConv, to_hetero
from torch.nn import  ReLU
import torch.nn.functional as F


class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()

        self.conv1 = SAGEConv(hidden_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)

    def forward(self, x: Tensor, edge_index: Tensor) -> Tensor:
        # Define a 2-layer GNN computation graph.
        # Use a *single* `ReLU` non-linearity in-between.
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)

        return x

# Our final classifier applies the dot-product between source and destination
# node embeddings to derive edge-level predictions:
class Classifier(torch.nn.Module):
    def forward(self, x_user: Tensor, x_movie: Tensor, edge_label_index: Tensor) -> Tensor:
        # Convert node embeddings to edge-level representations:
        edge_feat_user = x_user[edge_label_index[0]]
        edge_feat_movie = x_movie[edge_label_index[1]]

        # Apply dot-product to get a prediction per supervision edge:
        return (edge_feat_user * edge_feat_movie).sum(dim=-1)


class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        # Since the dataset does not come with rich features, we also learn two
        # embedding matrices for users and movies:
        self.movie_lin = torch.nn.Linear(20, hidden_channels)
        self.user_emb = torch.nn.Embedding(data["user"].num_nodes, hidden_channels)
        self.movie_emb = torch.nn.Embedding(data["movie"].num_nodes, hidden_channels)

        # Instantiate homogeneous GNN:
        self.gnn = GNN(hidden_channels)

        # Convert GNN model into a heterogeneous variant:
        self.gnn = to_hetero(self.gnn, metadata=data.metadata())

        self.classifier = Classifier()

    def forward(self, data: HeteroData) -> Tensor:
        x_dict = {
          "user": self.user_emb(data["user"].node_id),
          "movie": self.movie_lin(data["movie"].x) + self.movie_emb(data["movie"].node_id),
        }

        # `x_dict` holds feature matrices of all node types
        # `edge_index_dict` holds all edge indices of all edge types
        x_dict = self.gnn(x_dict, data.edge_index_dict)

        pred = self.classifier(
            x_dict["user"],
            x_dict["movie"],
            data["user", "rates", "movie"].edge_label_index,
        )

        return pred

model = Model(hidden_channels=256)

print(model)

Model(
  (movie_lin): Linear(in_features=20, out_features=256, bias=True)
  (user_emb): Embedding(609, 256)
  (movie_emb): Embedding(9742, 256)
  (gnn): GraphModule(
    (conv1): ModuleDict(
      (user__rates__movie): SAGEConv(256, 256, aggr=mean)
      (movie__rev_rates__user): SAGEConv(256, 256, aggr=mean)
    )
    (conv2): ModuleDict(
      (user__rates__movie): SAGEConv(256, 256, aggr=mean)
      (movie__rev_rates__user): SAGEConv(256, 256, aggr=mean)
    )
  )
  (classifier): Classifier()
)


# Training a Heterogeneous Link-Level GNN

In [22]:
def bpr_loss(pos_pred, neg_pred, neg_sampling_ratio):
    # Reshape negative predictions
    neg_pred = neg_pred.view(pos_pred.size(0), neg_sampling_ratio)
    # Expand positive predictions
    pos_pred = pos_pred.unsqueeze(1)
    # Compute BPR loss
    return -torch.log(torch.sigmoid(pos_pred - neg_pred)).mean()

    

In [23]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


for epoch in range(1, 31):
    total_loss = 0
    for sampled_data in tqdm.tqdm(train_loader):
        optimizer.zero_grad()

        sampled_data = sampled_data.to(device)
        pred = model(sampled_data).to(device)

        edge_label = sampled_data['user', 'rates', 'movie'].edge_label.to(device)

        pos_pred = pred[edge_label == 1]
        neg_pred = pred[edge_label == 0]


        loss = bpr_loss(pos_pred, neg_pred, neg_sampling_ratio=2)


        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch: {epoch:03d}, Loss: {total_loss:.4f}")


100%|██████████| 1/1 [00:00<00:00,  2.74it/s]


Epoch: 001, Loss: 1.0365


100%|██████████| 1/1 [00:00<00:00,  2.01it/s]


Epoch: 002, Loss: 0.6212


100%|██████████| 1/1 [00:00<00:00,  3.39it/s]


Epoch: 003, Loss: 0.5042


100%|██████████| 1/1 [00:00<00:00,  4.07it/s]


Epoch: 004, Loss: 0.3829


100%|██████████| 1/1 [00:00<00:00,  4.39it/s]


Epoch: 005, Loss: 0.3248


100%|██████████| 1/1 [00:00<00:00,  4.37it/s]


Epoch: 006, Loss: 0.3561


100%|██████████| 1/1 [00:00<00:00,  4.80it/s]


Epoch: 007, Loss: 0.2691


100%|██████████| 1/1 [00:00<00:00,  4.59it/s]


Epoch: 008, Loss: 0.2209


100%|██████████| 1/1 [00:00<00:00,  4.49it/s]


Epoch: 009, Loss: 0.2219


100%|██████████| 1/1 [00:00<00:00,  4.80it/s]


Epoch: 010, Loss: 0.2191


100%|██████████| 1/1 [00:00<00:00,  4.65it/s]


Epoch: 011, Loss: 0.2011


100%|██████████| 1/1 [00:00<00:00,  4.46it/s]


Epoch: 012, Loss: 0.1840


100%|██████████| 1/1 [00:00<00:00,  4.48it/s]


Epoch: 013, Loss: 0.1660


100%|██████████| 1/1 [00:00<00:00,  4.81it/s]


Epoch: 014, Loss: 0.1549


100%|██████████| 1/1 [00:00<00:00,  4.94it/s]


Epoch: 015, Loss: 0.1527


100%|██████████| 1/1 [00:00<00:00,  4.86it/s]


Epoch: 016, Loss: 0.1473


100%|██████████| 1/1 [00:00<00:00,  4.50it/s]


Epoch: 017, Loss: 0.1454


100%|██████████| 1/1 [00:00<00:00,  4.57it/s]


Epoch: 018, Loss: 0.1324


100%|██████████| 1/1 [00:00<00:00,  4.86it/s]


Epoch: 019, Loss: 0.1281


100%|██████████| 1/1 [00:00<00:00,  4.66it/s]


Epoch: 020, Loss: 0.1257


100%|██████████| 1/1 [00:00<00:00,  4.39it/s]


Epoch: 021, Loss: 0.1156


100%|██████████| 1/1 [00:00<00:00,  4.07it/s]


Epoch: 022, Loss: 0.1180


100%|██████████| 1/1 [00:00<00:00,  2.77it/s]


Epoch: 023, Loss: 0.1098


100%|██████████| 1/1 [00:00<00:00,  3.21it/s]


Epoch: 024, Loss: 0.1101


100%|██████████| 1/1 [00:00<00:00,  4.58it/s]


Epoch: 025, Loss: 0.1071


100%|██████████| 1/1 [00:00<00:00,  4.37it/s]


Epoch: 026, Loss: 0.1026


100%|██████████| 1/1 [00:00<00:00,  4.19it/s]


Epoch: 027, Loss: 0.1021


100%|██████████| 1/1 [00:00<00:00,  5.01it/s]


Epoch: 028, Loss: 0.0969


100%|██████████| 1/1 [00:00<00:00,  5.05it/s]


Epoch: 029, Loss: 0.0923


100%|██████████| 1/1 [00:00<00:00,  4.88it/s]

Epoch: 030, Loss: 0.0941





# Evaluating

# Metrics

In [29]:
preds = []
ground_truths = []

# Set the model to evaluation mode
model.eval()

for sampled_data in tqdm.tqdm(val_loader):
    sampled_data = sampled_data.to(device)
    pred = model(sampled_data)
    ground_truth = sampled_data['user', 'rates', 'movie'].edge_label

    preds.append(pred)
    ground_truths.append(ground_truth)

# Concatenate predictions and ground truths
pred = torch.cat(preds, dim=0).cpu().detach().numpy()
ground_truth = torch.cat(ground_truths, dim=0).cpu().detach().numpy()

# Define your value for k
k = 10  # You can modify this value as per your requirement

# Calculate precision@k and recall@k
precision_k = precision_at_k(ground_truth, pred, k)
recall_k = normalized_recall_at_k(ground_truth, pred, k)

print(f"Precision@{k}: {precision_k:.4f}")
print(f"Recall@{k}: {recall_k:.4f}")


100%|██████████| 1/1 [00:00<00:00, 10.80it/s]

Precision@10: 1.0000
Recall@10: 1.0000





In [None]:
from collections import defaultdict

# Assuming val_loader is already defined and contains the validation data
model.eval()  # Ensure the model is in evaluation mode

recommendations = defaultdict(list)  # Store recommendations for each user

for sampled_data in tqdm.tqdm(val_loader):
    # Move data to the device the model is on (e.g., GPU)
    sampled_data = sampled_data.to(device)
    
    # Get the predictions from the model
    with torch.no_grad():  # Disable gradient calculation for inference
        pred = model(sampled_data)
    
    # Extract ground truth labels (optional, for evaluating) and edge labels for recommendation
    user_ids = sampled_data['user', 'rates', 'movie'].edge_label_index[0]  # Users
    movie_ids = sampled_data['user', 'rates', 'movie'].edge_label_index[1]  # Movies
    
    # For each user, store the predicted score and corresponding movie
    for i in range(len(user_ids)):
        user_id = user_ids[i].item()
        movie_id = movie_ids[i].item()
        score = pred[i].item()  # Predicted score
        
        # Append the movie and its score for each user
        recommendations[user_id].append((movie_id, score))


# Convert them to dictionaries for quick lookup
user_id_map = unique_user_id.set_index('mappedID')['userId'].to_dict()
movie_id_map = unique_movie_id.set_index('mappedID')['movieId'].to_dict()

# Create a list to store the top recommendations in a structured format
top_n = 10  # Set how many top movies to recommend for each user
recommendation_data = []

for mapped_user_id, movie_scores in recommendations.items():
    # Sort the list of (movie_id, score) by score in descending order
    sorted_recommendations = sorted(movie_scores, key=lambda x: x[1], reverse=True)
    
    # Get the real user ID using the mapping
    real_user_id = user_id_map.get(mapped_user_id)
    
    # Get the top N movie recommendations
    top_movies = sorted_recommendations[:top_n]
    
    # Append the recommendations for this user to the list
    for mapped_movie_id, score in top_movies:
        # Get the real movie ID using the mapping
        real_movie_id = movie_id_map.get(mapped_movie_id)
        
        # Append the real IDs and score to the list
        recommendation_data.append({
            'user_id': real_user_id,
            'movie_id': real_movie_id,
            'predicted_score': score
        })

# Convert the list of recommendations to a pandas DataFrame
recommendations_df = pd.DataFrame(recommendation_data)


In [None]:
recommendations_df = pd.merge(recommendations_df, movies_df, 
                                left_on='movie_id', right_on='movieId',
                                how='left')