In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.nn import GATConv, to_hetero, HeteroConv
from torch_geometric.data import Data, HeteroData
import torch_geometric.transforms as T
from torch_geometric.loader import LinkNeighborLoader
import tqdm
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch.nn.functional as F

In [None]:
# Download additional libraries
%pip install torch-scatter -f https://data.pyg.org/whl/torch-${torch.__version__}.html
%pip install torch-sparse -f https://data.pyg.org/whl/torch-${torch.__version__}.html
%pip install pyg-lib -f https://data.pyg.org/whl/nightly/torch-${torch.__version__}.html

In [None]:
from torch_geometric.data import download_url, extract_zip

url = "https://files.grouplens.org/datasets/movielens/ml-latest.zip"
extract_zip(download_url(url, "."), ".")


In [2]:

movies_path = './ml-latest/movies.csv'
ratings_path = './ml-latest/ratings.csv'

# Check and prerocess movies_df

In [3]:
# Load movie data
movies_df = pd.read_csv(movies_path, index_col="movieId")

movies_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [4]:
# Split genres and convert into indicator variables (create dummy variables)
genres = movies_df['genres'].str.get_dummies("|")
genres.head()

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
# User genres as movie input features (node features)
movie_features = torch.from_numpy(genres.values).to(torch.float)
assert movie_features.size() == (86537, 20) # 20 genres in total

In [6]:
# Create a mapping from unique movie indices to range[0, num_movie_nodes]
unique_movie_id = pd.DataFrame(data={
    'movieId': movies_df.index,
    "mappedID": pd.RangeIndex(len(movies_df)),
})
unique_movie_id.head()

Unnamed: 0,movieId,mappedID
0,1,0
1,2,1
2,3,2
3,4,3
4,5,4


# Check and preprocess ratings_df

In [7]:
# Load ratings data
ratings_df = pd.read_csv(ratings_path)
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,1225734739
1,1,110,4.0,1225865086
2,1,158,4.0,1225733503
3,1,260,4.5,1225735204
4,1,356,5.0,1225735119


In [8]:
# Create a mapping from unique user indices to range [0, num_user_nodes]
unique_user_id = ratings_df['userId'].unique()
unique_user_id = pd.DataFrame(data={
    'userId': unique_user_id,
    "mappedID": pd.RangeIndex(len(unique_user_id)),
})
unique_user_id.head()

Unnamed: 0,userId,mappedID
0,1,0
1,2,1
2,3,2
3,4,3
4,5,4


# Creating `edge_index` in COO Format for User-Movie Relationships 

In [9]:
# Perform merge to obtain the edges from users and movies
ratings_user_id = pd.merge(ratings_df['userId'], unique_user_id,
                            on='userId', how='left')
ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)

ratings_movie_id = pd.merge(ratings_df['movieId'], unique_movie_id,
                            on='movieId', how='left')
ratings_movie_id = torch.from_numpy(ratings_movie_id['mappedID'].values)

# Create `edge_index` in COO format following PyG semantics
edge_index_user_to_movie = torch.stack([ratings_user_id, ratings_movie_id], dim=0)
assert edge_index_user_to_movie.size() == (2, 33832162)

# Data view

In [10]:
print("Mapping of user IDs to consecutive values:")
print("==========================================")
print(unique_user_id.head())

Mapping of user IDs to consecutive values:
   userId  mappedID
0       1         0
1       2         1
2       3         2
3       4         3
4       5         4


In [11]:
print("Mapping of movie IDs to consecutive values:")
print("===========================================")
print(unique_movie_id.head())

Mapping of movie IDs to consecutive values:
   movieId  mappedID
0        1         0
1        2         1
2        3         2
3        4         3
4        5         4


In [12]:
print("Final edge indices pointing from users to movies:")
print("=================================================")
print(edge_index_user_to_movie)

Final edge indices pointing from users to movies:
tensor([[     0,      0,      0,  ..., 330974, 330974, 330974],
        [     0,    108,    156,  ...,   7911,   7954,   8071]])


# Creating graph based HeteroData (where nodes have different origins)

In [10]:
data = HeteroData()

# Save node indices
data['user'].node_id = torch.arange(len(unique_user_id))
data['movie'].node_id = torch.arange(len(movies_df))

# Add the node features and edge indices
data['movie'].x = movie_features

data['user', 'rates', 'movie'].edge_index = edge_index_user_to_movie

# Also need to make sure add the reverse adges from movies to users
# in order to let a GNN model be able to pass messages in both directions.
# For this use `ToUndirected()` transform 

data = T.ToUndirected()(data)

In [14]:
# Check data

assert data.node_types == ['user', 'movie']
assert data.edge_types == [("user", "rates", "movie"),
                           ("movie", "rev_rates", "user")]
assert data['user'].num_nodes == 330975
assert data['user'].num_features == 0
assert data['movie'].num_nodes == 86537
assert data['movie'].num_features == 20
assert data['user', 'rates', 'movie'].num_edges == 33832162
assert data['movie','rev_rates', 'user'].num_edges == 33832162


# Defining Edge-level Training Splits

In [11]:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio = 0.3,
    neg_sampling_ratio = 2.0,
    add_negative_train_samples=False,
    edge_types = ('user', 'rates', 'movie'),
    rev_edge_types = ('movie', 'rev_rates', 'user'),
)

train_data, val_data, test_data = transform(data)

In [16]:
# Check splits
assert train_data['user', 'rates', 'movie'].num_edges == 18946011
assert train_data['user', 'rates', 'movie'].edge_label_index.size(1) == 8119719
assert train_data['movie', 'rev_rates', 'user'].num_edges == 18946011


# No negative edges added:
assert train_data["user", "rates", "movie"].edge_label.min() == 1
assert train_data["user", "rates", "movie"].edge_label.max() == 1

assert val_data["user", "rates", "movie"].num_edges == 27065730
assert val_data["user", "rates", "movie"].edge_label_index.size(1) == 10149648
assert val_data["movie", "rev_rates", "user"].num_edges == 27065730
# Negative edges with ratio 2:1:
assert val_data["user", "rates", "movie"].edge_label.long().bincount().tolist() == [6766432, 3383216]

In [12]:
# Define seed edges:
edge_label_index = train_data["user", "rates", "movie"].edge_label_index
edge_label = train_data["user", "rates", "movie"].edge_label

train_loader = LinkNeighborLoader(
    data=train_data,  
    num_neighbors=[20, 10],  
    neg_sampling_ratio=2.0, 
    edge_label_index=(("user", "rates", "movie"), edge_label_index),
    edge_label=edge_label,
    batch_size=262144, # OPtional default= 128
    shuffle=True,
)

# Inspect a sample:
sampled_data = next(iter(train_loader))

print("Sampled mini-batch:")
print("===================")
print(sampled_data)

assert sampled_data["user", "rates", "movie"].edge_label_index.size(1) == 3 * 262144
assert sampled_data["user", "rates", "movie"].edge_label.min() == 0
assert sampled_data["user", "rates", "movie"].edge_label.max() == 1

Sampled mini-batch:
HeteroData(
  user={
    node_id=[292025],
    n_id=[292025],
    num_sampled_nodes=[3],
  },
  movie={
    node_id=[86422],
    x=[86422, 20],
    n_id=[86422],
    num_sampled_nodes=[3],
  },
  (user, rates, movie)={
    edge_index=[2, 592251],
    edge_label=[786432],
    edge_label_index=[2, 786432],
    e_id=[592251],
    num_sampled_edges=[2],
    input_id=[262144],
  },
  (movie, rev_rates, user)={
    edge_index=[2, 4107837],
    e_id=[4107837],
    num_sampled_edges=[2],
  }
)


# Creating a Heterogenious Link-level Graph Attention GNN

In [13]:
class GATModel(nn.Module):
    def __init__(self, hidden_channels, num_user_nodes):
        super(GATModel, self).__init__()
        # Embedding layer for 'user' nodes (since they have no features)
        self.user_embedding = nn.Embedding(num_user_nodes, hidden_channels)
        # Linear transformation for 'movie' node features to match hidden_channel
        self.movie_lin = nn.Linear(20, hidden_channels)
        # Define two GATConv layers for each edge type in the heterogeneous graph
        self.conv1 = HeteroConv({
            ('user', 'rates', 'movie'): GATConv((-1, -1), hidden_channels, add_self_loops=False),
            ('movie', 'rev_rates', 'user'): GATConv((-1, -1), hidden_channels, add_self_loops=False),

        }, aggr='mean')

        self.conv2 = HeteroConv({
            ('user', 'rates', 'movie'): GATConv((-1, -1), hidden_channels, add_self_loops=False),
            ('movie', 'rev_rates', 'user'): GATConv((-1, -1), hidden_channels, add_self_loops=False),

        }, aggr='mean')
        # Edge scoring MLP to compute predictions from node embeddings
        self.edge_mlp = nn.Linear(hidden_channels * 2, 1)
    
    def forward(self, data):
        # Retrieve global node IDs for 'user' and 'movie' nodes
        user_ids = data['user'].n_id
        movie_ids = data['movie'].n_id
        # Initialize node features for 'user' and 'movie' nodes
        x_dict = {}
        x_dict['user'] = self.user_embedding(user_ids)
        x_dict['movie'] = self.movie_lin(data['movie'].x)
        # Perform GNN message passing through two GATConv layers
        x_dict = self.conv1(x_dict, data.edge_index_dict)
        x_dict = {key: F.relu(x) for key, x in x_dict.items()}
        x_dict = self.conv2(x_dict, data.edge_index_dict)
        x_dict = {key: F.relu(x) for key, x in x_dict.items()}  
        # Extract node embeddings for 'user' and 'movie' nodes
        user_emb = x_dict['user']
        movie_emb = x_dict['movie']
        # Get edge indices for which to compute predictions
        edge_label_index = data['user', 'rates', 'movie'].edge_label_index
        # Get embeddings for source ('user') and target ('movie') nodes
        src = user_emb[edge_label_index[0]]
        dst = movie_emb[edge_label_index[1]]

        # Compute edge scores using an MLP
        edge_features = torch.cat([src, dst], dim=-1)
        pred = self.edge_mlp(edge_features).squeeze(-1)

        return pred

num_user_nodes = len(unique_user_id)
hidden_channels = 64

model = GATModel(hidden_channels=hidden_channels, num_user_nodes=num_user_nodes)

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
model = model.to(device)

In [15]:
def bpr_loss(pos_pred, neg_pred, neg_sampling_ratio):
    # Reshape negative predictions
    neg_pred = neg_pred.view(pos_pred.size(0), neg_sampling_ratio)
    # Expand positive predictions
    pos_pred = pos_pred.unsqueeze(1)
    # Compute BPR loss
    return -torch.log(torch.sigmoid(pos_pred - neg_pred)).mean()


In [16]:
for epoch in range(1, 2):
    total_loss = 0
    for sampled_data in tqdm.tqdm(train_loader):
        optimizer.zero_grad()

        sampled_data = sampled_data.to(device)
        pred = model(sampled_data).to(device)

        edge_label = sampled_data['user', 'rates', 'movie'].edge_label.to(device)

        pos_pred = pred[edge_label == 1]
        neg_pred = pred[edge_label == 0]


        loss = bpr_loss(pos_pred, neg_pred, neg_sampling_ratio=2)


        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch: {epoch:03d}, Loss: {total_loss:.4f}")


100%|██████████| 31/31 [00:15<00:00,  2.00it/s]

Epoch: 001, Loss: 18.3401





# Evaluating

In [17]:
edge_label_index = val_data['user', 'rates', 'movie'].edge_label_index
edge_label = val_data['user', 'rates', 'movie'].edge_label

val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors = [20, 10],
    edge_label_index = (('user', 'rates', 'movie'), edge_label_index),
    edge_label=edge_label,
    batch_size=262144,
    shuffle=False
)

sampled_data = next(iter(val_loader))
print("Sampled mini-batch:")
print("===================")
print(sampled_data)

Sampled mini-batch:
HeteroData(
  user={
    node_id=[138352],
    n_id=[138352],
    num_sampled_nodes=[3],
  },
  movie={
    node_id=[29445],
    x=[29445, 20],
    n_id=[29445],
    num_sampled_nodes=[3],
  },
  (user, rates, movie)={
    edge_index=[2, 391970],
    edge_label=[262144],
    edge_label_index=[2, 262144],
    e_id=[391970],
    num_sampled_edges=[2],
    input_id=[262144],
  },
  (movie, rev_rates, user)={
    edge_index=[2, 2353189],
    e_id=[2353189],
    num_sampled_edges=[2],
  }
)


In [18]:
from sklearn.metrics import roc_auc_score
preds = []
ground_truths = []

# Set the model to evaluation mode
model.eval()

with torch.no_grad():
    for sampled_data in tqdm.tqdm(val_loader):
        sampled_data = sampled_data.to(device)
        pred = model(sampled_data)
        ground_truth = sampled_data['user', 'rates', 'movie'].edge_label.to(device)

        # Apply sigmoid to convert logits to probabilities
        pred = torch.sigmoid(pred)

        preds.append(pred.cpu())
        ground_truths.append(ground_truth.cpu())

# Concatenate predictions and ground truths
pred = torch.cat(preds, dim=0).numpy()
ground_truth = torch.cat(ground_truths, dim=0).numpy()

# Calculate the AUC score
auc = roc_auc_score(ground_truth, pred)

print(f"Validation AUC: {auc:.4f}")


100%|██████████| 39/39 [00:10<00:00,  3.81it/s]


Validation AUC: 0.8665


In [19]:
from collections import defaultdict
import torch
import pandas as pd
from tqdm import tqdm

# Ensure the model is in evaluation mode
model.eval()

# Create a dictionary to store recommendations for each user
recommendations = defaultdict(list)

# Loop over the validation data loader
with torch.no_grad():
    for sampled_data in tqdm(val_loader):
        sampled_data = sampled_data.to(device)
        
        # Get model predictions
        pred = model(sampled_data)
        
        # Apply sigmoid to convert logits to probabilities if needed
        pred = torch.sigmoid(pred).cpu()
        
        # Extract user and movie IDs from the edge label index
        user_ids = sampled_data['user', 'rates', 'movie'].edge_label_index[0].cpu()
        movie_ids = sampled_data['user', 'rates', 'movie'].edge_label_index[1].cpu()
        
        # Loop over the batch and collect predictions
        for user_id, movie_id, score in zip(user_ids, movie_ids, pred):
            # Convert tensors to scalars
            user_id = user_id.item()
            movie_id = movie_id.item()
            score = score.item()
            
            # Append the movie and its score for each user
            recommendations[user_id].append((movie_id, score))

# Map internal IDs back to original user IDs and movie IDs
user_id_map = unique_user_id.set_index('mappedID')['userId'].to_dict()
movie_id_map = unique_movie_id.set_index('mappedID')['movieId'].to_dict()

# Prepare a list to store the top-N recommendations
top_n = 10  # Adjust as needed
recommendation_data = []

for mapped_user_id, movie_scores in recommendations.items():
    # Sort the movies for this user by predicted score in descending order
    sorted_recommendations = sorted(movie_scores, key=lambda x: x[1], reverse=True)
    
    # Get the real user ID
    real_user_id = user_id_map.get(mapped_user_id)
    
    # Get the top-N movies
    top_movies = sorted_recommendations[:top_n]
    
    for mapped_movie_id, score in top_movies:
        # Get the real movie ID
        real_movie_id = movie_id_map.get(mapped_movie_id)
        
        # Append the recommendation to the list
        recommendation_data.append({
            'user_id': real_user_id,
            'movie_id': real_movie_id,
            'predicted_score': score
        })

# Create a DataFrame from the recommendation data
recommendations_df = pd.DataFrame(recommendation_data)


100%|██████████| 39/39 [01:41<00:00,  2.60s/it]


In [21]:
recommendations_df = pd.merge(recommendations_df, movies_df, 
                                left_on='movie_id', right_on='movieId',
                                how='left')

In [22]:
recommendations_df.head()

Unnamed: 0,user_id,movie_id,predicted_score,title,genres
0,16978,137,0.953852,Man of the Year (1995),Documentary
1,16978,2078,0.953028,"Jungle Book, The (1967)",Animation|Children|Comedy|Musical
2,16978,199043,0.951652,Black & White: The Dawn of Justice (2014),Action
3,16978,54116,0.947968,First Snow (2006),Drama|Thriller
4,16978,274,0.945266,Man of the House (1995),Comedy


In [24]:
recommendations_df[recommendations_df['user_id'].isin([16978])]

Unnamed: 0,user_id,movie_id,predicted_score,title,genres
0,16978,137,0.953852,Man of the Year (1995),Documentary
1,16978,2078,0.953028,"Jungle Book, The (1967)",Animation|Children|Comedy|Musical
2,16978,199043,0.951652,Black & White: The Dawn of Justice (2014),Action
3,16978,54116,0.947968,First Snow (2006),Drama|Thriller
4,16978,274,0.945266,Man of the House (1995),Comedy
5,16978,168096,0.944958,The Rider Named Death (2004),Drama|Thriller
6,16978,2616,0.943591,Dick Tracy (1990),Action|Crime
7,16978,546,0.942411,Super Mario Bros. (1993),Action|Adventure|Children|Comedy|Fantasy|Sci-Fi
8,16978,5219,0.941679,Resident Evil (2002),Action|Horror|Sci-Fi|Thriller
9,16978,992,0.941496,"Rich Man's Wife, The (1996)",Thriller
