### Hypothesis 

Users are rating movies, can we 
1. create a bipartite graph of user and movies, with edges as interaction between them 
    1. edges are 0/1 binary 
    2. edges are weighted by the ratings
2. Evaluate we then infer the affinity of users towards a perticular movie by predicting 
    1. if there exists an edge between the movie and the user
    2. edge weight prediction to predict the ratings for the user and movie

### Import the packages

In [97]:
import os
import re
import yaml 
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm


from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import remove_stopword_tokens

import torch
from torch_geometric.data import HeteroData
from torch_geometric.loader import LinkLoader, LinkNeighborLoader, NeighborLoader
import torch_geometric.transforms as T

In [98]:
from torch_geometric.nn import SAGEConv, to_hetero
from torch import Tensor

In [99]:
# raw_data_path
raw_data_root_path = 'data/movie-lens/ml-1m/'
movie_file_name = 'movies.dat'
users_file_name = 'users.dat'
ratings_file_name = 'ratings.dat'
movies_data_path = os.path.join(raw_data_root_path, movie_file_name)
users_data_path = os.path.join(raw_data_root_path, users_file_name)
ratings_data_path = os.path.join(raw_data_root_path, ratings_file_name)

# read data to df 
# Read users
users = pd.read_csv(users_data_path, 
                    sep="::", 
                    engine="python", 
                    encoding="ISO-8859-1", 
                    names=["UserID", "Gender", "Age", "Occupation", "Zip-code"])
print(users.head())
# Read movies
movies = pd.read_csv(movies_data_path, 
                        sep="::", 
                        engine="python", 
                        encoding="ISO-8859-1", 
                        names=["MovieID", "Title", "Genres"])
print(movies.head())
# Read ratings
ratings = pd.read_csv(ratings_data_path, 
                        sep="::", 
                        engine="python", 
                        encoding="ISO-8859-1", 
                        names=["UserID", "MovieID", "Rating", "Timestamp"])
print(ratings.head())

   UserID Gender  Age  Occupation Zip-code
0       1      F    1          10    48067
1       2      M   56          16    70072
2       3      M   25          15    55117
3       4      M   45           7    02460
4       5      M   25          20    55455
   MovieID                               Title                        Genres
0        1                    Toy Story (1995)   Animation|Children's|Comedy
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy
2        3             Grumpier Old Men (1995)                Comedy|Romance
3        4            Waiting to Exhale (1995)                  Comedy|Drama
4        5  Father of the Bride Part II (1995)                        Comedy
   UserID  MovieID  Rating  Timestamp
0       1     1193       5  978300760
1       1      661       3  978302109
2       1      914       3  978301968
3       1     3408       4  978300275
4       1     2355       5  978824291


In [100]:
print(f'{users.shape = }')
print(f'{movies.shape = }')
print(f'{ratings.shape = }')
print(f'Number of users in user df = {users['UserID'].nunique()}')
print(f'Number of movies in movie df = {movies['MovieID'].nunique()}')
print(f'Number of users in ratings df = {ratings['UserID'].nunique()}')
print(f'Number of movies in ratings df = {ratings['MovieID'].nunique()}')

users.shape = (6040, 5)
movies.shape = (3883, 3)
ratings.shape = (1000209, 4)
Number of users in user df = 6040
Number of movies in movie df = 3883
Number of users in ratings df = 6040
Number of movies in ratings df = 3706


### Split the data temporally

We'll split the data temporally to mimick the deployement time scenario as closely as possible.
The split would on ratings df 
- train set - 90% 
- test set - 10% 

train set would further be devided into another train and val set. We can use inbuilt Random Splitter for latter task

For users and movies - the hypothesis is that we're inly aware about the user, movie that exist in the system before the thrshhold time of split. So we'll have the train and test users/movies as well 

In [101]:
# split the data temporally
train_prop  = 0.9
test_prop = 0.1 

ratings.sort_values(by = 'Timestamp',
                    ascending=True, 
                    inplace=True)

ratings.reset_index(drop = True,
                    inplace=True)

start_time = ratings['Timestamp'].loc[0]
end_time = ratings['Timestamp'].loc[len(ratings) - 1]

train_start_time = start_time 
train_end_time = ratings['Timestamp'].loc[int(len(ratings)*train_prop)]
test_start_time = ratings['Timestamp'].loc[int(len(ratings)*train_prop) + 1]
test_end_time = end_time 

train_ratings = ratings[(ratings['Timestamp'] <= train_end_time) & (ratings['Timestamp'] >= train_start_time)]
test_ratings = ratings[(ratings['Timestamp'] <= test_end_time) & (ratings['Timestamp'] >= test_start_time)]

print(f'{train_ratings.shape = }')
print(f'{test_ratings.shape = }')

train_user_id = train_ratings['UserID'].unique().tolist()
train_movie_id = train_ratings['MovieID'].unique().tolist()
test_user_id = test_ratings['UserID'].unique().tolist()
test_movie_id = test_ratings['MovieID'].unique().tolist()

train_users = users[users['UserID'].isin(train_user_id)]
test_users = users[users['UserID'].isin(test_user_id)]
train_movies = movies[movies['MovieID'].isin(train_movie_id)]
test_movies = movies[movies['MovieID'].isin(test_movie_id)]

train_ratings.shape = (900189, 4)
test_ratings.shape = (100020, 4)


### Feature Engineering

Create a feature engineering routine. 
Takes in the raw user, movie and ratings table, performes the following tasks - 

In [102]:
users 

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [103]:
w2v_model = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300_2.bin', binary=True)

In [8]:
possible_genres = ['action',
                    'adventure',
                    'animation',
                    "children's",
                    'comedy',
                    'crime',
                    'documentary',
                    'drama',
                    'fantasy',
                    'film-noir',
                    'horror',
                    'musical',
                    'mystery',
                    'romance',
                    'sci-fi',
                    'thriller',
                    'war',
                    'western']

In [104]:
movies

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [105]:
def preprocess_genre(genre):
    genre = genre.lower()
    genre = genre.split('|')
    return genre
    
def preprocess_title(title):
    title = title.lower()
    title = title.split(' ')
    title = remove_stopword_tokens(title)
    title  = [re.sub(r"[^ a-zA-Z0-9]+",'',word) for word in title]
    title = [word.strip() for word in title]
    title = [word for word in title if len(word)]
    return title

def compute_average_embedding(genres, w2v_model):
    embeddings = [w2v_model.get_vector(genre) for genre in genres if genre in w2v_model.index_to_key]
    if embeddings:
        return np.mean(embeddings, axis=0).tolist()
    else:
        # Return a zero vector if no genres are found in the model
        return np.zeros(w2v_model.vector_size).tolist()

def create_features(users, movies, ratings):
    """
    - Encode the userid, movieid
    - Map the genfer to F/M = 0/1
    - create word2vec average embeddings for the Title
    - create multihot encoding for genre
    - create encoding for Year
    """
    user_encoder = LabelEncoder()
    movie_encoder = LabelEncoder()
    users['UserID'] = user_encoder.fit_transform(users['UserID'])
    movies['MovieID'] = movie_encoder.fit_transform(movies['MovieID'])

    ratings['UserID'] = user_encoder.transform(ratings['UserID'])
    ratings['MovieID'] = movie_encoder.transform(ratings['MovieID'])

    users ['Gender'] = users['Gender'].replace({'F' : 0, 'M': 1})

    movies[['Title', 'Year']] = movies['Title'].str.extract(r'^(.*?)(?: \((\d{4})\))?$')
    movies['Year'] = movies['Year'].astype(int)
    movies['Year'].fillna(0, inplace = True)

    movies['Genre_List'] = movies['Genres'].apply(preprocess_genre)

    for genre in tqdm(possible_genres):
        movies[genre] = movies['Genre_List'].apply(lambda x: 1 if genre in x else 0)

    movies['Title_List'] = movies['Title'].apply(preprocess_title)

    movies['Title_Embedding'] = movies['Title_List'].apply(lambda x: compute_average_embedding(x, w2v_model))

    return users, movies, ratings

In [106]:
train_users_transformed, train_movies_transformed, train_ratings_transformed = create_features(train_users.copy(),
                                                                                               train_movies.copy(),
                                                                                               train_ratings.copy())

  users ['Gender'] = users['Gender'].replace({'F' : 0, 'M': 1})
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movies['Year'].fillna(0, inplace = True)
100%|██████████| 18/18 [00:00<00:00, 1566.27it/s]


In [107]:
test_users_transformed, test_movies_transformed, test_ratings_transformed = create_features(test_users.copy(),
                                                                                            test_movies.copy(),
                                                                                            test_ratings.copy())

  users ['Gender'] = users['Gender'].replace({'F' : 0, 'M': 1})
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  movies['Year'].fillna(0, inplace = True)
100%|██████████| 18/18 [00:00<00:00, 1075.39it/s]


In [13]:
test_users_transformed

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,0,0,1,10,48067
1,1,1,56,16,70072
2,2,1,25,15,55117
3,3,1,45,7,02460
4,4,1,25,20,55455
...,...,...,...,...,...
6000,1204,0,25,7,94117
6001,1205,1,50,0,43231
6015,1206,1,45,1,37209
6027,1207,1,18,4,94133


In [122]:
train_users_transformed

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
23,0,0,25,7,10023
24,1,1,18,4,01609
25,2,1,25,7,23112
26,3,1,25,11,19130
27,4,0,25,1,14607
...,...,...,...,...,...
6035,6006,0,25,15,32603
6036,6007,0,45,1,76006
6037,6008,0,56,1,14706
6038,6009,0,45,0,01060


In [123]:
train_movies_transformed

Unnamed: 0,MovieID,Title,Genres,Year,Genre_List,action,adventure,animation,children's,comedy,...,horror,musical,mystery,romance,sci-fi,thriller,war,western,Title_List,Title_Embedding
0,0,Toy Story,Animation|Children's|Comedy,1995,"[animation, children's, comedy]",0,0,1,1,1,...,0,0,0,0,0,0,0,0,"[toy, story]","[0.135498046875, 0.09771728515625, -0.06188964..."
1,1,Jumanji,Adventure|Children's|Fantasy,1995,"[adventure, children's, fantasy]",0,1,0,1,0,...,0,0,0,0,0,0,0,0,[jumanji],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,2,Grumpier Old Men,Comedy|Romance,1995,"[comedy, romance]",0,0,0,0,1,...,0,0,0,1,0,0,0,0,"[grumpier, old, men]","[0.1028645858168602, 0.1243489608168602, 0.065..."
3,3,Waiting to Exhale,Comedy|Drama,1995,"[comedy, drama]",0,0,0,0,1,...,0,0,0,0,0,0,0,0,"[waiting, exhale]","[0.12060546875, 0.0087890625, 0.29052734375, 0..."
4,4,Father of the Bride Part II,Comedy,1995,[comedy],0,0,0,0,1,...,0,0,0,0,0,0,0,0,"[father, bride, ii]","[-0.0677083358168602, -0.0944010391831398, 0.0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3673,Meet the Parents,Comedy,2000,[comedy],0,0,0,0,1,...,0,0,0,0,0,0,0,0,"[meet, parents]","[-0.194091796875, -0.03369140625, 0.0590820312..."
3879,3674,Requiem for a Dream,Drama,2000,[drama],0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[requiem, dream]","[0.048095703125, -0.06427001953125, 0.12304687..."
3880,3675,Tigerland,Drama,2000,[drama],0,0,0,0,0,...,0,0,0,0,0,0,0,0,[tigerland],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3881,3676,Two Family House,Drama,2000,[drama],0,0,0,0,0,...,0,0,0,0,0,0,0,0,"[family, house]","[0.0692138671875, -0.094482421875, -0.04479980..."


### Create Graph

Create graph from the training data and split into train and val graph 
- train_prop - 0.9
- val_prop - 0.1

In [124]:
def create_graph(users, movies, ratings):
    # create users features
    user_feat = torch.tensor(users[['Gender', 'Age', 'Occupation']].values, dtype=torch.float)
    movie_feat = torch.tensor(torch.hstack([torch.tensor(movies[['Year'] + [genre for genre in possible_genres]].values), 
                               torch.tensor(movies['Title_Embedding'].values.tolist())]), dtype=torch.float)
    edge_index_user_to_movie = torch.tensor(ratings[['UserID', 'MovieID']].values.T)
    edge_label_user_to_movie = torch.tensor(ratings['Rating'].values - 1, dtype=torch.long)
    print(edge_label_user_to_movie.unique())
    data = HeteroData()
    data["user"].node_id = torch.arange(users['UserID'].nunique())
    data["movie"].node_id = torch.arange(movies['MovieID'].nunique())
    data['user'].x = user_feat
    data['movie'].x = movie_feat
    data['user', 'rates', 'movie'].edge_index = edge_index_user_to_movie
    data['user', 'rates', 'movie'].edge_label = edge_label_user_to_movie
    data["movie", "rev_rates", "user"].edge_index = edge_index_user_to_movie.flip(0)
    data['movie', 'rev_rates', 'user'].edge_label = edge_label_user_to_movie
    return data

In [125]:
train_graph = create_graph(train_users_transformed,
                           train_movies_transformed,
                           train_ratings_transformed)

train_graph

tensor([0, 1, 2, 3, 4])


  movie_feat = torch.tensor(torch.hstack([torch.tensor(movies[['Year'] + [genre for genre in possible_genres]].values),


HeteroData(
  user={
    node_id=[6011],
    x=[6011, 3],
  },
  movie={
    node_id=[3678],
    x=[3678, 319],
  },
  (user, rates, movie)={
    edge_index=[2, 900189],
    edge_label=[900189],
  },
  (movie, rev_rates, user)={
    edge_index=[2, 900189],
    edge_label=[900189],
  }
)

In [126]:
train_graph[('user', 'rates', 'movie')].edge_index

tensor([[6010, 6010, 6010,  ...,    0,    0,    0],
        [ 794, 2175,  576,  ...,  568, 2463,  452]])

In [117]:
train_graph[('user', 'rates', 'movie')].edge_label

tensor([3, 3, 4,  ..., 3, 3, 3])

In [112]:
train_graph[('movie', 'rev_rates', 'user')].edge_index

tensor([[ 794, 2175,  576,  ...,  568, 2463,  452],
        [6010, 6010, 6010,  ...,    0,    0,    0]])

In [115]:
train_ratings_transformed[(train_ratings_transformed['UserID'] == 6010) & (train_ratings_transformed['MovieID'] == 794)]


Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,6010,794,4,956703932


In [145]:
transform = T.RandomLinkSplit(
    num_val=0.1,  # 10% validation edges
    num_test = 0,
    disjoint_train_ratio=0.3,  # 30% of training edges used for supervision
    edge_types=("user", "rates", "movie"),  # Edge type
    rev_edge_types=("movie", "rev_rates", "user"),  # Reverse edge type
    add_negative_train_samples = False # no negative samples as edges are determined solely on the ratings
)

train_data, val_data, test_data = transform(train_graph)

In [146]:
train_data

HeteroData(
  user={
    node_id=[6011],
    x=[6011, 3],
  },
  movie={
    node_id=[3678],
    x=[3678, 319],
  },
  (user, rates, movie)={
    edge_index=[2, 567120],
    edge_label=[243051],
    edge_label_index=[2, 243051],
  },
  (movie, rev_rates, user)={
    edge_index=[2, 567120],
    edge_label=[567120],
  }
)

In [147]:
train_data['movie'].node_id

tensor([   0,    1,    2,  ..., 3675, 3676, 3677])

In [148]:
train_data[('user', 'rates', 'movie')].edge_index

tensor([[2882,  386, 4972,  ..., 4139, 1276,   24],
        [2309, 3650, 3164,  ..., 1753, 1489,  752]])

In [149]:
train_data[('user', 'rates', 'movie')].edge_label_index

tensor([[1753, 2441, 1671,  ..., 5506,  326, 2235],
        [2586,  575, 2742,  ..., 1790, 2583, 2067]])

In [154]:
train_data[('user', 'rates', 'movie')].edge_label

tensor([1, 3, 3,  ..., 2, 3, 3])

In [153]:
train_ratings_transformed[(train_ratings_transformed['UserID'] == 1753) 
                          & (train_ratings_transformed['MovieID'] == 2586)] 

Unnamed: 0,UserID,MovieID,Rating,Timestamp
619277,1753,2586,2,974705975


In [155]:
unique_labels = train_data[("user", "rates", "movie")].edge_label.unique()
print("Unique edge labels:", unique_labels)

Unique edge labels: tensor([0, 1, 2, 3, 4])


In [156]:
print("Reverse edge labels:")
print(train_data[("movie", "rev_rates", "user")].edge_label.unique())

Reverse edge labels:
tensor([0, 1, 2, 3, 4])


In [157]:
print("Original edge labels in train_graph:")
print(train_graph[("user", "rates", "movie")].edge_label.unique())

Original edge labels in train_graph:
tensor([0, 1, 2, 3, 4])


In [158]:
print("Edge labels in training set:")
print(train_data[("user", "rates", "movie")].edge_label)

print("Edge labels in validation set:")
print(val_data[("user", "rates", "movie")].edge_label)

Edge labels in training set:
tensor([1, 3, 3,  ..., 2, 3, 3])
Edge labels in validation set:
tensor([5, 3, 3,  ..., 0, 0, 0])


In [159]:
val_data[('user', 'rates', 'movie')]

{'edge_index': tensor([[1753, 2441, 1671,  ..., 4139, 1276,   24],
        [2586,  575, 2742,  ..., 1753, 1489,  752]]), 'edge_label': tensor([5, 3, 3,  ..., 0, 0, 0]), 'edge_label_index': tensor([[2545,  692, 4183,  ..., 1265, 1652, 3066],
        [1122, 3299, 1438,  ..., 3533, 1666, 1889]])}

### Create Mini Batches
- use linkneighbourloader 

In [160]:
# Define seed edges:
edge_label_index = train_data["user", "rates", "movie"].edge_label_index
edge_label = train_data["user", "rates", "movie"].edge_label

# Define the LinkNeighborLoader
train_loader = LinkNeighborLoader(
    data=train_data,  # Use the training data
    num_neighbors=[20, 10],  # 20 neighbors in the first hop, 10 in the second hop
    edge_label_index=(("user", "rates", "movie"), edge_label_index),  # Edge type and indices
    edge_label=edge_label,  # Labels for the edges
    batch_size=128,  # Batch size
    shuffle=True,  # Shuffle the data during training
)

# Inspect a sample:
sampled_data = next(iter(train_loader))

print("Sampled mini-batch:")
print("===================")
print(sampled_data)
# print(f"{sampled_data[('user', 'rates', 'movie')].edge_label}")

Sampled mini-batch:
HeteroData(
  user={
    node_id=[4062],
    x=[4062, 3],
    n_id=[4062],
    num_sampled_nodes=[3],
  },
  movie={
    node_id=[2544],
    x=[2544, 319],
    n_id=[2544],
    num_sampled_nodes=[3],
  },
  (user, rates, movie)={
    edge_index=[2, 14337],
    edge_label=[128],
    edge_label_index=[2, 128],
    e_id=[14337],
    num_sampled_edges=[2],
    input_id=[128],
  },
  (movie, rev_rates, user)={
    edge_index=[2, 18510],
    edge_label=[18510],
    e_id=[18510],
    num_sampled_edges=[2],
  }
)


In [161]:
sampled_data['user'].x.shape 

torch.Size([4062, 3])

In [162]:
sampled_data['user'].node_id

tensor([  29,   42,   93,  ..., 5316, 4215, 2536])

In [163]:
edge_index_sampled = sampled_data[('user', 'rates', 'movie')]['edge_index']
edge_index_sampled

tensor([[ 125,  126,  127,  ..., 1071, 2222, 1902],
        [   0,    0,    0,  ..., 1320, 1320, 1320]])

In [164]:
e_id_sampled = sampled_data[('user', 'rates', 'movie')]['e_id']
e_id_sampled

tensor([263888, 173080, 514548,  ..., 453852, 362991, 180020])

In [167]:
edge_label_sampled = sampled_data[('user', 'rates', 'movie')]['edge_label']
edge_label_sampled

tensor([2, 3, 4, 2, 4, 4, 2, 2, 4, 2, 3, 2, 3, 3, 2, 2, 2, 4, 0, 4, 3, 4, 3, 2,
        3, 4, 2, 1, 3, 3, 2, 2, 3, 3, 3, 2, 2, 4, 1, 2, 2, 2, 3, 4, 0, 3, 4, 3,
        2, 3, 2, 4, 2, 2, 3, 3, 4, 2, 2, 4, 3, 3, 3, 4, 1, 0, 2, 4, 3, 4, 4, 4,
        3, 2, 3, 3, 3, 3, 4, 2, 3, 1, 4, 1, 0, 4, 3, 4, 3, 0, 3, 2, 2, 3, 4, 3,
        3, 1, 3, 4, 3, 4, 2, 4, 2, 3, 3, 3, 0, 4, 2, 3, 3, 4, 2, 2, 3, 4, 4, 3,
        2, 1, 0, 1, 4, 1, 3, 3])

### Heterogenous GNN Model

Creating a heterogenous gnn model for the bipartite graph created.
- Create embeddings for the bith node types 
- The embeddings are trained for the downstream task of predicting the ratings for the movies by a user

In [199]:
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()

        self.conv1 = SAGEConv(hidden_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)

    def forward(self, x: Tensor, edge_index: Tensor) -> Tensor:
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

# Our final classifier applies the dot-product between source and destination
# node embeddings to derive edge-level predictions:
class Classifier(torch.nn.Module):
    def forward(self, x_user: Tensor, x_movie: Tensor, edge_label_index: Tensor) -> Tensor:
        # Convert node embeddings to edge-level representations:
        edge_feat_user = x_user[edge_label_index[0]]
        edge_feat_movie = x_movie[edge_label_index[1]]
        # Apply dot-product to get a prediction per supervision edge:
        return (edge_feat_user * edge_feat_movie).sum(dim=-1)

class Model(torch.nn.Module):
    def __init__(self, num_user_feat, num_movie_feat, hidden_channels):
        super().__init__()
        # Instantiate homogeneous GNN:
        self.gnn = GNN(hidden_channels)
        self.user_batch_norm = torch.nn.BatchNorm1d(num_user_feat)
        self.movie_batch_norm = torch.nn.BatchNorm1d(num_movie_feat)
        self.movie_lin = torch.nn.Linear(num_movie_feat, hidden_channels)
        self.user_lin = torch.nn.Linear(num_user_feat, hidden_channels)
        # Convert GNN model into a heterogeneous variant:
        self.gnn = to_hetero(self.gnn, metadata=train_graph.metadata())

        self.classifier = Classifier()

    def forward(self, data: HeteroData) -> Tensor:
        x_dict = {
          "user": self.user_lin(self.user_batch_norm(data['user'].x)),
          "movie": self.movie_lin(self.movie_batch_norm(data["movie"].x)),
        }

        # `x_dict` holds feature matrices of all node types
        # `edge_index_dict` holds all edge indices of all edge types
        x_dict = self.gnn(x_dict, data.edge_index_dict)

        pred = self.classifier(
            x_dict["user"],
            x_dict["movie"],
            data["user", "rates", "movie"].edge_label_index,
        )

        return pred
    
model = Model(num_user_feat=train_data['user'].x.shape[1], 
              num_movie_feat=train_data['movie'].x.shape[1],
              hidden_channels=64)

print(model)

Model(
  (gnn): GraphModule(
    (conv1): ModuleDict(
      (user__rates__movie): SAGEConv(64, 64, aggr=mean)
      (movie__rev_rates__user): SAGEConv(64, 64, aggr=mean)
    )
    (conv2): ModuleDict(
      (user__rates__movie): SAGEConv(64, 64, aggr=mean)
      (movie__rev_rates__user): SAGEConv(64, 64, aggr=mean)
    )
  )
  (user_batch_norm): BatchNorm1d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (movie_batch_norm): BatchNorm1d(319, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (movie_lin): Linear(in_features=319, out_features=64, bias=True)
  (user_lin): Linear(in_features=3, out_features=64, bias=True)
  (classifier): Classifier()
)


### Training a Hetrogenous Link-level GNN

In [200]:
import tqdm
import torch.nn.functional as F

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: '{device}'")

model = model.to(device)  # Move the model to the device
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Define the optimizer

for epoch in range(1, 6):  # Training for 5 epochs
    total_loss = total_examples = 0  # Initialize loss and examples
    model.train()  # Set the model to training mode
    
    for i, sampled_data in tqdm.tqdm(enumerate(train_loader)):  # Iterate over the training loader
        optimizer.zero_grad()  # Zero the gradients
        
        # Move `sampled_data` to the device
        sampled_data = sampled_data.to(device)
        
        # Run the `forward` pass of the model
        pred = model(
            sampled_data
        )
        # Extract ground truth labels
        ground_truth = sampled_data["user", "rates", "movie"].edge_label.to(torch.float)
        # Apply binary cross-entropy loss
        loss = F.cross_entropy(pred, ground_truth)
        
        # Backward pass and optimization step
        loss.backward()
        optimizer.step()
        
        # Accumulate total loss and total examples
        total_loss += float(loss) * pred.numel()  # Scale loss by number of predictions
        total_examples += pred.numel()  # Count the number of predictions
        if i%100 == 0 and i > 0:
            with torch.no_grad():
                val_pred = model(val_data)
                val_ground_truth = val_data["user", "rates", "movie"].edge_label.to(torch.float)
                val_loss = F.cross_entropy(val_pred, val_ground_truth, reduce=True, reduction='mean')
                print(f"{epoch = }, {i = }, {val_loss = }")
    
    
    # Print epoch loss
    print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")

Device: 'cpu'


104it [00:05, 11.12it/s]

epoch = 1, i = 100, val_loss = tensor(3910763.7500)


204it [00:11, 14.19it/s]

epoch = 1, i = 200, val_loss = tensor(3906294.)


304it [00:16, 10.75it/s]

epoch = 1, i = 300, val_loss = tensor(3903304.)


405it [00:21, 14.35it/s]

epoch = 1, i = 400, val_loss = tensor(3904933.7500)


504it [00:27, 10.27it/s]

epoch = 1, i = 500, val_loss = tensor(3902139.7500)


605it [00:32, 15.28it/s]

epoch = 1, i = 600, val_loss = tensor(3900745.5000)


705it [00:37, 12.51it/s]

epoch = 1, i = 700, val_loss = tensor(3903539.5000)


802it [00:42, 12.71it/s]

epoch = 1, i = 800, val_loss = tensor(3903153.5000)


904it [00:47, 12.77it/s]

epoch = 1, i = 900, val_loss = tensor(3901981.2500)


1003it [00:53, 13.81it/s]

epoch = 1, i = 1000, val_loss = tensor(3899750.2500)


1103it [00:59,  9.54it/s]

epoch = 1, i = 1100, val_loss = tensor(3900696.)


1204it [01:05, 10.60it/s]

epoch = 1, i = 1200, val_loss = tensor(3902696.5000)


1303it [01:12,  9.26it/s]

epoch = 1, i = 1300, val_loss = tensor(3899784.5000)


1403it [01:18,  8.87it/s]

epoch = 1, i = 1400, val_loss = tensor(3903937.5000)


1502it [01:25,  8.49it/s]

epoch = 1, i = 1500, val_loss = tensor(3902360.5000)


1605it [01:32, 11.40it/s]

epoch = 1, i = 1600, val_loss = tensor(3901127.7500)


1703it [01:38, 13.02it/s]

epoch = 1, i = 1700, val_loss = tensor(3900033.2500)


1803it [01:43, 11.25it/s]

epoch = 1, i = 1800, val_loss = tensor(3901574.2500)


1899it [01:48, 17.51it/s]


Epoch: 001, Loss: 1604.7130


103it [00:06, 11.26it/s]

epoch = 2, i = 100, val_loss = tensor(3900907.5000)


203it [00:11, 13.06it/s]

epoch = 2, i = 200, val_loss = tensor(3899245.5000)


304it [00:17, 10.01it/s]

epoch = 2, i = 300, val_loss = tensor(3900417.5000)


404it [00:22, 12.57it/s]

epoch = 2, i = 400, val_loss = tensor(3900210.5000)


503it [00:27, 12.21it/s]

epoch = 2, i = 500, val_loss = tensor(3900647.5000)


605it [00:32, 16.18it/s]

epoch = 2, i = 600, val_loss = tensor(3901194.)


704it [00:37, 13.81it/s]

epoch = 2, i = 700, val_loss = tensor(3902026.)


804it [00:42, 12.08it/s]

epoch = 2, i = 800, val_loss = tensor(3898953.7500)


903it [00:48,  9.48it/s]

epoch = 2, i = 900, val_loss = tensor(3900795.7500)


1004it [00:54, 11.87it/s]

epoch = 2, i = 1000, val_loss = tensor(3899897.2500)


1104it [01:00, 13.07it/s]

epoch = 2, i = 1100, val_loss = tensor(3900871.2500)


1202it [01:06,  8.61it/s]

epoch = 2, i = 1200, val_loss = tensor(3899073.)


1304it [01:12, 13.54it/s]

epoch = 2, i = 1300, val_loss = tensor(3899361.)


1403it [01:18,  8.98it/s]

epoch = 2, i = 1400, val_loss = tensor(3899483.7500)


1502it [01:24, 10.06it/s]

epoch = 2, i = 1500, val_loss = tensor(3901598.)


1603it [01:30, 11.25it/s]

epoch = 2, i = 1600, val_loss = tensor(3901918.)


1703it [01:35, 10.94it/s]

epoch = 2, i = 1700, val_loss = tensor(3900184.)


1802it [01:42,  7.09it/s]

epoch = 2, i = 1800, val_loss = tensor(3897528.)


1899it [01:47, 17.71it/s]


Epoch: 002, Loss: 1603.5531


104it [00:05, 11.42it/s]

epoch = 3, i = 100, val_loss = tensor(3899773.5000)


204it [00:11, 10.79it/s]

epoch = 3, i = 200, val_loss = tensor(3900155.)


302it [00:17, 10.25it/s]

epoch = 3, i = 300, val_loss = tensor(3900253.)


404it [00:23,  9.71it/s]

epoch = 3, i = 400, val_loss = tensor(3898236.2500)


503it [00:28, 13.51it/s]

epoch = 3, i = 500, val_loss = tensor(3898723.)


604it [00:33, 12.44it/s]

epoch = 3, i = 600, val_loss = tensor(3899614.7500)


705it [00:39, 12.99it/s]

epoch = 3, i = 700, val_loss = tensor(3898620.5000)


803it [00:45, 11.40it/s]

epoch = 3, i = 800, val_loss = tensor(3899383.5000)


905it [00:50, 14.27it/s]

epoch = 3, i = 900, val_loss = tensor(3898613.2500)


1002it [00:56,  6.28it/s]

epoch = 3, i = 1000, val_loss = tensor(3900621.5000)


1104it [01:03,  8.84it/s]

epoch = 3, i = 1100, val_loss = tensor(3899568.5000)


1204it [01:09, 12.74it/s]

epoch = 3, i = 1200, val_loss = tensor(3900248.5000)


1305it [01:15, 11.21it/s]

epoch = 3, i = 1300, val_loss = tensor(3898050.7500)


1404it [01:21, 13.87it/s]

epoch = 3, i = 1400, val_loss = tensor(3897917.5000)


1503it [01:26, 13.22it/s]

epoch = 3, i = 1500, val_loss = tensor(3899649.5000)


1604it [01:32, 11.23it/s]

epoch = 3, i = 1600, val_loss = tensor(3899571.)


1703it [01:40,  6.91it/s]

epoch = 3, i = 1700, val_loss = tensor(3898685.5000)


1804it [01:45, 13.03it/s]

epoch = 3, i = 1800, val_loss = tensor(3896280.)


1899it [01:52, 16.86it/s]


Epoch: 003, Loss: 1603.1834


102it [00:05, 10.39it/s]

epoch = 4, i = 100, val_loss = tensor(3897488.)


202it [00:11,  9.55it/s]

epoch = 4, i = 200, val_loss = tensor(3895840.)


302it [00:18,  9.17it/s]

epoch = 4, i = 300, val_loss = tensor(3899042.)


404it [00:24, 13.53it/s]

epoch = 4, i = 400, val_loss = tensor(3898210.)


503it [00:30,  9.24it/s]

epoch = 4, i = 500, val_loss = tensor(3896995.5000)


602it [00:36,  9.24it/s]

epoch = 4, i = 600, val_loss = tensor(3896649.7500)


703it [00:43,  9.87it/s]

epoch = 4, i = 700, val_loss = tensor(3898321.)


804it [00:49,  9.58it/s]

epoch = 4, i = 800, val_loss = tensor(3898278.7500)


903it [00:56, 11.44it/s]

epoch = 4, i = 900, val_loss = tensor(3898610.5000)


1003it [01:02,  9.35it/s]

epoch = 4, i = 1000, val_loss = tensor(3898332.5000)


1103it [01:08, 11.49it/s]

epoch = 4, i = 1100, val_loss = tensor(3898530.5000)


1202it [01:14, 10.31it/s]

epoch = 4, i = 1200, val_loss = tensor(3898699.7500)


1302it [01:19, 11.65it/s]

epoch = 4, i = 1300, val_loss = tensor(3897389.)


1404it [01:25, 11.26it/s]

epoch = 4, i = 1400, val_loss = tensor(3898890.)


1501it [01:31, 10.08it/s]

epoch = 4, i = 1500, val_loss = tensor(3896753.5000)


1604it [01:37, 13.92it/s]

epoch = 4, i = 1600, val_loss = tensor(3899454.)


1705it [01:42, 13.52it/s]

epoch = 4, i = 1700, val_loss = tensor(3899234.2500)


1805it [01:48, 11.68it/s]

epoch = 4, i = 1800, val_loss = tensor(3897988.)


1899it [01:53, 16.79it/s]


Epoch: 004, Loss: 1602.8644


102it [00:05,  9.98it/s]

epoch = 5, i = 100, val_loss = tensor(3896500.5000)


204it [00:11, 13.41it/s]

epoch = 5, i = 200, val_loss = tensor(3901232.)


304it [00:18, 10.40it/s]

epoch = 5, i = 300, val_loss = tensor(3898653.)


403it [00:23, 10.88it/s]

epoch = 5, i = 400, val_loss = tensor(3898625.)


503it [00:28,  9.94it/s]

epoch = 5, i = 500, val_loss = tensor(3898980.)


602it [00:35,  8.39it/s]

epoch = 5, i = 600, val_loss = tensor(3895722.)


703it [00:41, 12.31it/s]

epoch = 5, i = 700, val_loss = tensor(3897207.7500)


803it [00:47,  9.14it/s]

epoch = 5, i = 800, val_loss = tensor(3897960.)


904it [00:52, 14.81it/s]

epoch = 5, i = 900, val_loss = tensor(3897624.)


1002it [00:58,  8.01it/s]

epoch = 5, i = 1000, val_loss = tensor(3897032.5000)


1103it [01:03, 12.52it/s]

epoch = 5, i = 1100, val_loss = tensor(3898913.)


1204it [01:09, 11.17it/s]

epoch = 5, i = 1200, val_loss = tensor(3897545.5000)


1303it [01:14, 10.67it/s]

epoch = 5, i = 1300, val_loss = tensor(3898348.5000)


1403it [01:20, 11.95it/s]

epoch = 5, i = 1400, val_loss = tensor(3896458.2500)


1503it [01:25, 12.50it/s]

epoch = 5, i = 1500, val_loss = tensor(3897897.5000)


1603it [01:31, 12.41it/s]

epoch = 5, i = 1600, val_loss = tensor(3898014.5000)


1704it [01:37, 12.91it/s]

epoch = 5, i = 1700, val_loss = tensor(3897237.)


1805it [01:43, 13.81it/s]

epoch = 5, i = 1800, val_loss = tensor(3895078.)


1899it [01:48, 17.56it/s]

Epoch: 005, Loss: 1602.6951





### Evaluation

For a given node (user) predict the the other nodes (movies) it will show affinity to (rating)

Given a truly unseen data, splitted by timestamp

Following strategy will be used -
1. get all the user and movie features
2. for a given user find the affinity to all the movies present in the test dataset
3. rank them by affinities
4. evaluate the true ratings vs the predicted ratings

In [201]:
train_ratings_transformed

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,6010,794,4,956703932
1,6010,2175,4,956703954
2,6010,576,5,956703954
3,6010,1769,4,956703977
4,6010,1827,5,956703977
...,...,...,...,...
900184,0,980,4,978133305
900185,0,2215,3,978133334
900186,0,568,4,978133348
900187,0,2463,4,978133367


torch.Size([180036])

In [185]:
# Sampled mini-batch:
# ===================
# HeteroData(
#   user={
#     node_id=[4062],
#     x=[4062, 3],
#     n_id=[4062],
#     num_sampled_nodes=[3],
#   },
#   movie={
#     node_id=[2544],
#     x=[2544, 319],
#     n_id=[2544],
#     num_sampled_nodes=[3],
#   },
#   (user, rates, movie)={
#     edge_index=[2, 14337],
#     edge_label=[128],
#     edge_label_index=[2, 128],
#     e_id=[14337],
#     num_sampled_edges=[2],
#     input_id=[128],
#   },
#   (movie, rev_rates, user)={
#     edge_index=[2, 18510],
#     edge_label=[18510],
#     e_id=[18510],
#     num_sampled_edges=[2],
#   }
# )



HeteroData(
  user={
    node_id=[6011],
    x=[6011, 3],
  },
  movie={
    node_id=[3678],
    x=[3678, 319],
  },
  (user, rates, movie)={
    edge_index=[2, 810171],
    edge_label=[180036],
    edge_label_index=[2, 180036],
  },
  (movie, rev_rates, user)={
    edge_index=[2, 810171],
    edge_label=[810171],
  }
)