In [18]:
import os
import torch
import pandas as pd
import numpy as np
from torch_geometric.data import download_url, extract_zip, HeteroData
import torch_geometric.transforms as T
from torch_geometric.loader import LinkNeighborLoader, NeighborLoader
from torch_geometric.nn import SAGEConv, to_hetero
import torch.nn.functional as F
import tqdm
from sklearn.metrics import roc_auc_score, f1_score
import matplotlib.pyplot as plt
import networkx as nx

In [21]:
movies_path = 'movies.csv'
ratings_path = 'ratings.csv'
movies_df = pd.read_csv(movies_path, index_col='movieId')

In [20]:
movies_df

In [22]:
# Split genres and convert into indicator variables
genres = movies_df['genres'].str.get_dummies('|')

# Use genres as movie input features
movie_feat = torch.from_numpy(genres.values).to(torch.float)

# Load the entire ratings data frame into memory
ratings_df = pd.read_csv(ratings_path)

In [23]:
unique_user_id = ratings_df['userId'].unique()
unique_user_id = pd.DataFrame(data={
    'userId': unique_user_id,
    'mappedID': pd.RangeIndex(len(unique_user_id)),
})

# Create a mapping from unique MOVIE indices to range [0, num_movie_nodes)
unique_movie_id = ratings_df['movieId'].unique()
unique_movie_id = pd.DataFrame(data={
    'movieId': movies_df.index,
    'mappedID': pd.RangeIndex(len(movies_df)),
})

In [24]:
ratings_user_id = pd.merge(ratings_df['userId'], unique_user_id,
                           left_on='userId', right_on='userId', how='left')
print(ratings_user_id.head())
ratings_user_id = torch.from_numpy(ratings_user_id['mappedID'].values)
ratings_movie_id = pd.merge(ratings_df['movieId'], unique_movie_id,
                            left_on='movieId', right_on='movieId', how='left')
print(ratings_movie_id.head())

ratings_movie_id = torch.from_numpy(ratings_movie_id['mappedID'].values)

In [25]:
edge_index_user_to_movie = torch.stack([ratings_user_id, ratings_movie_id], dim=0)

# Create HeteroData object
data = HeteroData()
data

In [26]:
data["user"].node_id = torch.arange(len(unique_user_id))
data["movie"].node_id = torch.arange(len(movies_df))

# Add the node features and edge indices
data["movie"].x = movie_feat
data["user", "rates", "movie"].edge_index = edge_index_user_to_movie

# Add reverse edges
data = T.ToUndirected()(data)

In [27]:
transform = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    disjoint_train_ratio=0.3,
    neg_sampling_ratio=2.0,
    add_negative_train_samples=False,
    edge_types=("user", "rates", "movie"),
    rev_edge_types=("movie", "rev_rates", "user"),
)

In [28]:
train_data, val_data, test_data = transform(data)

In [29]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv, to_hetero
from torch_geometric.loader import LinkNeighborLoader
from sklearn.metrics import roc_auc_score
import tqdm

In [30]:
class GNN(torch.nn.Module):
    def __init__(self, hidden_channels, num_classes=None):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), hidden_channels)
        self.classifier = None
        if num_classes is not None:
            # For Node Classification
            self.classifier = torch.nn.Linear(hidden_channels, num_classes)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        if self.classifier:
            x = self.classifier(x)
        return x

In [31]:
class Classifier(torch.nn.Module):
    def forward(self, x_user, x_movie, edge_label_index):
        edge_feat_user = x_user[edge_label_index[0]]
        edge_feat_movie = x_movie[edge_label_index[1]]
        return (edge_feat_user * edge_feat_movie).sum(dim=-1)

In [32]:
class LinkAndNodePredictionModel(torch.nn.Module):
    def __init__(self, hidden_channels, num_classes):
        super().__init__()
        self.movie_lin = torch.nn.Linear(20, hidden_channels)
        self.user_emb = torch.nn.Embedding(data["user"].num_nodes, hidden_channels)
        self.movie_emb = torch.nn.Embedding(data["movie"].num_nodes, hidden_channels)
        self.gnn = GNN(hidden_channels, num_classes)
        self.gnn = to_hetero(self.gnn, metadata=data.metadata())
        self.classifier = Classifier()

    def forward(self, data, task='link_prediction'):
        x_dict = {
            "user": self.user_emb(data["user"].node_id),
            "movie": self.movie_lin(data["movie"].x) + self.movie_emb(data["movie"].node_id),
        }
        x_dict = self.gnn(x_dict, data.edge_index_dict)

        if task == 'link_prediction':
            pred = self.classifier(
                x_dict["user"],
                x_dict["movie"],
                data["user", "rates", "movie"].edge_label_index,
            )
        elif task == 'node_classification':
            pred = x_dict["user"]  # For node classification, return user node embeddings
        return pred

In [33]:
def train_node_classification():
    model.train()
    optimizer.zero_grad()
    pred = model(train_data, task='node_classification')
    loss = F.cross_entropy(pred, train_data["user"].y)  # Assuming y contains node labels
    loss.backward()
    optimizer.step()
    return loss.item()

In [34]:
train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[20, 10],
    neg_sampling_ratio=2.0,
    edge_label_index=(("user", "rates", "movie"), train_data["user", "rates", "movie"].edge_label_index),
    edge_label=train_data["user", "rates", "movie"].edge_label,
    batch_size=128,
    shuffle=True,
)

In [35]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = LinkAndNodePredictionModel(hidden_channels=64, num_classes=10).to(device)  # Assuming 10 classes for Node Classification
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
for epoch in range(1, 10):
    # Link Prediction
    total_loss = total_examples = 0
    for sampled_data in tqdm.tqdm(train_loader):
        optimizer.zero_grad()
        sampled_data.to(device)
        pred = model(sampled_data, task='link_prediction')
        ground_truth = sampled_data["user", "rates", "movie"].edge_label
        loss = F.binary_cross_entropy_with_logits(pred, ground_truth)
        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.numel()
        total_examples += pred.numel()
    print(f"Epoch: {epoch:03d}, Link Prediction Loss: {total_loss / total_examples:.4f}")
    
    # Node Classification
    node_class_loss = train_node_classification()
    print(f"Epoch: {epoch:03d}, Node Classification Loss: {node_class_loss:.4f}")

In [None]:
# Evaluate Link Prediction on validation data
val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=[20, 10],
    edge_label_index=(("user", "rates", "movie"), val_data["user", "rates", "movie"].edge_label_index),
    edge_label=val_data["user", "rates", "movie"].edge_label,
    batch_size=3 * 128,
    shuffle=False,
)

In [None]:
preds = []
ground_truths = []
for sampled_data in tqdm.tqdm(val_loader):
    with torch.no_grad():
        sampled_data.to(device)
        preds.append(model(sampled_data, task='link_prediction'))
        ground_truths.append(sampled_data["user", "rates", "movie"].edge_label)

pred = torch.cat(preds, dim=0).cpu().numpy()
ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
auc = roc_auc_score(ground_truth, pred)
print(f"\nValidation AUC: {auc:.4f}")