In [1]:

import os.path as osp

import torch
import torch.nn.functional as F
from torch.nn import Linear

import torch_geometric.transforms as T
from torch_geometric.datasets import MovieLens
from torch_geometric.nn import SAGEConv, to_hetero
from torch_geometric.data import HeteroData

import pandas as pd
from sklearn.metrics import log_loss
import warnings
warnings.filterwarnings('ignore')


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# device = torch.device('cpu')


In [11]:
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        # z = torch.cat([z_dict['user'][row], z_dict['movie'][col]], dim=-1)
        z = torch.cat([z_dict['u'][row], z_dict['v'][col]], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)


class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)

In [3]:
df = pd.read_csv(r'./ml-latest-small/ratings.csv')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
df.shape

(100836, 4)

In [7]:
df['if'] = df['rating'].apply(lambda x: 1 if x>3.5 else 0)

In [8]:
df['if'].value_counts()

0    52256
1    48580
Name: if, dtype: int64

In [9]:
def load_node(col):
    # x = torch.rand(len(col.unique()), 100, device=device)
    mapping = {id_: i for i, id_ in enumerate(col.unique())}
    x = torch.eye(len(col.unique()))
    return x, mapping

def load_edge(src, dst, u_mapping, v_mapping):
    src_mapping = [u_mapping[u] for u in src.tolist()]
    dst_mapping = [v_mapping[v] for v in dst.tolist()]

    return torch.tensor([src_mapping, dst_mapping], device=device)

In [10]:
data = HeteroData()
ux, u_mapping = load_node(df['userId'])
vx, v_mapping = load_node(df['movieId'])

data['u'].x = ux
data['v'].x = vx
data['u', 'r', 'v'].edge_index = load_edge(df['userId'], df['movieId'], u_mapping, v_mapping)
data['u', 'r', 'v'].edge_label = torch.tensor(df['if'].tolist(), device=device)

data = T.ToUndirected()(data)
del data['v', 'rev_r', 'u'].edge_label
data

HeteroData(
  [1mu[0m={ x=[610, 610] },
  [1mv[0m={ x=[9724, 9724] },
  [1m(u, r, v)[0m={
    edge_index=[2, 100836],
    edge_label=[100836]
  },
  [1m(v, rev_r, u)[0m={ edge_index=[2, 100836] }
)

In [205]:
# HeteroData(
#   movie={ x=[9742, 404] },
#   user={ x=[610, 610] },
#   (user, rates, movie)={
#     edge_index=[2, 100836],
#     edge_label=[100836]
#   },
#   (movie, rev_rates, user)={ edge_index=[2, 100836] }
# )

In [9]:

data = T.AddSelfLoops()(data)
data = T.NormalizeFeatures()(data)

In [12]:
data = data.to(device=device)

train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('u', 'r', 'v')],
    rev_edge_types=[('v', 'rev_r', 'u')],
)(data)

# use_weighted_loss = True
# # We have an unbalanced dataset with many labels for rating 3 and 4, and very
# # few for 0 and 1. Therefore we use a weighted MSE loss.
# if use_weighted_loss:
#     weight = torch.bincount(train_data['u', 'v'].edge_label)
#     weight = weight.max() / weight
# else:
#     weight = None

In [16]:
def train():
    model.train()
    optimizer.zero_grad()
    pred = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data['u', 'v'].edge_label_index)
    target = train_data['u', 'v'].edge_label
    loss = criterion(pred, target.float())
    # loss = weighted_mse_loss(pred, target, weight)
    loss.backward()
    optimizer.step()
    return float(loss)


@torch.no_grad()
def test(data):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict,
                 data['u', 'v'].edge_label_index)
    pred = pred.clamp(min=0, max=1)
    pred = torch.sigmoid(pred)
    target = data['u', 'v'].edge_label
    log_loss_ = log_loss(target.cpu().numpy().astype(int), pred.cpu().numpy())
    
    return float(log_loss_)

In [15]:
# sag = SAGEConv((-1, -1), 32)
# sag_ = to_hetero(sag, data.metadata(), aggr='sum')

In [16]:
# sag(train_data.x_dict['u'], train_data.edge_index_dict[('u', 'r', 'v')])

In [13]:
from torch import nn

In [14]:
model = Model(hidden_channels=32).to(device)
criterion = nn.BCEWithLogitsLoss()

# Due to lazy initialization, we need to run one model step so the number
# of parameters can be inferred:
with torch.no_grad():
    model.encoder(train_data.x_dict, train_data.edge_index_dict)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [17]:
for epoch in range(1, 10):
    loss = train()
    train_rmse = test(train_data)
    val_rmse = test(val_data)
    test_rmse = test(test_data)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, '
          f'Val: {val_rmse:.4f}, Test: {test_rmse:.4f}')

Epoch: 001, Loss: 0.6930, Train: 0.6925, Val: 0.6925, Test: 0.6926
Epoch: 002, Loss: 0.6924, Train: 0.6931, Val: 0.6931, Test: 0.6931
Epoch: 003, Loss: 0.6897, Train: 0.6931, Val: 0.6931, Test: 0.6931
Epoch: 004, Loss: 0.6855, Train: 0.6911, Val: 0.6915, Test: 0.6916
Epoch: 005, Loss: 0.6764, Train: 0.6814, Val: 0.6830, Test: 0.6839
Epoch: 006, Loss: 0.6614, Train: 0.6667, Val: 0.6700, Test: 0.6727
Epoch: 007, Loss: 0.6416, Train: 0.6531, Val: 0.6584, Test: 0.6634
Epoch: 008, Loss: 0.6185, Train: 0.6447, Val: 0.6519, Test: 0.6589
Epoch: 009, Loss: 0.5940, Train: 0.6416, Val: 0.6509, Test: 0.6577


# Movie base

In [21]:
path = r'./'
dataset = MovieLens(path, model_name='all-MiniLM-L6-v2')
data = dataset[0].to(device)

# Add user node features for message passing:
data['user'].x = torch.eye(data['user'].num_nodes, device=device)
del data['user'].num_nodes

# Add a reverse ('movie', 'rev_rates', 'user') relation for message passing:
data = T.ToUndirected()(data)
del data['movie', 'rev_rates', 'user'].edge_label  # Remove "reverse" label.

In [22]:
data

HeteroData(
  [1mmovie[0m={ x=[9742, 404] },
  [1muser[0m={ x=[610, 610] },
  [1m(user, rates, movie)[0m={
    edge_index=[2, 100836],
    edge_label=[100836]
  },
  [1m(movie, rev_rates, user)[0m={ edge_index=[2, 100836] }
)

In [23]:
data['user', 'rates', 'movie'].edge_index.max()

tensor(9741, device='cuda:0')

In [25]:

# Perform a link-level split into training, validation, and test edges:
train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('user', 'rates', 'movie')],
    rev_edge_types=[('movie', 'rev_rates', 'user')],
)(data)

use_weighted_loss = True
# We have an unbalanced dataset with many labels for rating 3 and 4, and very
# few for 0 and 1. Therefore we use a weighted MSE loss.
if use_weighted_loss:
    weight = torch.bincount(train_data['user', 'movie'].edge_label)
    weight = weight.max() / weight
else:
    weight = None


def weighted_mse_loss(pred, target, weight=None):
    weight = 1. if weight is None else weight[target].to(pred.dtype)
    return (weight * (pred - target.to(pred.dtype)).pow(2)).mean()


In [26]:
def train():
    model.train()
    optimizer.zero_grad()
    pred = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data['user', 'movie'].edge_label_index)
    target = train_data['user', 'movie'].edge_label
    loss = weighted_mse_loss(pred, target, weight)
    loss.backward()
    optimizer.step()
    return float(loss)


@torch.no_grad()
def test(data):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict,
                 data['user', 'movie'].edge_label_index)
    pred = pred.clamp(min=0, max=5)
    target = data['user', 'movie'].edge_label.float()
    rmse = F.mse_loss(pred, target).sqrt()
    return float(rmse)

In [30]:
class GNNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels):
        super().__init__()
        self.conv1 = SAGEConv((-1, -1), hidden_channels)
        self.conv2 = SAGEConv((-1, -1), out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x


class EdgeDecoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.lin1 = Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = Linear(hidden_channels, 1)

    def forward(self, z_dict, edge_label_index):
        row, col = edge_label_index
        z = torch.cat([z_dict['user'][row], z_dict['movie'][col]], dim=-1)

        z = self.lin1(z).relu()
        z = self.lin2(z)
        return z.view(-1)


class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.encoder = GNNEncoder(hidden_channels, hidden_channels)
        self.encoder = to_hetero(self.encoder, data.metadata(), aggr='sum')
        self.decoder = EdgeDecoder(hidden_channels)

    def forward(self, x_dict, edge_index_dict, edge_label_index):
        z_dict = self.encoder(x_dict, edge_index_dict)
        return self.decoder(z_dict, edge_label_index)

In [31]:
def train():
    model.train()
    optimizer.zero_grad()
    pred = model(train_data.x_dict, train_data.edge_index_dict,
                 train_data['u', 'v'].edge_label_index)
    target = train_data['u', 'v'].edge_label
    loss = criterion(pred, target.float())
    # loss = weighted_mse_loss(pred, target, weight)
    loss.backward()
    optimizer.step()
    return float(loss)


@torch.no_grad()
def test(data):
    model.eval()
    pred = model(data.x_dict, data.edge_index_dict,
                 data['user', 'movie'].edge_label_index)
    pred = pred.clamp(min=0, max=1)
    pred = torch.sigmoid(pred)
    target = data['user', 'movie'].edge_label
    log_loss_ = log_loss(target.cpu().numpy().astype(int), pred.cpu().numpy())
    
    return float(log_loss_)

In [32]:
model = Model(hidden_channels=32).to(device)

# Due to lazy initialization, we need to run one model step so the number
# of parameters can be inferred:
with torch.no_grad():
    model.encoder(train_data.x_dict, train_data.edge_index_dict)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


In [33]:
for epoch in range(1, 301):
    loss = train()
    train_rmse = test(train_data)
    val_rmse = test(val_data)
    test_rmse = test(test_data)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Train: {train_rmse:.4f}, '
          f'Val: {val_rmse:.4f}, Test: {test_rmse:.4f}')

AttributeError: 'EdgeStorage' object has no attribute 'edge_label_index'