Credits: [Pytorch Geometric Tutorials](https://colab.research.google.com/drive/1xpzn1Nvai1ygd_P5Yambc_oe4VBPK_ZT?usp=sharing)

In [3]:
import torch
from torch import Tensor
print(torch.__version__)

2.1.0+cu118


In [4]:
# Install required packages.
import os
os.environ['TORCH'] = torch.__version__

!pip install torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install pyg-lib -f https://data.pyg.org/whl/nightly/torch-${TORCH}.html
!pip install git+https://github.com/pyg-team/pytorch_geometric.git

Looking in links: https://data.pyg.org/whl/torch-2.1.0+cu118.html
Looking in links: https://data.pyg.org/whl/torch-2.1.0+cu118.html
Looking in links: https://data.pyg.org/whl/nightly/torch-2.1.0+cu118.html
Collecting git+https://github.com/pyg-team/pytorch_geometric.git
  Cloning https://github.com/pyg-team/pytorch_geometric.git to /tmp/pip-req-build-6vehwxae
  Running command git clone --filter=blob:none --quiet https://github.com/pyg-team/pytorch_geometric.git /tmp/pip-req-build-6vehwxae
  Resolved https://github.com/pyg-team/pytorch_geometric.git to commit 9adb8d082cd1ea45394e137085404a9aacf4e261
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


### Loading datasets <br>
They are already split into train, test, validation sets during preprocessing.

In [None]:
# Upload data folder from github to ./data/spotify_small/

In [30]:
# loading dataset
import torch
from torch_geometric.data import HeteroData
from torch import Tensor

data = torch.load('./data/data_object.pt')
data

Data(edge_index=[2, 1318992], num_nodes=14992)

In [38]:
from torch_geometric.datasets import AmazonBook

dataset = AmazonBook(root='../data/amazonbook/')
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')
# print(f'Number of classes: {dataset.num_classes}')

Downloading https://raw.githubusercontent.com/gusye1234/LightGCN-PyTorch/master/data/amazon-book/user_list.txt
Downloading https://raw.githubusercontent.com/gusye1234/LightGCN-PyTorch/master/data/amazon-book/item_list.txt
Downloading https://raw.githubusercontent.com/gusye1234/LightGCN-PyTorch/master/data/amazon-book/train.txt
Downloading https://raw.githubusercontent.com/gusye1234/LightGCN-PyTorch/master/data/amazon-book/test.txt
Processing...


Dataset: AmazonBook():
Number of graphs: 1
Number of features: {'user': 0, 'book': 0}


Done!


In [41]:
# from torch_geometric.data import HeteroData

# HeteroData(data)
data = dataset[0]
data

HeteroData(
  user={ num_nodes=52643 },
  book={ num_nodes=91599 },
  (user, rates, book)={
    edge_index=[2, 2380730],
    edge_label_index=[2, 603378],
  },
  (book, rated_by, user)={ edge_index=[2, 2380730] }
)

In [43]:
transform = T.RandomLinkSplit(
    num_val=0.1,  # Validation set 10%
    num_test=0.1,  # Test set 10%
    disjoint_train_ratio=0.3,  # 30% of edges for supervision
    neg_sampling_ratio=2.0,  # negative samples to positive samples ratio
    add_negative_train_samples=False,
    edge_types=("user", "rates", "book"),
    rev_edge_types=("book", "rated_by", "user"),
)

train_data, val_data, test_data = transform(data)
train_data, val_data, test_data


(HeteroData(
   user={ num_nodes=52643 },
   book={ num_nodes=91599 },
   (user, rates, book)={
     edge_index=[2, 1333209],
     edge_label_index=[2, 571375],
     edge_label=[571375],
   },
   (book, rated_by, user)={ edge_index=[2, 1333209] }
 ),
 HeteroData(
   user={ num_nodes=52643 },
   book={ num_nodes=91599 },
   (user, rates, book)={
     edge_index=[2, 1904584],
     edge_label_index=[2, 714219],
     edge_label=[714219],
   },
   (book, rated_by, user)={ edge_index=[2, 1904584] }
 ),
 HeteroData(
   user={ num_nodes=52643 },
   book={ num_nodes=91599 },
   (user, rates, book)={
     edge_index=[2, 2142657],
     edge_label_index=[2, 714219],
     edge_label=[714219],
   },
   (book, rated_by, user)={ edge_index=[2, 2142657] }
 ))

## Batching
It is difficult to store large graphs in memory so we need to create subgraphs. We can use LinkNeighborLoader to sample graphs and we can spedify that we want 'n' hops and at each hop, sample 'i' neighbors

In [44]:
from torch_geometric.loader import LinkNeighborLoader

# Define seed edges: Which edges to start sampling from.
# In train, it can start at any point but in test and val, it is smaller
edge_label_index = train_data["user", "rates", "book"].edge_label_index
edge_label = train_data["user", "rates", "book"].edge_label

train_loader = LinkNeighborLoader(
    data=train_data,  #
    num_neighbors=[30,20],  # This list size is number of hops. It means sample 30 neighbors at 1st hop and 20 at 2nd hop
    neg_sampling_ratio=2.0,
    edge_label_index=(("user", "rates", "book"), edge_label_index),
    edge_label=edge_label,
    batch_size=128,
    shuffle=True,
)


## Model Definition

Lets define a graph neural network. There are multiple possible layer types but we will use nn.SAGEConv initially. As there are 2 types of nodes in graph, we will need to convert model to hetegenous also.

In [47]:
from torch_geometric.nn import SAGEConv, to_hetero
import torch.nn.functional as F

class GNN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()

        self.conv1 = SAGEConv(hidden_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)

    def forward(self, x: Tensor, edge_index: Tensor) -> Tensor:
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return x


# Our final classifier applies the dot-product between source and destination
# node embeddings to derive edge-level predictions:
class Classifier(torch.nn.Module):
    def forward(self, user: Tensor, book: Tensor, edge_label_index: Tensor) -> Tensor:
        # Convert node embeddings to edge-level representations:
        edge_feat_user = user[edge_label_index[0]]
        edge_feat_book = book[edge_label_index[1]]

        # Apply dot-product to get a prediction per supervision edge:
        return (edge_feat_user * edge_feat_book).sum(dim=-1)


class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        # create [hidden_channels] length embeddings for users
        self.user_emb = torch.nn.Embedding(train_data["user"].num_nodes, hidden_channels)
        # create [hidden_channels] length embeddings for book
        self.book_emb = torch.nn.Embedding(train_data["book"].num_nodes, hidden_channels)

        # GNN: Homogeneous by default:
        self.gnn = GNN(hidden_channels)

        # Making GNN heterogeneous:
        self.gnn = to_hetero(self.gnn, metadata=train_data.metadata())

        self.classifier = Classifier()

    def forward(self, data: HeteroData) -> Tensor:
            x_dict = {
              "user": self.user_emb(data["user"].n_id),
              "book": self.book_emb(data["book"].n_id),
            }

            # `x_dict` holds feature matrices of all node types
            # `edge_index_dict` holds all edge indices of all edge types
            x_dict = self.gnn(x_dict, data.edge_index_dict)

            pred = self.classifier(
                x_dict["user"],
                x_dict["book"],
                data["user", "rates", "book"].edge_label_index,
            )

            return pred


model = Model(hidden_channels=64)
print(model)

Model(
  (user_emb): Embedding(52643, 64)
  (book_emb): Embedding(91599, 64)
  (gnn): GraphModule(
    (conv1): ModuleDict(
      (user__rates__book): SAGEConv(64, 64, aggr=mean)
      (book__rated_by__user): SAGEConv(64, 64, aggr=mean)
    )
    (conv2): ModuleDict(
      (user__rates__book): SAGEConv(64, 64, aggr=mean)
      (book__rated_by__user): SAGEConv(64, 64, aggr=mean)
    )
  )
  (classifier): Classifier()
)


## Training

In [49]:
import tqdm # For progress bar
import torch.nn.functional as F
import torch_sparse

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: '{device}'")

model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

for epoch in range(1, 5):
    total_loss = total_examples = 0
    for sampled_data in tqdm.tqdm(train_loader):
        optimizer.zero_grad()

        # Move `sampled_data` to the respective `device`
        sampled_data.to(device)
        # Run `forward` pass of the model
        pred = model.forward(sampled_data)
        # Apply binary cross entropy via
        ground_truth = sampled_data["user", "rates", "book"].edge_label
        loss = F.binary_cross_entropy_with_logits(pred, ground_truth)

        loss.backward()
        optimizer.step()
        total_loss += float(loss) * pred.numel()
        total_examples += pred.numel()
    print(f"Epoch: {epoch:03d}, Loss: {total_loss / total_examples:.4f}")

Device: 'cpu'


100%|██████████| 4464/4464 [22:40<00:00,  3.28it/s]


Epoch: 001, Loss: 0.4436


100%|██████████| 4464/4464 [22:25<00:00,  3.32it/s]


Epoch: 002, Loss: 0.3171


100%|██████████| 4464/4464 [23:17<00:00,  3.19it/s]


Epoch: 003, Loss: 0.2792


100%|██████████| 4464/4464 [23:24<00:00,  3.18it/s]

Epoch: 004, Loss: 0.2535





## Evaluation


In [50]:
# Define the validation seed edges:
edge_label_index = val_data["user", "rates", "book"].edge_label_index
edge_label = val_data["user", "rates", "book"].edge_label

val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=[20, 10],
    edge_label_index=(("user", "rates", "book"), edge_label_index),
    edge_label=edge_label,
    batch_size=3 * 128,
    shuffle=False,
)

In [51]:
from sklearn.metrics import roc_auc_score, precision_score, recall_score

preds = []
ground_truths = []
for sampled_data in tqdm.tqdm(val_loader):
    with torch.no_grad():
        # TODO: Collect predictions and ground-truths and write them into
        # `preds` and `ground_truths`.
        preds.append(model(sampled_data))
        ground_truths.append(sampled_data["user", "rates", "book"].edge_label)
        # raise NotImplementedError

pred = torch.cat(preds, dim=0).cpu().numpy()
ground_truth = torch.cat(ground_truths, dim=0).cpu().numpy()
print(pred)
auc = roc_auc_score(ground_truth, pred)
print()
print(f"Validation AUC: {auc:.4f}")


prec = precision_score(y_true=ground_truth,y_pred=[1 if x>-0.5 else 0 for x in pred])
print()
print(f"Validation Precision: {prec:.4f}")

rec = recall_score(y_true=ground_truth,y_pred=[1 if x>-0.5 else 0 for x in pred])
print()
print(f"Validation Recall: {rec:.4f}")




100%|██████████| 1860/1860 [02:35<00:00, 11.93it/s]


[ 0.34676635  1.140393   -0.7163358  ... -6.524456   -7.8290877
 -7.784643  ]

Validation AUC: 0.9407

Validation Precision: 0.7751

Validation Recall: 0.8587


In [52]:
ground_truth, pred

(array([1., 1., 1., ..., 0., 0., 0.], dtype=float32),
 array([ 0.34676635,  1.140393  , -0.7163358 , ..., -6.524456  ,
        -7.8290877 , -7.784643  ], dtype=float32))

In [53]:
import numpy as np

auc_lst = []

for i in np.linspace(-1,1,100):
  auc_score = roc_auc_score(ground_truth, [1 if x>i else 0 for x in pred])
  auc_lst.append(auc_score)

In [54]:
max(auc_lst)

0.8694654580737842

In [56]:
import pandas as pd
import numpy as np

user_books = val_data[('user', 'rates', 'book')]['edge_label_index'].numpy()
actual_edge_exists = val_data[('user', 'rates', 'book')]['edge_label'].numpy()
pred

val_pred = np.vstack([user_books, actual_edge_exists, pred])
val_pred = val_pred.transpose()
val_pred = pd.DataFrame(val_pred)
val_pred.columns = ['user','book','edge_exists','sim_score']
val_pred.head()

Unnamed: 0,user,book,edge_exists,sim_score
0,10183.0,25721.0,1.0,0.346766
1,82.0,684.0,1.0,1.140393
2,35061.0,60284.0,1.0,-0.716336
3,27295.0,7055.0,1.0,1.01674
4,39948.0,28797.0,1.0,1.255421


In [58]:
val_pred['rank'] = val_pred.groupby('user')['sim_score'].rank(method='dense', ascending=False)
k = 20
val_pred['pred_top20'] = val_pred.apply(lambda x: 1 if x['rank']<=k else 0, axis=1)

from sklearn.metrics import precision_score, recall_score

recall_score(val_pred['edge_exists'],val_pred['pred_top20']), precision_score(val_pred['edge_exists'],val_pred['pred_top20'])


(0.9135853288697164, 0.3243905931202348)

In [59]:
roc_auc_score(val_pred['edge_exists'],val_pred['pred_top20'])

0.48111083575205926

In [60]:
train_data

HeteroData(
  user={ num_nodes=52643 },
  book={ num_nodes=91599 },
  (user, rates, book)={
    edge_index=[2, 1333209],
    edge_label_index=[2, 571375],
    edge_label=[571375],
  },
  (book, rated_by, user)={ edge_index=[2, 1333209] }
)

In [61]:
val_data

HeteroData(
  user={ num_nodes=52643 },
  book={ num_nodes=91599 },
  (user, rates, book)={
    edge_index=[2, 1904584],
    edge_label_index=[2, 714219],
    edge_label=[714219],
  },
  (book, rated_by, user)={ edge_index=[2, 1904584] }
)

In [64]:
val_data[('user', 'rates', 'book')]['edge_label'],val_data[('user', 'rates', 'book')]['edge_label_index']

(tensor([1., 1., 1.,  ..., 0., 0., 0.]),
 tensor([[10183,    82, 35061,  ..., 42522, 18647, 43440],
         [25721,   684, 60284,  ..., 56825, 14807, 14095]]))