In [None]:
!pip install -q torch-scatter -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install -q torch-sparse -f https://data.pyg.org/whl/torch-1.10.0+cu113.html
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git

In [None]:
import os.path as osp
import numpy as np
import torch
from torch_geometric.datasets import AMiner
from torch_geometric.nn import MetaPath2Vec

# MetaPath2Vec

[paper](https://ericdongyx.github.io/papers/KDD17-dong-chawla-swami-metapath2vec.pdf)  
[code](https://github.com/rusty1s/pytorch_geometric/blob/master/examples/metapath2vec.py)

In [None]:
# load the dataset
path = osp.join('..', 'data', 'AMiner')
dataset = AMiner(path)
data = dataset[0]


In [None]:
print(data)

In [None]:
print(type(data.edge_index_dict))
print(data.edge_index_dict[('paper', 'written by', 'author')])

In [None]:
print(type(data.num_nodes_dict))
print(data.num_nodes_dict)

In [None]:
print(type(data.y_dict))
print(data.y_dict["venue"])

In [None]:
print(type(data.y_index_dict))
print(data.y_index_dict["venue"])

In [None]:
# move the data to cpu or GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = "cpu"

In [None]:
# define the model

metapath = [
    ('author', 'wrote', 'paper'),
    ('paper', 'published in', 'venue'),
    ('venue', 'published', 'paper'),
    ('paper', 'written by', 'author'),
]


model = MetaPath2Vec(data.edge_index_dict, 
                     embedding_dim=128,
                     metapath=metapath,
                     walk_length=5, 
                     context_size=3,
                     walks_per_node=3,
                     num_negative_samples=1,
                     sparse=True
                    ).to(device)


In [None]:
# use the loader to build a loader
loader = model.loader(batch_size=128, shuffle=True, num_workers=3)

In [None]:
for idx, (pos_rw, neg_rw) in enumerate(loader):
    if idx == 10: break
    print(idx, pos_rw.shape, neg_rw.shape)

In [None]:
print(pos_rw[0],neg_rw[0])

In [None]:
# Inizialize optimizer
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)

In [None]:
def train(epoch, log_steps=500, eval_steps=1000):
    model.train()

    total_loss = 0
    for i, (pos_rw, neg_rw) in enumerate(loader):
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        if (i + 1) % log_steps == 0:
            print((f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                   f'Loss: {total_loss / log_steps:.4f}'))
            total_loss = 0

        if (i + 1) % eval_steps == 0:
            acc = test()
            print((f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                   f'Acc: {acc:.4f}'))

@torch.no_grad()
def test(train_ratio=0.1):
    model.eval()

    z = model('author', batch=data.y_index_dict['author'])
    y = data.y_dict['author']

    perm = torch.randperm(z.size(0))
    train_perm = perm[:int(z.size(0) * train_ratio)]
    test_perm = perm[int(z.size(0) * train_ratio):]

    return model.test(z[train_perm], y[train_perm], z[test_perm],
                      y[test_perm], max_iter=150)


In [None]:
for epoch in range(1, 2):
    train(epoch)
    acc = test()
    print(f'Epoch: {epoch}, Accuracy: {acc:.4f}')

# load the model

In [None]:
loaded_model = MetaPath2Vec(data.edge_index_dict, 
                     embedding_dim=128,
                     metapath=metapath,
                     walk_length=5, 
                     context_size=3,
                     walks_per_node=3,
                     num_negative_samples=1,
                     sparse=True
                    ).to(device)

In [None]:
print(loaded_model.embedding.weight[1][:5])

In [None]:
# load the model
loaded_model.load_state_dict(torch.load("mymodel").detach().cpu())

In [None]:
# move the model to cpu
file = torch.load('mymodel', map_location=lambda storage, loc: storage)
loaded_model.load_state_dict(file)

In [None]:
print(loaded_model.embedding.weight[1][:5])

In [None]:
z_venue = loaded_model('venue', batch=data.y_index_dict['venue']).detach().numpy()
z_auth = loaded_model('author', batch=data.y_index_dict['author']).detach().numpy()

In [None]:
z_venue = z_venue[0:100]
z_auth = z_auth[0:100]

In [None]:
import umap

embedder = umap.UMAP().fit(data,y)

z_venue_2d = umap.UMAP().fit_transform(z_venue)
z_auth_2d = umap.UMAP().fit_transform(z_auth)

In [None]:
import matplotlib.pyplot as plt


plt.figure(figsize=(6,6))
plt.scatter(z_auth_2d[:,0],z_auth_2d[:,1],color="red",alpha=0.5,label="author")
plt.scatter(z_venue_2d[:,0],z_venue_2d[:,1],color="blue",alpha=0.5,label="venue")
plt.legend()
plt.title("2D embedding")
plt.show()