In [1]:
from torch_geometric.datasets import MovieLens

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
dataset = MovieLens(root='../data/movielens')
data = dataset[0]
data

HeteroData(
  movie={ x=[9742, 404] },
  user={ num_nodes=610 },
  (user, rates, movie)={
    edge_index=[2, 100836],
    edge_label=[100836],
  }
)

In [22]:
data['movie']['x'].shape

torch.Size([9742, 404])

In [24]:
data['user','movie']

{'edge_index': tensor([[   0,    0,    0,  ...,  609,  609,  609],
        [   0,    2,    5,  ..., 9462, 9463, 9503]]), 'edge_label': tensor([4, 4, 4,  ..., 5, 5, 3])}

In [27]:
data['user','movie']['edge_index']

tensor([[   0,    0,    0,  ...,  609,  609,  609],
        [   0,    2,    5,  ..., 9462, 9463, 9503]])

In [30]:
data['user','movie']['edge_index'][0].max(), data['user','movie']['edge_index'][1].max()
# From counts, clear that 0 row is User and 1 row is movies

(tensor(609), tensor(9741))

In [32]:
data['user','movie']['edge_label'].min(), data['user','movie']['edge_label'].max()
# edge_label is the rating that the user gave to a movie

(tensor(0), tensor(5))

In [33]:
# There are 100836 edges so reverse edges have not been added already
import torch_geometric.transforms as T
from torch_geometric.data import HeteroData

# We can leverage the `T.ToUndirected()` transform for this from PyG:
data = T.ToUndirected()(data)
data

HeteroData(
  movie={ x=[9742, 404] },
  user={ num_nodes=610 },
  (user, rates, movie)={
    edge_index=[2, 100836],
    edge_label=[100836],
  },
  (movie, rev_rates, user)={
    edge_index=[2, 100836],
    edge_label=[100836],
  }
)

In [34]:
# Don't need labels for reverse edges
del data['movie', 'rev_rates', 'user'].edge_label
data

HeteroData(
  movie={ x=[9742, 404] },
  user={ num_nodes=610 },
  (user, rates, movie)={
    edge_index=[2, 100836],
    edge_label=[100836],
  },
  (movie, rev_rates, user)={ edge_index=[2, 100836] }
)

In [36]:
train_data, val_data, test_data = T.RandomLinkSplit(
    num_val=0.1,
    num_test=0.1,
    neg_sampling_ratio=0.0,
    edge_types=[('user', 'rates', 'movie')],
    rev_edge_types=[('movie', 'rev_rates', 'user')],
)(data)
train_data, val_data, test_data

(HeteroData(
   movie={ x=[9742, 404] },
   user={ num_nodes=610 },
   (user, rates, movie)={
     edge_index=[2, 80670],
     edge_label=[80670],
     edge_label_index=[2, 80670],
   },
   (movie, rev_rates, user)={ edge_index=[2, 80670] }
 ),
 HeteroData(
   movie={ x=[9742, 404] },
   user={ num_nodes=610 },
   (user, rates, movie)={
     edge_index=[2, 80670],
     edge_label=[10083],
     edge_label_index=[2, 10083],
   },
   (movie, rev_rates, user)={ edge_index=[2, 80670] }
 ),
 HeteroData(
   movie={ x=[9742, 404] },
   user={ num_nodes=610 },
   (user, rates, movie)={
     edge_index=[2, 90753],
     edge_label=[10083],
     edge_label_index=[2, 10083],
   },
   (movie, rev_rates, user)={ edge_index=[2, 90753] }
 ))

In [40]:
from torch_geometric.utils import to_networkx
import networkx as nx

G = to_networkx(test_data)
nx.write_gexf(G, '../data/movielens/movielens_test.gexf')

In [41]:
import torch

torch.save(train_data, '../data/movielens/train.pt')
torch.save(val_data, '../data/movielens/test.pt')
torch.save(test_data, '../data/movielens/val.pt')
