In [3]:
import os
import numpy as np
import pandas as pd
from ast import literal_eval
import matplotlib.pyplot as plt
import tqdm
import pickle

In [4]:
import torch
import torch.nn as nn
import torch.functional as F
from torch_geometric.data import Data, InMemoryDataset, download_url, Dataset

In [5]:
from torch_geometric.loader import DataLoader

In [33]:
edge_index = torch.tensor([[0, 1, 1, 2],
                           [1, 0, 2, 1]], dtype=torch.long) # graph connectivity in COO format
x = torch.tensor([[-1], [0], [1]], dtype=torch.float) # Node feature matrix (scalers in this case)

In [34]:
data = Data(x = x, edge_index=edge_index)

In [35]:
data  # output describes the shape

Data(x=[3, 1], edge_index=[2, 4])

## Creating custom dataset object
### Smaller DataSet with single graph
*https://pytorch-geometric.readthedocs.io/en/latest/tutorial/create_dataset.html*

Two abstrct dataset classes are provided in PyG - 

    - `torch_geometric.data.Dataset`
    - `torch_geometric.data.InMemeoryDataset`
    

Example 1: Planetoid  Cora Dataset 

In [39]:
nodes = pd.read_csv('data/cora/nodes.csv', index_col=0)
edges = pd.read_csv('data/cora/edges.csv', index_col=0)

In [42]:
nodes

Unnamed: 0,subject,features,id
31336,Neural_Networks,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",31336
1061127,Rule_Learning,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",1061127
1106406,Reinforcement_Learning,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1106406
13195,Reinforcement_Learning,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",13195
37879,Probabilistic_Methods,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",37879
...,...,...,...
1128975,Genetic_Algorithms,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1128975
1128977,Genetic_Algorithms,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1128977
1128978,Genetic_Algorithms,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1128978
117328,Case_Based,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",117328


In [48]:
nodes.id.max()

1155073

In [50]:
edges

Unnamed: 0,target,source,label
0,35,1033,cites
1,35,103482,cites
2,35,103515,cites
3,35,1050679,cites
4,35,1103960,cites
...,...,...,...
5424,853116,19621,cites
5425,853116,853155,cites
5426,853118,1140289,cites
5427,853155,853118,cites


In [45]:
print(f'No. of nodes - {nodes.shape[0]}')
print(f'No. of edges - {edges.shape[0]}')

No. of nodes - 2708
No. of edges - 5429


In [128]:
from typing import Callable, List, Optional, Tuple, Union


class Cora(InMemoryDataset):
    def __init__(self, root, name, transform = None, pre_transform = None):
        self.root = root
        self.name = name
        self.nodes = pd.read_csv(os.path.join(self.raw_dir, 'nodes.csv'), index_col=0, converters={'features': literal_eval})
        self.edges = pd.read_csv(os.path.join(self.raw_dir, 'edges.csv'), index_col=0)
        self.node2id, self.id2node = self.get_node_id()
        self.label2id, self.id2label = self.get_label_id()
        self.transform_data()
        super().__init__(root, transform, pre_transform)
        self.data = torch.load(self.processed_paths[0])
        
        
    @property
    def raw_dir(self):
        return os.path.join(self.root, self.name, 'raw')
    
    @property
    def processed_dir(self):
        return os.path.join(self.root, self.name, 'processed')
    
    @property
    def raw_file_names(self):
        return ['nodes.csv', 'edges.csv']

    @property
    def processed_file_names(self):
        return ['data_1.pt']

    def get_node_id(self):
        node2id = dict()
        id2node = dict()
        nid = 0
        for i, node in enumerate(self.nodes['id']):
            if not (node in node2id):
                node2id[node] = nid
                id2node[nid] = node
                nid += 1
        return node2id, id2node
    
    def get_label_id(self):
        label2id = dict()
        id2label = dict()
        lid = 0

        for i, label in enumerate(self.nodes['subject'].unique()):
            if not (label in label2id):
                label2id[label] = lid
                id2label[lid] = label
                lid += 1

        return label2id, id2label
    
    def transform_data(self):
        self.nodes['id'] = self.nodes['id'].map(self.node2id)
        self.nodes['subject'] = self.nodes['subject'].map(self.label2id)
        self.edges['source'] = self.edges['source'].map(self.node2id)
        self.edges['target'] = self.edges['target'].map(self.node2id)
        self.nodes.sort_values(by = 'id', ascending=True, inplace=True)

    def process(self):
        source = self.edges['source'].values
        target = self.edges['target'].values
        edge_index = torch.tensor(np.stack((source, target)), dtype=torch.long)
        x = self.nodes['features'].values.tolist()
        x = torch.tensor(x)
        y = self.nodes['subject'].values
        y = torch.FloatTensor(y)
        data = Data(x = x, edge_index=edge_index, y = y)
        torch.save(data, self.processed_paths[0])


In [129]:
cora = Cora(root = 'data', name='cora')

Processing...
Done!


In [136]:
cora.x

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])

In [137]:
cora.y

tensor([0., 1., 2.,  ..., 5., 6., 0.])

In [188]:
cora.y

tensor([0., 1., 2.,  ..., 5., 6., 0.])

In [189]:
data_loader = DataLoader(cora, batch_size=12)

In [196]:
len(data_loader)

1

In [193]:
for i, batch in enumerate(data_loader):
    print(batch)
    print(type(batch))
    sample_batch = batch

DataBatch(x=[2708, 1433], edge_index=[2, 5429], y=[2708], batch=[2708], ptr=[2])
<class 'torch_geometric.data.batch.DataBatch'>


In [161]:
sample_batch.batch

tensor([0, 0, 0,  ..., 0, 0, 0])

### Dataset with multiple graphs

Example : ModelNeT

In [6]:
from torch_geometric.datasets import ModelNet, Reddit2
from torch_geometric.loader import NeighborLoader

In [9]:
reddit2 = Reddit2(root='data/Reddit2')

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [11]:
device

device(type='cuda')

In [12]:
data = reddit2[0].to(device, 'x', 'y')

In [13]:
# 
kwargs = {'batch_size': 1024, 'num_workers': 6, 'persistent_workers': True}
train_loader = NeighborLoader(data, input_nodes=data.train_mask,
                              num_neighbors=[25, 10], shuffle=True, **kwargs)

In [14]:
for batch in train_loader:
    sample_batch = batch
    break

In [15]:
sample_batch

Data(x=[111776, 602], edge_index=[2, 226656], y=[111776], train_mask=[111776], val_mask=[111776], test_mask=[111776], n_id=[111776], e_id=[226656], num_sampled_nodes=[3], num_sampled_edges=[2], input_id=[1024], batch_size=1024)

In [18]:
23213838/226656

102.41881088521812