# NDSSL Link Classification

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
import itertools
from tqdm.auto import tqdm, trange

from torch_geometric.data import Data, InMemoryDataset, DataLoader
from torch_geometric.nn import GCNConv, SAGEConv, InnerProductDecoder
from torch_geometric.data import GraphSAINTRandomWalkSampler
from torch_geometric.utils import get_laplacian, degree, remove_self_loops, add_self_loops, negative_sampling

%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.style as style 
style.use('seaborn-paper')

fontsize = 12
plt.rcParams.update({
    'font.size': fontsize, 
    'axes.labelsize': fontsize, 
    'legend.fontsize': fontsize,
    'xtick.labelsize': fontsize,
    'ytick.labelsize': fontsize,
    'axes.titlesize': fontsize
                    })

In [2]:
#from imports import *
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


## Data processing

Convert the node attribute data into an (x,y) format

In [3]:
node_attributes = pd.read_csv('../../data/NDSSL data/raw/node_attributes.csv')

## one-hot encode gender
gender_index = torch.LongTensor(node_attributes['gender'].values - 1).type(torch.int64).reshape((len(node_attributes), 1))
gender_onehot = torch.LongTensor(len(node_attributes), 2)
gender_onehot.zero_()
gender_onehot = gender_onehot.scatter_(1, gender_index, 1).type(torch.float32);

## one-hot encode worker
worker_index = torch.LongTensor(node_attributes['worker'].values - 1).type(torch.int64).reshape((len(node_attributes), 1))
worker_onehot = torch.LongTensor(len(node_attributes), 2)
worker_onehot.zero_()
worker_onehot = worker_onehot.scatter_(1, worker_index, 1).type(torch.float32);

## map the 117 distinct zipcodes to the integers 0, ..., 116
zipcode_original = node_attributes['zipcode'].values
zipcode_dict = {i: j for j, i in enumerate(set(zipcode_original))} 
zipcode_index = torch.LongTensor(np.asarray([zipcode_dict[i] for i in zipcode_original])).type(torch.int64).reshape((len(node_attributes), 1))

## one-hot encode zipcode
zipcode_onehot = torch.LongTensor(len(node_attributes), len(zipcode_dict))
zipcode_onehot.zero_()
zipcode_onehot = zipcode_onehot.scatter_(1, zipcode_index, 1).type(torch.float32);

## one-hot encode household income
household_income_index = torch.LongTensor(node_attributes['household_income'].values - 1).type(torch.int64).reshape((len(node_attributes), 1))
household_income_onehot = torch.LongTensor(len(node_attributes), 14)
household_income_onehot.zero_()
household_income_onehot = household_income_onehot.scatter_(1, household_income_index, 1).type(torch.float32);

## one-hot encode relationship
relationship_index = torch.LongTensor(node_attributes['relationship'].values - 1).type(torch.int64).reshape((len(node_attributes), 1))
relationship_onehot = torch.LongTensor(len(node_attributes), 4)
relationship_onehot.zero_()
relationship_onehot = relationship_onehot.scatter_(1, relationship_index, 1).type(torch.float32);

age = torch.FloatTensor(node_attributes['age'].values).reshape(len(node_attributes), 1).type(torch.float32)
household_size = torch.FloatTensor(node_attributes['household_size'].values).reshape(len(node_attributes), 1).type(torch.float32)
household_workers = torch.FloatTensor(node_attributes['household_workers'].values).reshape(len(node_attributes), 1).type(torch.float32)
household_vehicles = torch.FloatTensor(node_attributes['household_vehicles'].values).reshape(len(node_attributes), 1).type(torch.float32)

In [4]:
x = torch.cat((gender_onehot, worker_onehot, age, relationship_onehot, zipcode_onehot, \
               household_income_onehot, household_size, household_workers, household_vehicles), dim=1)
print(x.shape, x.dtype)
torch.save(x, '../../data/NDSSL data/raw/x.pt')

torch.Size([1601330, 143]) torch.float32


In [5]:
class NDSSLDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(NDSSLDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])
        
    @property
    def raw_file_names(self):
        return ['edge_list.csv', 'x.pt', 'edge_attributes.csv']

    @property
    def processed_file_names(self):
        return ['NDSSL_graph_full.pt']

    def process(self):
        data_list = []
        
        ## load the edge list
        edge_list = pd.read_csv(self.raw_paths[0], dtype=int) - 2000000 #the node id's start at 2000000, shift these to start at 0         
        
        ## format the edge list
        target_nodes = edge_list.iloc[:,0].values
        source_nodes = edge_list.iloc[:,1].values
        edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.int64)

        ## load the (x,y) formatted data
        x = torch.load(self.raw_paths[1], map_location=torch.device('cpu'))
        #y = torch.load(self.raw_paths[2], map_location=torch.device('cpu'))
        #train_mask = torch.load(self.raw_paths[3], map_location=torch.device('cpu')) == 1 
        #test_mask = torch.load(self.raw_paths[4], map_location=torch.device('cpu')) == 1 

        ## set the edge weights to be the duration (in hours)
        edge_attributes = pd.read_csv(self.raw_paths[2])['duration'].values/3600
        duration =  torch.FloatTensor(edge_attributes)
        ## previous approaches used the degree:
        #row, col = data.edge_index
        #data.edge_attr = (1. / degree(col, data.num_nodes)[col]).double()
        
        ## build the data
        data = Data(edge_index=edge_index, x=x)
        data.edge_weight = duration
        #data.train_mask = train_mask
        #data.test_mask = test_mask
        #data.train_mask = torch.cat((torch.ones(n_train, dtype=torch.bool), torch.zeros(n_val, dtype=torch.bool), torch.zeros(n_test, dtype=torch.bool)), dim=0)
        #data.test_mask = torch.cat((torch.zeros(n_train, dtype=torch.bool), torch.zeros(n_val, dtype=torch.bool), torch.ones(n_test, dtype=torch.bool)), dim=0)

        print(data.__dict__)
        data, slices = self.collate([data])
        torch.save((data, slices), self.processed_paths[0])

In [6]:
## remove old processed files
import shutil
shutil.rmtree('../../data/NDSSL data/processed')

## shuffle the masks
dataset = NDSSLDataset('../../data/NDSSL data/')
dataset.process()
data = dataset[0]

Processing...
{'x': tensor([[1., 0., 1.,  ..., 3., 2., 3.],
        [0., 1., 1.,  ..., 3., 2., 3.],
        [1., 0., 0.,  ..., 3., 2., 3.],
        ...,
        [1., 0., 0.,  ..., 8., 1., 2.],
        [0., 1., 1.,  ..., 1., 1., 1.],
        [0., 1., 0.,  ..., 1., 0., 1.]]), 'edge_index': tensor([[      0,       0,       1,  ..., 1486224, 1378614, 1556530],
        [      1,       2,       2,  ..., 1601329, 1601329, 1601329]]), 'edge_attr': None, 'y': None, 'pos': None, 'norm': None, 'face': None, 'edge_weight': tensor([10.9161, 12.7494, 12.5828,  ...,  0.0497,  0.1667,  0.1667])}
Done!
{'x': tensor([[1., 0., 1.,  ..., 3., 2., 3.],
        [0., 1., 1.,  ..., 3., 2., 3.],
        [1., 0., 0.,  ..., 3., 2., 3.],
        ...,
        [1., 0., 0.,  ..., 8., 1., 2.],
        [0., 1., 1.,  ..., 1., 1., 1.],
        [0., 1., 0.,  ..., 1., 0., 1.]]), 'edge_index': tensor([[      0,       0,       1,  ..., 1486224, 1378614, 1556530],
        [      1,       2,       2,  ..., 1601329, 1601329, 16

In [7]:
data.__dict__

{'x': tensor([[1., 0., 1.,  ..., 3., 2., 3.],
         [0., 1., 1.,  ..., 3., 2., 3.],
         [1., 0., 0.,  ..., 3., 2., 3.],
         ...,
         [1., 0., 0.,  ..., 8., 1., 2.],
         [0., 1., 1.,  ..., 1., 1., 1.],
         [0., 1., 0.,  ..., 1., 0., 1.]]),
 'edge_index': tensor([[      0,       0,       1,  ..., 1486224, 1378614, 1556530],
         [      1,       2,       2,  ..., 1601329, 1601329, 1601329]]),
 'edge_attr': None,
 'y': None,
 'pos': None,
 'norm': None,
 'face': None,
 'edge_weight': tensor([10.9161, 12.7494, 12.5828,  ...,  0.0497,  0.1667,  0.1667])}

In [8]:
import math
from torch_geometric.utils import to_undirected

def train_test_split_big(data, val_ratio=0.05, test_ratio=0.1):
    row, col = data.edge_index
    # data.edge_index = None

    # Return upper triangular portion.
    mask = row < col
    row, col = row[mask], col[mask]

    n_v = int(math.floor(val_ratio * row.size(0)))
    n_t = int(math.floor(test_ratio * row.size(0)))

    # Positive edges.
    perm = torch.randperm(row.size(0))
    row, col = row[perm], col[perm]

    r, c = row[:n_v], col[:n_v]
    data.val_pos_edge_index = torch.stack([r, c], dim=0)
    r, c = row[n_v:n_v + n_t], col[n_v:n_v + n_t]
    data.test_pos_edge_index = torch.stack([r, c], dim=0)

    r, c = row[n_v + n_t:], col[n_v + n_t:]
    data.train_pos_edge_index = torch.stack([r, c], dim=0)
    data.train_pos_edge_index = to_undirected(data.train_pos_edge_index)
    return data

In [9]:
data = train_test_split_big(data, test_ratio=0.1)
train_data = Data(x=data.x, edge_index=data.train_pos_edge_index)
test_data = Data(x=data.x, edge_index=data.test_pos_edge_index)

train_loader = GraphSAINTRandomWalkSampler(train_data, batch_size=6000, walk_length=2,
                                     num_steps=5, sample_coverage=10,
                                     save_dir=None)#dataset.processed_dir)

test_loader = GraphSAINTRandomWalkSampler(test_data, batch_size=6000, walk_length=2,
                                     num_steps=5, sample_coverage=10,
                                     save_dir=None)#dataset.processed_dir)

Compute GraphSAINT normalization: : 17297351it [00:16, 1065732.59it/s]                            
Compute GraphSAINT normalization: : 17680708it [00:09, 1898026.94it/s]                            


In [10]:
## WHAT IS EPS?
EPS = 1e-4

In [11]:
class Net(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(Net, self).__init__()
        self.device = torch.device(device)
        in_channels = dataset.num_node_features
        out_channels = 64
        self.decoder = InnerProductDecoder().to(self.device)
        self.conv1 = SAGEConv(in_channels, hidden_channels)
        self.conv2 = SAGEConv(hidden_channels, hidden_channels)
        self.conv3 = SAGEConv(hidden_channels, hidden_channels)
        self.lin = torch.nn.Linear(3 * hidden_channels, out_channels)

    def set_aggr(self, aggr):
        self.conv1.aggr = aggr
        self.conv2.aggr = aggr
        self.conv3.aggr = aggr

    def forward(self, x0, edge_index, edge_weight=None):
        x1 = F.relu(self.conv1(x0, edge_index, edge_weight))
        x1 = F.dropout(x1, p=0.2, training=self.training)
        x2 = F.relu(self.conv2(x1, edge_index, edge_weight))
        x2 = F.dropout(x2, p=0.2, training=self.training)
        x3 = F.relu(self.conv3(x2, edge_index, edge_weight))
        x3 = F.dropout(x3, p=0.2, training=self.training)
        x = torch.cat([x1, x2, x3], dim=-1)
        x = self.lin(x)
        return x
        #return x.log_softmax(dim=-1)
    
    def recon_loss(self, z, pos_edge_index):
        r"""Given latent variables :obj:`z`, computes the binary cross
        entropy loss for positive edges :obj:`pos_edge_index` and negative
        sampled edges.

        Args:
            z (Tensor): The latent space :math:`\mathbf{Z}`.
            pos_edge_index (LongTensor): The positive edges to train against.
        """
        #set_trace()
        EPS = 1e-4
        pos_loss = -torch.log(
            self.decoder(z, pos_edge_index, sigmoid=True) + EPS).sum()

        # Do not include self-loops in negative samples
        pos_edge_index, _ = remove_self_loops(pos_edge_index)
        pos_edge_index, _ = add_self_loops(pos_edge_index)

        neg_edge_index = negative_sampling(pos_edge_index, z.size(0))
        neg_loss = -torch.log(1 -
                              self.decoder(z, neg_edge_index, sigmoid=True) +
                              EPS).sum()

        return pos_loss + neg_loss
    
    def test(self, z, pos_edge_index, neg_edge_index):
        r"""Given latent variables :obj:`z`, positive edges
        :obj:`pos_edge_index` and negative edges :obj:`neg_edge_index`,
        computes area under the ROC curve (AUC) and average precision (AP)
        scores.

        Args:
            z (Tensor): The latent space :math:`\mathbf{Z}`.
            pos_edge_index (LongTensor): The positive edges to evaluate
                against.
            neg_edge_index (LongTensor): The negative edges to evaluate
                against.
        """
        pos_y = z.new_ones(pos_edge_index.size(1))
        neg_y = z.new_zeros(neg_edge_index.size(1))
        y = torch.cat([pos_y, neg_y], dim=0)

        pos_pred = self.decoder(z, pos_edge_index, sigmoid=True)
        neg_pred = self.decoder(z, neg_edge_index, sigmoid=True)
        pred = torch.cat([pos_pred, neg_pred], dim=0)

        y, pred = y.detach().cpu().numpy(), pred.detach().cpu().numpy()

        return roc_auc_score(y, pred), average_precision_score(y, pred)

In [12]:
model = Net(hidden_channels=256).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

In [13]:
def train():
    model.train()
    model.set_aggr('add')

    total_loss = total_examples = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data.x, data.edge_index) #, data.edge_norm * data.edge_attr)
        loss = model.recon_loss(out, data.edge_index)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * data.num_nodes
        total_examples += data.num_nodes
    return total_loss / total_examples

@torch.no_grad()
def test():
    model.eval()
    model.set_aggr('mean')

    out = model(data.x.to(device), data.test_pos_edge_index.to(device))
    auc, ap = model.test(out, data.test_pos_edge_index, data.test_neg_edge_index)

    return auc, ap


for epoch in range(1, 51):
    loss = train()
    accs = test()
    print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}, AUC: {accs[0]:.4f}, '
          f'AP: {accs[1]:.4f}')

RuntimeError: CUDA out of memory. Tried to allocate 1.53 GiB (GPU 0; 10.91 GiB total capacity; 8.53 GiB already allocated; 1.16 GiB free; 8.88 GiB reserved in total by PyTorch)