In [1]:
import os
import copy
from typing import Tuple, Union, List
import pandas as pd
import numpy as np
import networkx as nx
from tqdm.notebook import tqdm as tqdm
from sklearn.neighbors import NearestNeighbors

# import node2vec

import torch
import torch.nn as nn

import torch_geometric as pyg
from torch_geometric.utils.convert import from_networkx

from utils import *
from dataset import *
from train import train, test, device

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from IPython.display import clear_output

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
G = nx.readwrite.read_gpickle('data/network.gpickle')

In [3]:
# Pytorch geometric Data object. For now used only for storing node embedding. 
# Supposed to be used in the future for obtaining node embeddings.
pyg_graph = from_networkx(G)
pyg_graph.to(device)

  data[key] = torch.tensor(value)


Data(x=[60789, 128], edge_index=[2, 151294], lng=[60789], lat=[60789], id=[60789], dist=[151294])

$$T(a) = \sum_{b \neq a,\space b\subset Train } T(b)w(a, b), \textrm{where summation is calculated for the {\bf k} nearest neighbors.}$$

$$ w(a,b)= \frac{e^{u(a,b)}}{\sum_{b \neq a} e^{u(a, b)}} ;$$

$u(a, b) = f(x_a, x_b, dist_{a,b}) = MLP(concat[hadamard(x_a, x_b), dist_{a, b}]);$

$hadamard(x_a, x_b)$ - per coordinate product;

$x_a$ is an embedding for a node **a**;

So for this model $MLP$ parameters are optimized.

In [20]:
class MLP(nn.Module):
    def __init__(self, input_dim):
        super().__init__()

        self.layers = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.2),

            nn.Linear(128, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(0.2),
            
            nn.Linear(64, 1))

    def forward(self, X):
        out = self.layers(X)
        return out



class Estimator(nn.Module):
    def __init__(self, pyg_graph: pyg.data.Data, observations: Tuple[List, List]) -> None:
        super().__init__()

        self.g = pyg_graph
        self.obs_nodes = observations[0]
        self.obs_targets = observations[1]

        self.mlp = MLP(128 + 1)

        self.NEIGHBORS_NUM = 15
        
        # dicts for fast indexing
        self.node_to_gidx = np.vectorize(dict(zip(self.g.id.detach().cpu().numpy(), range(len(self.g.id)))).get)
        
        self.neighbors = NearestNeighbors(n_neighbors=self.NEIGHBORS_NUM, metric='haversine')
        obs_nodes_indices = self.node_to_gidx(self.obs_nodes)
        self.neighbors.fit(torch.vstack([self.g.lat[obs_nodes_indices], self.g.lng[obs_nodes_indices]]).T.detach().cpu())

        self.k = nn.Parameter(torch.rand(1))
        # self.k = torch.tensor([1.0]).to(device)
        self.lambda_1 = nn.Parameter(torch.rand(1))
        self.lambda_2 = nn.Parameter(torch.rand(1))

    def forward(self, X):
        # getting nearest observed nodes
        X_indices = self.node_to_gidx(X.detach().cpu())
        dists, indices = self.neighbors.kneighbors(torch.vstack([self.g.lat[X_indices], self.g.lng[X_indices]]).T.detach().cpu())
        # converting dists to km
        dists = dists * 6371

        # skipping loc by itself
        if self.training:
            dists, indices = dists[:, 1:], indices[:, 1:]
        
        observations = self.obs_targets[indices.reshape(-1)].reshape(*indices.shape)

        dists, observations = torch.as_tensor(dists).to(device), torch.as_tensor(observations).to(device)

        # finding corresponding node embedding of neighbors
        neighbors_indices = self.node_to_gidx(self.obs_nodes[indices.reshape(-1)])
        neighbors_embeds = self.g.x[neighbors_indices].reshape(*indices.shape, -1)

        # computing similarities between node ans its neighbors
        X_embeds = self.g.x[X_indices]

        input = torch.cat([torch.mul(X_embeds[:, None].expand(-1, neighbors_embeds.shape[1], -1),
                                     neighbors_embeds),
                           dists[:, :, None]
                           ], dim=2)

        
        out = self.mlp(input.reshape(-1, input.shape[-1]).float()).reshape(-1, neighbors_embeds.shape[1])

        # normalizization
        # att_weights = nn.functional.normalize(out, p=1)
        att_weights = nn.functional.softmax(out, dim=-1)

        # interpolation
        result = torch.sum(att_weights.mul(observations), dim=-1)

        return result

In [21]:
results = dict()

In [22]:
loss_fn = nn.HuberLoss(delta=20).to(device)

for day in tqdm(range(1, 21)):
    path = f'datasets/day_{day}.dat'

    try:
        ds = torch.load(path)
    except:
        continue

    train_loader, val_loader, test_loader = make_data_loaders_from_dataset(ds, train_batch_size=64)
    train_batches = [batch for batch in train_loader]
    train_nodes = torch.cat([batch[0] for batch in train_batches])
    train_targets = torch.cat([batch[1] for batch in train_batches])

    model = Estimator(pyg_graph, (train_nodes, train_targets)).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 20, gamma=0.9)

    best_model = train(model, train_loader, val_loader, loss_fn, optimizer, num_epochs=30, plotting=False)
    test_loss, test_score = test(best_model, test_loader, loss_fn)

    # for name, param in best_model.named_parameters():
        # print(name, param)

    results[f'day_{day}'] = test_score 
    print(f'Day {day}, Test loss: {test_loss:.4f}, test score: {test_score:.4f}')

  0%|          | 0/20 [00:00<?, ?it/s]

Day 1, Test loss: 41.6902, test score: 0.9322
Day 2, Test loss: 43.1477, test score: 0.8884
Day 3, Test loss: 36.8675, test score: 0.9031
Day 4, Test loss: 38.9109, test score: 0.8825
Day 5, Test loss: 44.6251, test score: 0.9180
Day 6, Test loss: 43.7092, test score: 0.8600
Day 7, Test loss: 42.5549, test score: 0.9178
Day 8, Test loss: 46.4163, test score: 0.9267
Day 9, Test loss: 57.7076, test score: 0.9236
Day 10, Test loss: 49.4633, test score: 0.9149
Day 11, Test loss: 42.1883, test score: 0.9280
Day 12, Test loss: 49.6712, test score: 0.9011
Day 13, Test loss: 43.9908, test score: 0.9571
Day 14, Test loss: 47.2227, test score: 0.9414
Day 15, Test loss: 42.9286, test score: 0.9393
Day 16, Test loss: 57.7090, test score: 0.9298
Day 17, Test loss: 57.1782, test score: 0.9279
Day 18, Test loss: 38.4185, test score: 0.9398
Day 19, Test loss: 43.2382, test score: 0.9364
Day 20, Test loss: 44.6762, test score: 0.8789


In [23]:
import json

with open('results/gnn1.json', mode='w') as f:
    json.dump(results, f, indent=4)