In [7]:
import sys
sys.path.append('../')

import glob

from typing import Tuple, Union, List
import pandas as pd
import numpy as np
import networkx as nx
from tqdm.notebook import tqdm as tqdm

import torch
import torch.nn as nn

import torch_geometric as pyg
from torch_geometric.utils.convert import from_networkx

from src.utils import *
from dataset import *
from src.train import train, test
from src.dataloaders import make_dataloaders_from_dataset
from src.model import KnnEstimator

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from IPython.display import clear_output

%matplotlib inline
%load_ext autoreload
%autoreload 2

device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
G = nx.readwrite.read_gpickle('data/network.gpickle')

In [9]:
# Pytorch geometric Data object. For now used only for storing node embedding. 
# Supposed to be used in the future for obtaining node embeddings.
pyg_graph = from_networkx(G)
pyg_graph.to(device)

Data(edge_index=[2, 972], id=[373], lat=[373], lon=[373], labor_force_rate=[373], housing_sales_num_Total:=[373], housing_sales_num_Total:!!Less than $10,000=[373], housing_sales_num_Total:!!$10,000 to $14,999=[373], housing_sales_num_Total:!!$15,000 to $19,999=[373], housing_sales_num_Total:!!$20,000 to $24,999=[373], housing_sales_num_Total:!!$25,000 to $29,999=[373], housing_sales_num_Total:!!$30,000 to $34,999=[373], housing_sales_num_Total:!!$35,000 to $39,999=[373], housing_sales_num_Total:!!$40,000 to $49,999=[373], housing_sales_num_Total:!!$50,000 to $59,999=[373], housing_sales_num_Total:!!$60,000 to $69,999=[373], housing_sales_num_Total:!!$70,000 to $79,999=[373], housing_sales_num_Total:!!$80,000 to $89,999=[373], housing_sales_num_Total:!!$90,000 to $99,999=[373], housing_sales_num_Total:!!$100,000 to $124,999=[373], housing_sales_num_Total:!!$125,000 to $149,999=[373], housing_sales_num_Total:!!$150,000 to $174,999=[373], housing_sales_num_Total:!!$175,000 to $199,999=[3

$$T(a) = \sum_{b \neq a,\space b\subset Train } T(b)w(a, b), \textrm{where summation is calculated for the {\bf k} nearest neighbors.}$$

$$ w(a,b)= \frac{u(a,b)}{\sum_{b \neq a} u(a, b)};$$

$u(a, b) = exp(-\lambda_1 d(a, b));$

So for this model $\lambda_1$ is optimized.

In [10]:
def weight_fn(dists, lamb):
    return torch.exp(-lamb * dists)


class Estimator(KnnEstimator):
    def __init__(self, pyg_graph: pyg.data.Data, obs_nodes, obs_targets) -> None:
        super().__init__(pyg_graph, obs_nodes, obs_targets)

        # self.k = torch.tensor([1.0]).to(device)
        self.lambda_1 = nn.Parameter(torch.rand(1))
        self.lambda_2 = nn.Parameter(torch.rand(1))

    def forward(self, X):
        # getting nearest observed nodes
        X_indices = torch.as_tensor(self.node_to_idx(X))
        dists, indices = self.get_kneighbors(X_indices)
        
        dists = dists.to(device)
        indices = indices.to(device)

        dist_weights = weight_fn(dists, self.lambda_1)

        # sum normalizization
        dist_weights = nn.functional.normalize(dist_weights, p=1)

        att_weights = dist_weights
        targets = self.obs_targets[indices].to(device)

        # interpolation 
        result = torch.sum(att_weights.mul(targets), dim=-1)

        return result

In [11]:
results = dict()

In [13]:
loss_fn = nn.HuberLoss(delta=20).to(device)
model = None

for path in tqdm(glob.glob('datasets/*')[:20]):
    day = path.split('/')[1].split('.')[0]

    try:
        ds = torch.load(path)
    except:
        continue

    train_loader, val_loader, test_loader = make_dataloaders_from_dataset(ds, batch_size=16)
    train_batches = [batch for batch in train_loader]
    train_nodes = [n for batch in train_batches for n in batch[0]]
    train_targets = torch.cat([batch[1] for batch in train_batches])


    if model is None:
        model = Estimator(pyg_graph, train_nodes, train_targets).to(device)
    else:
        model.set_observations(train_nodes, train_targets)
        
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 20, gamma=0.9)

    best_model = train(model, train_loader, val_loader, loss_fn, optimizer, device, num_epochs=10, plotting=False)
    test_loss, test_score = test(best_model, test_loader, loss_fn, device)

    # for name, param in best_model.named_parameters():
        # print(name, param)

    results[f'{day}'] = test_score 
    print(f'{day}, Test loss: {test_loss:.4f}, test score: {test_score:.4f}')

  0%|          | 0/20 [00:00<?, ?it/s]

2021-01-03, Test loss: 233.1693, test score: -1.7048
2021-01-15, Test loss: 810.1702, test score: -0.3000
2021-01-04, Test loss: 603.0308, test score: -2.3722
2021-01-24, Test loss: 253.3502, test score: -1.4160
2020-12-28, Test loss: 531.8792, test score: -2.2469
2021-01-06, Test loss: 583.0620, test score: -2.1916
2021-01-13, Test loss: 568.6688, test score: -2.0083
2021-01-09, Test loss: 370.2094, test score: -1.1074
2021-01-20, Test loss: 360.3887, test score: -0.5112
2021-01-18, Test loss: 215.8270, test score: 0.1417
2021-01-17, Test loss: 165.5557, test score: -0.0859
2021-01-12, Test loss: 268.3489, test score: 0.0378
2020-12-29, Test loss: 280.6115, test score: -0.0101
2021-01-07, Test loss: 283.6414, test score: -0.0244
2020-12-30, Test loss: 275.1746, test score: 0.1108
2021-01-19, Test loss: 326.5425, test score: -0.2836
mean, Test loss: 332.8162, test score: -0.0803
2021-01-02, Test loss: 261.2631, test score: 0.0421
2021-01-05, Test loss: 268.4191, test score: 0.1091
2020

In [154]:
import json

with open('results/baseline.json', mode='w') as f:
    json.dump(results, f, indent=4)