In [2]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.model_selection import train_test_split
from node2vec import Node2Vec
import torch
import torch.nn as nn

In [3]:
from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians
    # lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    return c * r

In [4]:
nodes_df = pd.read_csv('data/road_intersection_nodes.csv')
nodes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 236258 entries, 0 to 236257
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   lng     236258 non-null  float64
 1   lat     236258 non-null  float64
 2   id      236258 non-null  int64  
dtypes: float64(2), int64(1)
memory usage: 5.4 MB


In [5]:
nodes_df[['lng', 'lat']] = nodes_df[['lng', 'lat']].apply(np.vectorize(radians))
nodes_df

Unnamed: 0,lng,lat,id
0,-1.291857,0.710457,0
1,-1.291856,0.710460,1
2,-1.291854,0.710463,2
3,-1.291852,0.710466,3
4,-1.291850,0.710468,4
...,...,...,...
236253,-1.289822,0.709714,2414372
236254,-1.290694,0.711094,2414375
236255,-1.290695,0.711093,2414376
236256,-1.290695,0.711091,2414377


In [6]:
lat_center, lng_center = nodes_df.lat.mean(), nodes_df.lng.mean()

In [7]:
G = nx.Graph()
G.graph['Name'] = 'TLC'

RADIUS = 4
node_to_idx = {}

for idx, (_, row) in enumerate(nodes_df.iterrows()):
    if haversine(lng_center, lat_center, row.lng, row.lat) > RADIUS:
       continue
    G.add_node(int(row.id), id=int(row.id) ,lng=row.lng, lat=row.lat)
    node_to_idx[int(row.id)] = idx

G.number_of_nodes(), G.number_of_edges()

(11704, 0)

In [8]:
observed_nodes = list(G.nodes())
obs_nodes_set = set(observed_nodes)

In [9]:
pickups_df = pd.read_csv('data/TLC_daily.csv')
pickups_df = pickups_df[pickups_df['id'].isin(obs_nodes_set)]
pickups_df

Unnamed: 0,day,id,pickups
5902,1.0,19416,3.0
5903,1.0,19417,2.0
5904,1.0,19418,2.0
5905,1.0,19420,2.0
5906,1.0,19422,4.0
...,...,...,...
35910369,152.0,401970,0.0
35910370,152.0,401973,0.0
35910371,152.0,401974,0.0
35910491,152.0,872198,0.0


In [10]:
edges_df = pd.read_csv('data/road_intersection_edges.csv')
edges_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 282983 entries, 0 to 282982
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   olng    282983 non-null  float64
 1   olat    282983 non-null  float64
 2   dlng    282983 non-null  float64
 3   dlat    282983 non-null  float64
 4   oid     282983 non-null  int64  
 5   did     282983 non-null  int64  
dtypes: float64(4), int64(2)
memory usage: 13.0 MB


In [11]:
for _, row in edges_df.iterrows():
    if row.oid not in obs_nodes_set or row.did not in obs_nodes_set:
        continue

    dist = haversine(row.olng, row.olat, row.dlng, row.dlat)

    G.add_edge(int(row.oid), int(row.did), weight=dist)
    
G.number_of_nodes(), G.number_of_edges()

(11704, 15021)

In [12]:
embedder = Node2Vec(G, dimensions=128, walk_length=30, workers=4)

Computing transition probabilities: 100%|██████████| 11704/11704 [00:00<00:00, 23325.06it/s]
Generating walks (CPU: 1): 100%|██████████| 3/3 [00:03<00:00,  1.19s/it]
Generating walks (CPU: 3): 100%|██████████| 2/2 [00:02<00:00,  1.24s/it]
Generating walks (CPU: 2): 100%|██████████| 3/3 [00:03<00:00,  1.22s/it]
Generating walks (CPU: 4): 100%|██████████| 2/2 [00:02<00:00,  1.22s/it]


In [13]:
model = embedder.fit(window=5, min_count=1)

In [14]:
for idx, e in enumerate(G.edges()):
    if idx == 5:
        break
    print(e)

(19416, 19417)
(19416, 195466)
(19416, 195451)
(19416, 388769)
(19417, 19418)


In [15]:
# model.wv.most_similar('19459')

In [16]:
# embedding = nn.Embedding(G.number_of_nodes(), 128)
# embedding.weight.requires_grad = False

for idx, node in enumerate(G.nodes()):
    G.add_node(int(node), x=model.wv[str(node)].copy())

In [135]:
from torch.utils.data import Dataset


class DayObservationsDataset(Dataset):
    def __init__(self, df: pd.DataFrame) -> None:
        super().__init__()
        self.data = df['id'].to_numpy()
        self.targets = df['pickups'].to_numpy()
        self.observed_nodes = set(np.unique(self.data))

        self.node_to_target = df.set_index('id')['pickups'].to_dict()
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        # returns node, and its observation
        return self.data[idx], self.targets[idx]

    def get_observation_by_node(self, node):
        return self.node_to_target[node]

    def get_observed_nodes(self):
        return self.observed_nodes


def get_dataset_by_day(pickups_df, day):
    df = pickups_df[pickups_df['day'].astype('int') == day].copy()
    df.drop('day', axis=1, inplace=True)
    return DayObservationsDataset(df)


ds = get_dataset_by_day(pickups_df, 12)
len(ds)

11704

In [136]:
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler, Sampler

np.random.seed(228)

indices = list(range(len(ds)))

train_indices, test_indices = train_test_split(indices, test_size=0.3)
train_indices, val_indices = train_test_split(train_indices, test_size=0.3)
print(len(train_indices), len(val_indices), len(test_indices))

train_sampler = SubsetRandomSampler(train_indices)
val_sampler = SubsetRandomSampler(val_indices)
test_sampler = SubsetRandomSampler(test_indices)


train_loader = DataLoader(ds, batch_size=128, sampler=train_sampler)
val_loader = DataLoader(ds, batch_size=len(val_sampler.indices), sampler=val_sampler)
test_loader = DataLoader(ds, batch_size=len(test_sampler.indices), sampler=test_sampler)

5734 2458 3512


In [137]:
from torch_geometric.utils.convert import from_networkx

pyg_graph = from_networkx(G)
pyg_graph

Data(x=[11704, 128], edge_index=[2, 30042], id=[11704], lng=[11704], lat=[11704], weight=[30042])

In [140]:
from sklearn.neighbors import NearestNeighbors

def weight_fn(dists, lamb):
    return torch.exp(-lamb * dists)


class Predictor(nn.Module):
    def __init__(self, pyg_graph, nodes_df, observs) -> None:
        super().__init__()

        self.g = pyg_graph
        self.nodes_df = nodes_df
        self.obs_nodes = observs[0]
        self.obs_targets = observs[1]
        self.node_to_gidx = dict(zip(self.g.id.numpy(), range(len(self.g.id))))

        self.neighbors = NearestNeighbors(n_neighbors=15, metric='haversine')
        self.obs_nodes_locs = self.nodes_df.iloc[np.vectorize(node_to_idx.get)(self.obs_nodes)]
        self.neighbors.fit(self.obs_nodes_locs[['lat', 'lng']].values)

        self.k = nn.Parameter(torch.rand(1))
        # self.k = torch.tensor([1.0])
        self.lambda_1 = nn.Parameter(torch.rand(1))
        self.lambda_2 = nn.Parameter(torch.rand(1))

    def forward(self, X):
        # getting nearest observed nodes
        dists, indices = self.neighbors.kneighbors(nodes_df.iloc[np.vectorize(node_to_idx.get)(X)][['lat', 'lng']].values)
        dists = dists * 6371 * 1000

        # so for now dist in meters
        if self.training:
            dists, indices = dists[:, 1:], indices[:, 1:]
        # get_observations = np.vectorize(self.obs_ds.get_observation_by_node)
        # observations = get_observations(self.obs_nodes_locs['id'].values[indices])
        observations = self.obs_targets[indices]

        dists, observations = torch.as_tensor(dists), torch.as_tensor(observations)

        neighbors_indices = np.vectorize(self.node_to_gidx.get)(self.obs_nodes[indices])
        neighbors_embeds = self.g.x[neighbors_indices.reshape(-1)].reshape(*neighbors_indices.shape, -1)

        X_embeds = self.g.x[np.vectorize(self.node_to_gidx.get)(X)]
        similarities = nn.functional.cosine_similarity(X_embeds[:, None], neighbors_embeds, dim=2)
        # similarities = torch.sum(X_embeds[:, None] * neighbors_embeds, dim=-1)
        # print(similarities)

        dist_weights = weight_fn(dists, self.lambda_1)
        simi_weights = weight_fn(similarities, self.lambda_2)
        # sum normalizization
        dist_weights = nn.functional.normalize(dist_weights, p=1)
        simi_weights = nn.functional.normalize(simi_weights, p=1)

        f = self.k * dist_weights + (1 - self.k) * simi_weights

        result = torch.sum(f.mul(observations), dim=-1)

        return result

predictor = Predictor(pyg_graph, nodes_df, ds[train_indices])
kek = next(iter(train_loader))
predictor(kek[0])[:20], kek[1][:20] 

(tensor([0.0000e+00, 7.0563e-02, 1.0056e+00, 4.1295e-02, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 3.9692e-02, 1.4919e+01, 1.3713e-02, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 3.7000e-01, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         9.8177e-02, 0.0000e+00], dtype=torch.float64, grad_fn=<SliceBackward0>),
 tensor([ 0.,  0.,  2.,  0.,  0.,  0.,  0.,  0., 13.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  0.,  0.,  0.], dtype=torch.float64))

In [141]:
from sklearn.metrics import r2_score

loss_fn = nn.HuberLoss(delta=100)
optimizer = torch.optim.Adam(predictor.parameters(), lr=1e-2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 10, gamma=0.7)


def calc_score(pred, actual):
    return r2_score(actual, pred)

def test(model, loader):
    model.eval()

    scores = []
    total_loss = 0

    with torch.no_grad():
        for (X, y) in loader:
            out = model(X)
            scores.append(calc_score(out, y))
            loss = loss_fn(out, y)
            total_loss += loss.item()
    
    return total_loss / len(loader), np.mean(scores)


def train(model, train_loader, val_loader, loss_fn, optimizer, scheduler=None, num_epochs=100):
    losses = []
    test_scores = []
    

    for epoch in range(num_epochs + 1):
        model.train()
        total_loss = 0

        for i_step, (X, y) in enumerate(train_loader):
            optimizer.zero_grad()
            out = model(X)
            loss = loss_fn(out, y)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        total_loss /= len(train_loader)
        losses.append(total_loss)

        if scheduler is not None:
            scheduler.step()

        if epoch % 5 == 0:
            val_loss, score = test(model, val_loader)
            test_scores.append(score)
            print(f'Epoch {epoch}, Loss: {losses[-1]:.4f}, Val loss: {val_loss:.4f}, Val R2: {test_scores[-1]:.4f}')

train(predictor, train_loader, val_loader, loss_fn, optimizer, scheduler)
for name, param in predictor.named_parameters():
    print(name, param)

print(test(predictor, test_loader))

Epoch 0, Loss: 1.4960, Val loss: 0.9012, Val R2: 0.8655
Epoch 5, Loss: 1.0314, Val loss: 1.0785, Val R2: 0.8391
Epoch 10, Loss: 1.0354, Val loss: 1.0779, Val R2: 0.8392
Epoch 15, Loss: 1.0267, Val loss: 1.0794, Val R2: 0.8389
Epoch 20, Loss: 1.0253, Val loss: 1.0768, Val R2: 0.8393
Epoch 25, Loss: 1.0249, Val loss: 1.0701, Val R2: 0.8403
Epoch 30, Loss: 1.0115, Val loss: 1.0663, Val R2: 0.8409
Epoch 35, Loss: 1.0155, Val loss: 1.0647, Val R2: 0.8411
Epoch 40, Loss: 1.0185, Val loss: 1.0638, Val R2: 0.8413
Epoch 45, Loss: 1.0154, Val loss: 1.0635, Val R2: 0.8413
Epoch 50, Loss: 1.0086, Val loss: 1.0633, Val R2: 0.8413
Epoch 55, Loss: 1.0246, Val loss: 1.0635, Val R2: 0.8413
Epoch 60, Loss: 1.0064, Val loss: 1.0629, Val R2: 0.8414
Epoch 65, Loss: 1.0140, Val loss: 1.0624, Val R2: 0.8415
Epoch 70, Loss: 1.0130, Val loss: 1.0627, Val R2: 0.8414
Epoch 75, Loss: 1.0052, Val loss: 1.0625, Val R2: 0.8415
Epoch 80, Loss: 1.0198, Val loss: 1.0621, Val R2: 0.8415
Epoch 85, Loss: 1.0042, Val loss: