In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.model_selection import train_test_split
from node2vec import Node2Vec
import torch
import torch.nn as nn

In [2]:
from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians
    # lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    return c * r

In [3]:
nodes_df = pd.read_csv('data/road_intersection_nodes.csv')
nodes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 236258 entries, 0 to 236257
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   lng     236258 non-null  float64
 1   lat     236258 non-null  float64
 2   id      236258 non-null  int64  
dtypes: float64(2), int64(1)
memory usage: 5.4 MB


In [4]:
nodes_df[['lng', 'lat']] = nodes_df[['lng', 'lat']].apply(np.vectorize(radians))
nodes_df

Unnamed: 0,lng,lat,id
0,-1.291857,0.710457,0
1,-1.291856,0.710460,1
2,-1.291854,0.710463,2
3,-1.291852,0.710466,3
4,-1.291850,0.710468,4
...,...,...,...
236253,-1.289822,0.709714,2414372
236254,-1.290694,0.711094,2414375
236255,-1.290695,0.711093,2414376
236256,-1.290695,0.711091,2414377


In [5]:
lat_center, lng_center = nodes_df.lat.mean(), nodes_df.lng.mean()

In [6]:
G = nx.Graph()
G.graph['Name'] = 'TLC'

RADIUS = 4
node_to_idx = {}

for idx, (_, row) in enumerate(nodes_df.iterrows()):
    if haversine(lng_center, lat_center, row.lng, row.lat) > RADIUS:
       continue
    G.add_node(int(row.id), lng=row.lng, lat=row.lat)
    node_to_idx[int(row.id)] = idx

G.number_of_nodes(), G.number_of_edges()

(11704, 0)

In [7]:
observed_nodes = list(G.nodes())
obs_nodes_set = set(observed_nodes)

In [8]:
pickups_df = pd.read_csv('data/TLC_daily.csv')
pickups_df = pickups_df[pickups_df['id'].isin(obs_nodes_set)]
pickups_df

Unnamed: 0,day,id,pickups
5902,1.0,19416,3.0
5903,1.0,19417,2.0
5904,1.0,19418,2.0
5905,1.0,19420,2.0
5906,1.0,19422,4.0
...,...,...,...
35910369,152.0,401970,0.0
35910370,152.0,401973,0.0
35910371,152.0,401974,0.0
35910491,152.0,872198,0.0


In [9]:
edges_df = pd.read_csv('data/road_intersection_edges.csv')
edges_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 282983 entries, 0 to 282982
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   olng    282983 non-null  float64
 1   olat    282983 non-null  float64
 2   dlng    282983 non-null  float64
 3   dlat    282983 non-null  float64
 4   oid     282983 non-null  int64  
 5   did     282983 non-null  int64  
dtypes: float64(4), int64(2)
memory usage: 13.0 MB


In [10]:
for _, row in edges_df.iterrows():
    if row.oid not in obs_nodes_set or row.did not in obs_nodes_set:
        continue

    dist = haversine(row.olng, row.olat, row.dlng, row.dlat)

    G.add_edge(int(row.oid), int(row.did), weight=dist)
    
G.number_of_nodes(), G.number_of_edges()

In [11]:
embedder = Node2Vec(G, dimensions=128, walk_length=30, workers=4)

Computing transition probabilities: 100%|██████████| 11704/11704 [00:00<00:00, 24554.48it/s]
Generating walks (CPU: 1): 100%|██████████| 3/3 [00:03<00:00,  1.20s/it]
Generating walks (CPU: 3): 100%|██████████| 2/2 [00:02<00:00,  1.24s/it]
Generating walks (CPU: 2): 100%|██████████| 3/3 [00:03<00:00,  1.20s/it]
Generating walks (CPU: 4): 100%|██████████| 2/2 [00:02<00:00,  1.24s/it]


In [12]:
model = embedder.fit(window=5, min_count=1)

In [13]:
for idx, e in enumerate(G.edges()):
    if idx == 5:
        break
    print(e)

(19416, 19417)
(19416, 195466)
(19416, 195451)
(19416, 388769)
(19417, 19418)


In [None]:
# model.wv.most_similar('19459')

In [None]:
# embedding = nn.Embedding(G.number_of_nodes(), 128)
# embedding.weight.requires_grad = False

for idx, node in enumerate(G.nodes()):
    G.add_node(int(node), x=model.wv[str(node)].copy())

In [80]:
from torch.utils.data import Dataset


class DayObservationsDataset(Dataset):
    def __init__(self, df: pd.DataFrame) -> None:
        super().__init__()
        self.data = df['id'].to_numpy()
        self.targets = df['pickups'].to_numpy()
        self.observed_nodes = set(np.unique(self.data))

        self.node_to_target = df.set_index('id')['pickups'].to_dict()
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        # returns node, and its observation
        return self.data[idx], self.targets[idx]

    def get_observation_by_node(self, node):
        return self.node_to_target[node]

    def get_observed_nodes(self):
        return self.observed_nodes


def get_dataset_by_day(pickups_df, day):
    df = pickups_df[pickups_df['day'].astype('int') == day].copy()
    df.drop('day', axis=1, inplace=True)
    return DayObservationsDataset(df)


ds = get_dataset_by_day(pickups_df, 16)
next(iter(ds))

(19422, 3.0)

In [81]:
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler, Sampler

np.random.seed(228)

indices = list(range(len(ds)))

train_indices, test_indices = train_test_split(indices, test_size=0.15)
train_indices, val_indices = train_test_split(train_indices, test_size=0.15)
print(len(train_indices), len(val_indices), len(test_indices))

train_sampler = SubsetRandomSampler(train_indices)
val_sampler = SubsetRandomSampler(val_indices)
test_sampler = SubsetRandomSampler(test_indices)


train_loader = DataLoader(ds, batch_size=128, sampler=train_sampler)
val_loader = DataLoader(ds, batch_size=len(val_sampler.indices), sampler=val_sampler)
test_loader = DataLoader(ds, batch_size=len(test_sampler.indices), sampler=test_sampler)

8455 1493 1756


In [82]:
next(iter(train_loader))

[tensor([230298, 267138, 177663, 317350, 116113, 117764, 225551, 210600, 117121,
         246571, 128300, 117979, 191861, 285700, 286930,  67883, 234779,  67209,
          30528, 114183,  73297, 317347,  80475, 284193, 234476,  84171, 392680,
          30347,  19443, 172562, 120175,  73275, 215398,  27219,  63138, 277655,
         267322,  22964, 321757,  20802,  30531, 369655, 112939,  98501, 223953,
         163294, 173486, 119091,  73290,  79762,  32910, 288279, 116502, 370877,
         378871, 115907,  37899,  24399, 284142, 298992, 209166, 279975, 288802,
         109950, 298981, 174319, 321847,  26910, 319063,  80401, 304914, 397221,
         267329, 319404,  84234, 116551,  37898, 117742, 277287, 373420, 188952,
         117731, 277670, 298978, 378915, 191558, 223425,  67881, 371019,  30555,
          26478,  80353, 285800,  26730,  67754, 113520,  23317, 286900, 267372,
         370856, 109778, 200288,  70447, 387925, 191854, 286859,  67561,  82104,
         111206, 320341, 266

In [83]:
from torch_geometric.utils.convert import from_networkx

pyg_graph = from_networkx(G)
pyg_graph

Data(edge_index=[2, 30042], lng=[11704], lat=[11704], weight=[30042], num_nodes=11704)

In [84]:
nodes_df.iloc[np.vectorize(node_to_idx.get)(ds.data)]

Unnamed: 0,lng,lat,id
12256,-1.290688,0.711084,19422
12257,-1.290683,0.711083,19424
12258,-1.290667,0.711080,19426
12259,-1.290711,0.711088,19427
12260,-1.290702,0.711087,19428
...,...,...,...
235074,-1.289575,0.710608,401970
235075,-1.289582,0.710618,401973
235076,-1.289581,0.710617,401974
235231,-1.289707,0.710787,872198


In [85]:
nodes_df.iloc[np.vectorize(node_to_idx.get)(ds.data)]

Unnamed: 0,lng,lat,id
12256,-1.290688,0.711084,19422
12257,-1.290683,0.711083,19424
12258,-1.290667,0.711080,19426
12259,-1.290711,0.711088,19427
12260,-1.290702,0.711087,19428
...,...,...,...
235074,-1.289575,0.710608,401970
235075,-1.289582,0.710618,401973
235076,-1.289581,0.710617,401974
235231,-1.289707,0.710787,872198


In [86]:
ds[train_indices]

(array([ 26915, 322526, 378895, ..., 237212, 278082, 168995]),
 array([1., 0., 0., ..., 0., 0., 1.]))

In [87]:
from sklearn.neighbors import NearestNeighbors

def weight_fn(dists, lamb):
    return torch.exp(-lamb * dists)


class Predictor(nn.Module):
    def __init__(self, pyg_graph, nodes_df, observs) -> None:
        super().__init__()

        self.g = pyg_graph
        self.nodes_df = nodes_df
        self.obs_nodes = observs[0]
        self.obs_targets = observs[1]

        self.neighbors = NearestNeighbors(n_neighbors=10, metric='haversine')
        self.obs_nodes_locs = self.nodes_df.iloc[np.vectorize(node_to_idx.get)(self.obs_nodes)]
        self.neighbors.fit(self.obs_nodes_locs[['lat', 'lng']].values)

        self.k = nn.Parameter(torch.rand(1))
        self.lambda_1 = nn.Parameter(torch.rand(1))
        self.lambda_2 = nn.Parameter(torch.rand(1))
    
    def get_nearest_observations(self, X):
        # print(X)
        dists, indices = self.neighbors.kneighbors(nodes_df.iloc[np.vectorize(node_to_idx.get)(X)][['lat', 'lng']].values)
        dists = dists * 6371 * 1000
        # so for now dist in meters
        if self.training:
            dists, indices = dists[:, 1:], indices[:, 1:]
        # get_observations = np.vectorize(self.obs_ds.get_observation_by_node)
        # observations = get_observations(self.obs_nodes_locs['id'].values[indices])
        observations = self.obs_targets[indices]
        
        return torch.as_tensor(dists), torch.as_tensor(observations)

    def forward(self, X):
        dists, observations = self.get_nearest_observations(X)

        weights = weight_fn(dists, self.lambda_1)
        # sum normalizization
        weights = nn.functional.normalize(weights, p=1)
        result = torch.sum(weights.mul(observations), dim=-1)

        return result

model = Predictor(pyg_graph, nodes_df, ds[train_indices])
kek = next(iter(test_loader))
model(kek[0])[:20], kek[1][:20] 

(tensor([0.0000e+00, 0.0000e+00, 0.0000e+00, 3.8782e+01, 0.0000e+00, 3.0000e+00,
         0.0000e+00, 0.0000e+00, 9.9103e-01, 0.0000e+00, 8.7961e-06, 2.6969e+01,
         1.8170e-11, 0.0000e+00, 2.5100e+00, 0.0000e+00, 1.8546e-04, 8.0076e-13,
         2.0000e+00, 1.0072e-06], dtype=torch.float64, grad_fn=<SliceBackward0>),
 tensor([ 0.,  0.,  0., 43.,  0.,  2.,  0.,  0.,  1.,  0.,  0., 28.,  0.,  0.,
         13.,  0.,  0.,  0.,  6.,  0.], dtype=torch.float64))

In [88]:
from sklearn.metrics import r2_score

loss_fn = nn.HuberLoss(delta=100)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 20, gamma=0.5)


print(list(model.parameters()))

def calc_score(pred, actual):
    return r2_score(actual, pred)

def test(model, loader):
    model.eval()

    scores = []
    total_loss = 0

    with torch.no_grad():
        for (X, y) in loader:
            out = model(X)
            scores.append(calc_score(out, y))
            loss = loss_fn(out, y)
            total_loss += loss.item()
    
    return total_loss / len(loader), np.mean(scores)


        

def train(model, train_loader, val_loader, loss_fn, optimizer, scheduler=None, num_epochs=200):
    
    losses = []
    test_scores = []
    

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        for i_step, (X, y) in enumerate(train_loader):
            optimizer.zero_grad()
            out = model(X)
            loss = loss_fn(out, y)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        total_loss /= len(train_loader)
        losses.append(total_loss)

        if scheduler is not None:
            scheduler.step()

        if epoch % 10 == 0:
            val_loss, score = test(model, val_loader)
            test_scores.append(score)
            print(f'Epoch {epoch}, Loss: {losses[-1]:.4f}, Val loss: {val_loss:.4f}, Val R2: {test_scores[-1]:.4f}')
            print(model.lambda_1)

train(model, train_loader, val_loader, loss_fn, optimizer, scheduler)
print(list(model.parameters()))
print(test(model, test_loader))

[Parameter containing:
tensor([0.3132], requires_grad=True)]
Epoch 0, Loss: 2.9754, Val loss: 4.8837, Val R2: 0.7859
Parameter containing:
tensor([0.2339], requires_grad=True)
Epoch 10, Loss: 2.7967, Val loss: 4.2428, Val R2: 0.8140
Parameter containing:
tensor([0.0666], requires_grad=True)
Epoch 20, Loss: 2.8086, Val loss: 4.3267, Val R2: 0.8103
Parameter containing:
tensor([0.1019], requires_grad=True)
Epoch 30, Loss: 2.8600, Val loss: 4.2431, Val R2: 0.8140
Parameter containing:
tensor([0.0661], requires_grad=True)
Epoch 40, Loss: 2.7965, Val loss: 4.3020, Val R2: 0.8114
Parameter containing:
tensor([0.0953], requires_grad=True)
Epoch 50, Loss: 2.7817, Val loss: 4.2490, Val R2: 0.8137
Parameter containing:
tensor([0.0761], requires_grad=True)
Epoch 60, Loss: 2.7651, Val loss: 4.2648, Val R2: 0.8130
Parameter containing:
tensor([0.0572], requires_grad=True)
Epoch 70, Loss: 2.7616, Val loss: 4.3336, Val R2: 0.8100
Parameter containing:
tensor([0.0485], requires_grad=True)
Epoch 80, Lo

In [None]:
import torch
import torch.nn as nn

In [None]:
np.dot(embedding[0], embedding[1])