In [1]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.model_selection import train_test_split
from node2vec import Node2Vec

In [2]:
from math import radians, cos, sin, asin, sqrt

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a))
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    return c * r

In [3]:
nodes_df = pd.read_csv('data/road_intersection_nodes.csv')
nodes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 236258 entries, 0 to 236257
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   lng     236258 non-null  float64
 1   lat     236258 non-null  float64
 2   id      236258 non-null  int64  
dtypes: float64(2), int64(1)
memory usage: 5.4 MB


In [4]:
lat_center, lng_center = nodes_df.lat.mean(), nodes_df.lng.mean()

In [6]:
G = nx.DiGraph()
G.graph['Name'] = 'TLC'

RADIUS = 5

for _, row in nodes_df.iterrows():
    if haversine(lng_center, lat_center, row.lng, row.lat) > RADIUS:
       continue
    G.add_node(int(row.id), lng=row.lng, lat=row.lat)

G.number_of_nodes()

21009

In [7]:
observed_nodes = list(G.nodes())
obs_nodes_set = set(observed_nodes)

In [8]:
edges_df = pd.read_csv('data/road_intersection_edges.csv')
edges_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 282983 entries, 0 to 282982
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   olng    282983 non-null  float64
 1   olat    282983 non-null  float64
 2   dlng    282983 non-null  float64
 3   dlat    282983 non-null  float64
 4   oid     282983 non-null  int64  
 5   did     282983 non-null  int64  
dtypes: float64(4), int64(2)
memory usage: 13.0 MB


In [9]:
for _, row in edges_df.iterrows():
    if row.oid not in obs_nodes_set or row.did not in obs_nodes_set:
        continue
    dist = haversine(row.olng, row.olat, row.dlng, row.dlat)
    G.add_edge(int(row.oid), int(row.did), weight=dist)

In [10]:
G.number_of_nodes(), G.number_of_edges()

(21009, 26289)

In [11]:
nx.readwrite.gpickle.write_gpickle(G, 'data/graph.data')

In [None]:
embedder = Node2Vec(G, dimensions=64, num_walks=50)

Computing transition probabilities:   0%|          | 0/21009 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|                                                                 | 0/50 [00:00<?, ?it/s][A
Generating walks (CPU: 1):   4%|██▎                                                      | 2/50 [00:03<01:33,  1.94s/it][A
Generating walks (CPU: 1):   6%|███▍                                                     | 3/50 [00:07<02:01,  2.58s/it][AERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



In [None]:
model = embedder.fit(window=10, min_count=1)

In [148]:
for idx, e in enumerate(G.edges()):
    if idx == 5:
        break
    print(e)

(3355, 3356.0)
(3356, 288609.0)
(3356, 325958.0)
(3359, 3360.0)
(3360, 3361.0)


In [151]:
model.wv.most_similar(3356)

[('157871.0', 0.9605061411857605),
 ('109756.0', 0.9509170055389404),
 ('109752.0', 0.9340439438819885),
 ('109758.0', 0.9101048111915588),
 ('128273.0', 0.9081839919090271),
 ('117287.0', 0.9080386757850647),
 ('109751.0', 0.8969016075134277),
 ('157870.0', 0.8786019682884216),
 ('128273', 0.865290641784668),
 ('109760.0', 0.8585328459739685)]

In [52]:
np.savetxt('data/embedding.csv', embedding, delimiter=',')

In [56]:
# train-val-test split of intersection nodes

np.random.seed(2002)

nodes_train, nodes_test = train_test_split(nodes_df.id.unique(), test_size=0.3)

nodes_train, nodes_val = train_test_split(nodes_train, test_size=0.3)
print(len(nodes_train), len(nodes_val), len(nodes_test))

115766 49614 70878


In [57]:
import torch
import torch.nn as nn

In [64]:
np.dot(embedding[0], embedding[1])

-1.2539645986258102e-37