In [1]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
import networkx as nx

import torch

from src.utils import haversine, radians
from dataset import DayObservationsDataset

%load_ext autoreload
%autoreload 2

In [2]:
nodes_df = pd.read_csv('data/road_intersection_nodes.csv')
nodes_df = nodes_df.rename(columns={'lng': 'lon'})

In [3]:
# convert decimals to radiands
nodes_df[['lon', 'lat']] = nodes_df[['lon', 'lat']].apply(radians)
nodes_df.head()

Unnamed: 0,lon,lat,id
0,-1.291857,0.710457,0
1,-1.291856,0.71046,1
2,-1.291854,0.710463,2
3,-1.291852,0.710466,3
4,-1.29185,0.710468,4


In [4]:
# computing center of region
lat_center, lon_center = nodes_df.lat.mean(), nodes_df.lon.mean()
lat_center, lon_center

(0.7106174200098879, -1.290224335366838)

Due computational complexity, I consider only region with a radius of RADIUS around (lat_center, lng_center).

In [5]:
RADIUS = 8

considered_nodes_df = nodes_df[haversine(lon_center, lat_center, nodes_df.lon, nodes_df.lat) <= RADIUS]
considered_nodes_set = set(considered_nodes_df.id.unique())

In [6]:
edges_df = pd.read_csv('data/road_intersection_edges.csv')
edges_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 282983 entries, 0 to 282982
Data columns (total 6 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   olng    282983 non-null  float64
 1   olat    282983 non-null  float64
 2   dlng    282983 non-null  float64
 3   dlat    282983 non-null  float64
 4   oid     282983 non-null  int64  
 5   did     282983 non-null  int64  
dtypes: float64(4), int64(2)
memory usage: 13.0 MB


In [7]:
considered_edges_df = edges_df[edges_df.oid.isin(considered_nodes_set) & edges_df.did.isin(considered_nodes_set)]
# computing edges weights in km
considered_edges_df['dist'] = considered_edges_df.apply(lambda x: haversine(x['olng'], x['olat'], x['dlng'], x['dlat']), axis=1).astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  considered_edges_df['dist'] = considered_edges_df.apply(lambda x: haversine(x['olng'], x['olat'], x['dlng'], x['dlat']), axis=1).astype(float)


In [8]:
def make_graph_from_df(nodes_df, edges_df, name='TLC', directed=False):
    G = nx.Graph(directed=directed)
    G.graph['Name'] = name

    G.add_nodes_from(nodes_df.set_index('id').to_dict('index').items())
    G.add_nodes_from((n, {'id': str(n)}) for n in G.nodes())

    G.add_edges_from(nx.from_pandas_edgelist(edges_df, 'oid', 'did', ['dist']).edges(data=True))

    return G

In [9]:
G = make_graph_from_df(considered_nodes_df, considered_edges_df)
G.number_of_nodes(), G.number_of_edges()

(60789, 75647)

In [10]:
nx.readwrite.write_gpickle(G, 'data/network.gpickle')

In [11]:
pickups_df = pd.read_csv('data/TLC_daily.csv')
pickups_df['day'] = pickups_df['day'].astype(int)
pickups_df = pickups_df[pickups_df['id'].isin(considered_nodes_set)]
pickups_df

Unnamed: 0,day,id,pickups
0,1,0,19.0
1,1,1,18.0
2,1,2,18.0
3,1,3,17.0
4,1,4,13.0
...,...,...,...
35911210,152,2414253,0.0
35911211,152,2414254,0.0
35911212,152,2414267,0.0
35911213,152,2414268,0.0


In [12]:
for day in pickups_df.day.unique()[:30]:
    ds = DayObservationsDataset.from_dataframe_by_day(pickups_df, day)
    torch.save(ds, f'datasets/day_{day}.dat')