In [95]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
import networkx as nx

import torch

from src.utils import haversine, radians
from dataset import DayObservationsDataset

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [96]:
legend_df = pd.read_csv('data/subway_locations_connections.csv')

def parse_string_array_frame(df, column):
    df[column] = df[column].apply(lambda x : [str(i.strip().upper()[1:-1]) for i in x[1:-1].split(',')])

def parse_float_array_frame(df, column):
    df[column] = df[column].apply(lambda x : [float(i.strip()) for i in x[1:-1].split(',')])

parse_string_array_frame(legend_df, 'dest_name')
parse_string_array_frame(legend_df, 'dest_id')
parse_float_array_frame(legend_df, 'dest_lat')
parse_float_array_frame(legend_df, 'dest_long')

origin_cols = [x for x in legend_df.columns if x.startswith('origin')]
dest_cols = [x for x in legend_df.columns if x.startswith('dest')]

legend_df[['origin_lat', 'origin_long', 'dest_lat', 'dest_long']] = \
    legend_df[['origin_lat', 'origin_long', 'dest_lat', 'dest_long']].applymap(lambda x : radians(x))

In [97]:
nodes_df = legend_df[origin_cols]
# nodes_df = nodes_df.rename(columns={'origin_lat': 'lat', 'origin_long': 'lon'})
nodes_set = set(nodes_df['origin_name'].unique())

edges = []
for i, row in legend_df.iterrows():
    for name, id, lat, lon in zip(*(row[col] for col in dest_cols)):
        if name not in nodes_set:
            nodes_df = nodes_df.append(pd.DataFrame([[name, id, lat, lon]], columns=origin_cols), ignore_index=True)
            nodes_set.add(name)

        olon, olat = row['origin_long'], row['origin_lat']
        dist = float(haversine(olon, olat, lon, lat))

        edges.append((row['origin_name'], name, {'dist' : dist}))

edges[:5]
        

[('1 AV', '3 AV', {'dist': 0.4333996170612458}),
 ('1 AV', 'BEDFORD AV', {'dist': 2.579846524429354}),
 ('103 ST', '96 ST', {'dist': 0.6985162154962519}),
 ('103 ST', 'CATHEDRAL PKWY', {'dist': 0.5189875230607629}),
 ('103 ST - CORONA PLAZA', 'JUNCTION BLVD', {'dist': 0.5806388684783266})]

In [98]:
nodes_df = nodes_df.rename(columns=lambda x : x.removeprefix('origin_'))
nodes_df = nodes_df.rename(columns={'long':'lon'})
nodes_df['id'] = nodes_df['name']
nodes_df = nodes_df.set_index('name')
nodes_df

Unnamed: 0_level_0,id,lat,lon
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1 AV,1 AV,0.710889,-1.291223
103 ST,103 ST,0.712085,-1.290992
103 ST - CORONA PLAZA,103 ST - CORONA PLAZA,0.711219,-1.289147
104 ST,104 ST,0.710030,-1.288711
110 ST,110 ST,0.712007,-1.290571
...,...,...,...
WOODSIDE - 61 ST,WOODSIDE - 61 ST,0.711145,-1.289850
WORLD TRADE CENTER,WORLD TRADE CENTER,0.710569,-1.291714
YORK ST,YORK ST,0.710373,-1.291312
ZEREGA AV,ZEREGA AV,0.712731,-1.288874


In [99]:
def make_graph_from_df(nodes_df, edges, name='TLC', directed=True):
    G = nx.Graph(directed=directed)
    G.graph['Name'] = name

    G.add_nodes_from(nodes_df.to_dict('index').items())
    G.add_edges_from(edges)

    return G

In [100]:
G = make_graph_from_df(nodes_df, edges)
G.number_of_nodes(), G.number_of_edges()

(374, 503)

In [101]:
nx.readwrite.write_gpickle(G, 'data/network.gpickle')

In [104]:
mta = pd.read_csv('data/subway_2021_ridership.csv')
mta = mta[mta['flag'] == True]
mta['DATE_time'] = pd.to_datetime(mta['DATE_time'])
mta['DATE'] = mta['DATE_time'].dt.date
mta = mta.groupby(['DATE', 'STATION'], as_index=False)['ridership'].sum()
mta = mta[mta['STATION'].isin(nodes_set)]
mta

Unnamed: 0,DATE,STATION,ridership
0,2020-12-26,1 AV,191.0
1,2020-12-26,103 ST,317.0
3,2020-12-26,104 ST,20.0
4,2020-12-26,110 ST,239.0
5,2020-12-26,111 ST,70.0
...,...,...,...
102330,2021-12-24,WINTHROP ST,390.0
102331,2021-12-24,WOODHAVEN BLVD,1546.0
102332,2021-12-24,WOODLAWN,488.0
102334,2021-12-24,YORK ST,989.0


In [105]:
for day in mta['DATE'].unique()[:30]:
    ds = DayObservationsDataset.from_dataframe_by_day(mta, day)
    torch.save(ds, f'datasets/{day}.dat')