# JSON to Temporal Graph

In [1]:
import json
import pickle
from datetime import datetime, timedelta

import numpy as np
import torch

In [2]:
path = "../data/train_delay.json"
with open(path) as file:
    data = json.load(file)

path_info = "../data/train_info.json"
with open(path_info) as file:
    info = json.load(file)

## Static graph temporal data
Reference used: [WikiMathsDatasetLoader](https://pytorch-geometric-temporal.readthedocs.io/en/latest/_modules/torch_geometric_temporal/dataset/wikimath.html#WikiMathsDatasetLoader)

### Map stations to ids

In [3]:
trains = []
station_ids = {}
# set id to 0
id = 0
# iterate over the trains
for train, train_data in data.items():
    trains.append(train)
    # take the first date entry. The stations are the entry's keys
    stations = list(train_data.values())[0].keys()
    # iterate over the stations
    for station in stations:
        # if station isn't in station id keys, save it with value as current id
        if station not in station_ids:
            station_ids[station] = id
            # increment the id
            id += 1

In [4]:
print(f"#trains: {len(trains)}")
print(f"#stations: {len(station_ids)}")

#trains: 89
#stations: 984


### Create the static edge index and compute the edge weights

In [5]:
# create a list to hold the edges
edge_list = []
num_trains = []
# distance_list = []
# iterate over the trains
for train, train_data in data.items():
    # take the first date entry.
    stations = list(list(train_data.values())[0].keys())
    # iterate upto the second last station
    for i in range(len(stations) - 1):
        # add (station, next station) to the edge index
        src_code = stations[i]
        dst_code = stations[i + 1]

        src = station_ids[stations[i]]
        dst = station_ids[stations[i + 1]]

        # src_dist = int(info[train]["stationDict"][src_code]["distance"])
        # dst_dist = int(info[train]["stationDict"][dst_code]["distance"])
        # distance_list.append(dst_dist - src_dist)

        try:
            idx = edge_list.index((src, dst))
            num_trains[idx] += 1
        except ValueError:
            edge_list.append((src, dst))
            num_trains.append(1)

# Convert it to an array of shape (2, #edges)
edge_index = np.array(edge_list).T
num_trains_array = np.array(num_trains)

del edge_list, num_trains

In [6]:
"""
# maintain a list for [(src, dst, dist), ...]
distance_dict = {}
# itearte over the trains in info
for train in info:
    # if we don't have the delay data for this train, pass.
    if train["trainNumber"] not in trains:
        continue
    # iterate over the stations in the station list
    # starting from the second entry upto the last
    for i in range(1, len(train["stationList"])):
        print(train["trainNumber"], train[""])
        prev_station = station_ids[train["stationList"][i - 1]["stationCode"]]
        curr_station = station_ids[train["stationList"][i]["stationCode"]]
        prev_dist = int(train["stationList"][i - 1]["distance"])
        curr_dist = int(train["stationList"][i]["distance"])
        # Add current_distance - prev_distance for edge
        # (prev station, current station) to the dictionary.
        # if the edge already exists, take the min of the two.
        dist = curr_dist - prev_dist
        try:
            distance_dict[(prev_station, curr_station)] = min(dist, distance_dict[(prev_station, curr_station)])
        except KeyError:
            distance_dict[(prev_station, curr_station)] = dist

# convert the list to a numpy array of shape (#edges, dist) sorted by src node.
distance_list = [(k[0], k[1], d) for k, d in distance_dict.items()]
distance_list = sorted(distance_list, key=lambda t: t[0])
distance_array = np.array([i[2] for i in distance_list])

del distance_dict
"""

'\n# maintain a list for [(src, dst, dist), ...]\ndistance_dict = {}\n# itearte over the trains in info\nfor train in info:\n    # if we don\'t have the delay data for this train, pass.\n    if train["trainNumber"] not in trains:\n        continue\n    # iterate over the stations in the station list\n    # starting from the second entry upto the last\n    for i in range(1, len(train["stationList"])):\n        print(train["trainNumber"], train[""])\n        prev_station = station_ids[train["stationList"][i - 1]["stationCode"]]\n        curr_station = station_ids[train["stationList"][i]["stationCode"]]\n        prev_dist = int(train["stationList"][i - 1]["distance"])\n        curr_dist = int(train["stationList"][i]["distance"])\n        # Add current_distance - prev_distance for edge\n        # (prev station, current station) to the dictionary.\n        # if the edge already exists, take the min of the two.\n        dist = curr_dist - prev_dist\n        try:\n            distance_dict[(p

In [7]:
edge_weights = num_trains_array
# edge_weights = distance_array
# edge_weights = np.stack((distance_array, num_trains_array)).T

In [8]:
print(edge_index.shape)
print(edge_weights.shape)

(2, 1470)
(1470,)


### Create the temporal snapshots

In [9]:
str_to_date = lambda str_: datetime.strptime(str_, '%d-%m-%Y').date()
date_to_str = lambda date_: date_.strftime("%d-%m-%Y")

def daterange(start_date, end_date):
    for n in range(int((end_date - start_date).days)):
        yield start_date + timedelta(n)

In [10]:
s = ""
s.split()

[]

In [11]:
s = "01 Hr 40 Mins"
s = "40 Mins"
s = ""

def read_delay(delay: str) -> int:
    split = delay.split()
    if len(split) == 0:
        return 0
    elif len(split) == 2:
        return int(split[0])
    else:
        return 60 * int(split[0]) + int(split[2])

In [32]:
num_nodes = len(np.unique(np.concatenate((edge_index[0], edge_index[1]))))
num_feats = 1

# decide the interval
start = str_to_date("01-05-2023")
end = str_to_date("01-08-2023")
days = (end - start).days

# create a list for storing the temporal data
node_feats_list = []

# iterate over the days
for day in daterange(start, end):
    station_seen_count = {i:0 for i in station_ids.values()}
    # define a matrix for that day.
    # create a numpy array of size (#nodes,) filled with zeros.
    node_feats = np.zeros((num_nodes, 1))
    # iterate over the trains
    for train, train_data in data.items():
        key = date_to_str(day)
        try:
            day_data = train_data[key]
        except KeyError:
            # print(train)
            continue
        # iterate over the stations
        for station, delay in day_data.items():
            # if there is a delay, add 1 to the correponding feature of node_feats
            delay = read_delay(delay)
            node_feats[station_ids[station]] += delay
            station_seen_count[station_ids[station]] += 1
    # average each station's delay
    for i in range(node_feats.shape[0]):
        node_feats[station_ids[station]] /= station_seen_count[station_ids[station]]
    node_feats_list.append(node_feats)

# convert the list to a numpy array
stacked_target = np.stack(node_feats_list)
# re-order the axes to (#trains, #features, time)
stacked_target = np.transpose(stacked_target,(1, 2, 0))

In [33]:
node_feats_list[0].shape, stacked_target.shape

((984, 1), (984, 1, 92))

In [34]:
stacked_target[0]

array([[ 16.,   0.,   1., 110., 234.,   5.,   5.,   9.,   6.,  66.,  51.,
        151.,  41.,  74., 293.,   5.,   5.,  12.,   3.,   3.,   1.,   4.,
          5.,   4.,  26., 112.,  80., 109.,  70.,  10.,  45.,  45.,  55.,
          1.,  22.,  80.,   7.,  14.,   4.,   4.,   3.,   3.,   5.,  14.,
         14.,  13.,  27., 191.,   1.,   7.,  16.,  14.,   8.,   7.,  19.,
          9.,  10.,  12.,  15.,   9.,  93., 304.,  21., 128.,  23.,  24.,
        246., 316., 170., 341., 341.,  87., 126.,  48., 277.,  47.,  37.,
        303., 122.,  66., 159.,  50.,  52.,  25.,  44.,  69.,  57.,   2.,
          6.,  18.,   7., 193.]])

### Create pairs of features and targets

In [12]:
"""
lags = 7

# Results in nan values.
# standardized_target = (
#     stacked_target - np.mean(stacked_target, axis=0)
# ) / np.std(stacked_target, axis=0)

# training days
features = [
    stacked_target[i : i + lags, :].T
    for i in range(len(node_feats_list) - lags)
]
# predict next day
targets = [
    stacked_target[i + lags, :].T
    for i in range(len(node_feats_list) - lags)
]
"""

'\nlags = 7\n\n# Results in nan values.\n# standardized_target = (\n#     stacked_target - np.mean(stacked_target, axis=0)\n# ) / np.std(stacked_target, axis=0)\n\n# training days\nfeatures = [\n    stacked_target[i : i + lags, :].T\n    for i in range(len(node_feats_list) - lags)\n]\n# predict next day\ntargets = [\n    stacked_target[i + lags, :].T\n    for i in range(len(node_feats_list) - lags)\n]\n'

In [35]:
lags = 7

# training days
features = [
    stacked_target[:, :, i : i + lags]
    for i in range(len(node_feats_list) - lags)
]
# predict next day
targets = [
    stacked_target[:, :, i + lags]
    for i in range(len(node_feats_list) - lags)
]

In [36]:
print(len(features), features[0].shape)
print(len(targets), targets[0].shape)

85 (984, 1, 7)
85 (984, 1)


### Create the torch geometric temporal dataset
Reference: [Docs](https://pytorch-geometric-temporal.readthedocs.io/en/latest/_modules/torch_geometric_temporal/signal/static_graph_temporal_signal.html)

In [37]:
from torch_geometric_temporal.signal import StaticGraphTemporalSignal

dataset = StaticGraphTemporalSignal(
    edge_index, edge_weights, features, targets
)

In [38]:
torch.save(dataset, "../data/trains_time.pt")

In [39]:
print(dataset[0])

Data(x=[984, 1, 7], edge_index=[2, 1470], edge_attr=[1470], y=[984, 1])


In [40]:
dataset[0].edge_index

tensor([[  0,   1,   2,  ..., 122, 983, 123],
        [  1,   2,   3,  ..., 983, 123, 364]])

In [41]:
dataset[0].edge_attr

tensor([2., 2., 3.,  ..., 1., 1., 1.])

In [42]:
dataset[0].y

tensor([[  9.],
        [ 27.],
        [ 30.],
        [ 38.],
        [ 18.],
        [ 20.],
        [ 67.],
        [ 76.],
        [ 18.],
        [ 41.],
        [ 37.],
        [ 65.],
        [ 39.],
        [ 33.],
        [ 59.],
        [ 36.],
        [ 67.],
        [ 37.],
        [  6.],
        [ 75.],
        [112.],
        [151.],
        [152.],
        [173.],
        [  1.],
        [  0.],
        [ 20.],
        [  0.],
        [  0.],
        [  0.],
        [  0.],
        [  0.],
        [  0.],
        [  0.],
        [  0.],
        [  0.],
        [  0.],
        [  9.],
        [  0.],
        [  0.],
        [  0.],
        [  0.],
        [  0.],
        [  0.],
        [  1.],
        [  0.],
        [  0.],
        [  0.],
        [  0.],
        [  0.],
        [  0.],
        [  0.],
        [  0.],
        [  0.],
        [  0.],
        [  0.],
        [  0.],
        [  0.],
        [  0.],
        [  0.],
        [  0.],
        [  0.],
        

In [43]:
dataset[0].x.shape, dataset[0].x.sum()

(torch.Size([984, 1, 7]), tensor(79528.))

# Rough