In [11]:
import analysis as an
import importlib
importlib.reload(an)

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import tqdm

# visualization
import matplotlib.pyplot as plt

data = an.BikeShareData.load_from_pickle(name = '2024-5')

avg_lat = np.mean(data.stations['lat'])
avg_lon = np.mean(data.stations['lon'])

In [2]:
import pyproj

__geod = pyproj.Geod(ellps='WGS84')
def dst(lat1, lon1, lat2, lon2):
    """
    Calculate the distance between two points on the Earth's surface in meters.
    """
    angle1,angle2,distance = __geod.inv(lon1, lat1, lon2, lat2)
    return distance

In [14]:
import math
# calculate adjacency matrix for the station distance graph
# the station graph 
max_dst = 500 # meters
min_stations_connected = 2 # should always be connected to at least min_stations_connected other stations

# remove stations with missing lat/lon
# after having eliminated ghost stations (missing lat/lon), we have a new sequential index
stations = data.stations[~(data.stations['lat'].isna() | data.stations['lon'].isna())]
# thus we have new_index < old_index
new2old_idx = np.array([old_idx for old_idx in stations.index])
stations = stations.reset_index(drop=True)

# connect stations that are within max_dst of each other; or connect the closest stations until min_stations_connected is reached ; additionally add self-loops

adj = np.zeros((len(stations), len(stations)))
lats, lons = np.array(stations['lat']), np.array(stations['lon'])
dst_matrix = np.zeros_like(adj, dtype=np.float32)
N_stations = len(stations)
for i in range(len(stations)):
    dst_matrix[i, :] = dst(np.tile(lats[i], N_stations), np.tile(lons[i], N_stations), lats, lons)

# adj = adj + adj.T + np.eye(len(stations)) # add self-loops
adj[dst_matrix < max_dst] = 1
for i in range(len(stations)):
    if np.sum(adj[i, :]) < min_stations_connected * 2 + 1: # node degree is (sum of row i - 1) // 2 (we exclude the self-loop)
        # connect the closest stations
        closest = np.argsort(dst_matrix[i, :])
        for j in range(2 * min_stations_connected + 1):
            adj[i, closest[j]] = 1

# test that at least min_stations are connected
assert np.all(np.sum(adj, axis=1) >= min_stations_connected)
# calculate node degree params
node_degrees = (np.sum(adj, axis=1) - 1) // 2
print(f"Node degree: min {np.min(node_degrees)}, max {np.max(node_degrees)}, mean {np.mean(node_degrees)}, std {np.std(node_degrees)}")

print("Dst shape:", dst_matrix.shape)

Node degree: min 2.0, max 16.0, mean 3.8776470588235292, std 2.9931834900059653
Dst shape: (850, 850)


In [12]:
import numpy as np
mat = np.random.randint(0, 2, (5, 5))
mat = (mat + mat.T + np.eye(5)) // 2
print(mat)
inds_i, inds_j = mat.nonzero()
list(zip(inds_i, inds_j))

adj = mat.copy()
adj[np.tril_indices(adj.shape[0], k= -1)] = 0 # zero out the strict lower triangle (starting at diagonal -1 to the right) to not count edges twice
inds_i, inds_j = np.nonzero(adj)
num_edges = len(inds_i)
np.stack([inds_i, inds_j], axis = 0)

[[1. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0.]
 [0. 1. 0. 1. 0.]
 [0. 0. 0. 0. 0.]]


array([[0, 1, 3],
       [0, 3, 3]])

In [None]:
# !pip install scikit-learn
import sklearn.utils
#td: select train, eval and test days randomly from the month
N_days = 30
train_days = range(0,24)
eval_days = range(24, 28)
test_days = range(28, 30)


ImportError: cannot import name 'train_test_split' from 'sklearn.utils' (/mnt/vol2/BikeSharePrediction/.venv/lib/python3.12/site-packages/sklearn/utils/__init__.py)

In [None]:
import torch_geometric.data as geomdata

cfg = {'N_pred': 12, 'N_hist' : 12} 
n_window = cfg['N_hist'] + cfg['N_pred']
sequences = []

graphdata = geomdata.Data()
# data.x : source ; dimensions N × F
# data.y : pred ; dimensions N × Predicted ; here, we do N × History × FeatureLen (FeatureLen = 2 (In rates, out rates))
in_rates = data.in_rates[new2old_idx, 0::5] # only retrieve the real station data , and only take it all five minutes
# iterate the window over the whole data

print(geomdata.Data.__doc__)
in_rates.shape

A data object describing a homogeneous graph.
    The data object can hold node-level, link-level and graph-level attributes.
    In general, :class:`~torch_geometric.data.Data` tries to mimic the
    behavior of a regular :python:`Python` dictionary.
    In addition, it provides useful functionality for analyzing graph
    structures, and provides basic PyTorch tensor functionalities.
    See `here <https://pytorch-geometric.readthedocs.io/en/latest/get_started/
    introduction.html#data-handling-of-graphs>`__ for the accompanying
    tutorial.

    .. code-block:: python

        from torch_geometric.data import Data

        data = Data(x=x, edge_index=edge_index, ...)

        # Add additional arguments to `data`:
        data.train_idx = torch.tensor([...], dtype=torch.long)
        data.test_mask = torch.tensor([...], dtype=torch.bool)

        # Analyzing the graph structure:
        data.num_nodes
        >>> 23

        data.is_directed()
        >>> False

        # PyTorch 

(850, 83083)

[1, 2, 1, 2, 3]

In [21]:
import os
from pathlib import Path

cwd = Path(os.getcwd())
cwd.parents[0], cwd.parents[1]
str(cwd)


'/mnt/vol2/BikeSharePrediction/src'

In [1]:
print(f"{2.058e-10:.2e}")

2.06e-10


In [None]:
import importlib
import dataset
from run_training import overfit_config
import numpy as np
import matplotlib.pyplot as plt

cfg = overfit_config()
data = dataset.BikeGraphDataset(cfg)
model = 

# plot the first sample
sample = data[0]
print("Shapes:" ,sample['x'].shape, sample['y'].shape)
y = sample['y'].view(cfg['N_stations'], cfg['N_predictions'], 2)
station = 0
# interpolate the y data with cubic splines


plt.plot(y[station, :, 0].numpy(), label='In Rates')
plt.plot(y[station, :, 1].numpy(), label='Out Rates')
# set axis to 5 minutes at each sample


plt.xticks(np.arange(0, cfg['N_predictions']), np.arange(0, cfg['N_predictions'] * cfg['subsample_minutes'], cfg['subsample_minutes']))
plt.gca().set(xlabel='Time (minutes)', ylabel='Rate (bikes/min)', title='In Rates')


ModuleNotFoundError: spec not found for the module 'dataset'

In [None]:
cfg

3