In [1]:
import psycopg as pg
import json
import pandas as pd
import ast
import datetime
import numpy as np
import torch

from torch_geometric.data import Data
from datetime import timedelta
from zoneinfo import ZoneInfo

from parameters import GraphAEParameters, GATAEParameters, RSTAEParameters
from models import GraphAE, GATAE, RelationalSTAE 
from training import load_model
from datautils import load_best_parameters, generate_edges, generate_relational_edges
from dbhelpers import make_ts_db_connection, insert_predictions

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
all_milemarkers = np.array([53.3, 53.6, 53.9, 54.1, 54.6, 55. , 55.3, 55.5, 56. , 56.3, 56.7,
       57.3, 57.7, 58.1, 58.3, 58.6, 58.8, 59. , 59.3, 59.7, 60. , 60.4,
       60.5, 61. , 61.5, 62.1, 62.5, 63.1, 63.6, 64. , 64.3, 64.5, 64.8,
       65. , 65.1, 65.6, 65.9, 66.3, 66.5, 66.7, 66.9, 67.3, 67.8, 68.2,
       68.5, 68.8, 69.3, 69.8, 70.1])

In [3]:
def get_query(query_name):
    if query_name == "last_hour":
        return """SELECT * 
        FROM raw_messages.raw_links
        WHERE link_update_time >= NOW() - INTERVAL '1 hour' """
    elif query_name == "10_mins":
        return """SELECT * 
        FROM raw_messages.raw_links
        WHERE link_update_time >= NOW() - INTERVAL '10 minutes' """
    else:
        raise NotImplementedError("Query not yet implemented. Check spelling.")

In [4]:
conn = make_ts_db_connection()

print(conn)
with conn.cursor() as cur:
    query = get_query('10_mins')
    cur.execute(query)
    data = cur.fetchall()

    if data:
        df = pd.DataFrame(data)
        print(df.head())

conn.close()

<connection object at 0x7f0d4165a200; dsn: 'user=austin password=xxx dbname=aidss-prod host=10.80.3.22 port=5432', closed: 0>
                                0              1    2                      3   \
0 2025-02-04 13:59:27.656868-06:00  1738699167656  656  R3G-00I24-64.8W (547)   
1 2025-02-04 13:59:27.818783-06:00  1738699167818  650  R3G-00I24-69.4E (519)   
2 2025-02-04 13:59:27.870160-06:00  1738699167870  643  R3G-00I24-67.8W (513)   
3 2025-02-04 13:59:27.711336-06:00  1738699167711  580  R3G-00I24-54.4E (231)   
4 2025-02-04 13:59:27.765663-06:00  1738699167765  657  R3G-00I24-65.3E (548)   

                                4    5              6     7          8     9   \
0 2025-02-04 13:59:27.197766-06:00  590  Interstate 24  64.8  Westbound  76.0   
1 2025-02-04 13:59:27.197766-06:00  584  Interstate 24  69.4  Eastbound  78.0   
2 2025-02-04 13:59:27.197766-06:00  577  Interstate 24  67.8  Westbound  70.0   
3 2025-02-04 13:59:27.196772-06:00  541  Interstate 24  54.4  E

In [5]:
# 8 is direction
# 14 is lane data
# 4 is link update time
# 7 is milemarker

In [6]:
def extract_lane_data(row):
    row = ast.literal_eval(row)
    data = []
    lanes = []

    for key in row.keys():

        lane_info = row[key]

        lane = int(lane_info[0][-1])
        lanes.append(lane)

        if len(lane_info) < 4:
            speed = None
            volume = None
            occ = None
        else: 
            speed = lane_info[1]
            volume = lane_info[2] 
            occ = lane_info[3]

        data.append([lane, occ, speed, volume])

    # Ensure lane 1, then 2, then 3, then 4
    sorted_data = [x for _, x in sorted(zip(lanes, data), key=lambda pair: pair[0])]
    
    return data

In [7]:
def process_milemarker(row):
    lane_data = extract_lane_data(row[14])
    milemarker = row[7]
    time = row[4]

    new_rows = []
    for data in lane_data:
        new_rows.append([time, milemarker, *data])

    return new_rows

In [8]:
def round_time(time):
    seconds = time.second
    microseconds = time.microsecond
    total_seconds = seconds + microseconds / 1_000_000.0

    if total_seconds % 60 < 15:
        new_second = 0
    elif 15 <= total_seconds % 60 < 45:
        new_second = 30
    else:
        new_second = 0
        time = time + timedelta(minutes=1)

    return time.replace(second=new_second, microsecond=0)

In [9]:
def round_times(data):
    data[4] = data[4].apply(round_time)
    return data

In [10]:
def fill_milemarkers(data):
    result_np = np.array(data)
    for milemarker in all_milemarkers:
        if milemarker not in result_np[:,1]:
            data.append([result_np[0][0], milemarker, 1, None, None, None])
            data.append([result_np[0][0], milemarker, 2, None, None, None])
            data.append([result_np[0][0], milemarker, 3, None, None, None])
            data.append([result_np[0][0], milemarker, 4, None, None, None])

    return data

In [11]:
def process_milemarkers(data):
    result = []
    for i, row in data.iterrows():
        if len(ast.literal_eval(row[14]).keys()) == 4:
            data = process_milemarker(row)
            for d in data:
                result.append(d)
                
    result = fill_milemarkers(result)

    return result

In [12]:
def process_times(data):
    data = data[(data[7] >= 53.3) & (data[7] <= 70.1)]
    data = data[data[8] == 'Westbound']
    data = round_times(data)
    result = []

    for time in np.unique(data[4]):
        curr_time = data[data[4] == time]
        processed_data = process_milemarkers(curr_time)
        for p in processed_data:
            result.append(p)

    return result

In [13]:
def process_as_df(data):
    processed_data = process_times(data)
    processed_data = np.array(processed_data)
    p_df = pd.DataFrame(processed_data, columns=['Time', 'Milemarker', 'Lane', 'Occupancy', 'Speed', 'Volume'])
    p_df = p_df.groupby(['Time', 'Milemarker', 'Lane'], as_index=False).mean()
    p_df = p_df.loc[p_df['Milemarker'].isin(all_milemarkers)]

    return p_df

In [14]:
def normalize_data(data):
    data['Occupancy'] /= 100
    data['Speed'] /= 80
    data['Volume'] /= 25

    return data 

In [15]:
def load_gcn():
    optimal_hyperparams = load_best_parameters('gcn_v2')
    hyperparams = GraphAEParameters(
        num_features=3,
        latent_dim=optimal_hyperparams['latent_dim'],
        gcn_hidden_dim=optimal_hyperparams['gcn_hidden_dim'],
        dropout=optimal_hyperparams['dropout'],
        num_gcn=optimal_hyperparams['num_gcn']
    )

    ae = load_model(GraphAE, hyperparams, 'gcn')

    return ae

In [16]:
def gcn_input_process(data):
    data = data.apply(lambda x: x.fillna(x.mean()), axis=0)
    normalized_data = normalize_data(data)
    time_pred = normalized_data.iloc[-1]['Time']
    time_pred = time_pred - pd.Timedelta(minutes=1) # 1 minute before so data has time to come in

    static_edges = generate_edges(milemarkers=list(range(49)))

    gcn_input = normalized_data[normalized_data['Time'] == time_pred].to_numpy(dtype=np.float32)[:,3:6]
    curr_graph = Data(x=torch.tensor(gcn_input, dtype=torch.float32), edge_index=static_edges)

    return curr_graph, time_pred

In [17]:
gcn_ae = load_gcn()

In [18]:
processed_data = process_as_df(df)

In [19]:
gcn_input, time_pred = gcn_input_process(processed_data)

In [20]:
processed_data

Unnamed: 0,Time,Milemarker,Lane,Occupancy,Speed,Volume
0,2025-02-04 13:49:30-06:00,53.3,1,4.0,75.0,7.0
1,2025-02-04 13:49:30-06:00,53.3,2,8.0,68.0,8.0
2,2025-02-04 13:49:30-06:00,53.3,3,5.0,64.0,7.0
3,2025-02-04 13:49:30-06:00,53.3,4,7.0,57.0,6.0
4,2025-02-04 13:49:30-06:00,53.6,1,7.0,81.0,11.0
...,...,...,...,...,...,...
4111,2025-02-04 13:59:30-06:00,69.8,4,,,
4112,2025-02-04 13:59:30-06:00,70.1,1,,,
4113,2025-02-04 13:59:30-06:00,70.1,2,,,
4114,2025-02-04 13:59:30-06:00,70.1,3,,,


In [21]:
gcn_recons = gcn_ae(gcn_input).detach().numpy()

In [22]:
def load_stg_gat():
    optimal_hyperparams = load_best_parameters('gat')

    hyperparams = GATAEParameters(
        num_features=3,
        latent_dim=optimal_hyperparams['latent_dim'],
        gcn_hidden_dim=optimal_hyperparams['gcn_hidden_dim'],
        dropout=optimal_hyperparams['dropout'],
        num_layers=optimal_hyperparams['num_layers'],
        num_heads=optimal_hyperparams['num_heads']
    )

    ae = load_model(GATAE, hyperparams, 'gat')

    return ae, optimal_hyperparams['timesteps']

In [23]:
def gat_input_process(data, timestep):
    data = data.apply(lambda x: x.fillna(x.mean()), axis=0)
    normalized_data = normalize_data(data)
    thirty_seconds_delay = 2 # how many 30-second windows to delay predictions by while waiting for more data
    time_preds = []
    for i in range(timestep-1, -1, -1):
        time_preds.append(np.unique(normalized_data['Time'])[-1-thirty_seconds_delay-i])

    # print(normalized_data.iloc[-2]['Time'])
    print(time_preds)

    relational_edges, relations = generate_relational_edges(milemarkers=list(range(49)), timesteps=timestep)
    inputs = []
    for time_pred in time_preds:
        gat_input = normalized_data[normalized_data['Time'] == time_pred].to_numpy(dtype=np.float32)[:,3:6]
        print(gat_input.shape)
        inputs.append(gat_input)
    
    inputs = np.concatenate(inputs)
    print(inputs.shape)

    pyg_data = Data(x=torch.tensor(inputs, dtype=torch.float32), edge_index=relational_edges, edge_attr=torch.tensor(relations, dtype=torch.long))

    return pyg_data

In [24]:
gat_ae, gat_timestep = load_stg_gat()

In [25]:
processed_data = process_as_df(df)

In [26]:
gat_data = gat_input_process(processed_data, gat_timestep)

[Timestamp('2025-02-04 13:58:00-0600', tz='UTC-06:00'), Timestamp('2025-02-04 13:58:30-0600', tz='UTC-06:00')]
(196, 3)
(196, 3)
(392, 3)


In [27]:
gat_recons = gat_ae(gat_data).detach().numpy()

In [28]:
def load_rgcn_ae():
    optimal_hyperparams = load_best_parameters('rstae')

    hyperparams = RSTAEParameters(
        num_features=3,
        latent_dim=optimal_hyperparams['latent_dim'],
        gcn_hidden_dim=optimal_hyperparams['gcn_hidden_dim'],
        dropout=optimal_hyperparams['dropout'],
        num_gcn=optimal_hyperparams['num_gcn']
    )

    ae = load_model(RelationalSTAE, hyperparams, 'rstae')

    return ae, optimal_hyperparams['timesteps']

In [29]:
rstae, rstae_timestep = load_rgcn_ae()

In [30]:
def rstae_input_process(data, timestep):
    data = data.apply(lambda x: x.fillna(x.mean()), axis=0)
    normalized_data = normalize_data(data)
    thirty_seconds_delay = 2 # how many 30-second windows to delay predictions by while waiting for more data
    time_preds = []
    for i in range(timestep-1, -1, -1):
        time_preds.append(np.unique(normalized_data['Time'])[-1-thirty_seconds_delay-i])

    print(time_preds)

    relational_edges, relations = generate_relational_edges(milemarkers=list(range(49)), timesteps=timestep)
    inputs = []
    for time_pred in time_preds:
        rstae_input = normalized_data[normalized_data['Time'] == time_pred].to_numpy(dtype=np.float32)[:,3:6]
        inputs.append(rstae_input)
    
    inputs = np.concatenate(inputs)

    pyg_data = Data(x=torch.tensor(inputs, dtype=torch.float32), edge_index=relational_edges, edge_attr=torch.tensor(relations, dtype=torch.long))

    return pyg_data

In [31]:
rstae_data = rstae_input_process(processed_data, rstae_timestep)

[Timestamp('2025-02-04 13:55:00-0600', tz='UTC-06:00'), Timestamp('2025-02-04 13:55:30-0600', tz='UTC-06:00'), Timestamp('2025-02-04 13:56:00-0600', tz='UTC-06:00'), Timestamp('2025-02-04 13:56:30-0600', tz='UTC-06:00'), Timestamp('2025-02-04 13:57:00-0600', tz='UTC-06:00'), Timestamp('2025-02-04 13:57:30-0600', tz='UTC-06:00'), Timestamp('2025-02-04 13:58:00-0600', tz='UTC-06:00'), Timestamp('2025-02-04 13:58:30-0600', tz='UTC-06:00')]


In [32]:
rstae_recons = rstae(rstae_data).detach().numpy()

In [33]:
def load_thresholds():
    gcn_thresh = np.load('saved_models/gcn_thresh.npy')
    gat_thresh = np.load('saved_models/gat_thresh.npy')
    rstae_thresh = np.load('saved_models/rstae_thresh.npy')

    return gcn_thresh, gat_thresh, rstae_thresh

In [34]:
def compute_error(predictions, data):
    error = (predictions - data)**2
    error = np.mean(error, axis=1)
    return error

In [35]:
gcn_thresh, gat_thresh, rstae_thresh = load_thresholds()

In [36]:
processed_data =  processed_data.apply(lambda x: x.fillna(x.mean()), axis=0)
current_data = normalize_data(processed_data[processed_data['Time']==time_pred]).to_numpy()[:,3:6]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Occupancy'] /= 100
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Speed'] /= 80
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Volume'] /= 25


In [37]:
gcn_error = compute_error(gcn_recons, current_data)
gat_error = compute_error(gat_recons, current_data)
rstae_error = compute_error(rstae_recons, current_data)

In [38]:
gcn_error > gcn_thresh

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [39]:
def format_predictions(time, gcn_error, gcn_thresh, gat_error, gat_thresh, rstae_error, rstae_thresh):
    result = []
    milemarker_list = all_milemarkers.repeat(4)
    lane_list = np.tile(np.arange(1, 5), 49)

    for mile, lane, gcn_e, gcn_t, gat_e, gat_t, rstae_e, rstae_t in zip(milemarker_list, lane_list, gcn_error, gcn_thresh, gat_error, gat_thresh, rstae_error, rstae_thresh):
        r3_tz = ZoneInfo('US/Central')
        localized_time = time.astimezone(r3_tz)

        rds_update_time = localized_time
        result.append({'rds_update_time': rds_update_time, 
                       'milemarker': float(mile), 
                       'lane_id': int(lane),
                       'direction': "Westbound", 
                       'reconstruction_error_gcn': float(gcn_e), 
                       'reconstruction_error_gat': float(gat_e),
                       'reconstruction_error_rgcn': float(rstae_e), 
                       'threshold_gcn': float(gcn_t),
                       'threshold_gat': float(gat_t),
                       'threshold_rgcn': float(rstae_t)})

    return result

In [40]:
predictions = format_predictions(time_pred, gcn_error, gcn_thresh, gat_error, gat_thresh, rstae_error, rstae_thresh)

In [41]:
predictions

[{'rds_update_time': Timestamp('2025-02-04 13:58:30-0600', tz='US/Central'),
  'milemarker': 53.3,
  'lane_id': 1,
  'direction': 'Westbound',
  'reconstruction_error_gcn': 0.017162913187029862,
  'reconstruction_error_gat': 0.011781368144908398,
  'reconstruction_error_rgcn': 0.012357416677603131,
  'threshold_gcn': 0.10256054,
  'threshold_gat': 0.120722376,
  'threshold_rgcn': 0.11990687},
 {'rds_update_time': Timestamp('2025-02-04 13:58:30-0600', tz='US/Central'),
  'milemarker': 53.3,
  'lane_id': 2,
  'direction': 'Westbound',
  'reconstruction_error_gcn': 0.0010735633497415441,
  'reconstruction_error_gat': 0.0017944302667926872,
  'reconstruction_error_rgcn': 0.005052087613730713,
  'threshold_gcn': 0.3654051,
  'threshold_gat': 0.35742438,
  'threshold_rgcn': 0.36751482},
 {'rds_update_time': Timestamp('2025-02-04 13:58:30-0600', tz='US/Central'),
  'milemarker': 53.3,
  'lane_id': 3,
  'direction': 'Westbound',
  'reconstruction_error_gcn': 0.002867770470526113,
  'reconstruc

In [42]:
insert_predictions(predictions)

ProgrammingError: can't adapt type 'numpy.float32'