# Processing

Create functions to produce X matrix and A matrix needed for the model

In [1]:
import json
import numpy as np
from copy import deepcopy
import math

In [2]:
# The higher level processing function would loop through the different file paths.
# For this notebook, we will use one of the defined file paths
data_file_path = 'truncated_data/trunc/Wed_Mar_10_2021/15_40_09.txt'

## Adjacency Matrix
Note: The adjacency matrix only needs to be pre-generated once on one of the datasets and saved as a .npy file, and subsequently loaded if needed.

In [None]:
with open(data_file_path, 'r') as traffic_data_file: # Dataset is a list of objects
    traffic_records = json.load(traffic_data_file)

In [14]:
def get_adjacency(file_path):
    '''
    Generates an Adjacency matrix
    -----------------------------
    :param str file_path: the file path of the dataset
    -----------------------------
    :returns:
        npy: Adjacency matrix
    '''
    with open(file_path, 'r') as traffic_data_file:
        traffic_records = json.load(traffic_data_file)
    
    # Format traffic_records so we can get the start & end location positions of the link
    traffic_records_formatted = []
    for record in traffic_records:
        record = deepcopy(record)
        lat_long_positions = record['Location'].split()
        record['start_pos'] = ' '. join(lat_long_positions[0:2])
        record['end_pos'] = ' '. join(lat_long_positions[2:4])
        traffic_records_formatted.append(record)
    traffic_records_formatted.sort(key=lambda x: int(x.get('LinkID')))
    
    # Generate Node Mappings
    nodes_params_dict = {}
    Nodes2LinkID = {} # not needed
    LinkID2Nodes = {} # not needed
    for i, record in enumerate(traffic_records_formatted):
        record = deepcopy(record)
        Nodes2LinkID[i] = record['LinkID']
        LinkID2Nodes[record['LinkID']] = i
        nodes_params_dict[i] = record
    
    
    # Generating a Directed Adjacency matrix
    '''
    Refer to illustrations
    To find a directed adjacency, we need to check each link(node)
    There is a directed adjacency from Node A to Node B if the end_pos of Node A is the start_pos of Node B

    This involves us having to loop through all nodes with: O(n^2) complexity
    (Computation and speed optimisation is not a concern here as this is pre-generated before training)
    '''
    nodes_count = len(nodes_params_dict)
    A = np.zeros((nodes_count,nodes_count), dtype=int)  
    # Finding the directed edges of the nodes
    for i, i_record in nodes_params_dict.items():
        # print(f'=====> Finding edges for Node: {i}, LinkID: {i_record["LinkID"]}')
        for j, j_record in nodes_params_dict.items():
            if i_record['end_pos'] == j_record['start_pos']:
                # print(f'Found a Directed Edge from Node {i} to Node {j}')
                A[i,j] = 1

    return A

In [15]:
A = get_adjacency(data_file_path)

In [16]:
print(A)
print(A.shape)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 1 0 0]]
(53, 53)


## Feature Matrix
Note: Feature Matrix, X, would contain the output speedband as well.
> Actually get_adjacency() & get_features() can be combined together.
> Its okay to leave it separate for now.

In [54]:
def link_length(start_pos, end_pos):
    """
    Calculation of distance between two lat-long geo positions, using Haversine distance
    ------------------------------------
    :param string start_pos: lat & long separated with a space
    :param string end_pos: lat & long separated with a space
    ------------------------------------
    :returns:
        float: total length of the link
    """
    lat1, lon1 = [float(pos) for pos in start_pos.split()]
    lat2, lon2 = [float(pos) for pos in end_pos.split()]
    radius = 6371
    dlat = math.radians(lat2 - lat1)
    dlon = math.radians(lon2 - lon1)
    a = (math.sin(dlat / 2) * math.sin(dlat / 2) +
         math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) *
         math.sin(dlon / 2) * math.sin(dlon / 2))
    d = radius * (2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a)))
    return d


def get_features(file_path):
    '''
    Generates a Feature matrix
    Note: Feature Matrix, X, would contain the output speedband as well. 
    -----------------------------
    :param str file_path: the file path of the dataset
    -----------------------------
    :returns:
        npy: Feature matrix
        dict: Metadata of parameters
    '''
    with open(file_path, 'r') as traffic_data_file:
        traffic_records = json.load(traffic_data_file)
    
    # Find out all Road Categories
    roadcategory_list = []
    for record in traffic_records:
        if record['RoadCategory'] not in roadcategory_list:
            roadcategory_list.append(record['RoadCategory'])
    roadcategory_list.sort()
    Index2RoadCat = {}
    RoadCat2Index = {}
    for i, cat in enumerate(roadcategory_list):
        Index2RoadCat[i] = cat
        RoadCat2Index[cat] = i

    
    # Format traffic_records to include additional field on length of link
    traffic_records_formatted = []
    for record in traffic_records:
        record = deepcopy(record)
        lat_long_positions = record['Location'].split()
        record['start_pos'] = ' '. join(lat_long_positions[0:2])
        record['end_pos'] = ' '. join(lat_long_positions[2:4])
        record['length'] = link_length(record['start_pos'], record['end_pos'])
        traffic_records_formatted.append(record)
    
    # Generate Node Mappings
    nodes_params_dict = {}
    Nodes2LinkID = {} # not needed
    LinkID2Nodes = {} # not needed
    for i, record in enumerate(traffic_records_formatted):
        record = deepcopy(record)
        Nodes2LinkID[i] = record['LinkID']
        LinkID2Nodes[record['LinkID']] = i
        nodes_params_dict[i] = record
        
    nodes_count = len(nodes_params_dict)
    num_features = 3
    X = []
    # Positions of Features
    # 0. SpeedBand
    # 1. RoadCategory
    # 2. Length of Link
    for i, record in nodes_params_dict.items():
        features = [float(record['SpeedBand']),RoadCat2Index[record['RoadCategory']],record['length'] ]
        X.append(features)
#         X[i][0] = float(record['SpeedBand'])
#         X[i][1] = RoadCat2Index[record['RoadCategory']]
#         X[i][2] = record['length']
    
    
    return np.array(X), nodes_params_dict, RoadCat2Index

In [57]:
X,nodes_params_dict,RoadCat2Index = get_features(data_file_path)
print(X)
print(nodes_params_dict)
print(RoadCat2Index)

[[3.         1.         0.08698627]
 [3.         1.         0.04609998]
 [4.         1.         0.1214745 ]
 [4.         1.         0.11285454]
 [4.         1.         0.01773246]
 [3.         1.         0.04363278]
 [4.         1.         0.03261392]
 [3.         1.         0.1313504 ]
 [5.         1.         0.04696537]
 [3.         1.         0.05844392]
 [4.         1.         0.09699181]
 [3.         1.         0.09690966]
 [4.         1.         0.05646912]
 [4.         1.         0.0756916 ]
 [4.         1.         0.0756916 ]
 [4.         1.         0.01459941]
 [4.         1.         0.01459941]
 [3.         1.         0.11190635]
 [3.         1.         0.05695016]
 [5.         0.         0.0220174 ]
 [3.         1.         0.10814834]
 [3.         1.         0.10814834]
 [3.         1.         0.03501077]
 [4.         1.         0.01996475]
 [4.         1.         0.02847569]
 [4.         1.         0.02155576]
 [4.         0.         0.00992923]
 [4.         0.         0.04