In [1]:

# Keras imports
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Feature imports
from features.feature_factory import FeatureFactory
from features.features import RiskFeaturesEnum

# Other imports
from remote_requests import get_date_range, save_predictive_model
import numpy as np
from collections import deque
from typing import Deque
import pickle as pkl
import os.path

LOAD_DATASET = True
OVERWRITE_DATASET = True

In [None]:
# physical_devices = tf.config.experimental.list_physical_devices('GPU')
# assert len(physical_devices) > 0, "Not enough GPU hardware devices available"
# config = tf.config.experimental.set_memory_growth(physical_devices[0], True)

### The Model

#### Model interface

The LSTM model will take as an input for each step a  `N`x`F` matrix containing the `N` last days of `F` features for a particular region, and it shall predict what the risk index for that given region is on the present day.
```
  x  x--------------x                        x--------------x
  |  |..............|             +\         |              |
  |  |..............|      +------|  \       |   Risk       |
  N  |..............|      |          ]      |              |
  |  |..............|      +----- |  /       |     Index    |
  |  |..............|             +/         |              |
  x  x--------------x                        x--------------x
     x------F-------x
``` 

In [2]:
LSTM_layers = 8
intermediate_input_layers = 1
dense_layers = 7
N = 7
F = 9
G = 2

lstm_input = layers.Input(shape=(N,F))
dense_input = layers.Input(shape=G)

lstm = layers.LSTM(LSTM_layers)(lstm_input)

concatenated = layers.concatenate([lstm, dense_input])

dense = layers.Dense(dense_layers, activation="relu")(concatenated)
dense = layers.Dense(1, activation="sigmoid")(dense)
final_model = keras.Model([lstm_input, dense_input], dense)

print(final_model.summary())

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 7, 9)]       0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     (None, 8)            576         input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 2)]          0                                            
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, 10)           0           lstm[0][0]                       
                                                                 input_2[0][0]                

2022-08-24 08:35:45.848822: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX AVX2 FMA
2022-08-24 08:35:45.872026: I tensorflow/core/platform/profile_utils/cpu_utils.cc:102] CPU Frequency: 2899885000 Hz
2022-08-24 08:35:45.872604: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x560bfb88c7f0 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2022-08-24 08:35:45.872632: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2022-08-24 08:35:45.873417: I tensorflow/core/common_runtime/process_util.cc:147] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


### Dataset preparation

In [None]:
if not LOAD_DATASET:
    # Feature factory
    feature_factory = FeatureFactory()

    # Lat/long for each region, (width x height x 2 matrix)
    region_centers, region_bounds = feature_factory.get_geo_features()

    # Dates (ISO)
    date_range = get_date_range()

    # Dataset size
    dataset_size = (len(date_range) - N) * feature_factory.get_number_of_regions()

    # Risk features
    first_input_features = np.zeros(shape=(
        dataset_size,
        N,
        F
    ), dtype=float)

    # Locations
    second_input_features = np.zeros(shape=(
        dataset_size,
        2
    ), dtype=float)

    # Outputs
    outputs = np.zeros(shape=(dataset_size), dtype=float)


#### INPUT: Geographical space

The entire region shall be encased in a bounding box, split into cells of `w` km x `h` km dimension
- latitude = y axis
- longitude = x axis
```
  x NW-------------NE     x  +--------------+
  |  |.............|      |  |              |
  |  |.............|      |  |              |
 lat |.............|      h  | Region (X,Y) |
  |  |.............|      |  |              |
  |  |.............|      |  |              |
  x SW-------------SE     x  +--------------+
     x-----lon-----x         x-------w------x
```

In [None]:
if not LOAD_DATASET:
    print("\t\t\tAREA COVERED")
    print("=" * 61)
    print(f'Regions are {feature_factory.REGION_WIDTH_KM} km wide x {feature_factory.REGION_HEIGTH_KM} km tall')
    print(f'NW = {feature_factory.region_bounding_box.north_west.to_array()} \tNE = {feature_factory.region_bounding_box.north_east.to_array()}')
    print(f'SW = {feature_factory.region_bounding_box.south_west.to_array()} \tSE = {feature_factory.region_bounding_box.south_east.to_array()}')
    print(f'{feature_factory.region_bounding_box.width} km\twide\tx {feature_factory.region_bounding_box.height} km \ttall')
    print(f'{feature_factory.region_height_cells} cells \t\twide\tx {feature_factory.region_width_cells} cells\t\ttall')

Each region shall have the following relevant features, derived from the accidents that occurred on that region for that day, as well as the geographical region information
```
     0         1         2         3         4         5      |      0         1
+---------+---------+---------+---------+---------+---------+ | +---------+---------+
|  Total  |  Total  |  Total  |  Total  | Avg.    | Avg.    | | |         |         |
|number of|number of|number of|number of|  risk   |  time   | | |   lat   |   lon   |
|accidents|deaths   |sers. inj|light inj|   index |   slots | | |         |         |
+---------+---------+---------+---------+---------+---------+ | +---------+---------+
```
Risk index will be a number calculated from the accident information that reflects "how dangerous" the region was that day.
```
             3 * number of deaths + 2 * number of serious injuries + number of light injuries 
Risk index = --------------------------------------------------------------------------------
                                   number of involved vehicles
```
#### INPUT: Weather data
Weather data is constant between the regions, since we only have one measuring point available. It is described separately from the region features as the following feature set:
```
     0         1         2
+---------+---------+---------+
|Avg.    |Avg.      |Avg.     |
| visib. | precipit | air     |
|  prcnt.|  milliim.|  humid. |
+---------+---------+---------+
```

In [None]:

if not LOAD_DATASET:
    region_queues = np.zeros(shape=(
        feature_factory.region_width_cells,
        feature_factory.region_height_cells,
        len(date_range),
        F
    ))

    # Iterate initial N dates to populate the queues
    for index, date in enumerate(date_range[:N]):

        #print(f'Processing date {date}', end='\r')
        weather_features = feature_factory.get_weather_features_for_date(date)
        risk_features = feature_factory.get_risk_features_for_date(date)

        for xi in range(feature_factory.region_width_cells):
            for yi in range(feature_factory.region_height_cells):
                region_features = risk_features[xi][yi]

                for z in range(len(region_features)):
                    region_queues[xi][yi][index][z] = region_features[z]

                for z in range(len(weather_features)):
                    region_queues[xi][yi][index][z +
                                                    len(region_features)] = weather_features[z]

    # Start from N to compose the data set
    global_index = 0
    queues_index = N
    for date in date_range[N:]:

        print(f'Processing date {date}', end='\r')
        weather_features = feature_factory.get_weather_features_for_date(date)
        risk_features = feature_factory.get_risk_features_for_date(date)

        for xi in range(feature_factory.region_width_cells):
            for yi in range(feature_factory.region_height_cells):

                region_features = risk_features[xi][yi]
                expected_risk_index = region_features[RiskFeaturesEnum.AVG_RISK_LEVEL.value]

                first_input_features[global_index][:] = region_queues[xi][yi][-N:]
                second_input_features[global_index][:] = region_centers[xi][yi]
                outputs[global_index] = expected_risk_index

                global_index += 1
                for z in range(len(region_features)):
                    region_queues[xi][yi][queues_index][z] = region_features[z]

                for z in range(len(weather_features)):
                    region_queues[xi][yi][queues_index][z +
                                                        len(region_features)] = weather_features[z]
                
        queues_index += 1

### Dataset backup

In [3]:
INPUTS_1_FILE = "inputs_1.pkl"
INPUTS_2_FILE = "inputs_2.pkl"
OUTPUTS_FILE = "outputs.pkl"
OVERWRITE_DATASET = False

#### Save dataset

In [None]:
if not LOAD_DATASET:
    if not os.path.exists(INPUTS_1_FILE) or OVERWRITE_DATASET:
        with open(INPUTS_1_FILE, 'wb') as h:
            pkl.dump(first_input_features, h)

    if not os.path.exists(INPUTS_2_FILE) or OVERWRITE_DATASET:
        with open(INPUTS_2_FILE, 'wb') as h:
            pkl.dump(second_input_features, h)

    if not os.path.exists(OUTPUTS_FILE) or OVERWRITE_DATASET:
        with open(OUTPUTS_FILE, 'wb') as h:
            pkl.dump(outputs, h)


#### Load dataset

In [4]:
if LOAD_DATASET:
    if os.path.exists(INPUTS_1_FILE):
        with open(INPUTS_1_FILE, 'rb') as h:
            first_input_features = pkl.load(h)

    if os.path.exists(INPUTS_2_FILE):
        with open(INPUTS_2_FILE, 'rb') as h:
            second_input_features = pkl.load(h)

    if os.path.exists(OUTPUTS_FILE):
        with open(OUTPUTS_FILE, 'rb') as h:
            outputs = pkl.load(h)


### Training

In [None]:
# Split training / testing data
PERCENT_USED_FOR_TRAINING = 0.8

training_total = round(PERCENT_USED_FOR_TRAINING * outputs.size)

training_x1 = first_input_features[:training_total]
training_x2 = second_input_features[:training_total]
training_y = outputs[:training_total]

testing_x1 = first_input_features[training_total:]
testing_x2 = second_input_features[training_total:]
testing_y = outputs[training_total:]


In [None]:
# Compile the model

final_model.compile(
    optimizer=keras.optimizers.SGD(),
    loss='mean_absolute_error',
    metrics=[
        tf.keras.metrics.MeanSquaredError(name="Mean Squared Error"),
        tf.keras.metrics.MeanAbsoluteError(name="Mean Absolute Error"),
        tf.keras.metrics.LogCoshError(name="Log Cosh"),
        tf.keras.metrics.RootMeanSquaredError(name="Root Mean Squared Error")
    ]
)

In [None]:
# Train
EPOCHS = 5
PERCENT_USED_OR_VALIDATION = 0.15
BATCH_SIZE = 64

fit_history = final_model.fit(
    x=[training_x1, training_x2],
    y=training_y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    verbose=1,
    callbacks=None,
    validation_split=PERCENT_USED_OR_VALIDATION,
    validation_data=None,
    shuffle='batch',
    class_weight=None,
    sample_weight=None,
    initial_epoch=0,
    steps_per_epoch=None,
    validation_steps=None,
    validation_batch_size=None,
    validation_freq=1,
    max_queue_size=10,
    workers=1,
    use_multiprocessing=False,
)

In [None]:
#Test
print(final_model.metrics)
final_model.evaluate(
    x=[testing_x1, testing_x2],
    y=testing_y,
    batch_size=BATCH_SIZE,
    verbose=1,
    sample_weight=None,
    steps=None,
    callbacks=None,
    max_queue_size=10,
    workers=1,
    use_multiprocessing=False,
    return_dict=False,
)

### Persistence

In [5]:
import requests
from features.feature_factory import FeatureFactory

#model_json = final_model.to_json()
VERSION_NAME = "nobers"
FILENAME = VERSION_NAME + '.h5'
# final_model.save(FILENAME)

fac = FeatureFactory()
_, bounds = fac.get_geo_features()

# with open(FILENAME, 'rb') as f:
#     requests.post(
#         url='http://localhost:8080/prediction/model/persist',
#         data={"domain": bounds},
#         files={"predictiveModel": f}
#     )

In [6]:
import json
import requests
from features.feature_factory import FeatureFactory
zz_regions = []
feature_factory = FeatureFactory()
print (feature_factory.region_width_cells)
print (feature_factory.region_height_cells)
for x in range(feature_factory.region_width_cells):
    for y in range(feature_factory.region_height_cells):
        center = feature_factory.get_region_center(x, y)
        bounds = {
            "regionId": "",
            "predictor": "",
            "center": {
                "latitude": center[0],
                "longitude": center[1]
            },
            "risk": -1,
            "bounds": {
                "coordinates": [
                    [center[0] + feature_factory.width_height_lat_diff / 2.0, center[1] + feature_factory.width_height_lat_diff / 2.0],
                    [center[0] + feature_factory.width_height_lat_diff / 2.0, center[1] - feature_factory.width_height_lat_diff / 2.0],
                    [center[0] - feature_factory.width_height_lat_diff / 2.0, center[1] + feature_factory.width_height_lat_diff / 2.0],
                    [center[0] - feature_factory.width_height_lat_diff / 2.0, center[1] - feature_factory.width_height_lat_diff / 2.0]
                ],
                "type": "Polygon"
            }
        }
        zz_regions.append(bounds)

43
66


In [None]:
import base64
VERSION_NAME = "nobers"
FILENAME = VERSION_NAME + '.h5'

with open(FILENAME, 'rb') as f:
    requests.post(
        url='http://localhost:8080/prediction/model/persist',
        data={"bounds": json.dumps(zz_regions)},
        files=dict(predictiveModel = f)
    )
