In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd

import sys, os
import sklearn
import datetime

import importlib
from tqdm import tqdm

In [4]:
import st_toolkit as geohl
importlib.reload(geohl)

import cri_calc as cri
importlib.reload(cri)

import cri_helper as helper
importlib.reload(helper)

In [7]:
def calculate_cri(rec_own, rec_target):
    own = rec_own._asdict()
    target = rec_target._asdict()
    
    ves_dcpa, ves_tcpa, hr, rel_movement_angle, dist_euclid, speed_r = cri.colregs_alarms(own=own, target=target)
    ves_cri = cri.calculate_cri(own, target, ves_dcpa, ves_tcpa, hr, rel_movement_angle, dist_euclid, speed_r)
    
    return ves_dcpa, ves_tcpa, hr, rel_movement_angle, dist_euclid, speed_r, ves_cri

In [8]:
def calculate_cri_timeslice(df):
    timeslice_result = []
    
    for row_i in df.itertuples():
        for row_j in df.itertuples():
            if row_i.Index == row_j.Index:
                continue
                
            timeslice_result.append([row_i.Index, row_i.mmsi, row_i.geom, row_i.speed, row_i.course, 
                                     row_j.Index, row_j.mmsi, row_j.geom, row_j.speed, row_j.course, *calculate_cri(row_i, row_j)])
            
#     return pd.DataFrame(timeslice_result, columns=['own', 'target', 'dcpa', 'tcpa', 'hr', 'rel_movement_angle', 'dist_euclid', 'speed_r', 'cri'])
    return pd.DataFrame(timeslice_result, columns=['own_Index', 'own_mmsi', 'own_geom', 'own_speed', 'own_course',
                                                   'target_Index', 'target_mmsi', 'target_geom', 'target_speed', 'target_course', 
                                                   'dcpa', 'tcpa', 'hr', 'rel_movement_angle', 'dist_euclid', 'speed_r', 'cri'])

# Load Data

In [9]:
df = pd.read_csv('./data/unipi_ais_dynamic_jul2018_1w_algn_linear_v2_w_lens.csv', parse_dates=['datetime'])
gdf = geohl.getGeoDataFrame_v2(df, crs='epsg:4326')

  arr = construct_1d_object_array_from_listlike(values)


In [10]:
gdf2 = gdf.loc[gdf.datetime.dt.date.between(datetime.date(2018, 7, 3), datetime.date(2018, 7, 3), inclusive='both')].copy()

In [11]:
gdf_sub_moving = gdf2.loc[gdf2.speed.between(1, 50, inclusive='neither')].copy()

# Creating Training Dataset

In [None]:
# tqdm.pandas()
# gdf_vcra = gdf_sub_moving.groupby(['datetime']).progress_apply(lambda l: calculate_cri_timeslice(l.copy()))

In [None]:
from joblib import Parallel, delayed
import multiprocessing


def applyParallel(dfGrouped, func, n_jobs=-1):
    '''
    Forked from: https://stackoverflow.com/a/27027632
    '''
    n_jobs = multiprocessing.cpu_count() if n_jobs == -1 else n_jobs
    print(f'Scaling {func} to {n_jobs} CPUs')
    
    retLst = Parallel(n_jobs=n_jobs)(delayed(func)(group) for name, group in tqdm(dfGrouped))
    return pd.concat(retLst)


gdf_vcra = applyParallel(gdf_sub_moving.groupby(['datetime']), lambda l: calculate_cri_timeslice(l.copy()), 7)

In [None]:
gdf_vcra_mi = gdf_vcra.copy()
gdf_vcra_mi = pd.merge(gdf_vcra_mi, gdf_sub_moving.datetime, left_on='own_Index', right_index=True)
gdf_vcra_mi.set_index(['datetime'], inplace=True)

In [None]:
gdf_vcra_mi.to_pickle('./data/unipi_ais_dynamic_jul2018_1w_vcra_dataset_v3.pickle')

# Training an MLP (via sklearn) -- Prototype

In [12]:
gdf_vcra = pd.read_pickle('./data/unipi_ais_dynamic_jul2018_1w_vcra_dataset_v3.pickle')

In [13]:
tqdm.pandas(desc='Adding Vessels\' Length...')

# gdf_vcra.loc[:, 'own_length'] = gdf_vcra.own_Index.apply(lambda l: gdf_sub_moving[l].length)
mlp_input = gdf_vcra.loc[gdf_vcra.own_Index.isin(gdf_sub_moving.index.values)].copy()
mlp_input.loc[:, 'own_length'] = mlp_input.own_Index.progress_apply(lambda l: gdf_sub_moving.loc[l].length)

Adding Vessels' Length...: 100%|██████| 960268/960268 [01:44<00:00, 9166.56it/s]


In [14]:
X = mlp_input[['dist_euclid', 'own_speed', 'target_speed', 'own_course', 'target_course', 'own_length']].values
y = mlp_input[['cri']].values.ravel()

In [15]:
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# regr = MLPRegressor(random_state=10, max_iter=500, hidden_layer_sizes=(128,64), 
#                     verbose=True, early_stopping=True, n_iter_no_change=10).fit(X_train, y_train)

scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)

In [43]:
regr = MLPRegressor(random_state=10, max_iter=30, hidden_layer_sizes=(256, 32), 
                    verbose=True, early_stopping=True, n_iter_no_change=10).fit(X_train_norm, y_train)

regr.score(scaler.transform(X_test), y_test)

Iteration 1, loss = 0.00385015
Validation score: 0.644050
Iteration 2, loss = 0.00259088
Validation score: 0.725855
Iteration 3, loss = 0.00230187
Validation score: 0.744237
Iteration 4, loss = 0.00216573
Validation score: 0.778982
Iteration 5, loss = 0.00204967
Validation score: 0.780121
Iteration 6, loss = 0.00198630
Validation score: 0.758965
Iteration 7, loss = 0.00190444
Validation score: 0.784247
Iteration 8, loss = 0.00183911
Validation score: 0.790634
Iteration 9, loss = 0.00179812
Validation score: 0.808164
Iteration 10, loss = 0.00176251
Validation score: 0.805912
Iteration 11, loss = 0.00172066
Validation score: 0.831105
Iteration 12, loss = 0.00168713
Validation score: 0.824960
Iteration 13, loss = 0.00164822
Validation score: 0.827729
Iteration 14, loss = 0.00162294
Validation score: 0.831060
Iteration 15, loss = 0.00158726
Validation score: 0.835460
Iteration 16, loss = 0.00156929
Validation score: 0.838889
Iteration 17, loss = 0.00154823
Validation score: 0.834683
Iterat



0.868270934890964

In [44]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

cri_pred = pd.Series(regr.predict(scaler.transform(X_test))).clip(0,1).values
print(f'MAE: {mean_absolute_error(y_test, cri_pred)}')
print(f'RMSE: {mean_squared_error(y_test, cri_pred, squared=False)}')

MAE: 0.01787829212857179
RMSE: 0.04847896187973707


In [None]:
from joblib import dump, load
dump(regr, './data/vcra-1w-mlp-hidden_256_32.joblib') 

# Evaluating ML model timeliness

In [46]:
def ml_calc_cri_timeslice(df, **kwargs):
    timeslice_result = []
    
    for row_i in df.itertuples():
        for row_j in df.itertuples():
            if row_i.Index == row_j.Index:
                continue
                
            timeslice_result.append([row_i.Index, row_i.mmsi, row_i.geom, row_i.speed, row_i.course, 
                                     row_j.Index, row_j.mmsi, row_j.geom, row_j.speed, row_j.course, *ml_calc_cri(row_i, row_j, **kwargs)])
            
#     return pd.DataFrame(timeslice_result, columns=['own', 'target', 'dcpa', 'tcpa', 'hr', 'rel_movement_angle', 'dist_euclid', 'speed_r', 'cri'])
    return pd.DataFrame(timeslice_result, columns=['own_Index', 'own_mmsi', 'own_geom', 'own_speed', 'own_course',
                                                   'target_Index', 'target_mmsi', 'target_geom', 'target_speed', 'target_course', 
                                                   'dist_euclid', 'cri'])

In [47]:
def ml_calc_cri(rec_own, rec_target, model=None, model_fun=calculate_cri, model_norm=None):
    own = rec_own
    target = rec_target
    
    if model is None:
        _, _, _, _, dist_euclid, _, ves_cri = model_fun(own, target)
    else:
        dist_euclid, model_input = model_fun(own, target)
        ves_cri = model.predict(model_norm.transform(np.array(model_input).reshape(1, -1)))
    
    return dist_euclid, min(max(ves_cri[0], 0), 1)

In [48]:
def ml_calc_cri_ours(rec_own, rec_target):
    own = rec_own._asdict()
    target = rec_target._asdict()
    
    own_geom_nm, target_geom_nm = map(helper.angular_to_nautical_miles, [own['geom'], target['geom']])
    xr, yr = helper.calculate_delta(own_geom_nm.x, target_geom_nm.x), helper.calculate_delta(own_geom_nm.y, target_geom_nm.y)
    hr = helper.calculate_delta(own['course'], target['course'])
    
    # Get vessels' Euclidean Distance -- NAUTICAL MILES
    dist_euclid = np.sqrt(xr**2 + yr**2)
    
    return dist_euclid, [dist_euclid, own['speed'], target['speed'], own['course'], target['course'], own['length']]

In [49]:
from joblib import dump, load
regr_ours = load('./data/vcra-1w-mlp-hidden_256_32.joblib') 

In [50]:
grouped = gdf_sub_moving.groupby(['datetime'])
l = grouped.get_group((list(grouped.groups)[0]))

In [52]:
%%timeit 
ml_calc_cri_timeslice(l.copy(), model=regr_ours, model_fun=ml_calc_cri_ours, model_norm=scaler);

311 ms ± 1.05 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
