In [16]:
import pandas as pd
import numpy as np
from numpy.random import RandomState
from scipy.stats import norm
import geopandas as gpd
import copy
import sys
sys.path.insert(0, 'D:/modules/thesis/resources/gp_pref_elicit/gp_utilities/utils_data')


In [17]:
# original dataset
sidewalk = gpd.read_file('D:/modules/thesis/data/Sidewalk_width_crossings_small.geojson')
sidewalk

Unnamed: 0,id,0.9-1.8m,1.8-2.9m,<0.9m,>2.9m,crossing,length,obstacle_free_width,unknown,geometry
0,0,0,0,0,1,0,9.99,>2.9m,0,"LINESTRING (120548.61203 486088.19578, 120548...."
1,1,0,0,0,1,0,3.64,>2.9m,0,"LINESTRING (120558.58273 486088.59136, 120558...."
2,2,1,0,0,0,0,4.30,0.9-1.8m,0,"LINESTRING (120554.77791 486105.08163, 120555...."
3,3,1,0,0,0,0,3.20,0.9-1.8m,0,"LINESTRING (120561.12010 486102.03679, 120561...."
4,4,0,0,0,0,0,9.99,unknown,1,"LINESTRING (120549.11715 486040.41439, 120549...."
...,...,...,...,...,...,...,...,...,...,...
1134,1134,0,0,0,1,1,8.41,>2.9m,0,"LINESTRING (120971.41298 485981.36204, 120971...."
1135,1135,0,0,0,1,1,25.81,>2.9m,0,"LINESTRING (120916.43844 486032.00808, 120890...."
1136,1136,0,0,0,1,1,9.47,>2.9m,0,"LINESTRING (120940.10969 486013.85196, 120940...."
1137,1137,0,0,0,1,1,8.41,>2.9m,0,"LINESTRING (120971.64289 485989.76706, 120971...."


In [18]:
# dataset with only columns that are needed 
dataset1 = sidewalk[['0.9-1.8m', '1.8-2.9m', '<0.9m', '>2.9m', 'crossing', 'length', 'obstacle_free_width', 'unknown']].copy()
dataset1

Unnamed: 0,0.9-1.8m,1.8-2.9m,<0.9m,>2.9m,crossing,length,obstacle_free_width,unknown
0,0,0,0,1,0,9.99,>2.9m,0
1,0,0,0,1,0,3.64,>2.9m,0
2,1,0,0,0,0,4.30,0.9-1.8m,0
3,1,0,0,0,0,3.20,0.9-1.8m,0
4,0,0,0,0,0,9.99,unknown,1
...,...,...,...,...,...,...,...,...
1134,0,0,0,1,1,8.41,>2.9m,0
1135,0,0,0,1,1,25.81,>2.9m,0
1136,0,0,0,1,1,9.47,>2.9m,0
1137,0,0,0,1,1,8.41,>2.9m,0


In [19]:
# multiplying all columns with the value in the length column as we are going towards six objectives
# columns '0.9-1.8m', '1.8-2.9m', '<0.9m', '>2.9m' and 'unknown' are path lengths of respective widths
boolean_columns = ['0.9-1.8m', '1.8-2.9m', '<0.9m', '>2.9m', 'unknown']
dataset1[boolean_columns] = dataset1[boolean_columns].mul(dataset1['length'], axis = 0)
dataset1

Unnamed: 0,0.9-1.8m,1.8-2.9m,<0.9m,>2.9m,crossing,length,obstacle_free_width,unknown
0,0.0,0.0,0.0,9.99,0,9.99,>2.9m,0.00
1,0.0,0.0,0.0,3.64,0,3.64,>2.9m,0.00
2,4.3,0.0,0.0,0.00,0,4.30,0.9-1.8m,0.00
3,3.2,0.0,0.0,0.00,0,3.20,0.9-1.8m,0.00
4,0.0,0.0,0.0,0.00,0,9.99,unknown,9.99
...,...,...,...,...,...,...,...,...
1134,0.0,0.0,0.0,8.41,1,8.41,>2.9m,0.00
1135,0.0,0.0,0.0,25.81,1,25.81,>2.9m,0.00
1136,0.0,0.0,0.0,9.47,1,9.47,>2.9m,0.00
1137,0.0,0.0,0.0,8.41,1,8.41,>2.9m,0.00


In [20]:
# wherever the crossing column has a boolean 1, all the other widths have a value 0 as widths don't define the crossings
crossing_col = dataset1['crossing'] == 1
dataset1.loc[crossing_col, dataset1.columns != 'crossing'] = 0
dataset1

Unnamed: 0,0.9-1.8m,1.8-2.9m,<0.9m,>2.9m,crossing,length,obstacle_free_width,unknown
0,0.0,0.0,0.0,9.99,0,9.99,>2.9m,0.00
1,0.0,0.0,0.0,3.64,0,3.64,>2.9m,0.00
2,4.3,0.0,0.0,0.00,0,4.30,0.9-1.8m,0.00
3,3.2,0.0,0.0,0.00,0,3.20,0.9-1.8m,0.00
4,0.0,0.0,0.0,0.00,0,9.99,unknown,9.99
...,...,...,...,...,...,...,...,...
1134,0.0,0.0,0.0,0.00,1,0.00,0,0.00
1135,0.0,0.0,0.0,0.00,1,0.00,0,0.00
1136,0.0,0.0,0.0,0.00,1,0.00,0,0.00
1137,0.0,0.0,0.0,0.00,1,0.00,0,0.00


In [21]:
# dropping the length column as we have its values multiplied with other columns, so we have 6 objectives instead of 7
new_dataset = dataset1.drop('length', axis=1)
new_dataset

Unnamed: 0,0.9-1.8m,1.8-2.9m,<0.9m,>2.9m,crossing,obstacle_free_width,unknown
0,0.0,0.0,0.0,9.99,0,>2.9m,0.00
1,0.0,0.0,0.0,3.64,0,>2.9m,0.00
2,4.3,0.0,0.0,0.00,0,0.9-1.8m,0.00
3,3.2,0.0,0.0,0.00,0,0.9-1.8m,0.00
4,0.0,0.0,0.0,0.00,0,unknown,9.99
...,...,...,...,...,...,...,...
1134,0.0,0.0,0.0,0.00,1,0,0.00
1135,0.0,0.0,0.0,0.00,1,0,0.00
1136,0.0,0.0,0.0,0.00,1,0,0.00
1137,0.0,0.0,0.0,0.00,1,0,0.00


In [22]:
new_dataset['obstacle_free_width'].unique()


array(['>2.9m', '0.9-1.8m', 'unknown', '1.8-2.9m', '<0.9m', 0],
      dtype=object)

In [23]:
new_dataset['obstacle_free_width'].value_counts()

>2.9m       324
1.8-2.9m    277
unknown     180
0           174
0.9-1.8m    141
<0.9m        43
Name: obstacle_free_width, dtype: int64

In [24]:
categorize_obstacles = {"obstacle_free_width": {'0':0, '0.9-1.8m':1, '1.8-2.9m':2, '<0.9m':3,'>2.9m':4,'unknown':5}}
new_dataset = new_dataset.replace(categorize_obstacles)
new_dataset

Unnamed: 0,0.9-1.8m,1.8-2.9m,<0.9m,>2.9m,crossing,obstacle_free_width,unknown
0,0.0,0.0,0.0,9.99,0,4,0.00
1,0.0,0.0,0.0,3.64,0,4,0.00
2,4.3,0.0,0.0,0.00,0,1,0.00
3,3.2,0.0,0.0,0.00,0,1,0.00
4,0.0,0.0,0.0,0.00,0,5,9.99
...,...,...,...,...,...,...,...
1134,0.0,0.0,0.0,0.00,1,0,0.00
1135,0.0,0.0,0.0,0.00,1,0,0.00
1136,0.0,0.0,0.0,0.00,1,0,0.00
1137,0.0,0.0,0.0,0.00,1,0,0.00


The original dataset (shown above) can not be used for the Gaussian Process as it is not in the form of value vectors. To bring it into the form of value vectors, the dataset needs to be passed into a Multi-Objective Shortest Path Planning Solver which will output value vectors. These value vectors can then be an input for the GP. Thus, for now, I am using a synthetic Pareto Coverage Set (PCS) with 2 objectives and 20 datapoints which gives us synthetic value vectors for input to the GP.

In [25]:
# importing methods from Luisa's code for the GP
from gp_pref_elicit_luisa.gp_utilities import utils_ccs as gp_utils_ccs
from gp_pref_elicit_luisa.gp_utilities import utils_data as  gp_utils_data
from gp_pref_elicit_luisa import gaussian_process as GP
from gp_pref_elicit_luisa import dataset as data 
from gp_pref_elicit_luisa.gp_utilities import utils_user as gp_utils_users
from gp_pref_elicit_luisa import acquisition_function as acquisition_function

In [26]:
# initializing GP and Dataset 
# num_objectives = 2
GP = GP.GPPairwise(2)
utils_comparisons = data.DatasetPairwise(2)

In [27]:
# gp_utils_ccs.get_ccs(2, 20)
# outputs a synthetic Pareto Coverage Set of value vectors with 2 objectives and 20 datapoints

In [28]:
synthetic_pcs = np.array([[0.14370116, 0.99159928],
       [0.9797389 , 0.2242916 ],
       [0.        , 1.        ],
       [0.91055917, 0.45020785],
       [0.59678925, 0.81854996],
       [1.        , 0.        ],
       [0.94198057, 0.352479  ],
       [0.81501748, 0.65358114],
       [0.99814566, 0.05429028],
       [0.33305315, 0.94955291],
       [0.28860669, 0.96123215],
       [0.99999796, 0.02591092],
       [0.98910769, 0.14092867],
       [0.18584726, 0.98334638],
       [0.05210043, 0.99838732],
       [0.87761802, 0.52114756],
       [0.74002719, 0.7144284 ],
       [0.21487083, 0.97724899],
       [0.43622937, 0.90230767],
       [0.99525346, 0.08213535]])
synthetic_pcs 

array([[0.14370116, 0.99159928],
       [0.9797389 , 0.2242916 ],
       [0.        , 1.        ],
       [0.91055917, 0.45020785],
       [0.59678925, 0.81854996],
       [1.        , 0.        ],
       [0.94198057, 0.352479  ],
       [0.81501748, 0.65358114],
       [0.99814566, 0.05429028],
       [0.33305315, 0.94955291],
       [0.28860669, 0.96123215],
       [0.99999796, 0.02591092],
       [0.98910769, 0.14092867],
       [0.18584726, 0.98334638],
       [0.05210043, 0.99838732],
       [0.87761802, 0.52114756],
       [0.74002719, 0.7144284 ],
       [0.21487083, 0.97724899],
       [0.43622937, 0.90230767],
       [0.99525346, 0.08213535]])

In [29]:
user_pref = gp_utils_users.UserPreference(2, std_noise=0.1)

In [30]:
ground_truth_utility_func = user_pref.get_preference(synthetic_pcs, add_noise=True)
ground_truth_utility_func

array([0.42407981, 0.69018523, 0.60117433, 0.35877183, 0.37156636,
       0.71637472, 0.46770729, 0.65324535, 0.60519265, 0.52140989,
       0.3809021 , 0.53708077, 0.59779812, 0.3579109 , 0.37993633,
       0.6556254 , 0.47249808, 0.45551176, 0.24102916, 0.38216443])

In [31]:
highest_utility = max(ground_truth_utility_func)
lowest_utility = min(ground_truth_utility_func)
print('Highest utility value for the user: ', highest_utility, '\n'
      'Lowest utility value for the user: ', lowest_utility)

Highest utility value for the user:  0.7163747163816129 
Lowest utility value for the user:  0.2410291614412829


In [32]:
# comparing highest and lowest utility values generated from the sampling 
# and adding these to the GP
utils_comparisons.add_single_comparison(highest_utility, lowest_utility)
utils_comparisons_datapoints = utils_comparisons.datapoints
utils_comparisons_datapoints 
# I use utils_comparisons_datapoints as input for GP methods, I am not sure if this is correct. 

array([[0.71637472, 0.71637472],
       [0.24102916, 0.24102916]])

In [33]:
# updating the GP based on the comparisons added to the dataset
GP.update(utils_comparisons)

In [34]:
# getting the predictive parameters, mean and variance, of the GP
GP.get_predictive_params(utils_comparisons_datapoints, pointwise=True)


(array([ 0.02560518, -0.02560518]), array([0.34479326, 0.34479326]))

In [35]:
# I am not sure if we are allowed sample on the datapoints
sampled_points = GP.sample(utils_comparisons_datapoints)
sampled_points

array([0.67564282, 0.24842949])

In [36]:
prior_mean_GP = GP.prior_mean(utils_comparisons_datapoints)
prior_mean_GP

array([0., 0.])

In [37]:
x1 = utils_comparisons_datapoints[0, 0]
x2 = utils_comparisons_datapoints[1, 0]
kernel_GP = GP._kernel(x1=x1, x2=x2)
kernel_GP

array([2.7850482e-05])

In [38]:
# acquisition function's input domain is the utils_comparisons_datapoints I am not sure if this is correct
acquisition_function_DA = acquisition_function.DiscreteAcquirer(input_domain= utils_comparisons_datapoints, query_type='pairwise', seed=None, acquisition_type='expected improvement')

In [39]:
# similarly for this, I am not sure if the input and output are correct
acquisition_function.get_expected_improvement(datapoints=utils_comparisons_datapoints, gaussian_process=GP, datapoints_hist=acquisition_function_DA.history, xi=0.01)

array([0.14549606, 0.12048278])

In [40]:
start_points = acquisition_function_DA.get_start_points(gaussian_process=GP)
# next_points = acquisition_function_DA.get_next_point(gaussian_process=GP, dataset=utils_comparisons_datapoints)
print('Starting points are: ', start_points, '\n')
      # 'Next points are: ', next_points)

Starting points are:  (array([0.71637472, 0.71637472]), array([0.24102916, 0.24102916])) 



In [41]:
next_points = acquisition_function_DA.get_next_point(gaussian_process=GP, dataset=utils_comparisons_datapoints)

AttributeError: 'numpy.ndarray' object has no attribute 'datapoints'

In [None]:
# I haven't updated this for loop according to the code above. Once I know the code above is correct, I will put everything here
data3 = data.DatasetPairwise(2)
num_iter = 2
for i in range(num_iter):
    # sampling on synthetic pcs according to gaussian process
    sampled_points_GP = GP.sample(synthetic_pcs)
    # output of this sampling is the utility values

    # Getting highest and lowest utility values from the sampled points
    highest_util = max(sampled_points_GP)
    lowest_util = min(sampled_points_GP)

    # performing single comparisons on the highest and lowest utility values
    d3 = data3.add_single_comparison(highest_util, lowest_util)
print('Sampled points from the GP are: \n', sampled_points_GP, '\n' 
    'Highest utility value is: ', highest_util, '\n'
    'Lowest utility value is: ', lowest_util, '\n')

Sampled points from the GP are: 
 [-0.85631043  0.68897133 -1.39333581  0.7565941  -0.46916365 -0.2471268
  1.3878009  -0.67903024 -0.34530695  1.23896378  0.92864363 -0.31009009
 -0.06570927 -0.34421415 -1.37013616 -0.01943969 -0.57645162  0.04894532
  0.87531646 -0.32554939] 
Highest utility value is:  1.3878008979745038 
Lowest utility value is:  -1.3933358073028823 

