In [4]:
import pandas as pd
import numpy as np
from numpy.random import RandomState
from scipy.stats import norm
import geopandas as gpd
import copy
import sys
sys.path.insert(0, 'D:/modules/thesis/resources/gp_pref_elicit/gp_utilities/utils_data')


In [5]:
# original (shortened) dataset
# to use actual (large) dataset, uncomment the below line:
# sidewalk = gpd.read_file('data\Sidewalk_width_crossings.geojson')
sidewalk = gpd.read_file('data\Sidewalk_width_crossings_small.geojson')
sidewalk

Unnamed: 0,id,0.9-1.8m,1.8-2.9m,<0.9m,>2.9m,crossing,length,obstacle_free_width,unknown,geometry
0,0,0,0,0,1,0,9.99,>2.9m,0,"LINESTRING (120548.61203 486088.19578, 120548...."
1,1,0,0,0,1,0,3.64,>2.9m,0,"LINESTRING (120558.58273 486088.59136, 120558...."
2,2,1,0,0,0,0,4.30,0.9-1.8m,0,"LINESTRING (120554.77791 486105.08163, 120555...."
3,3,1,0,0,0,0,3.20,0.9-1.8m,0,"LINESTRING (120561.12010 486102.03679, 120561...."
4,4,0,0,0,0,0,9.99,unknown,1,"LINESTRING (120549.11715 486040.41439, 120549...."
...,...,...,...,...,...,...,...,...,...,...
1134,1134,0,0,0,1,1,8.41,>2.9m,0,"LINESTRING (120971.41298 485981.36204, 120971...."
1135,1135,0,0,0,1,1,25.81,>2.9m,0,"LINESTRING (120916.43844 486032.00808, 120890...."
1136,1136,0,0,0,1,1,9.47,>2.9m,0,"LINESTRING (120940.10969 486013.85196, 120940...."
1137,1137,0,0,0,1,1,8.41,>2.9m,0,"LINESTRING (120971.64289 485989.76706, 120971...."


In [6]:
# dataset with only columns that are needed 
dataset1 = sidewalk[['0.9-1.8m', '1.8-2.9m', '<0.9m', '>2.9m', 'crossing', 'length', 'obstacle_free_width', 'unknown']].copy()
dataset1

Unnamed: 0,0.9-1.8m,1.8-2.9m,<0.9m,>2.9m,crossing,length,obstacle_free_width,unknown
0,0,0,0,1,0,9.99,>2.9m,0
1,0,0,0,1,0,3.64,>2.9m,0
2,1,0,0,0,0,4.30,0.9-1.8m,0
3,1,0,0,0,0,3.20,0.9-1.8m,0
4,0,0,0,0,0,9.99,unknown,1
...,...,...,...,...,...,...,...,...
1134,0,0,0,1,1,8.41,>2.9m,0
1135,0,0,0,1,1,25.81,>2.9m,0
1136,0,0,0,1,1,9.47,>2.9m,0
1137,0,0,0,1,1,8.41,>2.9m,0


In [7]:
# multiplying all columns with the value in the length column as we are going towards six objectives
# columns '0.9-1.8m', '1.8-2.9m', '<0.9m', '>2.9m' and 'unknown' are path lengths of respective widths
boolean_columns = ['0.9-1.8m', '1.8-2.9m', '<0.9m', '>2.9m', 'unknown']
dataset1[boolean_columns] = dataset1[boolean_columns].mul(dataset1['length'], axis = 0)
dataset1

Unnamed: 0,0.9-1.8m,1.8-2.9m,<0.9m,>2.9m,crossing,length,obstacle_free_width,unknown
0,0.0,0.0,0.0,9.99,0,9.99,>2.9m,0.00
1,0.0,0.0,0.0,3.64,0,3.64,>2.9m,0.00
2,4.3,0.0,0.0,0.00,0,4.30,0.9-1.8m,0.00
3,3.2,0.0,0.0,0.00,0,3.20,0.9-1.8m,0.00
4,0.0,0.0,0.0,0.00,0,9.99,unknown,9.99
...,...,...,...,...,...,...,...,...
1134,0.0,0.0,0.0,8.41,1,8.41,>2.9m,0.00
1135,0.0,0.0,0.0,25.81,1,25.81,>2.9m,0.00
1136,0.0,0.0,0.0,9.47,1,9.47,>2.9m,0.00
1137,0.0,0.0,0.0,8.41,1,8.41,>2.9m,0.00


In [8]:
# wherever the crossing column has a boolean 1, all the other widths have a value 0 as widths don't define the crossings
crossing_col = dataset1['crossing'] == 1
dataset1.loc[crossing_col, dataset1.columns != 'crossing'] = 0
dataset1

Unnamed: 0,0.9-1.8m,1.8-2.9m,<0.9m,>2.9m,crossing,length,obstacle_free_width,unknown
0,0.0,0.0,0.0,9.99,0,9.99,>2.9m,0.00
1,0.0,0.0,0.0,3.64,0,3.64,>2.9m,0.00
2,4.3,0.0,0.0,0.00,0,4.30,0.9-1.8m,0.00
3,3.2,0.0,0.0,0.00,0,3.20,0.9-1.8m,0.00
4,0.0,0.0,0.0,0.00,0,9.99,unknown,9.99
...,...,...,...,...,...,...,...,...
1134,0.0,0.0,0.0,0.00,1,0.00,0,0.00
1135,0.0,0.0,0.0,0.00,1,0.00,0,0.00
1136,0.0,0.0,0.0,0.00,1,0.00,0,0.00
1137,0.0,0.0,0.0,0.00,1,0.00,0,0.00


In [9]:
# dropping the length column as we have its values multiplied with other columns, so we have 6 objectives instead of 7
new_dataset = dataset1.drop('length', axis=1)
new_dataset

Unnamed: 0,0.9-1.8m,1.8-2.9m,<0.9m,>2.9m,crossing,obstacle_free_width,unknown
0,0.0,0.0,0.0,9.99,0,>2.9m,0.00
1,0.0,0.0,0.0,3.64,0,>2.9m,0.00
2,4.3,0.0,0.0,0.00,0,0.9-1.8m,0.00
3,3.2,0.0,0.0,0.00,0,0.9-1.8m,0.00
4,0.0,0.0,0.0,0.00,0,unknown,9.99
...,...,...,...,...,...,...,...
1134,0.0,0.0,0.0,0.00,1,0,0.00
1135,0.0,0.0,0.0,0.00,1,0,0.00
1136,0.0,0.0,0.0,0.00,1,0,0.00
1137,0.0,0.0,0.0,0.00,1,0,0.00


In [10]:
new_dataset['obstacle_free_width'].unique()

array(['>2.9m', '0.9-1.8m', 'unknown', '1.8-2.9m', '<0.9m', 0],
      dtype=object)

In [11]:
new_dataset['obstacle_free_width'].value_counts()

>2.9m       324
1.8-2.9m    277
unknown     180
0           174
0.9-1.8m    141
<0.9m        43
Name: obstacle_free_width, dtype: int64

In [12]:
categorize_obstacles = {"obstacle_free_width": {'0':0, '0.9-1.8m':1, '1.8-2.9m':2, '<0.9m':3,'>2.9m':4,'unknown':5}}
new_dataset = new_dataset.replace(categorize_obstacles)
new_dataset

Unnamed: 0,0.9-1.8m,1.8-2.9m,<0.9m,>2.9m,crossing,obstacle_free_width,unknown
0,0.0,0.0,0.0,9.99,0,4,0.00
1,0.0,0.0,0.0,3.64,0,4,0.00
2,4.3,0.0,0.0,0.00,0,1,0.00
3,3.2,0.0,0.0,0.00,0,1,0.00
4,0.0,0.0,0.0,0.00,0,5,9.99
...,...,...,...,...,...,...,...
1134,0.0,0.0,0.0,0.00,1,0,0.00
1135,0.0,0.0,0.0,0.00,1,0,0.00
1136,0.0,0.0,0.0,0.00,1,0,0.00
1137,0.0,0.0,0.0,0.00,1,0,0.00
