In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler

In [2]:
current_dir = os.getcwd()

# Construct the path to the 'data' directory relative to the current directory
data_dir = os.path.join(current_dir, '..', 'data')

# Access a specific file
df = pd.read_csv(os.path.join(data_dir, 'Bikeroad_Location.csv'), low_memory = False)
df.head()

Unnamed: 0,highway,lanes,lit,maxspeed,streetname,oneway,ref,smoothness,geometry,id,link,district,type,name,value,function,location
0,residential,,yes,,AEG-Siedlung Heimat,,,,"LINESTRING (13.3478331 52.6098004, 13.347076 5...",,,,,,,,
1,residential,,yes,30.0,AEG-Siedlung Heimat,,,,"LINESTRING (13.347866 52.608813, 13.347076 52....",,,,,,,,
2,residential,,yes,30.0,Aachener Straße,,,,"LINESTRING (13.3189287 52.4813095, 13.318873 5...",,,,,,,,
3,residential,,yes,30.0,Aachener Straße,,,,"LINESTRING (13.3183653 52.4789371, 13.3181786 ...",,,,,,,,
4,residential,,yes,30.0,Aachener Straße,,,,"LINESTRING (13.3155418 52.4796441, 13.3155083 ...",,,,,,,,


To get our routeplanner to sensibly favour streets with infrastructure for bikes, we need to develop a weight for each segment so that it works. For routeplanning, at the base is the distance. We want to get to our destination as fast as possible, so less distance is better. However, since security is also important, we want streets without biking infrastructure penalized. If a route without biking infrastructure is still much faster, we want to take it. But if taking a safer route only takes a small detour, it should change. Since we can't really test on a concrete routing planner, this can only provide a first prototype. <br>
Furthermore, there are multiple infos other than if there's biking infrastructure we can use. Streets with a higher speedlimit are unsafer, as braking distance is longer for cars (especially because there is no spacial division between car and bike traffic). Streets in poor condition (smoothness) might also be unsafer for bikes and also worse to ride on. Unlit treets are harder to navigate. Streets with a lot of lanes make traffic often faster. So all these factors will be considered in weighting.

In [3]:
# goal: normalize the columns in the dataframe that are interesting for weighting
weight_df = df.loc[:,["lanes", "maxspeed", "smoothness", "function"]].copy()
weight_df

Unnamed: 0,lanes,maxspeed,smoothness,function
0,,,,
1,,30,,
2,,30,,
3,,30,,
4,,30,,
...,...,...,...,...
63975,,30,,
63976,,30,,
63977,,30,,
63978,,30,bad,


For the first step, we will impute values so that we can calculate a mean for every street segment, even if we lack info.

In [4]:
weight_df.dtypes

lanes         float64
maxspeed       object
smoothness     object
function       object
dtype: object

In [5]:
# transform everything to numbers so we can compute it

# for smoothness, it's an ordinal scale, so we can transform it to school grades
print(weight_df["smoothness"].unique())

# source: https://wiki.openstreetmap.org/wiki/Key:smoothness
grade_map = {"excellent": 1, "very_good": 2, "good": 3, "intermediate": 4, "medium": 4, "bad": 5, "very_bad": 6, "horrible": 7, "very_horrible": 8}

weight_df['smoothness'] = weight_df['smoothness'].replace(grade_map)
weight_df[weight_df["smoothness"].notnull()]

[nan 'good' 'bad' 'intermediate' 'excellent' 'very_bad' 'very_horrible'
 'medium' 'horrible' 'very_good']


  weight_df['smoothness'] = weight_df['smoothness'].replace(grade_map)


Unnamed: 0,lanes,maxspeed,smoothness,function
17,,,3.0,
20,,,3.0,
21,,30,5.0,
22,,30,5.0,
23,,30,3.0,
...,...,...,...,...
63972,,30,4.0,
63973,,30,4.0,
63974,,,4.0,
63978,,30,5.0,


In [6]:
# now for function, the only value here is either road or NaN
print(weight_df["function"].unique())

# so we rename it to "bikeroad" and transform it into a bool
weight_df.rename(columns={'function': 'bikeroad'}, inplace=True)
weight_df['bikeroad'] = weight_df['bikeroad'].eq('road')
weight_df.head()

[nan 'road']


Unnamed: 0,lanes,maxspeed,smoothness,bikeroad
0,,,,False
1,,30.0,,False
2,,30.0,,False
3,,30.0,,False
4,,30.0,,False


In [7]:
# fix maxspeed
print(f"Before: {weight_df["maxspeed"].unique()}")

# replace "walk" with 10
# source: https://www.adac.de/verkehr/recht/verkehrsvorschriften-deutschland/schrittgeschwindigkeit/
weight_df["maxspeed"] = weight_df["maxspeed"].replace("walk", 10.0)
print(f"After: {weight_df["maxspeed"].unique()}")

Before: [nan '30' '50' '10' '60.0' '50.0' '70.0' '100.0' '30.0' '80' '60' '5' '20'
 '70' '100' '80.0' '10.0' 'walk' '15' '7']
After: [nan '30' '50' '10' '60.0' '50.0' '70.0' '100.0' '30.0' '80' '60' '5' '20'
 '70' '100' '80.0' '10.0' 10.0 '15' '7']


In [8]:
# impute missing values with mice algorithm with random forest classifier
rf = RandomForestRegressor(n_estimators=100)
imp = IterativeImputer(estimator=rf, verbose=2, max_iter=5, tol=1e-10, imputation_order='roman')
ImputedData = imp.fit_transform(weight_df)
Imputed_data = pd.DataFrame(ImputedData)
Imputed_data.columns = weight_df.columns
Imputed_data.head()

[IterativeImputer] Completing matrix with shape (63980, 4)
[IterativeImputer] Ending imputation round 1/5, elapsed time 2.34
[IterativeImputer] Change: 33.18689228180007, scaled tolerance: 1e-08 
[IterativeImputer] Ending imputation round 2/5, elapsed time 4.67
[IterativeImputer] Change: 41.27508632230343, scaled tolerance: 1e-08 
[IterativeImputer] Ending imputation round 3/5, elapsed time 6.97
[IterativeImputer] Change: 30.218626588828936, scaled tolerance: 1e-08 
[IterativeImputer] Ending imputation round 4/5, elapsed time 9.25
[IterativeImputer] Change: 17.014254840723, scaled tolerance: 1e-08 
[IterativeImputer] Ending imputation round 5/5, elapsed time 11.67
[IterativeImputer] Change: 2.406527777777777, scaled tolerance: 1e-08 




Unnamed: 0,lanes,maxspeed,smoothness,bikeroad
0,1.0,20.539951,4.719952,0.0
1,4.0,30.0,2.485503,0.0
2,4.0,30.0,2.485503,0.0
3,4.0,30.0,2.485503,0.0
4,4.0,30.0,2.485503,0.0


In [9]:
print(f"Before: {Imputed_data["lanes"].unique()}")
# round to full number
Imputed_data['lanes'] = Imputed_data['lanes'].round()
print(f"After: {Imputed_data["lanes"].unique()}")

Before: [1.         4.         2.18604027 1.60866566 1.91200099 1.5611918
 2.3334539  2.         3.         1.99109591 2.36950755 5.
 2.35899513 2.30668536 2.24959489 1.25698355 1.99644905 1.18541164
 1.60065318 2.01600733 2.10597399 2.19863231 2.18194343 6.
 1.24643777 2.65823547 3.0628945  2.52618031 2.01710206 1.95666667
 1.91412421 1.92854516 1.74713353 1.98       7.         2.00618687
 2.64243301 2.81950685 2.6131514  2.11148094]
After: [1. 4. 2. 3. 5. 6. 7.]


In [10]:
print(f"Before: {Imputed_data["smoothness"].unique()}")
# round to full number
Imputed_data['smoothness'] = Imputed_data['smoothness'].round()
print(f"After: {Imputed_data["smoothness"].unique()}")

Before: [4.71995234 2.48550332 3.         5.         4.         2.60156797
 1.         2.5544449  2.6823649  2.43657909 2.52669564 2.40049797
 2.13061599 3.14321246 2.90432819 2.06994875 3.01733929 3.2763872
 6.         2.95595458 2.58084389 2.61123333 2.72514884 2.9990677
 2.98418327 2.5320724  2.43272661 3.02442321 2.92484171 8.
 2.4845164  2.06354875 2.61920213 2.56518666 2.69753368 3.18186111
 2.99375    2.08731746 7.         2.41861703 2.61648493 2.75734453
 2.08315079 4.05401473 2.         2.49595952 2.71260168 2.58128529
 2.87502875 2.0659129  3.10498641]
After: [5. 2. 3. 4. 1. 6. 8. 7.]


In [11]:
print(f"Before: {Imputed_data["maxspeed"].unique()}")
# round to the next 10-value
Imputed_data['maxspeed'] = (Imputed_data['maxspeed'] + 9) // 10 * 10
print(f"After: {Imputed_data["maxspeed"].unique()}")

Before: [ 20.53995094  30.          70.          50.          10.
  60.          48.41708514  45.73364103  48.29758793  32.55246713
 100.          80.           5.          20.          46.50513175
  48.17294545  39.4565129   48.27831105  28.0523214   48.84030727
  46.90953266  30.37149949  41.17825661  31.11917239  46.65356201
  42.20851479  48.21442403  27.60549416  15.          46.5254282
  47.98842767  45.12663997  42.59435623   7.        ]
After: [ 20.  30.  70.  50.  10.  60.  40. 100.  80.]


In [12]:
weight_df = Imputed_data
weight_df.head()

Unnamed: 0,lanes,maxspeed,smoothness,bikeroad
0,1.0,20.0,5.0,0.0
1,4.0,30.0,2.0,0.0
2,4.0,30.0,2.0,0.0
3,4.0,30.0,2.0,0.0
4,4.0,30.0,2.0,0.0


In [13]:
# goal: achieve a weight that is a multiplier to the length of the street segment

# step 1, normalize values
scaler = MinMaxScaler()
df_scaled = scaler.fit_transform(weight_df)
df_scaled = pd.DataFrame(df_scaled, columns=weight_df.columns)
df_scaled.head()

Unnamed: 0,lanes,maxspeed,smoothness,bikeroad
0,0.0,0.111111,0.571429,0.0
1,0.5,0.222222,0.142857,0.0
2,0.5,0.222222,0.142857,0.0
3,0.5,0.222222,0.142857,0.0
4,0.5,0.222222,0.142857,0.0


In [14]:
# step 2: reverse mapping of bikeroad, because having a bikeroad is better than having none
df_scaled["bikeroad"] = df_scaled["bikeroad"].replace({0.0: 1.0, 1.0: 0.0})
df_scaled.head()

Unnamed: 0,lanes,maxspeed,smoothness,bikeroad
0,0.0,0.111111,0.571429,1.0
1,0.5,0.222222,0.142857,1.0
2,0.5,0.222222,0.142857,1.0
3,0.5,0.222222,0.142857,1.0
4,0.5,0.222222,0.142857,1.0


In [18]:
# step 3: create weight by adding the values + 0.1 (so we don't get a 0 weight)
df_scaled["weight"] = df_scaled["lanes"] + df_scaled["maxspeed"] + df_scaled["smoothness"] + df_scaled["bikeroad"] + 0.1
df_scaled.head()

Unnamed: 0,lanes,maxspeed,smoothness,bikeroad,weight
0,0.0,0.111111,0.571429,1.0,1.78254
1,0.5,0.222222,0.142857,1.0,1.965079
2,0.5,0.222222,0.142857,1.0,1.965079
3,0.5,0.222222,0.142857,1.0,1.965079
4,0.5,0.222222,0.142857,1.0,1.965079


In [27]:
weight_df["weight"] = df_scaled["lanes"] + df_scaled["maxspeed"] + df_scaled["smoothness"] + df_scaled["bikeroad"] + 0.1
weight_df[weight_df["bikeroad"] == 1].head(20)

Unnamed: 0,lanes,maxspeed,smoothness,bikeroad,weight
141,2.0,60.0,3.0,1.0,1.107937
142,3.0,50.0,3.0,1.0,1.163492
143,3.0,50.0,3.0,1.0,1.163492
144,4.0,50.0,3.0,1.0,1.330159
145,1.0,50.0,2.0,1.0,0.687302
146,2.0,50.0,4.0,1.0,1.139683
147,1.0,50.0,2.0,1.0,0.687302
148,3.0,50.0,3.0,1.0,1.163492
149,2.0,50.0,3.0,1.0,0.996825
150,3.0,50.0,3.0,1.0,1.163492


In [16]:
df["weight"] = df_scaled["weight"].copy()
df.head()

Unnamed: 0,highway,lanes,lit,maxspeed,streetname,oneway,ref,smoothness,geometry,id,link,district,type,name,value,function,location,weight
0,residential,,yes,,AEG-Siedlung Heimat,,,,"LINESTRING (13.3478331 52.6098004, 13.347076 5...",,,,,,,,,1.78254
1,residential,,yes,30.0,AEG-Siedlung Heimat,,,,"LINESTRING (13.347866 52.608813, 13.347076 52....",,,,,,,,,1.965079
2,residential,,yes,30.0,Aachener Straße,,,,"LINESTRING (13.3189287 52.4813095, 13.318873 5...",,,,,,,,,1.965079
3,residential,,yes,30.0,Aachener Straße,,,,"LINESTRING (13.3183653 52.4789371, 13.3181786 ...",,,,,,,,,1.965079
4,residential,,yes,30.0,Aachener Straße,,,,"LINESTRING (13.3155418 52.4796441, 13.3155083 ...",,,,,,,,,1.965079


In [17]:
#df.to_csv("Bikeroad_Location+weight.csv", index = False)