In [0]:
from pyspark.sql.functions import *
import pyspark.sql.types as T
from pyspark.sql import Window

In [0]:
def get_ways_with_keyvalues(df_way,keyvalues):
  #On crée une ligne par tag
  df_way2= df_way.select(df_way.id,df_way.nodes,explode(df_way.tags).alias("tags"))
  
  #On récupère le contenu des tags pour avoir les colonnes id, key et value  
  df_way3=df_way2.select(df_way2.id,df_way2.nodes,
                         df_way2.tags["key"].cast("string").alias("key"),
                         df_way2.tags["value"].cast("string").alias("value"))
  
  #On filtre les lignes pour garder que les lignes dont les keys ou les vavlues ont le mot cycleway par exemple
  #pour le premier valeur de keyvalues tient pas en compte
  #on recupere le premier, apres on l'ajoute dans la BOUCLE
  df_way4=df_way3.filter(df_way3.key.contains(keyvalues[0]["key"]) & df_way3.value.contains(keyvalues[0]["value"]))
  
  for x in keyvalues[1:]:
    df_way4=df_way4.union(df_way3.filter(df_way3.key.contains(x["key"]) & df_way3.value.contains(x["value"])))
    
  return df_way4

In [0]:
def get_ways(year,spark):
  df_way = spark.read.parquet("/mnt/externaldata/france-"+year[2:]+"0101.osm.pbf.way.parquet")
  return df_way
df_ways=get_ways("2022",spark)

In [0]:
kv_ = [{"key":"","value":""}]
df_type_way=get_ways_with_keyvalues(df_ways,kv_)
df_type_way.display()

id,nodes,key,value
2569,"List(List(0, 382017), List(1, 3745434701), List(2, 1836953770), List(3, 3745434708), List(4, 505820718))",cycleway:right,no
2569,"List(List(0, 382017), List(1, 3745434701), List(2, 1836953770), List(3, 3745434708), List(4, 505820718))",foot,no
2569,"List(List(0, 382017), List(1, 3745434701), List(2, 1836953770), List(3, 3745434708), List(4, 505820718))",highway,tertiary
2569,"List(List(0, 382017), List(1, 3745434701), List(2, 1836953770), List(3, 3745434708), List(4, 505820718))",lane_markings,no
2569,"List(List(0, 382017), List(1, 3745434701), List(2, 1836953770), List(3, 3745434708), List(4, 505820718))",lit,yes
2569,"List(List(0, 382017), List(1, 3745434701), List(2, 1836953770), List(3, 3745434708), List(4, 505820718))",maxspeed,30
2569,"List(List(0, 382017), List(1, 3745434701), List(2, 1836953770), List(3, 3745434708), List(4, 505820718))",name,Rue Nationale
2569,"List(List(0, 382017), List(1, 3745434701), List(2, 1836953770), List(3, 3745434708), List(4, 505820718))",oneway,yes
2569,"List(List(0, 382017), List(1, 3745434701), List(2, 1836953770), List(3, 3745434708), List(4, 505820718))",sidewalk,none
2569,"List(List(0, 382017), List(1, 3745434701), List(2, 1836953770), List(3, 3745434708), List(4, 505820718))",surface,asphalt


In [0]:
df_filter=df_type_way.filter(col("key").isin(["highway","maxspeed","tunnel","tunnel:name","ref","oneway","lanes","name","surface","lit",                                            "sidewalk","bridge","bridge:name","cycleway","parking:lane"]))

In [0]:
data_pivot=(df_filter.groupby("id", "key", "value").count()
                       .groupby("id")
                       .pivot("key")
                       .agg(first("value"))
                       
                     
         )

In [0]:
data_pivot.display()

id,bridge,bridge:name,cycleway,highway,lanes,lit,maxspeed,name,oneway,parking:lane,ref,sidewalk,surface,tunnel,tunnel:name
2569,,,,tertiary,,yes,30.0,Rue Nationale,yes,,,none,asphalt,,
2570,,,,secondary,2.0,yes,30.0,Avenue des Gobelins,yes,,,both,paving_stones,,
2573,,,,residential,,yes,30.0,Place des Alpes,yes,,,,asphalt,,
2574,,,,tertiary,3.0,yes,30.0,Rue Taine,,,,,asphalt,,
80708,,,,trunk_link,1.0,,,,yes,,,,,,
80709,,,,trunk_link,1.0,,,,yes,,,,,,
80710,,,,trunk_link,1.0,,,,yes,,,,,,
80711,,,,trunk_link,1.0,,,,yes,,,,,,
84073,,,,trunk_link,1.0,,90.0,Rocade Sud,yes,,N 87,,,,
84080,,,,trunk,2.0,,70.0,,yes,,,,,,


In [0]:
data_pivot.count()

Out[9]: 10150460

In [0]:
data_pivot.select([count(when(col(c).isNull(), c)).alias(c) for c in data_pivot.columns]).display()

id,bridge,bridge:name,cycleway,highway,lanes,lit,maxspeed,name,oneway,parking:lane,ref,sidewalk,surface,tunnel,tunnel:name
0,9908597,10149517,10055524,1261655,9725743,9717654,9136876,6334894,9063373,10150061,9140710,10010083,8319401,9926190,10149854


In [0]:
data_pivot.select([((count(when(col(c).isNull(), c))/10150460)*100).alias(c) for c in data_pivot.columns]).display()

id,bridge,bridge:name,cycleway,highway,lanes,lit,maxspeed,name,oneway,parking:lane,ref,sidewalk,surface,tunnel,tunnel:name
0.0,97.61722128849333,99.99070978064049,99.06471233816004,12.429535213182456,95.81578568853035,95.73609471886004,90.01440328812684,62.40992033858564,89.29026861836803,99.99606914366444,90.05217497532132,98.6170380455664,81.96082739107389,97.79054348275842,99.99402982721963


In [0]:
data_pivot.select([((count(when(col(c).isNotNull(), c))/10150460)*100).alias(c) for c in data_pivot.columns]).display()

id,bridge,bridge:name,cycleway,highway,lanes,lit,maxspeed,name,oneway,parking:lane,ref,sidewalk,surface,tunnel,tunnel:name
100.0,2.3827787115066705,0.0092902193595167,0.935287661839956,87.57046478681755,4.184214311469628,4.263905281139968,9.985596711873155,37.59007966141436,10.709731381631965,0.0039308563355749,9.947825024678684,1.382961954433592,18.039172608926098,2.209456517241583,0.0059701727803469


In [0]:
data_pivot.groupby("cycleway").count().display()

cycleway,count
on_street,366
share_busway,1633
designated,60
opposite_track,94
opposite,17463
none,163
,10055524
use_sidepath,18
sidewalk,1018
left,14
