In [0]:
from pyspark.sql.functions import *
from pyspark.sql import Window
import pyspark.sql.functions as F
import pyspark.sql.types as T
import json

#Common python library
import numpy as np
import pandas as pd
import datetime as dt
import unittest

#Shapely : for geometry manipulation
from shapely import geometry
from shapely.geometry import Polygon
import shapely
import shapely.wkt

#Library and dependency for Apache Sedona which gonna help up making spatial join like PostGreSQL with distributed data
#import sedona
from pyspark.sql import SparkSession
#from sedona.utils import SedonaKryoRegistrator, KryoSerializer
#from sedona.register import SedonaRegistrator
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType, ArrayType


In [0]:
dbutils.library.installPyPI("libify")
!pip install cryptography
import libify


In [0]:
apr_data_func = libify.importer(globals(),"/Dev/APR_DATA_FUNCTIONS")

In [0]:
def node_selection_f(node_list):
  """This function going to select the second node from all node (The most common one)"""  
  if len(node_list) < 2:
    x= node_list[0]
  else:
    x= node_list[1]
  return x

#The UDF to apply it on a pyspark dataframe
node_selection = F.udf(node_selection_f, T.LongType())
def parse_embedding_from_string(x):
    res = json.loads(x)
    return res

#LongType no Integer type.... 
retrieve_embedding = F.udf(parse_embedding_from_string, T.ArrayType(T.LongType()))

In [0]:
#récuperer les données pour le mois février 2020
year = "2020"
month = "02"
df = apr_data_func.get_releves_intermediares(year,month)

In [0]:
view_result = spark.read.csv("/mnt/datalake/tmp/guillaume/temporary/full_eclaireur/result_join_poly",inferSchema=True,header=True)

In [0]:
# on récupere seulement les nodes que sont definis 
landuse = view_result.filter(~F.col("inside")=="false")

In [0]:
#allnode est de type String donc on le change en array à l'aide de la  fonction (retrieve_embedding)
df = df.withColumn("allnodes",retrieve_embedding(F.col("allnodes")))


In [0]:
#récuperer la node ID à partir la colonne allnode(la deuxiéme node)
df = df.withColumn('node_id',node_selection(F.col("allnodes")))

In [0]:
#on fait le jointure suivant la colonne node_ID(node_id pour df et ID pour landuse)
df_join = df.join(landuse, df.node_id == landuse.ID, how = "left")

In [0]:
#drop les colonnes de plus , remplace null par inconnue, recupere seulement les colonnes qu'on a besoin
df_join = df_join.drop("ID").drop("inside").na.fill("inconnue")
df_join = df_join.select("rideId","dateentry","node_id","landuse","deviceid")

In [0]:
#utiliser window pour afficher date min et max pour chaque rideid suivant les differents landuse , ordonner par la date 
windows = Window.partitionBy("rideId","landuse").orderBy(F.col("dateentry").asc())
df_groupby = (df_join.withColumn('datemax', max('dateentry').over(windows))
 .withColumn('datemin', min('dateentry').over(windows)))



In [0]:
#afficher pour chaque rideid : le temps de debut et fin ,et avec les differents landuse 
data_final1=df_groupby.groupby("rideId","datemin").agg(F.max("datemax").alias("end_time"),
                                          (F.first("landuse").alias("landuse")))

In [0]:
data_final1.count()

In [0]:
#renommer les colonnes 
data_final=data_final1.withColumnRenamed('datemin','start_time')

In [0]:
data_final.write.option('header',True).mode('overwrite').csv('/mnt/datalake/tmp/amani/landuse_process/02_2020_ride')