# MobilityPySpark UDTs

This notebook serves as a basic example to how MobilityPySpark handles UDTs.

In [18]:
from pymeos import *

from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F

from pysparkmeos.UDT.MeosDatatype import *
from pysparkmeos.utils.udt_appender import udt_append
from pysparkmeos.utils.utils import *
from pysparkmeos.UDF.udf import *
from pysparkmeos.partitions.grid_partitioner import GridPartition
from pysparkmeos.UDTF.base_partition_udtf import BasePartitionUDTF

from typing import *
import os, sys

## Initialize PySpark and PyMEOS

In [2]:
# Initialize PyMEOS
pymeos_initialize("UTC")

os.environ['PYSPARK_DRIVER_PYTHON_OPTS']= "notebook"
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_PYTHON'] = sys.executable

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("PySpark UDT Example with PyMEOS") \
    .master("local[3]") \
    .config("spark.default.parallelism", 3) \
    .config("spark.executor.memory", "3g") \
    .config("spark.executor.cores", 1) \
    .config("spark.driver.memory", "2g") \
    .config("spark.driver.maxResultSize", 0) \
    .config("spark.sql.allowMultipleTableArguments.enabled", True) \
    .getOrCreate()

#spark.sparkContext.setLogLevel("DEBUG")

# Append the UDT mapping to the PyMEOS classes
udt_append()

# Register the UDFs in spark
register_udfs_under_spark_sql(spark)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/07/23 13:49:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/07/23 13:49:18 WARN SimpleFunctionRegistry: The function length replaced a previously registered function.
24/07/23 13:49:18 WARN SimpleFunctionRegistry: The function nearest_approach_distance replaced a previously registered function.


We have an example dataset prepared, let's explore it first.

In [3]:
data_path = '../datasets/preproc.csv'

In [4]:
!head -n 2 $data_path

icao24,pointStr
34718e,POINT(1.9229736328125 40.87294006347656)@2022-06-27 00:00:00+00


## Read UDTs

Apparently we already have a preprocessed set of Points that can be easily read by MobilityPySpark, by defining a schema using TGeogPointInstUDT.

In [5]:
schema = StructType([
    StructField("icao24", StringType()),
    StructField("PointStr", TGeomPointInstUDT())  
])
df = spark.read.csv(
    data_path, 
    header=True, 
    schema=schema,
    mode='PERMISSIVE'
)
df.printSchema()
df = df.withColumnRenamed("PointStr", "PointInst").withColumn("STBox", point_to_stbox("PointInst"))
df.show()
df.head()

root
 |-- icao24: string (nullable = true)
 |-- PointStr: pythonuserdefined (nullable = true)



                                                                                

+------+--------------------+--------------------+
|icao24|           PointInst|               STBox|
+------+--------------------+--------------------+
|34718e|POINT(1.922973632...|STBOX XT(((1.9229...|
|ac6364|POINT(-85.5262662...|STBOX XT(((-85.52...|
|406471|POINT(1.838302612...|STBOX XT(((1.8383...|
|a04417|POINT(-83.4583702...|STBOX XT(((-83.45...|
|c04aa1|POINT(-79.3079393...|STBOX XT(((-79.30...|
|4d21ea|POINT(8.005793644...|STBOX XT(((8.0057...|
|4ca9cc|POINT(8.388679504...|STBOX XT(((8.3886...|
|a20f1a|POINT(-77.7057878...|STBOX XT(((-77.70...|
|152019|POINT(52.04384408...|STBOX XT(((52.043...|
|a895b4|POINT(-80.0497055...|STBOX XT(((-80.04...|
|845d1c|POINT(134.1781997...|STBOX XT(((134.17...|
|aa84f8|POINT(-112.043282...|STBOX XT(((-112.0...|
|a0a8df|POINT(-96.9305003...|STBOX XT(((-96.93...|
|7c7a4d|POINT(153.0453608...|STBOX XT(((153.04...|
|750503|POINT(101.9908194...|STBOX XT(((101.99...|
|a721d5|POINT(-120.366270...|STBOX XT(((-120.3...|
|a888aa|POINT(-96.4073364...|ST

                                                                                

Row(icao24='34718e', PointInst=TGeomPointInstWrap(POINT(1.9229736328125 40.87294006347656)@2022-06-27 00:00:00+00), STBox=STBoxWrap(STBOX XT(((1.9229736328125,40.87294006347656),(1.9229736328125,40.87294006347656)),[2022-06-27 00:00:00+00, 2022-06-27 00:00:00+00])))

### Parse as Sequences

In [6]:
dfseq = df.dropna().groupBy("icao24").agg(
    F.collect_list(F.col("PointInst")).alias("PointSeq")
)
dfseq = dfseq.withColumn("PointSeq", tgeompointseq_from_instant_list("PointSeq"))
dfseq.show()

24/07/23 13:49:29 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
[Stage 2:>                                                          (0 + 1) / 1]

+------+--------------------+
|icao24|            PointSeq|
+------+--------------------+
|4bb186|[POINT(27.8621292...|
|4074e2|[POINT(-75.162197...|
|ac0d09|[POINT(-72.187853...|
|c08106|[POINT(-79.407135...|
|a1286c|[POINT(-69.845206...|
|010207|[POINT(26.7178734...|
|4d0113|[POINT(-100.32571...|
|a046eb|[POINT(-95.526492...|
|aa909a|[POINT(-75.856049...|
|0d0d40|[POINT(-116.00980...|
|ac6343|[POINT(-116.00452...|
|ab128d|[POINT(-121.75960...|
|4d21ea|[POINT(8.00579364...|
|a86b76|[POINT(-118.19572...|
|ac3b8e|[POINT(-86.820799...|
|406471|[POINT(1.83830261...|
|a393e6|[POINT(-101.17403...|
|a18581|[POINT(-110.19602...|
|50840d|[POINT(7.81187315...|
|a64a3b|[POINT(-70.762065...|
+------+--------------------+
only showing top 20 rows



                                                                                

## Define a Simple Grid and Partition

In [7]:
bounds = df.rdd.mapPartitions(lambda x: bounds_calculate_map(x, colname='PointInst')).reduce(bounds_calculate_reduce)
bounds

                                                                                

STBoxWrap(STBOX XT(((-121.75960459607712,-32.06538391113281),(153.0453608586238,53.242767333984375)),[2022-06-27 00:00:00+00, 2022-06-27 00:00:10+00]))

In [20]:
grid = GridPartition(bounds=bounds, cells_per_side=2)
griddf = grid.as_spark_table()
griddf.createOrReplaceTempView("grid")
griddf.show()

+------+--------------------+
|tileid|                tile|
+------+--------------------+
|     0|STBOX XT(((-121.7...|
|     1|STBOX XT(((-121.7...|
|     2|STBOX XT(((-121.7...|
|     3|STBOX XT(((-121.7...|
|     4|STBOX XT(((15.642...|
|     5|STBOX XT(((15.642...|
|     6|STBOX XT(((15.642...|
|     7|STBOX XT(((15.642...|
+------+--------------------+



In [22]:
partition_schema = StructType(
    [
        StructField("movingobjectid", StringType()),
        StructField("tileid", IntegerType()),
        StructField("movingobject", TGeomPointSeqSetUDT()),
    ]
)
@F.udtf(returnType=partition_schema)
class DemoUDTF(BasePartitionUDTF):
    def __init__(self):
        super().__init__(
            response_extra_cols=[],
            check_function=None,
            return_full_traj=False
        )

    def eval(self, row: Row):
        for val in super().eval_wrap(row):
            yield val

query = """
            SELECT *
            FROM DemoUDTF(
                TABLE(
                        SELECT
                            monotonically_increasing_id() AS trajectory_id,
                            PointSeq AS movingobject,
                            (SELECT collect_list(tile) FROM grid) AS tiles,
                            (SELECT collect_list(tileid) FROM grid) AS tileids
                        FROM dfseq
                )
            )
"""

dfseq.createOrReplaceTempView("dfseq")
spark.udtf.register("DemoUDTF", DemoUDTF)
partitioned = spark.sql(query)
partitioned.show()

24/07/23 14:00:53 WARN SimpleTableFunctionRegistry: The function demoudtf replaced a previously registered function.
[Stage 28:>                                                         (0 + 1) / 1]

+--------------+------+--------------------+
|movingobjectid|tileid|        movingobject|
+--------------+------+--------------------+
|             0|     7|{[POINT(27.862129...|
|             1|     3|{[POINT(-75.16219...|
|             2|     3|{[POINT(-72.18785...|
|             3|     3|{[POINT(-79.40713...|
|             4|     3|{[POINT(-69.84520...|
|             5|     7|{[POINT(26.717873...|
|             6|     3|{[POINT(-100.3257...|
|             7|     3|{[POINT(-95.52649...|
|             8|     1|{[POINT(-75.85604...|
|             9|     3|{[POINT(-116.0098...|
|            10|     3|{[POINT(-116.0045...|
|            11|     3|{[POINT(-121.7596...|
|            12|     2|{[POINT(8.0057936...|
|            13|     3|{[POINT(-118.1957...|
|            14|     3|{[POINT(-86.82079...|
|            15|     2|{[POINT(1.8383026...|
|            16|     3|{[POINT(-101.1740...|
|            17|     3|{[POINT(-110.1960...|
|            18|     3|{[POINT(7.8118731...|
|         

                                                                                

## Write UDTs

Now we save the dataframe back in a file.

In [46]:
partitioned.write.partitionBy('tileid').csv("../datasets/out.csv", mode="overwrite", header=True)
partitioned.write.partitionBy('tileid').parquet("../datasets/out.parquet", mode="overwrite")

                                                                                

In [47]:
!ls ../datasets/out.csv/*

 ../datasets/out.csv/_SUCCESS

'../datasets/out.csv/tileid=1':
part-00000-cd8e5db9-7546-4b93-8860-04c0cff51e9a.c000.csv

'../datasets/out.csv/tileid=2':
part-00000-cd8e5db9-7546-4b93-8860-04c0cff51e9a.c000.csv

'../datasets/out.csv/tileid=3':
part-00000-cd8e5db9-7546-4b93-8860-04c0cff51e9a.c000.csv

'../datasets/out.csv/tileid=5':
part-00000-cd8e5db9-7546-4b93-8860-04c0cff51e9a.c000.csv

'../datasets/out.csv/tileid=7':
part-00000-cd8e5db9-7546-4b93-8860-04c0cff51e9a.c000.csv


In [48]:
!ls ../datasets/out.parquet

 _SUCCESS  'tileid=1'  'tileid=2'  'tileid=3'  'tileid=5'  'tileid=7'


This is a very simple notebook that shows how using UDTs allows for basic read/write operations in MobilityPySpark.