# Pymeos and Pyspark Partitioning Demo

First we perform the corresponding imports for the libraries to use.

In [1]:
from pymeos import *
from pysparkmeos.UDT.MeosDatatype import *

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.serializers import PickleSerializer
import pyspark.sql.functions as F

from pyspark.storagelevel import StorageLevel

from pysparkmeos.partitions.grid.grid_partitioner import GridPartition

from pysparkmeos.utils.udt_appender import udt_append
from pysparkmeos.UDF.udf import *
from pysparkmeos.partitions.mobilityrdd import MobilityRDD

import random, datetime

from datetime import timedelta
from functools import partial
from datetime import datetime, timezone
import contextily as cx
import distinctipy
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import shapely.geometry as shp
from pymeos.plotters import (
    TemporalPointSequenceSetPlotter,
    TemporalPointSequencePlotter,
)

import matplotlib.pyplot as plt
import numpy as np
import os, sys
from shapely import wkb, box
import pandas as pd
from functools import reduce

## Initialize Pymeos and setup Pyspark

In [2]:
# Initialize PyMEOS
pymeos_initialize("UTC")

os.environ['PYSPARK_DRIVER_PYTHON_OPTS']= "notebook"
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_PYTHON'] = sys.executable

#.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("PySpark UDF Example with PyMEOS") \
    .master("local[3]") \
    .config("spark.default.parallelism", 3) \
    .config("spark.executor.memory", "1536m") \
    .config("spark.driver.memory", "1g") \
    .config("spark.sql.allowMultipleTableArguments.enabled", True) \
    .getOrCreate()

#spark.sparkContext.setLogLevel("DEBUG")

# Append the UDT mapping to the PyMEOS classes
udt_append()

# Get the value of 'spark.default.parallelism'
default_parallelism = spark.sparkContext.getConf().get("spark.default.parallelism")
print(f"spark.default.parallelism: {default_parallelism}")
print(f"spark.sql.allowMultipleTableArguments.enabled: {spark.sparkContext.getConf().get('spark.sql.allowMultipleTableArguments.enabled')}")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/15 16:11:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


spark.default.parallelism: 3
spark.sql.allowMultipleTableArguments.enabled: true


## Read the DataFrame and create the columns of Pymeos data objects

First, we are going to read a dataset from OpenSky, this dataset cointains the trajectory information of multiple flights within a timeframe.  

Using the latitude, longitude and timestamp we can create a PyMEOS TGeogPointInst, that in PySpark will be wrapped into a TGeogPointInstUDT object.

In [3]:
# Read data from a CSV file
data_path = "../../small_mid_states_2022-06-27-00.csv"  # Update this with your CSV file path
df = spark.read.csv(data_path, header=True, inferSchema=True, mode='PERMISSIVE').select("icao24", "time", "lat", "lon")

# Clean nans, if not Points can't be created
df = df.dropna(subset=["lat", "lon", "time", "icao24"])

    #.withColumn("Point", F.concat(F.lit("Point("), F.col("lat"), F.lit(" "), F.col("lon"), F.lit(")@"), F.col("time"))) \
# Convert the 'time' column to the correct format
df = df \
    .withColumn("time", F.from_unixtime(F.col("time"), "yyyy-MM-dd' 'HH:mm:ss")) \
    .withColumn("Point", create_point_udf("lat", "lon", "time")) \
    .withColumn("x", get_point_x("Point")) \
    .withColumn("y", get_point_y("Point")) \
    .withColumn("t", get_point_timestamp("Point")) \
    .withColumn("id", F.monotonically_increasing_id())

#    .withColumn("x", get_point_x("Point")) \
#    .withColumn("y", get_point_y("Point")) \
#    .withColumn("t", get_point_timestamp("Point")) \
# df.createOrReplaceTempView("rawPoints")

df.tail(5)

#for row in df.toLocalIterator():
#    print(row)

24/05/15 16:11:42 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

[Row(icao24='ad05ce', time='2022-06-27 00:00:50', lat=32.31065135891154, lon=-97.41846201371173, Point=TGeogPointInstWrap(POINT(-97.41846201371173 32.31065135891154)@2022-06-27 00:00:50+00), x=-97.41846466064453, y=32.31064987182617, t=datetime.datetime(2022, 6, 27, 0, 0, 50), id=8589935540),
 Row(icao24='c8806a', time='2022-06-27 00:00:50', lat=-26.871002197265625, lon=154.14751160819577, Point=TGeogPointInstWrap(POINT(154.14751160819577 -26.871002197265625)@2022-06-27 00:00:50+00), x=154.1475067138672, y=-26.871002197265625, t=datetime.datetime(2022, 6, 27, 0, 0, 50), id=8589935541),
 Row(icao24='a726fb', time='2022-06-27 00:00:50', lat=61.546417236328125, lon=-151.04876926967074, Point=TGeogPointInstWrap(POINT(-151.04876926967074 61.546417236328125)@2022-06-27 00:00:50+00), x=-151.04876708984375, y=61.546417236328125, t=datetime.datetime(2022, 6, 27, 0, 0, 50), id=8589935542),
 Row(icao24='ad225b', time='2022-06-27 00:00:50', lat=37.73808288574219, lon=-121.37812188331117, Point=TGe

In [4]:
df.rdd.getNumPartitions()

2

Handling a dataset like this gives us granularity over the datapoints, but it would be better to group the trajectories together using another function, this will return the TGeogPointSeq objects representing each trajectory. Let's create a UDTF that takes the icao24, and Point, and creates a new table with icao24, and PointSeq.

In [5]:
schema = StructType([
    StructField("icao24", StringType()),
    StructField("PointSeq", TGeogPointSeqUDT())
])


@F.udf(returnType=TGeogPointSeqUDT())
def pointSeqFromPoints(pointgroup):
    if not pointgroup:
        return None
    pymeos_initialize()
    if len(pointgroup) == 1:
        pointgroup = f'[{pointgroup[0].__str__()}]'
        return TGeogPointSeq(pointgroup)
    pointgroup = sorted(pointgroup)
    pointseq = TGeogPointSeq(instant_list=pointgroup)
    return pointseq

print(df.count())

#df.select("Point").tail(5)
df.groupBy("icao24").agg(F.count("Point").alias("cnt")).orderBy("cnt",ascending=False).show()

trajectories = df.groupBy("icao24").agg(
    F.collect_list(F.col("Point")).alias("PointSeq")
).select("icao24", "PointSeq").withColumn("PointSeq", pointSeqFromPoints("PointSeq"))

print(trajectories.count())

trajectories.show(5)

#trajectories.rdd.getNumPartitions()

#spark.sparkContext.setCheckpointDir("./checkpoint")
#trajectories = trajectories.checkpoint()
#trajectories.tail(1).limit(1).collect()[0].PointSeq

                                                                                

23288


                                                                                

+------+---+
|icao24|cnt|
+------+---+
|406471|  6|
|34718e|  6|
|a04417|  6|
|ac6364|  6|
|a054e1|  5|
|a51b96|  5|
|a6cf94|  5|
|7c7aac|  5|
|a95c2f|  5|
|c05ee5|  5|
|3455d9|  5|
|ac7ac9|  5|
|ad60b7|  5|
|4952a5|  5|
|a77ae0|  5|
|a0c4a1|  5|
|8013d8|  5|
|a03cb6|  5|
|700024|  5|
|740736|  5|
+------+---+
only showing top 20 rows



                                                                                

5187




+------+--------------------+
|icao24|            PointSeq|
+------+--------------------+
|0100f6|[POINT(24.7634696...|
|010109|[POINT(51.2653681...|
|01013d|[POINT(10.5608825...|
|0101a7|[POINT(12.1170895...|
|0101bb|[POINT(51.6333065...|
+------+--------------------+
only showing top 5 rows



                                                                                

This has reduced the table from 530k Points to 7k PointSeq!

In [6]:
# trajectories.createOrReplaceTempView("trajectories")

Now we need to calculate the boundaries of the whole space of trajectories.

In [7]:
from pymeos import TPoint

@F.udf(returnType=STBoxUDT())
def point_to_stbox(tpoint: TPoint) -> STBox:
    pymeos_initialize()
    return tpoint.bounding_box()

trajectories = trajectories.withColumn("STBox", point_to_stbox("PointSeq")).withColumn("seqId", F.monotonically_increasing_id())
trajectories.createOrReplaceTempView("trajectories")
trajectories.show()
#trajectories.cache()
print(trajectories.count())
trajectories.rdd.getNumPartitions()

                                                                                

+------+--------------------+--------------------+-----+
|icao24|            PointSeq|               STBox|seqId|
+------+--------------------+--------------------+-----+
|0100f6|[POINT(24.7634696...|SRID=4326;GEODSTB...|    0|
|010109|[POINT(51.2653681...|SRID=4326;GEODSTB...|    1|
|01013d|[POINT(10.5608825...|SRID=4326;GEODSTB...|    2|
|0101a7|[POINT(12.1170895...|SRID=4326;GEODSTB...|    3|
|0101bb|[POINT(51.6333065...|SRID=4326;GEODSTB...|    4|
|0101bd|[POINT(46.7433022...|SRID=4326;GEODSTB...|    5|
|0101dd|[POINT(55.5086263...|SRID=4326;GEODSTB...|    6|
|010205|[POINT(13.5366892...|SRID=4326;GEODSTB...|    7|
|010207|[POINT(26.7178734...|SRID=4326;GEODSTB...|    8|
|01020b|[POINT(30.5806831...|SRID=4326;GEODSTB...|    9|
|01022e|[POINT(19.7302592...|SRID=4326;GEODSTB...|   10|
|017073|[POINT(120.854524...|SRID=4326;GEODSTB...|   11|
|01802e|[POINT(29.6798639...|SRID=4326;GEODSTB...|   12|
|02006f|[POINT(-1.7363175...|SRID=4326;GEODSTB...|   13|
|02009f|[POINT(-7.1850585...|SR

[Stage 27:>                                                         (0 + 2) / 2]

1

In [8]:
# df.select("Point").write.mode("overwrite").csv("../../small_states_2022-06-27-00_only_points")

# df.show(3, truncate=False)

@F.udf(returnType=FloatType())
def get_box_dim(box: STBox, dim: str, category: str):
    pymeos_initialize()
    try:
        if dim == 'x':
            if category == 'max':
                return box.xmax()
            else:
                return box.xmin()
        if dim == 'y':
            if category == 'max':
                return box.ymax()
            else:
                return box.ymin()
        if dim == 'z':
            if category == 'max':
                return box.zmax()
            else:
                return box.zmin()
    except:
        return None

@F.udf(returnType=TimestampType())
def get_box_time(box: STBox, category: str):
    pymeos_initialize()
    try:
        if category=='max':
            return box.tmax()
        else:
            return box.tmin()
    except:
        return None


#boundsdf = trajectories.agg(
#        F.max(get_box_dim("STBox", F.lit("x"), F.lit("max"))).alias("xmax"),
#        F.min(get_box_dim("STBox", F.lit("x"), F.lit("min"))).alias("xmin"),
#        F.max(get_box_dim("STBox", F.lit("y"), F.lit("max"))).alias("ymax"),
#        F.min(get_box_dim("STBox", F.lit("y"), F.lit("min"))).alias("ymin"),
#        F.max(get_box_time("STBox", F.lit("max"))).alias("tmax"),
#        F.min(get_box_time("STBox", F.lit("min"))).alias("tmin"),
#    ).select(bounds_as_box("xmin", "xmax", "ymin", "ymax", "tmin", "tmax").alias("bounds"))


boundsdf = df.agg(
    F.max(F.col("x")).alias("max_x"),
    F.min(F.col("x")).alias("min_x"),
    F.max(F.col("y")).alias("max_y"),
    F.min(F.col("y")).alias("min_y"),
    F.max(F.col("t").cast("timestamp")).alias("max_t"),
    F.min(F.col("t").cast("timestamp")).alias("min_t")
).select(bounds_as_box("min_x", "max_x", "min_y", "max_y", "min_t", "max_t").alias("bounds"))

#boundsdf.show(truncate=False)
# bounds = STBoxWrap(boundsdf.collect()[0].bounds.__str__(), geodetic=True)
#bounds = STBoxWrap('STBOX XT(((-177.02969360351562,-46.421356201171875),(177.816650390625,70.29727935791016)),[2022-06-27 00:00:00+00, 2022-06-27 00:15:00+00])')
#boundsdf.unpersist()
#df.unpersist()
# bounds

In [9]:
bounds = STBoxWrap(
        "STBOX XT(((-177.02969360351562,-46.421356201171875),(177.816650390625,70.29727935791016)),[2022-06-27 00:00:00+00, 2022-06-27 00:15:00+00])",
        geodetic=True)
bounds

STBoxWrap(STBOX XT(((-177.02969360351562,-46.421356201171875),(177.816650390625,70.29727935791016)),[2022-06-27 00:00:00+00, 2022-06-27 00:15:00+00]))

## Generate the partitioning scheme and repartition data

In [10]:
# Now we calculate the grid and partition accordingly
gp = GridPartition(cells_per_side=3, bounds=bounds)
grid = gp.as_spark_table()
#grid.cache()
grid.show()
grid.createOrReplaceTempView("grid")
grid.rdd.getNumPartitions()

+------+--------------------+
|tileid|                tile|
+------+--------------------+
|     0|SRID=4326;GEODSTB...|
|     1|SRID=4326;GEODSTB...|
|     2|SRID=4326;GEODSTB...|
|     3|SRID=4326;GEODSTB...|
|     4|SRID=4326;GEODSTB...|
|     5|SRID=4326;GEODSTB...|
|     6|SRID=4326;GEODSTB...|
|     7|SRID=4326;GEODSTB...|
|     8|SRID=4326;GEODSTB...|
|     9|SRID=4326;GEODSTB...|
|    10|SRID=4326;GEODSTB...|
|    11|SRID=4326;GEODSTB...|
|    12|SRID=4326;GEODSTB...|
|    13|SRID=4326;GEODSTB...|
|    14|SRID=4326;GEODSTB...|
|    15|SRID=4326;GEODSTB...|
|    16|SRID=4326;GEODSTB...|
|    17|SRID=4326;GEODSTB...|
|    18|SRID=4326;GEODSTB...|
|    19|SRID=4326;GEODSTB...|
+------+--------------------+
only showing top 20 rows



24/05/15 16:12:22 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


1

In [11]:
from pyspark.sql.types import Row
from typing import Iterator

schema = StructType([
    StructField("trajectoryId", IntegerType()),
    StructField("partitionKey", IntegerType()),
    #StructField("trajectory", StringType())
])

@F.udtf(returnType=schema)
class RegularPartition:
    def eval(self, row: Row):
        pymeos_initialize()
        sequence_id = row.seqId
        trajectory = row.trajectory
        grid = row.grid
        partitioned = [(key, trajectory.at(tile)) for key, tile in enumerate(grid)]
        # print(trajectory)
        count = 0
        responses = []
        for partition_key, partition_traj in partitioned:
            count += 1
            if partition_traj is None:
                continue
            else:
                seqs = partition_traj.segments()
                #print(seqs)
                for partition_traj_seq in seqs:
                    responses.append((sequence_id, partition_key))#, partition_traj_seq.__str__()))
        for response in responses:
            yield response

spark.udtf.register("regularPartition", RegularPartition)

# spark.sql("SELECT collect_list(tile) FROM grid").show()

trajectoriesPartMap = spark.sql("""
    SELECT * 
    FROM regularPartition(
        TABLE(
                SELECT seqId, PointSeq AS trajectory, (SELECT collect_list(tile) FROM grid) AS grid
                FROM trajectories
        )
    )
""")
trajectoriesPartMap.show()


#spark.sparkContext.setLogLevel("WARN")

#print(trajectoriesPartMap.tail(1))
#print(trajectoriesPartMap.count())
#trajectoriesPartMap.createOrReplaceTempView("trajectoriesPartMap")
#print(trajectoriesPartMap.rdd.getNumPartitions())
num_partitions = gp.num_partitions()
#trajectoriesPartMap = trajectoriesPartMap.withColumn("trajectory", F.col("trajectory").cast("string"))
trajectoriesPartMap.printSchema()

def partitionMapper(partitionIterator):
    data = []
    for row in partitionIterator:
        #print(row)
        yield (row['partitionKey'], row)

#trajectoriesPartMap = trajectoriesPartMap.repartition(num_partitions, "partitionKey")

trajectoriesPartMapRdd = trajectoriesPartMap.rdd.map(lambda row: (row['partitionKey'], row)).partitionBy(num_partitions)
for row in trajectoriesPartMapRdd.take(1):
    print(row)

#trajectoriesPartMap = trajectoriesPartMapRdd.toDF(["partitionKey", "trajectorydata"]) #.withColumn("trajectory", F.col("trajectorydata.trajectory").cast(TGeogPointSeqUDT())).withColumn("trajectoryId", F.col("trajectorydata.trajectoryId")).drop("trajectorydata")
trajectoriesPartMap.printSchema()
#trajectoriesPart.groupBy("trajectoryId").agg(F.count(F.col("trajectory")).alias("cnt")).orderBy("cnt", ascending=False).show()
trajectoriesPartMap.orderBy("trajectoryId")
trajectoriesPartMap.show()
#print(trajectoriesPartMap.rdd.getNumPartitions())

#RegularPartition(trajectories, trajectories.select("PointSeq"), F.lit(gp.gridstr)).show()


#spark.udtf.register("regularPartition", RegularPartition)


#print(gp.num_partitions())
#num_partitions = gp.num_partitions()
#gridstr = gp.gridstr

# Register the udf
#get_partition_key_udf = udf(gp.get_partition, IntegerType())

#dfpoint = df.select("id", "Point", "PointStr").withColumn("partitionKey", get_partition_key_udf(col("Point"), lit(gridstr)))
#dfpoint.show()

# dfpoint.select("_metadata").show(truncate=False)

24/05/15 16:12:37 ERROR Executor: Exception in task 0.0 in stage 34.0 (TID 31)1]
org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:612)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:594)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:99)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.It

Py4JJavaError: An error occurred while calling o388.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 34.0 failed 1 times, most recent failure: Lost task 0.0 in stage 34.0 (TID 31) (d57adcaf5d97 executor driver): org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:612)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:594)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:99)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage4.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: java.io.EOFException
	at java.base/java.io.DataInputStream.readFully(DataInputStream.java:210)
	at java.base/java.io.DataInputStream.readInt(DataInputStream.java:385)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:83)
	... 26 more

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2398)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2419)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2438)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:530)
	at org.apache.spark.sql.execution.SparkPlan.executeTake(SparkPlan.scala:483)
	at org.apache.spark.sql.execution.CollectLimitExec.executeCollect(limit.scala:61)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.$anonfun$executeCollect$1(AdaptiveSparkPlanExec.scala:390)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.withFinalPlanUpdate(AdaptiveSparkPlanExec.scala:418)
	at org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec.executeCollect(AdaptiveSparkPlanExec.scala:390)
	at org.apache.spark.sql.Dataset.collectFromPlan(Dataset.scala:4332)
	at org.apache.spark.sql.Dataset.$anonfun$head$1(Dataset.scala:3314)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$2(Dataset.scala:4322)
	at org.apache.spark.sql.execution.QueryExecution$.withInternalError(QueryExecution.scala:546)
	at org.apache.spark.sql.Dataset.$anonfun$withAction$1(Dataset.scala:4320)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:125)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:201)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:108)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:900)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:66)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:4320)
	at org.apache.spark.sql.Dataset.head(Dataset.scala:3314)
	at org.apache.spark.sql.Dataset.take(Dataset.scala:3537)
	at org.apache.spark.sql.Dataset.getRows(Dataset.scala:280)
	at org.apache.spark.sql.Dataset.showString(Dataset.scala:315)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: org.apache.spark.SparkException: Python worker exited unexpectedly (crashed)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:612)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator$$anonfun$1.applyOrElse(PythonRunner.scala:594)
	at scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:38)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:99)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:75)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:491)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage4.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$getByteArrayRdd$1(SparkPlan.scala:388)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2(RDD.scala:893)
	at org.apache.spark.rdd.RDD.$anonfun$mapPartitionsInternal$2$adapted(RDD.scala:893)
	at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:52)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:367)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:331)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	... 1 more
Caused by: java.io.EOFException
	at java.base/java.io.DataInputStream.readFully(DataInputStream.java:210)
	at java.base/java.io.DataInputStream.readInt(DataInputStream.java:385)
	at org.apache.spark.sql.execution.python.BasePythonUDFRunner$$anon$1.read(PythonUDFRunner.scala:83)
	... 26 more


In [None]:
from pyspark.sql import SQLContext

trajectoriesPart = spark.table("trajectoriesPart")

#print(trajectoriesPart.tail(5))
print(trajectoriesPart.count())
print(trajectoriesPart.rdd.getNumPartitions())


# This operation can be costly, for testing do with few data points (<= 1000).
num_partitions = gp.num_partitions()
#trajectoriesPart = trajectoriesPart.repartition(num_partitions, "partitionKey")
#print(trajectoriesPart.rdd.getNumPartitions())
trajectoriesPartRdd = trajectoriesPart.rdd.map(lambda x: (x['partitionKey'], x)).partitionBy(num_partitions)
trajectoriesPart = trajectoriesPartRdd.toDF(["partitionKey", "trajectorydata"]).withColumn("trajectory", F.col("trajectorydata.trajectory")).withColumn("trajectoryId", F.col("trajectorydata.trajectoryId")).drop("trajectorydata")
#trajectoriesPart.groupBy("trajectoryId").agg(F.count(F.col("trajectory")).alias("cnt")).orderBy("cnt", ascending=False).show()
trajectoriesPart.orderBy("trajectoryId").show()

## Show the partition distribution

In [None]:
trajectoriesPart.groupBy("partitionKey").count().show()

#for idx, item in datardd.take(5):
#    print(idx, item)

trajectoriesPartRdd = trajectoriesPart.rdd
print(trajectoriesPartRdd.getNumPartitions())

# Function to count rows per partition
def count_in_partition(idx, iterator):
    cnt = 0
    for _ in iterator:
        cnt += 1
    return [(idx, cnt)]

# Using mapPartitionsWithIndex to count rows per partition
partition_counts = trajectoriesPartRdd.mapPartitionsWithIndex(count_in_partition).collect()
    
# Print the results
for partition_id, cnt in partition_counts:
    print(f"Partition {partition_id} has {cnt} rows")

## Plot the Projection of the grid and the distribution of data

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 10))

# dfpoint = datardd.toDF(['partitionKey', 'Point'])
trajectoriesPart.printSchema()

# Create a bounding box
# bounding_box = box(bounds.xmin(), bounds.ymin(), bounds.xmax(), bounds.ymax())

world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
world.plot(ax=ax, alpha=0.3)

gridstr = [row.tile.__str__() for row in grid.collect()]

colors = plt.cm.Accent(np.linspace(0, 1, len(gridstr)))
# Adding an alpha value when creating the color ma<p
alpha_value = 1.0
color_map = {i: colors[i][:3].tolist() + [alpha_value] for i, tile in enumerate(gridstr)}
for i, tilestr in enumerate(gridstr):
    tile: STBox = STBoxWrap(tilestr).set_srid(0)
    tile.plot_xy(axes=ax, color="black", draw_filling=False)

for idx, row in enumerate(trajectoriesPart.toLocalIterator()):
    if i == 100:
        break
    if row.trajectory.num_instants() <= 1:
        continue
    traj = row.trajectory
    #if type(point)==str:
    #    point = TGeogPointInst(point)
    #tpointseq = point.to_sequence(interp).plot(axes=ax,label="tpoint", color=color_map[row.partitionKey % gp.num_partitions()], facecolors='none')
    traj.plot(axes=ax,label="trajectory", color=color_map[row.partitionKey % gp.num_partitions()])
plt.title("XY Tile Partition and Trajectory Projection")
plt.xlabel("Lon")
plt.ylabel("Lat")

In [None]:
import pandas as pd
from functools import reduce
from pymeos import *

class MeosWrap:  
    def __setstate__(self, state):
        pymeos_initialize()
        #print("Im being unpickled: ", state)
        self._inner = self(state)._inner

    def __getstate__(self):
        pymeos_initialize()
        #print("Im being pickled: ", self.__str__())
        return self.__str__()


class TGeogPointInstWrap(TGeogPointInst, MeosWrap):
    def __setstate__(self, state):
        pymeos_initialize()
        #print("Im being unpickled: ", state)
        self._inner = TGeogPointInst(state)._inner

class TGeogPointInstWrap2(TGeogPointInst):
    def __setstate__(self, state):
        print("Im being unpickled: ", state)
        self._inner = TGeogPointInst(state, srid=0)._inner

    def __getstate__(self):
        print("Im being pickled: ", self.__str__())
        return self.__str__()

pymeos_initialize()

tpoint = TGeogPointInstWrap("POINT(40.87294006347656 1.9229736328125)@2022-06-27 00:00:00+00", srid=4326)
tpoint2 = TGeogPointInstWrap("POINT(100.87294006347656 105.9229736328125)@2022-06-28 23:05:00+00", srid=4326)
tpoint3 = TGeogPointInstWrap("POINT(200.87294006347656 300.9229736328125)@2022-06-29 12:30:00+00", srid=4326)
tpointseq = TGeogPointSeq("""
[POINT(26.717873431266625 35.62088788565943)@2022-06-27 00:00:10+00, POINT(26.737689971923828 35.60289001464844)@2022-06-27 00:00:20+00, POINT(26.757373809814453 35.58494567871094)@2022-06-27 00:00:30+00, POINT(26.775684356689453 35.5682373046875)@2022-06-27 00:00:40+00, POINT(26.796297113946146 35.54947675284693)@2022-06-27 00:00:50+00, POINT(26.81591033935547 35.5316162109375)@2022-06-27 00:01:00+00, POINT(26.83492457613032 35.514236708818856)@2022-06-27 00:01:10+00, POINT(26.853098767869017 35.49771066439354)@2022-06-27 00:01:20+00, POINT(26.87454548287899 35.47811217227225)@2022-06-27 00:01:30+00, POINT(26.8938299950133 35.4604688741393)@2022-06-27 00:01:40+00, POINT(26.913289820894285 35.4426859192929)@2022-06-27 00:01:50+00, POINT(26.932048391788566 35.42536848682468)@2022-06-27 00:02:00+00, POINT(26.95144977975399 35.407585531978285)@2022-06-27 00:02:10+00, POINT(26.971492767333984 35.38920593261719)@2022-06-27 00:02:20+00, POINT(26.990135679853726 35.372159278998936)@2022-06-27 00:02:30+00, POINT(27.009770819481385 35.35419011520127)@2022-06-27 00:02:40+00, POINT(27.02870470412234 35.33682613049523)@2022-06-27 00:02:50+00, POINT(27.04782485961914 35.31925964355469)@2022-06-27 00:03:00+00, POINT(27.066981538813167 35.30177229541843)@2022-06-27 00:03:10+00, POINT(27.086266050947476 35.284082445047666)@2022-06-27 00:03:20+00, POINT(27.10470199584961 35.26702880859375)@2022-06-27 00:03:30+00, POINT(27.12261199951172 35.25054931640625)@2022-06-27 00:03:40+00, POINT(27.143783569335938 35.23109436035156)@2022-06-27 00:03:50+00, POINT(27.162967604033803 35.21342468261719)@2022-06-27 00:04:00+00, POINT(27.1823618363361 35.19561767578125)@2022-06-27 00:04:10+00, POINT(27.200522909359055 35.17881774902344)@2022-06-27 00:04:20+00, POINT(27.220745086669922 35.16025349245233)@2022-06-27 00:04:30+00, POINT(27.239112854003906 35.14326192564884)@2022-06-27 00:04:40+00, POINT(27.25788116455078 35.12599104541843)@2022-06-27 00:04:50+00, POINT(27.277507781982422 35.10783567266949)@2022-06-27 00:05:00+00, POINT(27.295703887939453 35.09093721034163)@2022-06-27 00:05:10+00, POINT(27.315150669642858 35.073028564453125)@2022-06-27 00:05:20+00, POINT(27.333412170410156 35.056116136453916)@2022-06-27 00:05:30+00, POINT(27.351248604910715 35.039520263671875)@2022-06-27 00:05:40+00, POINT(27.370698889907526 35.02143859863281)@2022-06-27 00:05:50+00, POINT(27.389373779296875 35.004024182335804)@2022-06-27 00:06:00+00, POINT(27.408599853515625 34.98605501853813)@2022-06-27 00:06:10+00, POINT(27.427139282226562 34.96878413830773)@2022-06-27 00:06:20+00, POINT(27.445865553252553 34.95135498046875)@2022-06-27 00:06:30+00, POINT(27.464618682861328 34.9339165121822)@2022-06-27 00:06:40+00, POINT(27.484760284423828 34.91515596034163)@2022-06-27 00:06:50+00, POINT(27.50343167051977 34.897796630859375)@2022-06-27 00:07:00+00, POINT(27.52246856689453 34.88010212526483)@2022-06-27 00:07:10+00, POINT(27.539806365966797 34.863901946504235)@2022-06-27 00:07:20+00, POINT(27.55550462372449 34.84918212890625)@2022-06-27 00:07:30+00, POINT(27.57473069794324 34.83119201660156)@2022-06-27 00:07:40+00, POINT(27.59176254272461 34.81525485798464)@2022-06-27 00:07:50+00, POINT(27.59176254272461 34.81525485798464)@2022-06-27 00:12:40+00)

""")

tpointseq2 = TGeogPointSeq("""
[POINT(26.717873431266625 35.62088788565943)@2022-06-27 00:00:10+00, POINT(26.737689971923828 35.60289001464844)@2022-06-27 00:00:20+00, POINT(26.757373809814453 35.58494567871094)@2022-06-27 00:00:30+00, POINT(26.775684356689453 35.5682373046875)@2022-06-27 00:00:40+00, POINT(26.796297113946146 35.54947675284693)@2022-06-27 00:00:50+00, POINT(26.81591033935547 35.5316162109375)@2022-06-27 00:01:00+00, POINT(26.83492457613032 35.514236708818856)@2022-06-27 00:01:10+00, POINT(26.853098767869017 35.49771066439354)@2022-06-27 00:01:20+00, POINT(26.87454548287899 35.47811217227225)@2022-06-27 00:01:30+00, POINT(26.8938299950133 35.4604688741393)@2022-06-27 00:01:40+00, POINT(26.913289820894285 35.4426859192929)@2022-06-27 00:01:50+00, POINT(26.932048391788566 35.42536848682468)@2022-06-27 00:02:00+00, POINT(26.95144977975399 35.407585531978285)@2022-06-27 00:02:10+00, POINT(26.971492767333984 35.38920593261719)@2022-06-27 00:02:20+00, POINT(26.990135679853726 35.372159278998936)@2022-06-27 00:02:30+00, POINT(27.009770819481385 35.35419011520127)@2022-06-27 00:02:40+00, POINT(27.02870470412234 35.33682613049523)@2022-06-27 00:02:50+00, POINT(27.04782485961914 35.31925964355469)@2022-06-27 00:03:00+00, POINT(27.066981538813167 35.30177229541843)@2022-06-27 00:03:10+00, POINT(27.086266050947476 35.284082445047666)@2022-06-27 00:03:20+00, POINT(27.10470199584961 35.26702880859375)@2022-06-27 00:03:30+00, POINT(27.12261199951172 35.25054931640625)@2022-06-27 00:03:40+00, POINT(27.143783569335938 35.23109436035156)@2022-06-27 00:03:50+00, POINT(27.162967604033803 35.21342468261719)@2022-06-27 00:04:00+00, POINT(27.1823618363361 35.19561767578125)@2022-06-27 00:04:10+00, POINT(27.200522909359055 35.17881774902344)@2022-06-27 00:04:20+00, POINT(27.220745086669922 35.16025349245233)@2022-06-27 00:04:30+00, POINT(27.239112854003906 35.14326192564884)@2022-06-27 00:04:40+00, POINT(27.25788116455078 35.12599104541843)@2022-06-27 00:04:50+00, POINT(27.277507781982422 35.10783567266949)@2022-06-27 00:05:00+00, POINT(27.295703887939453 35.09093721034163)@2022-06-27 00:05:10+00, POINT(27.315150669642858 35.073028564453125)@2022-06-27 00:05:20+00, POINT(27.333412170410156 35.056116136453916)@2022-06-27 00:05:30+00, POINT(27.351248604910715 35.039520263671875)@2022-06-27 00:05:40+00, POINT(27.370698889907526 35.02143859863281)@2022-06-27 00:05:50+00, POINT(27.389373779296875 35.004024182335804)@2022-06-27 00:06:00+00, POINT(27.408599853515625 34.98605501853813)@2022-06-27 00:06:10+00, POINT(27.427139282226562 34.96878413830773)@2022-06-27 00:06:20+00, POINT(27.445865553252553 34.95135498046875)@2022-06-27 00:06:30+00, POINT(27.464618682861328 34.9339165121822)@2022-06-27 00:06:40+00, POINT(27.484760284423828 34.91515596034163)@2022-06-27 00:06:50+00, POINT(27.50343167051977 34.897796630859375)@2022-06-27 00:07:00+00, POINT(27.52246856689453 34.88010212526483)@2022-06-27 00:07:10+00, POINT(27.539806365966797 34.863901946504235)@2022-06-27 00:07:20+00, POINT(27.55550462372449 34.84918212890625)@2022-06-27 00:07:30+00, POINT(27.57473069794324 34.83119201660156)@2022-06-27 00:07:40+00, POINT(27.59176254272461 34.81525485798464)@2022-06-27 00:07:50+00, POINT(27.59176254272461 34.81525485798464)@2022-06-27 00:12:40+00)

""")


def bds(boxes: pd.Series):
    return reduce(lambda b1, b2: b1.union(b2), boxes)

def bb(x):
    pymeos_initialize()
    return x.bounding_box()

dftest = [tpoint, tpoint2, tpoint3]
tagg = TemporalPointExtentAggregator.start_aggregation()
tagg.add(tpointseq)
tagg.add(tpointseq2)

agg = tagg.aggregation()
print(agg)
print("Im the bounds..", )

hexa = pickle.dumps(tpoint)
pickle.loads(hexa)

TGeogPointInst('Point(44.53415175615731 -88.8822081030869)@2022-06-27 00:00:10')

#print(dftest)

#TemporalPointExtentAggregator.aggregate(dftest)
#aggregator.add(tpoint)
#print(sorted([tpoint2, tpoint]))
#aggregator.aggregation()
#print(tseq, type(tseq))
#dftest.sum().__str__()