In [1]:
from pymeos import pymeos_initialize, pymeos_finalize, TGeogPointInst, TGeogPointSeq
from pysparkmeos.UDT import TGeogPointInstUDT

# Important: Always initialize MEOS library
pymeos_initialize()

sequence_from_string = TGeogPointSeq(string='[Point(10.0 10.0)@2019-09-01 00:00:00+01, Point(20.0 20.0)@2019-09-02 00:00:00+01, Point(10.0 10.0)@2019-09-03 00:00:00+01]')
print(f'Output: {sequence_from_string}')

sequence_from_points = TGeogPointSeq(instant_list=[TGeogPointInst(string='Point(10.0 10.0)@2019-09-01 00:00:00+01'), TGeogPointInst(string='Point(20.0 20.0)@2019-09-02 00:00:00+01'), TGeogPointInst(string='Point(10.0 10.0)@2019-09-03 00:00:00+01')], lower_inc=True, upper_inc=True)
speed = sequence_from_points.speed()
print(f'Speeds: {speed}')

point = TGeogPointInst("POINT(-85.5262662926499 33.139048430879235)@2022-06-27 00:00:00+00")
print(point)

# Call finish at the end of your code
pymeos_finalize()

ModuleNotFoundError: No module named 'pysparkmeos'

In [2]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py4j==0.10.9.7 (from pyspark)
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl.metadata (1.5 kB)
Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488513 sha256=c92e97ce36351c5ea3b3c6aaaaca67970d73a142b17abaa4b95ecba617a7b688
  Stored in directory: /root/.cache/pip/wheels/92/09/11/aa01d01a7f005fda8a66ad71d2be7f8aa341bddafb27eee3c7
Successfully built pyspark
Installing collected packages: py4j, pyspar

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, IntegerType
from pyspark.sql.functions import from_unixtime, col, udf, collect_list, count
from pymeos import pymeos_initialize, pymeos_finalize, TGeogPointInst, TGeogPointSeq

def main():
    # Initialize PyMEOS
    pymeos_initialize("UTC")

    # Initialize a Spark session
    spark = SparkSession.builder \
        .appName("PySpark UDF Example with PyMEOS") \
        .master("local[*]") \
        .config("spark.default.parallelism", 4) \
        .getOrCreate()

    # Define the UDF for creating a TGeogPointInst
    @udf(returnType=StringType())
    def create_point_udf(lat, lon, time):
        pymeos_initialize("UTC")
        point_inst = TGeogPointInst(f"Point({lon} {lat})@{str(time)}")
        point_inst_str = point_inst.__str__()
        # print(point_inst)
        point_inst = None
        #pymeos_finalize()
        return point_inst_str
    
    @udf(returnType=IntegerType())
    def create_pointseq_instants_udf(lats, lons, times):
        # Initialize PyMEOS
        pymeos_initialize("UTC")
    
        # Combine lat, lon, and time into a list of tuples and sort them by time
        combined = sorted(zip(lats, lons, times), key=lambda x: x[2])
        
        points_inst_list = [TGeogPointInst(f"Point({lon} {lat})@{time}") for lat, lon, time in combined]
        point_seq = TGeogPointSeq(instant_list=points_inst_list, lower_inc=True, upper_inc=True)
        instants = point_seq.num_instants()
        return instants


    def partitioner(key):

    
    # Get the value of 'spark.default.parallelism'
    default_parallelism = spark.sparkContext.getConf().get("spark.default.parallelism")
    print(f"spark.default.parallelism: {default_parallelism}")
    
    # Read data from a CSV file
    data_path = "data/states_2022-06-27-00.csv"  # Update this with your CSV file path
    df = spark.read.csv(data_path, header=True, inferSchema=True).select("icao24", "time", "lat", "lon")
    
    df = df.dropna(subset=["lat", "lon"])
    # Convert the 'time' column to the correct format
    df = df.withColumn("time", from_unixtime(col("time"), "yyyy-MM-dd' 'HH:mm:ss"))

    # Apply the UDF to the DataFrame to create a new 'Point' column
    df_with_points = df.withColumn("Point", create_point_udf("lat", "lon", "time"))

    # df_with_points.explain(True)

    num_partitions = df.rdd.getNumPartitions()

    print(f"Number of partitions in the DataFrame: {num_partitions}")

    # Show the transformed DataFrame
    print("Transformed DataFrame with Points:")
    df_with_points.show()

    # Group by 'icao24' and aggregate 'lat', 'lon', and 'time' into separate lists
    df_grouped = df_with_points.groupby("icao24").agg(
        collect_list("lat").alias("lats"),
        collect_list("lon").alias("lons"),
        collect_list("time").alias("times"),
        count("time").alias("SeqInstantsTotal")
    )
    
    # Apply the UDF to convert the lists of latitudes, longitudes, and timestamps to a TGeogPointSeq
    df_with_pointseq = df_grouped.withColumn(
        "PointSeqInstants",
        create_pointseq_instants_udf(col("lats"), col("lons"), col("times"))
    ).select("icao24", "SeqInstantsTotal", "PointSeqInstants")
    
    # Show the result
    print("DataFrame with Point Sequences:")
    df_with_pointseq.show(truncate=False)

if __name__ == "__main__":
    main()

spark.default.parallelism: 4


                                                                                

Number of partitions in the DataFrame: 4
Transformed DataFrame with Points:


                                                                                

+------+-------------------+-------------------+-------------------+--------------------+
|icao24|               time|                lat|                lon|               Point|
+------+-------------------+-------------------+-------------------+--------------------+
|34718e|2022-06-27 00:00:00|  40.87294006347656|    1.9229736328125|POINT(1.922973632...|
|ac6364|2022-06-27 00:00:00| 33.139048430879235|  -85.5262662926499|POINT(-85.5262662...|
|406471|2022-06-27 00:00:00|   51.9085693359375| 1.8383026123046875|POINT(1.838302612...|
|a04417|2022-06-27 00:00:00|  33.75718688964844| -83.45837028659122|POINT(-83.4583702...|
|c04aa1|2022-06-27 00:00:00|  42.86952209472656|  -79.3079393963481|POINT(-79.3079393...|
|4d21ea|2022-06-27 00:00:00|  48.92765808105469|   8.00579364483173|POINT(8.005793644...|
|4ca9cc|2022-06-27 00:00:00|  47.56517028808594|  8.388679504394531|POINT(8.388679504...|
|a20f1a|2022-06-27 00:00:00|  41.99859101893538| -77.70578783611916|POINT(-77.7057878...|
|152019|20

[Stage 26:>                                                         (0 + 1) / 1]

+------+----------------+----------------+
|icao24|SeqInstantsTotal|PointSeqInstants|
+------+----------------+----------------+
|0100a3|167             |142             |
|0100e7|122             |115             |
|0100f6|112             |56              |
|0101bb|73              |37              |
|010207|76              |48              |
|01022e|359             |355             |
|02a1a7|165             |123             |
|02a1ca|4               |4               |
|04015c|215             |121             |
|040172|161             |149             |
|040203|181             |141             |
|06809b|276             |232             |
|06a03d|143             |114             |
|06a041|155             |127             |
|06a04e|58              |24              |
|06a062|140             |132             |
|06a064|31              |26              |
|06a09a|78              |32              |
|06a0a5|131             |33              |
|06a0a8|305             |252             |
+------+---

                                                                                