In [1]:
!python3 -m pip install .

[31mERROR: Directory '.' is not installable. Neither 'setup.py' nor 'pyproject.toml' found.[0m[31m
[0m

In [2]:
from pymeos import pymeos_initialize, pymeos_finalize, TGeogPointInst, TGeogPointSeq

# Important: Always initialize MEOS library
pymeos_initialize()

sequence_from_string = TGeogPointSeq(string='[Point(10.0 10.0)@2019-09-01 00:00:00+01, Point(20.0 20.0)@2019-09-02 00:00:00+01, Point(10.0 10.0)@2019-09-03 00:00:00+01]')
print(f'Output: {sequence_from_string}')

sequence_from_points = TGeogPointSeq(instant_list=[TGeogPointInst(string='Point(10.0 10.0)@2019-09-01 00:00:00+01'), TGeogPointInst(string='Point(20.0 20.0)@2019-09-02 00:00:00+01'), TGeogPointInst(string='Point(10.0 10.0)@2019-09-03 00:00:00+01')], lower_inc=True, upper_inc=True)
speed = sequence_from_points.speed()
print(f'Speeds: {speed}')

point = TGeogPointInst("POINT(-85.5262662926499 33.139048430879235)@2022-06-27 00:00:00+00")
print(point)

print(sequence_from_points.timespan())
ts = sequence_from_points.timespan()


Output: [POINT(10 10)@2019-08-31 23:00:00+00, POINT(20 20)@2019-09-01 23:00:00+00, POINT(10 10)@2019-09-02 23:00:00+00]
Speeds: Interp=Step;[17.84556057812839@2019-08-31 23:00:00+00, 17.84556057812839@2019-09-02 23:00:00+00]
POINT(-85.5262662926499 33.139048430879235)@2022-06-27 00:00:00+00
[2019-08-31 23:00:00+00, 2019-09-02 23:00:00+00]


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, IntegerType
from pyspark.sql.functions import from_unixtime, col, udf, collect_list, count, monotonically_increasing_id
from pymeos import pymeos_initialize, pymeos_finalize, TGeogPointInst, TGeogPointSeq
from pysparkmeos.partitions.grid.grid_partitioner import GridPartition

import random

def main():
    # Initialize PyMEOS
    pymeos_initialize("UTC")

    # Initialize a Spark session
    spark = SparkSession.builder \
        .appName("PySpark UDF Example with PyMEOS") \
        .master("local[*]") \
        .config("spark.default.parallelism", 4) \
        .getOrCreate()

    # Define the UDF for creating a TGeogPointInst
    @udf(returnType=StringType())
    def create_point_udf(lat, lon, time):
        pymeos_initialize("UTC")
        point_inst = TGeogPointInst(f"Point({lon} {lat})@{str(time)}")
        point_inst_str = point_inst.__str__()
        # print(point_inst)
        #pymeos_finalize()
        return point_inst_str
    
    @udf(returnType=IntegerType())
    def create_pointseq_instants_udf(lats, lons, times):
        # Initialize PyMEOS
        pymeos_initialize("UTC")
    
        # Combine lat, lon, and time into a list of tuples and sort them by time
        combined = sorted(zip(lats, lons, times), key=lambda x: x[2])
        
        points_inst_list = [TGeogPointInst(f"Point({lon} {lat})@{time}") for lat, lon, time in combined]
        point_seq = TGeogPointSeq(instant_list=points_inst_list, lower_inc=True, upper_inc=True)
        instants = point_seq.num_instants()
        return instants

    
    # Get the value of 'spark.default.parallelism'
    default_parallelism = spark.sparkContext.getConf().get("spark.default.parallelism")
    print(f"spark.default.parallelism: {default_parallelism}")
    
    # Read data from a CSV file
    data_path = "../../states_2022-06-27-00.csv"  # Update this with your CSV file path
    df = spark.read.csv(data_path, header=True, inferSchema=True).select("icao24", "time", "lat", "lon")
    
    df = df.dropna(subset=["lat", "lon"])
    # Convert the 'time' column to the correct format
    df = df.withColumn("time", from_unixtime(col("time"), "yyyy-MM-dd' 'HH:mm:ss"))

    # Apply the UDF to the DataFrame to create a new 'Point' column
    df_with_points = df.withColumn("Point", create_point_udf("lat", "lon", "time"))

    # df_with_points.explain(True)

    num_partitions = df.rdd.getNumPartitions()

    print(f"Number of partitions in the DataFrame: {num_partitions}")

    # Show the transformed DataFrame
    print("Transformed DataFrame with Points:")
    df_with_points.show()

    # Group by 'icao24' and aggregate 'lat', 'lon', and 'time' into separate lists
    df_grouped = df_with_points.groupby("icao24").agg(
        collect_list("lat").alias("lats"),
        collect_list("lon").alias("lons"),
        collect_list("time").alias("times"),
        count("time").alias("SeqInstantsTotal")
    )
    
    # Apply the UDF to convert the lists of latitudes, longitudes, and timestamps to a TGeogPointSeq
    df_with_pointseq = df_grouped.withColumn(
        "PointSeqInstants",
        create_pointseq_instants_udf(col("lats"), col("lons"), col("times"))
    ).select("icao24", "SeqInstantsTotal", "PointSeqInstants")

    def partitioner(key):
        # print(key)
        return random.randint(1,10)
    
    #grid_partitioner = GridPartitioner(partition_dimensions = [3, 3])
    partitionedRDD = df_grouped.rdd.map(lambda row: (row[0], row)).partitionBy(10, partitioner)

        # Function to count rows per partition
    def count_in_partition(idx, iterator):
        cnt = 0
        for _ in iterator:
            cnt += 1
        return [(idx, cnt)]
    
    # Using mapPartitionsWithIndex to count rows per partition
    partition_counts = partitionedRDD.mapPartitionsWithIndex(count_in_partition).collect()
    
    # Print the results
    for partition_id, cnt in partition_counts:
        print(f"Partition {partition_id} has {cnt} rows")

    
    # Show the result
    print("DataFrame with Point Sequences:")
    df_with_pointseq.show(truncate=False)

if __name__ == "__main__":
    main()

KeyboardInterrupt: 