# BerlinMOD Queries

So far we have replicated the BerlinMOD Pymeos tutorial using Pyspark. Now we will execute the BerlinMOD queries.

In [1]:
cd "../mobilitydb-berlinmod-sf0.1/"

/data/mobilitydb-berlinmod-sf0.1


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
ls

[0m[01;32minstants.csv[0m*  [01;32mpoints.csv[0m*       [01;32mtrips.csv[0m*       [01;32mvehicles.csv[0m*
[01;32mlicences.csv[0m*  [01;32mregions.csv[0m*      trips_small.csv  vehicles_small.csv
[01;32mperiods.csv[0m*   [01;34mspark-warehouse[0m/  vehicle_ids.txt


In [3]:
rm -R spark-warehouse/

## Imports

In [4]:
from pymeos import *
from pymeos.plotters import *

from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F

from pysparkmeos.UDT.MeosDatatype import *
from pysparkmeos.partitions.grid.grid_partitioner import GridPartition
from pysparkmeos.utils.udt_appender import udt_append
from pysparkmeos.utils.utils import *


from pysparkmeos.UDF.udf import *
from pysparkmeos.partitions.mobilityrdd import MobilityRDD


import random, datetime, os, sys
from datetime import timedelta
from functools import partial
from datetime import datetime, timezone
import contextily as cx
import distinctipy
import geopandas as gpd
import pandas as pd
import shapely.geometry as shp

import matplotlib.pyplot as plt
import numpy as np
from shapely import wkb, box, from_wkb
from typing import Union
from time import time

## Spark Initialization

In [5]:
def startspark():
    # Initialize PyMEOS
    pymeos_initialize("UTC")
    
    os.environ['PYSPARK_DRIVER_PYTHON_OPTS']= "notebook"
    os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
    os.environ['PYSPARK_PYTHON'] = sys.executable
    
    #.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    # Initialize a Spark session
    spark = SparkSession.builder \
        .appName("BerlinMOD with PySpark") \
        .master("local[3]") \
        .config("spark.default.parallelism", 3) \
        .config("spark.executor.memory", "3g") \
        .config("spark.executor.cores", 1) \
        .config("spark.driver.memory", "2g") \
        .config("spark.driver.maxResultSize", 0) \
        .config("spark.sql.allowMultipleTableArguments.enabled", True) \
        .getOrCreate()
    
    # spark.sparkContext.setLogLevel("INFO")
    
    # Append the UDT mapping to the PyMEOS classes
    udt_append()
    
    # Get the value of 'spark.default.parallelism'
    default_parallelism = spark.sparkContext.getConf().get("spark.default.parallelism")
    print(f"spark.default.parallelism: {default_parallelism}")

    # Register udfs in Spark SQL
    register_udfs_under_spark_sql(spark)
    
    return spark

spark = startspark()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/25 18:21:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/05/25 18:21:18 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


spark.default.parallelism: 3


## Load Tables
We will use the power of Spark SQL to read in the raw dataframes and then create the tables.

In [6]:
ls

[0m[01;32minstants.csv[0m*  [01;32mperiods.csv[0m*  [01;32mregions.csv[0m*  trips_small.csv  [01;32mvehicles.csv[0m*
[01;32mlicences.csv[0m*  [01;32mpoints.csv[0m*   [01;32mtrips.csv[0m*    vehicle_ids.txt  vehicles_small.csv


In [7]:
def load_table(
    spark, 
    path, 
    tablename, 
    partition_key=None, 
    transformation_query=None,
    partition_query = None,
    partitioner_class = None,
    partitioner_args = {},
    **kwargs
):
    print("Reading raw csv ", path)
    rawdf = spark.read.csv(path, **kwargs)

    print("Creating temp view of raw table")
    rawdf.createOrReplaceTempView(f"{tablename}RawNoCache")

    print("Schema and statistics of raw table")
    rawdf.printSchema()
    rawdf.describe().show()
    print(f"Creating final table {tablename} based on {tablename}Raw, partitioned by {partition_key}.")
    spark.sql(f"""DROP TABLE IF EXISTS {tablename}""")

    if transformation_query:
        rawdf = spark.sql(transformation_query)
        rawdf.createOrReplaceTempView(f"{tablename}RawNoCache")
        spark.sql(f"CACHE TABLE {tablename}Raw SELECT * FROM {tablename}RawNoCache")
        spark.sql(f"SELECT * FROM {tablename}Raw LIMIT 5").show()
        #spark.catalog.dropTempView(f"{tablename}RawNoCache")
    else:
        spark.sql(f"CACHE TABLE {tablename}Raw SELECT * FROM {tablename}RawNoCache")
        #spark.catalog.dropTempView(f"{tablename}RawNoCache")
    partitioner = None
    if partition_query:
        bounds = rawdf.rdd.mapPartitions(bounds_calculate_map).reduce(bounds_calculate_reduce)
        print("Bounds: ", bounds)
        partitioner = partitioner_class(bounds=bounds, **partitioner_args)
        grid = partitioner.as_spark_table()
        grid.cache()
        grid.show()
        grid.createOrReplaceTempView("grid")
        print("Creating partitioned table... ")
        partitionedTable = spark.sql(partition_query)
        partitionedTable.createOrReplaceTempView(f"{tablename}Raw")


    start = time()
    
    if partition_key:
        df = spark.sql(f"""
        CREATE TABLE {tablename}NoCache
        USING parquet
        PARTITIONED BY ({partition_key})
        AS SELECT * FROM {tablename}Raw
        """)
    else:
        df = spark.sql(f"""
        CREATE TABLE {tablename}NoCache
        USING parquet
        AS SELECT * FROM {tablename}Raw
        """)
        
    end = time()
    
    if partition_key:
        print(f"{tablename} partitions:")
        spark.sql(f"""
        SHOW PARTITIONS {tablename}NoCache
        """).show()
    print(f"Final table created in {end-start} seconds")

    spark.sql(f"CACHE TABLE {tablename} SELECT * FROM {tablename}NoCache")

    df = spark.table(f"{tablename}")
    
    print(f"Final table {tablename} schema:")
    df.printSchema()
    
    #Drop the temporary view
    #spark.catalog.dropTempView(f"{tablename}Raw")
    return df, (start, end, end-start)


def load_all_tables(configs):
    tables = {}
    stats = {}
    for tablename, config in configs.items():
        table, stat = load_table(**config)
        tables[tablename] = table
        stats[tablename] = stat
    return tables, stats

### Instants

In [8]:
#instants, statsinstants = load_table(spark, "instants.csv", 'instants', inferSchema=True, header=True)
#instants.show()

### Licences

In [9]:
#licences, statslicences = load_table(spark, "licences.csv", 'licences', inferSchema=True, header=True)
#licences.show()

### Periods

In [10]:
transperiod = """
SELECT periodid, beginp, endp, tstzspan(period) AS period FROM periodsRawNoCache
"""
#periods, statsperiods = load_table(spark, "periods.csv", 'periods', transformation_query=transperiod, inferSchema=True, header=True)
#periods.show()

### Points

In [11]:
transpoints = """
SELECT pointid, posx, posy, geometry_from_hexwkb(geom) AS geom FROM pointsRawNoCache
"""
#points, statspoints = load_table(spark, "points.csv", 'points', transformation_query=transpoints, inferSchema=True, header=True)
#points.show()

### Regions

In [12]:
transregions = "SELECT regionid, geometry_from_hexwkb(geom) AS geom FROM regionsRawNoCache"
#regions, statsregions = load_table(spark, "regions.csv", 'regions', transformation_query=transregions, inferSchema=True, header=True)
#regions.show()

### Trips
Note: Use trips_small for testing.

In [13]:
from pyspark.sql.types import Row
from typing import Iterator

schema = StructType([
    StructField("tripid", IntegerType()),
    StructField("vehid", IntegerType()),
    StructField("day", IntegerType()),
    StructField("seqno", IntegerType()),
    StructField("sourcenode", IntegerType()),
    StructField("targetnode", StringType()),
    StructField("trip", TGeomPointSeqSetUDT()),
    StructField("trajectory", GeometryUDT()),
    StructField("license", StringType()),
    StructField("partitionKey", IntegerType())
])

@F.udtf(returnType=schema)
class PartitionUDTF:
    def eval(self, row: Row):
        pymeos_initialize()
        sequence_id = row.tripid
        trajectory = row.trip
        #print(trajectory,type(trajectory))
        #print(row.grid[0])
        #grid = [STBoxWrap(f"SRID=4326;{tile.__str__().strip('SRID=4326;')}") for tile in row.grid]
        grid = row.grid
        gridids = row.gridids
        partitioned = [(key, trajectory.at(tile)) for key, tile in zip(gridids, grid)]
        #print(trajectory)
        #print(grid)
        #print(partitioned)
        count = 0
        responses = []
        for partition_key, partition_traj in partitioned:
            count += 1
            if partition_traj is None:
                continue
            else:
                response = (sequence_id, row.vehid, row.day, row.seqno, row.sourcenode, row.targetnode, partition_traj, row.trajectory, row.licence, partition_key)
                yield response
                #seqs = partition_traj.segments()
                #print(seqs)
                #for partition_traj_seq in seqs:
                #    response = (sequence_id, row.vehid, row.day, row.seqno, row.sourcenode, row.targetnode, partition_traj_seq, row.trajectory, row.licence, partition_key)
                    #responses.append(response)
                    #yield response
        #for response in responses:
        #    yield response


spark.udtf.register("PartitionUDTF", PartitionUDTF)

<pyspark.sql.udtf.UserDefinedTableFunction at 0x7fffbcaef8b0>

In [14]:
# We need to add the transformation query to the trips table and inject it to the config.
parttrips = """
    SELECT * 
    FROM PartitionUDTF(
        TABLE(
                SELECT 
                    *, 
                    (SELECT collect_list(tile) FROM grid) AS grid, 
                    (SELECT collect_list(tileid) FROM grid) AS gridids
                FROM tripsRaw
        )
    )
"""

In [15]:

transtrips = "SELECT tripid, vehid, day, seqno, sourcenode, targetnode, trip_from_hexwkb(trip) AS trip, geometry_from_hexwkb(trajectory) AS trajectory, licence FROM tripsRawNoCache"

"""
trips, statstrips = load_table(
    spark, "trips_small.csv", 'trips', 
    transformation_query=transtrips,
    partition_key= 'partitionKey',
    partition_query=parttrips,
    partitioner_class=GridPartition,
    partitioner_args = {'cells_per_side': 3},
    inferSchema=True,
    header=True
)
trips.show()
"""

'\ntrips, statstrips = load_table(\n    spark, "trips_small.csv", \'trips\', \n    transformation_query=transtrips,\n    partition_key= \'partitionKey\',\n    partition_query=parttrips,\n    partitioner_class=GridPartition,\n    partitioner_args = {\'cells_per_side\': 3},\n    inferSchema=True,\n    header=True\n)\ntrips.show()\n'

### Vehicles
Note: Also read vehicles_small for testing.

In [16]:
#vehicles, statsvehicles = load_table(spark, "vehicles_small.csv", 'vehicles', inferSchema=True, header=True)
#vehicles.show()

## Queries

First queries take a general approach and are only used to measure overall performance.

In [17]:
from time import time
def query_exec(query, spark, execute=True, explain=False, explainmode=''):
    plan = None
    if explain:
        plan = spark.sql(f"EXPLAIN {explainmode} {query}").collect()[0].plan
    result = spark.sql(query)
    start = time()
    if execute:
        result.show()
    end = time()
    print("Query execution time: ", end-start, " seconds.")
    return result, (start, end, end-start), plan


def retrieve_exec_stats(queries, starts, ends, durations, plans):
    return pd.DataFrame({"queries": queries, "start": starts, "end": ends, "duration": durations, "plan": plans})


def run_all_queries(queries, spark, execute=True, explain=True, explainmode='', printplan=False):
    """ Utility function to run all queries through subsequent experiments """
    qdfs = []
    starts = []
    ends = []
    durations = []
    plans = []
    for querytext in queries:
        qdf, qstats, plan = query_exec(querytext, spark, execute, explain, explainmode)
        qdfs.append(qdf)
        starts.append(qstats[0])
        ends.append(qstats[1])
        durations.append(qstats[2])
        plans.append(plan)
        if printplan:
            print(plan)
    exec_stats = retrieve_exec_stats(queries, starts, ends, durations, plans)
    return qdfs, exec_stats

### Query 1: What are the models of the vehicles with licence plate numbers from QueryLicences?

In [18]:
querytext1 = """
    SELECT l.licence, v.model
    FROM licences l, vehicles v
    WHERE l.licence = v.licence
"""
#q1, q1stats, plan1 = query_exec(querytext1, spark, explain=True)
#if plan1:
#    print(plan1)

### Query 2: How many vehicles exist that are 'passenger' cars?

In [19]:
querytext2 = """
    SELECT COUNT(licence) AS PassengerCarCount
    FROM vehicles
    WHERE type='passenger'
"""
#q2, q2stats, plan2 = query_exec(querytext2, spark, explain=True)
#if plan2:
#    print(plan2)

### Query 3: Where have the vehicles with licences from QueryLicences1 been at each of the instants from QueryInstants1?

In [20]:
from datetime import datetime

querytext3 = """
    WITH
    veh_w_lic AS (
        SELECT v.vehid, l.licence, v.model
        FROM licences l, vehicles v
        WHERE l.licence = v.licence
    ),
    veh_trips AS (
        SELECT t.* 
        FROM veh_w_lic vw, trips t
        WHERE t.vehid = vw.vehid
    ),
    tile_instants AS (
        SELECT /*+ BROADCAST(gr) */ gr.tile, i.instant
        FROM grid gr, instants i
        WHERE contains_stbox_stbox(gr.tile, i.instant) = TRUE
    )
    SELECT /*+ BROADCAST(i) */ vt.vehid, vt.tripid, vt.trip, i.instant, tpoint_at(vt.trip, i.instant) AS pos
    FROM veh_trips vt, tile_instants i
"""
#q3, q3stats, plan3 = query_exec(querytext3, spark, explain=True)
#if plan3:
#    print(plan3)

### Query 4: Which licence plate numbers belong to vehicles that have passed the points from QueryPoints?

In [21]:
querytext4 = """
    WITH 
    vehids_intersect AS (
        SELECT t.vehid
        FROM trips t, points p
        WHERE ever_touches(t.trip, p.geom) = TRUE
    )
    SELECT vi.vehid, v.licence
    FROM vehids_intersect vi, vehicles v
"""
#q4, q4stats, plan4 = query_exec(querytext4, spark, explain=True)
#if plan4:
#    print(plan4)

In [22]:
dummy_df = spark.createDataFrame(["0"], "string").toDF("temp_clm")
dummy_df.show()

24/05/25 18:21:31 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


+--------+
|temp_clm|
+--------+
|       0|
+--------+



In [23]:
queries = [querytext1, querytext2] #, querytext3] #, querytext4]

## Experiments

### Experiment 1: Run Queries ASIS
First we are going to run the queries without any improvement or partitioning strategy.

In [24]:
run_exp_1 = False

#### Create Tables

In [25]:
rm -R spark-warehouse/

/bin/rm: cannot remove 'spark-warehouse/': No such file or directory


In [26]:
configs_exp1 = {
    'instants': {'spark': spark, 'path': 'instants.csv', 'tablename': 'instants', 'inferSchema': True, 'header': True},
    'licences': {'spark': spark, 'path': 'licences.csv', 'tablename': 'licences', 'inferSchema': True, 'header': True},
    'periods':  {'spark': spark, 'path': 'periods.csv', 'tablename': 'periods', 'transformation_query': transperiod, 'inferSchema': True, 'header': True},
    'points':   {'spark': spark, 'path': 'points.csv', 'tablename': 'points', 'transformation_query': transpoints, 'inferSchema': True, 'header': True},
    'regions':  {'spark': spark, 'path': 'regions.csv', 'tablename': 'regions', 'transformation_query':transregions, 'inferSchema': True, 'header': True},
    'trips':    {'spark': spark, 'path': 'trips_small.csv', 'tablename': 'trips', 'transformation_query':transtrips, 'inferSchema': True, 'header': True},
    'vehicles': {'spark': spark, 'path': 'vehicles_small.csv', 'tablename': 'vehicles', 'inferSchema': True, 'header': True}
}

In [27]:
if run_exp_1:
    tables, stats = load_all_tables(configs_exp1)

In [28]:
@F.udtf(returnType="num: int, squared: int")
class SquareNumbers:
    def eval(self, start: int, end: int):
        for num in range(start, end + 1):
            yield (num, num * num)

SquareNumbers(F.lit(1), F.lit(3)).show()

[Stage 2:>                                                          (0 + 1) / 1]

+---+-------+
|num|squared|
+---+-------+
|  1|      1|
|  2|      4|
|  3|      9|
+---+-------+



                                                                                

In [29]:
"""
 |-- tripid: integer (nullable = true)
 |-- vehid: integer (nullable = true)
 |-- day: date (nullable = true)
 |-- seqno: integer (nullable = true)
 |-- sourcenode: integer (nullable = true)
 |-- targetnode: integer (nullable = true)
 |-- trip: pythonuserdefined (nullable = true)
 |-- trajectory: pythonuserdefined (nullable = true)
"""
schema = StructType([
    StructField("point", TGeomPointInstUDT())
])
@F.udtf(returnType=schema)
class ExplodeGeomSeq:
    def eval(self, trip: TGeomPointSeqWrap):
        #print(trip['trip'])
        #trip = trip.trip
        pymeos_initialize()
        instants = trip['trip'].instants()
        for i in instants:
            yield i,

spark.udtf.register("explodeGeomSeq", ExplodeGeomSeq)

<pyspark.sql.udtf.UserDefinedTableFunction at 0x7fffbca9ddf0>

In [30]:
if run_exp_1:
    spark.sql("SELECT * FROM explodeGeomSeq(TABLE(SELECT trip FROM trips))").show()

#### Execute Queries

In [31]:
if run_exp_1:
    qdfs_exp1, stats_exp1 = run_all_queries(queries, spark, explain=True, printplan=True)

In [32]:
if run_exp_1:
    for (_id, rdd) in spark.sparkContext._jsc.getPersistentRDDs().items():
        rdd.unpersist()
        print("Unpersisted {} rdd".format(_id))
spark.stop()

### Experiment 2: Partition Trips by vehid, HashPartitioning

In [33]:
run_exp_2 = False

In [34]:
rm -R rm -R spark-warehouse/

/bin/rm: cannot remove 'rm': No such file or directory
/bin/rm: cannot remove 'spark-warehouse/': No such file or directory


In [35]:
if run_exp_2:
    spark = startspark()

#### Create Tables

In [36]:
configs_exp2 = {
    'instants': {'spark': spark, 'path': 'instants.csv', 'tablename': 'instants', 'inferSchema': True, 'header': True},
    'licences': {'spark': spark, 'path': 'licences.csv', 'tablename': 'licences', 'inferSchema': True, 'header': True},
    'periods':  {'spark': spark, 'path': 'periods.csv', 'tablename': 'periods', 'transformation_query': transperiod, 'inferSchema': True, 'header': True},
    'points':   {'spark': spark, 'path': 'points.csv', 'tablename': 'points', 'transformation_query': transpoints, 'inferSchema': True, 'header': True},
    'regions':  {'spark': spark, 'path': 'regions.csv', 'tablename': 'regions', 'transformation_query':transregions, 'inferSchema': True, 'header': True},
    'trips':    {'spark': spark, 'path': 'trips_small.csv', 'tablename': 'trips', 'partition_key': 'vehid', 'transformation_query':transtrips, 'inferSchema': True, 'header': True},
    'vehicles': {'spark': spark, 'path': 'vehicles_small.csv', 'tablename': 'vehicles', 'inferSchema': True, 'header': True}
}

In [37]:
if run_exp_2:
    tables, stats = load_all_tables(configs_exp2)

#### Execute Queries

In [38]:
if run_exp_2:
    qdfs_exp2, stats_exp2 = run_all_queries(queries, spark, explain=True, printplan=True)

In [39]:
#spark.stop()

### Experiment 3: Partition by Trip, RegularGrid

In [40]:
run_exp_3 = True

In [41]:
rm -R rm -R spark-warehouse/

/bin/rm: cannot remove 'rm': No such file or directory
/bin/rm: cannot remove 'spark-warehouse/': No such file or directory


In [42]:
if run_exp_3:
    spark = startspark()

24/05/25 18:21:39 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


spark.default.parallelism: 3


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 39980)
Traceback (most recent call last):
  File "/usr/lib/python3.9/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.9/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.9/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib/python3.9/socketserver.py", line 720, in __init__
    self.handle()
  File "/usr/local/lib/python3.9/dist-packages/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/usr/local/lib/python3.9/dist-packages/pyspark/accumulators.py", line 267, in poll
    if self.rfile in r and func():
  File "/usr/local/lib/python3.9/dist-packages/pyspark/accumulators.py", line 271, in accum_updates
    num_updates = read_int(self.rf

In [43]:
configs_exp3 = {
    'instants': {'spark': spark, 'path': 'instants.csv', 'tablename': 'instants', 'inferSchema': True, 'header': True},
    'licences': {'spark': spark, 'path': 'licences.csv', 'tablename': 'licences', 'inferSchema': True, 'header': True},
    'periods':  {'spark': spark, 'path': 'periods.csv', 'tablename': 'periods', 'transformation_query': transperiod, 'inferSchema': True, 'header': True},
    'points':   {'spark': spark, 'path': 'points.csv', 'tablename': 'points', 'transformation_query': transpoints, 'inferSchema': True, 'header': True},
    'regions':  {'spark': spark, 'path': 'regions.csv', 'tablename': 'regions', 'transformation_query':transregions, 'inferSchema': True, 'header': True},
    'trips':    {
        'spark': spark, 
        'path': 'trips_small.csv', 
        'tablename': 'trips', 
        'partition_key': 'partitionKey', 
        'transformation_query':transtrips,
        'partition_query': parttrips,
        'partitioner_class': GridPartition,
        'partitioner_args': {'cells_per_side': 3},
        'inferSchema': True, 
        'header': True},
    'vehicles': {'spark': spark, 'path': 'vehicles_small.csv', 'tablename': 'vehicles', 'inferSchema': True, 'header': True}
}

In [44]:
if run_exp_3:
    spark.udtf.register("PartitionUDTF", PartitionUDTF)
    tables, stats = load_all_tables(configs_exp3)

Reading raw csv  instants.csv


                                                                                

Creating temp view of raw table
Schema and statistics of raw table
root
 |-- instantid: integer (nullable = true)
 |-- instant: timestamp (nullable = true)

+-------+------------------+
|summary|         instantid|
+-------+------------------+
|  count|               100|
|   mean|              50.5|
| stddev|29.011491975882016|
|    min|                 1|
|    max|               100|
+-------+------------------+

Creating final table instants based on instantsRaw, partitioned by None.


                                                                                

Final table created in 4.297498464584351 seconds
Final table instants schema:
root
 |-- instantid: integer (nullable = true)
 |-- instant: timestamp (nullable = true)

Reading raw csv  licences.csv
Creating temp view of raw table
Schema and statistics of raw table
root
 |-- licenceid: integer (nullable = true)
 |-- licence: string (nullable = true)
 |-- vehid: integer (nullable = true)



                                                                                

+-------+------------------+--------+------------------+
|summary|         licenceid| licence|             vehid|
+-------+------------------+--------+------------------+
|  count|               101|     101|               101|
|   mean|              51.0|    NULL|319.46534653465346|
| stddev|29.300170647967224|    NULL| 175.0106604956644|
|    min|                 1|B-BJ 115|                 9|
|    max|               101|B-[U 177|               622|
+-------+------------------+--------+------------------+

Creating final table licences based on licencesRaw, partitioned by None.


                                                                                

Final table created in 2.0499801635742188 seconds
Final table licences schema:
root
 |-- licenceid: integer (nullable = true)
 |-- licence: string (nullable = true)
 |-- vehid: integer (nullable = true)

Reading raw csv  periods.csv
Creating temp view of raw table
Schema and statistics of raw table
root
 |-- periodid: integer (nullable = true)
 |-- beginp: timestamp (nullable = true)
 |-- endp: timestamp (nullable = true)
 |-- period: string (nullable = true)

+-------+------------------+--------------------+
|summary|          periodid|              period|
+-------+------------------+--------------------+
|  count|               100|                 100|
|   mean|              50.5|                NULL|
| stddev|29.011491975882016|                NULL|
|    min|                 1|[2020-06-01 00:45...|
|    max|               100|[2020-06-11 21:18...|
+-------+------------------+--------------------+

Creating final table periods based on periodsRaw, partitioned by None.


                                                                                

+--------+--------------------+--------------------+--------------------+
|periodid|              beginp|                endp|              period|
+--------+--------------------+--------------------+--------------------+
|       1|2020-06-09 11:15:...|2020-06-09 20:38:...|[2020-06-09 11:15...|
|       2|2020-06-10 10:55:...|2020-06-11 01:01:...|[2020-06-10 10:55...|
|       3|2020-06-04 06:42:...|2020-06-05 02:50:...|[2020-06-04 06:42...|
|       4|2020-06-05 04:39:...|2020-06-06 05:48:...|[2020-06-05 04:39...|
|       5|2020-06-06 09:10:...|2020-06-07 03:59:...|[2020-06-06 09:10...|
+--------+--------------------+--------------------+--------------------+



                                                                                

Final table created in 2.219373941421509 seconds


                                                                                

Final table periods schema:
root
 |-- periodid: integer (nullable = true)
 |-- beginp: timestamp (nullable = true)
 |-- endp: timestamp (nullable = true)
 |-- period: pythonuserdefined (nullable = true)

Reading raw csv  points.csv
Creating temp view of raw table
Schema and statistics of raw table
root
 |-- pointid: integer (nullable = true)
 |-- posx: double (nullable = true)
 |-- posy: double (nullable = true)
 |-- geom: string (nullable = true)



24/05/25 18:22:18 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+-------+------------------+-----------------+-----------------+--------------------+
|summary|           pointid|             posx|             posy|                geom|
+-------+------------------+-----------------+-----------------+--------------------+
|  count|               100|              100|              100|                 100|
|   mean|              50.5|486384.3413598945|6594038.933758076|                NULL|
| stddev|29.011491975882016|7200.526060474747|6552.156274876073|                NULL|
|    min|                 1|472428.0634008836|6577421.541139536|0101000020110F000...|
|    max|               100| 498913.875699313|6607119.513588189|0101000020110F000...|
+-------+------------------+-----------------+-----------------+--------------------+

Creating final table points based on pointsRaw, partitioned by None.


                                                                                

+-------+------------------+-----------------+--------------------+
|pointid|              posx|             posy|                geom|
+-------+------------------+-----------------+--------------------+
|      1| 476191.0852037612|6589454.831155596|POINT (476191.085...|
|      2| 485998.9668637461|6580934.403927697|POINT (485998.966...|
|      3|486927.13764603145|  6584864.3484669|POINT (486927.137...|
|      4|491514.42461848777|6594412.284642856|POINT (491514.424...|
|      5| 493018.1394320724|6602300.271879816|POINT (493018.139...|
+-------+------------------+-----------------+--------------------+



                                                                                

Final table created in 1.8431105613708496 seconds
Final table points schema:
root
 |-- pointid: integer (nullable = true)
 |-- posx: double (nullable = true)
 |-- posy: double (nullable = true)
 |-- geom: pythonuserdefined (nullable = true)

Reading raw csv  regions.csv
Creating temp view of raw table
Schema and statistics of raw table
root
 |-- regionid: integer (nullable = true)
 |-- geom: string (nullable = true)

+-------+------------------+--------------------+
|summary|          regionid|                geom|
+-------+------------------+--------------------+
|  count|               100|                 100|
|   mean|              50.5|                NULL|
| stddev|29.011491975882016|                NULL|
|    min|                 1|0103000020110F000...|
|    max|               100|0103000020110F000...|
+-------+------------------+--------------------+

Creating final table regions based on regionsRaw, partitioned by None.


                                                                                

+--------+--------------------+
|regionid|                geom|
+--------+--------------------+
|       1|POLYGON ((483571....|
|       2|POLYGON ((485438....|
|       3|POLYGON ((486542....|
|       4|POLYGON ((488077....|
|       5|POLYGON ((482151....|
+--------+--------------------+



                                                                                

Final table created in 1.9060332775115967 seconds
Final table regions schema:
root
 |-- regionid: integer (nullable = true)
 |-- geom: pythonuserdefined (nullable = true)

Reading raw csv  trips_small.csv
Creating temp view of raw table
Schema and statistics of raw table
root
 |-- tripid: integer (nullable = true)
 |-- vehid: integer (nullable = true)
 |-- day: date (nullable = true)
 |-- seqno: integer (nullable = true)
 |-- sourcenode: integer (nullable = true)
 |-- targetnode: integer (nullable = true)
 |-- trip: string (nullable = true)
 |-- trajectory: string (nullable = true)
 |-- licence: string (nullable = true)



                                                                                

+-------+-----------------+------------------+------------------+------------------+-----------------+--------------------+--------------------+-------+
|summary|           tripid|             vehid|             seqno|        sourcenode|       targetnode|                trip|          trajectory|licence|
+-------+-----------------+------------------+------------------+------------------+-----------------+--------------------+--------------------+-------+
|  count|               91|                91|                91|                91|               91|                  91|                  91|      0|
|   mean|304.3296703296703|10.956043956043956|2.5934065934065935| 39454.89010989011|39454.89010989011|                NULL|                NULL|   NULL|
| stddev|204.9905176638967| 7.067786213065726| 1.666520140079134|28341.445037436115|28341.44503743612|                NULL|                NULL|   NULL|
|    min|                1|                 1|                 1|              116

                                                                                

+------+-----+----------+-----+----------+----------+--------------------+--------------------+-------+
|tripid|vehid|       day|seqno|sourcenode|targetnode|                trip|          trajectory|licence|
+------+-----+----------+-----+----------+----------+--------------------+--------------------+-------+
|     1|    1|2020-06-01|    1|     79113|     66276|[POINT(496253.840...|LINESTRING (49625...|   NULL|
|     2|    1|2020-06-01|    2|     66276|     79113|[POINT(481241.171...|LINESTRING (48124...|   NULL|
|     3|    1|2020-06-02|    1|     79113|     66276|[POINT(496253.840...|LINESTRING (49625...|   NULL|
|     4|    1|2020-06-02|    2|     66276|     79113|[POINT(481241.171...|LINESTRING (48124...|   NULL|
|     5|    1|2020-06-03|    1|     79113|     66276|[POINT(496253.840...|LINESTRING (49625...|   NULL|
+------+-----+----------+-----+----------+----------+--------------------+--------------------+-------+



                                                                                

Bounds:  STBOX XT(((473277.05262936745,6579811.389156611),(498784.34433982597,6606871.682578203)),[2020-06-01 06:01:41.054+00, 2020-06-11 19:30:26.096307+00])
+------+--------------------+
|tileid|                tile|
+------+--------------------+
|     0|STBOX XT(((473277...|
|     1|STBOX XT(((473277...|
|     2|STBOX XT(((473277...|
|     3|STBOX XT(((473277...|
|     4|STBOX XT(((473277...|
|     5|STBOX XT(((473277...|
|     6|STBOX XT(((473277...|
|     7|STBOX XT(((473277...|
|     8|STBOX XT(((473277...|
|     9|STBOX XT(((481779...|
|    10|STBOX XT(((481779...|
|    11|STBOX XT(((481779...|
|    12|STBOX XT(((481779...|
|    13|STBOX XT(((481779...|
|    14|STBOX XT(((481779...|
|    15|STBOX XT(((481779...|
|    16|STBOX XT(((481779...|
|    17|STBOX XT(((481779...|
|    18|STBOX XT(((490281...|
|    19|STBOX XT(((490281...|
+------+--------------------+
only showing top 20 rows

Creating partitioned table... 


                                                                                

trips partitions:
+---------------+
|      partition|
+---------------+
| partitionKey=0|
| partitionKey=1|
|partitionKey=10|
|partitionKey=12|
|partitionKey=13|
|partitionKey=14|
|partitionKey=17|
| partitionKey=2|
|partitionKey=21|
|partitionKey=22|
|partitionKey=23|
|partitionKey=24|
|partitionKey=25|
|partitionKey=26|
| partitionKey=3|
| partitionKey=4|
| partitionKey=5|
| partitionKey=6|
| partitionKey=7|
| partitionKey=8|
+---------------+
only showing top 20 rows

Final table created in 22.16624641418457 seconds


                                                                                

Final table trips schema:
root
 |-- tripid: integer (nullable = true)
 |-- vehid: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- seqno: integer (nullable = true)
 |-- sourcenode: integer (nullable = true)
 |-- targetnode: string (nullable = true)
 |-- trip: pythonuserdefined (nullable = true)
 |-- trajectory: pythonuserdefined (nullable = true)
 |-- license: string (nullable = true)
 |-- partitionKey: integer (nullable = true)

Reading raw csv  vehicles_small.csv
Creating temp view of raw table
Schema and statistics of raw table
root
 |-- vehid: integer (nullable = true)
 |-- licence: string (nullable = true)
 |-- type: string (nullable = true)
 |-- model: string (nullable = true)

+-------+-----------------+-------+-----+--------+
|summary|            vehid|licence| type|   model|
+-------+-----------------+-------+-----+--------+
|  count|                3|      3|    3|       3|
|   mean|             11.0|   NULL| NULL|    NULL|
| stddev|8.717797887081348|   NULL

                                                                                

Final table created in 2.4541289806365967 seconds


                                                                                

Final table vehicles schema:
root
 |-- vehid: integer (nullable = true)
 |-- licence: string (nullable = true)
 |-- type: string (nullable = true)
 |-- model: string (nullable = true)



In [45]:
spark.sql("SELECT tripid, COUNT(trip) AS cnt FROM trips GROUP BY tripid LIMIT 5").show()

                                                                                

+------+---+
|tripid|cnt|
+------+---+
|   471|  3|
|   481|  3|
|   472|  2|
|    28|  5|
|   436|  1|
+------+---+



In [46]:
trip = spark.sql("SELECT * FROM trips LIMIT 1").collect()[0].trip
instant = spark.sql("SELECT * FROM instants LIMIT 1").collect()[0].instant

print(trip.at(instant))

None


#### Execute Queries

In [47]:
if run_exp_3:
    qdfs_exp3, stats_exp3 = run_all_queries(queries, spark, explain=True, printplan=True)

+-------+-----+
|licence|model|
+-------+-----+
|B-CJ 17| Opel|
+-------+-----+

Query execution time:  0.7265965938568115  seconds.
== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [licence#6268, model#6273]
   +- BroadcastHashJoin [licence#6268], [licence#6271], Inner, BuildRight, false
      :- Filter isnotnull(licence#6268)
      :  +- Scan In-memory table licences [licence#6268], [isnotnull(licence#6268)]
      :        +- InMemoryRelation [licenceid#6267, licence#6268, vehid#6269], StorageLevel(disk, memory, deserialized, 1 replicas)
      :              +- *(1) ColumnarToRow
      :                 +- FileScan parquet spark_catalog.default.licencesnocache[licenceid#823,licence#824,vehid#825] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/data/mobilitydb-berlinmod-sf0.1/spark-warehouse/licencesnocache], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<licenceid:int,licence:string,vehid:int>
      +- Broadca

In [53]:
%%time
querytext3 = """
    WITH
    littletrips AS (
        SELECT * FROM trips
    ),
    veh_w_lic AS (
        SELECT v.vehid, l.licence, v.model
        FROM licences l, vehicles v
        WHERE l.licence = v.licence
    ),
    veh_trips AS (
        SELECT t.* 
        FROM veh_w_lic vw, littletrips t
        WHERE t.vehid = vw.vehid
    ),
    tile_instants AS (
        SELECT /*+ BROADCAST(gr) */ gr.tile, i.instant
        FROM grid gr, instants i
        WHERE contains_stbox_stbox(gr.tile, i.instant) = TRUE
    )
    SELECT /*+ BROADCAST(i) */ vt.vehid, vt.tripid, vt.trip, i.instant, tpoint_at(vt.trip, i.instant) AS pos
    FROM veh_trips vt, tile_instants i
"""

q3 = spark.sql(querytext3)
q3.show()

def delete_nulls(partition):
    for row in partition:
        if row.pos != None:
            yield row
    
q3.rdd.mapPartitions(delete_nulls).take(5)

24/05/25 18:30:23 WARN ExtractPythonUDFFromJoinCondition: The join condition:contains_stbox_stbox(tile#4306, instant#9473)#9474 of the join plan contains PythonUDF only, it will be moved out and the join plan will be turned to cross join.
                                                                                

+-----+------+--------------------+--------------------+----+
|vehid|tripid|                trip|             instant| pos|
+-----+------+--------------------+--------------------+----+
|   17|   464|{[POINT(479113.66...|2020-06-01 19:44:...|NULL|
|   17|   464|{[POINT(479113.66...|2020-06-01 19:44:...|NULL|
|   17|   464|{[POINT(479113.66...|2020-06-01 19:44:...|NULL|
|   17|   464|{[POINT(479113.66...|2020-06-01 19:44:...|NULL|
|   17|   464|{[POINT(479113.66...|2020-06-01 19:44:...|NULL|
|   17|   464|{[POINT(479113.66...|2020-06-01 19:44:...|NULL|
|   17|   464|{[POINT(479113.66...|2020-06-01 19:44:...|NULL|
|   17|   464|{[POINT(479113.66...|2020-06-01 19:44:...|NULL|
|   17|   464|{[POINT(479113.66...|2020-06-01 19:44:...|NULL|
|   17|   464|{[POINT(479113.66...|2020-06-05 23:09:...|NULL|
|   17|   464|{[POINT(479113.66...|2020-06-05 23:09:...|NULL|
|   17|   464|{[POINT(479113.66...|2020-06-05 23:09:...|NULL|
|   17|   464|{[POINT(479113.66...|2020-06-05 23:09:...|NULL|
|   17| 

24/05/25 18:31:11 WARN ExtractPythonUDFFromJoinCondition: The join condition:contains_stbox_stbox(tile#4306, instant#9473)#9474 of the join plan contains PythonUDF only, it will be moved out and the join plan will be turned to cross join.
ERROR:root:Exception while sending command.                         (0 + 1) / 1]
Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.9/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.9/dist-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending

Py4JError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob

In [50]:
%%time
@F.udf(returnType=BooleanType())
def contains_stbox_stbox(stbox, other):
    pymeos_initialize()
    return stbox.contains(other)
spark.udf.register("contains_stbox_stbox", contains_stbox_stbox)

CPU times: user 28.2 ms, sys: 5.83 ms, total: 34.1 ms
Wall time: 132 ms


<pyspark.sql.udf.UserDefinedFunction at 0x7fffbcb42f40>

In [69]:
spark.sql("SELECT * FROM grid").show(truncate=False)

+------+---------------------------------------------------------------------------------------------------------------------------------------------+
|tileid|tile                                                                                                                                         |
+------+---------------------------------------------------------------------------------------------------------------------------------------------+
|0     |STBOX XT(((-179.99978893675143,-89.99724849779159),(-60.00591066159541,-30.00028948827336)),[2020-06-01 06:01:41+00, 2020-06-04 18:31:16+00])|
|1     |STBOX XT(((-179.99978893675143,-89.99724849779159),(-60.00591066159541,-30.00028948827336)),[2020-06-04 18:31:16+00, 2020-06-08 07:00:51+00])|
|2     |STBOX XT(((-179.99978893675143,-89.99724849779159),(-60.00591066159541,-30.00028948827336)),[2020-06-08 07:00:51+00, 2020-06-11 19:30:26+00])|
|3     |STBOX XT(((-179.99978893675143,-30.00028948827336),(-60.00591066159541,29.996669521244

In [70]:
spark.sql("SELECT * FROM instants").show(truncate=False)

+---------+--------------------------+
|instantid|instant                   |
+---------+--------------------------+
|1        |2020-06-01 19:44:49.709717|
|2        |2020-06-05 23:09:46.756569|
|3        |2020-06-02 07:23:47.01265 |
|4        |2020-06-03 17:45:26.880351|
|5        |2020-06-06 21:50:37.821979|
|6        |2020-06-05 04:20:39.881892|
|7        |2020-06-03 21:27:35.094863|
|8        |2020-06-06 23:55:02.695356|
|9        |2020-06-10 01:19:31.530931|
|10       |2020-06-02 05:29:31.40646 |
|11       |2020-05-31 23:17:56.475136|
|12       |2020-06-10 14:09:38.90966 |
|13       |2020-06-07 04:35:05.556855|
|14       |2020-06-08 08:21:40.758555|
|15       |2020-06-06 10:06:55.084251|
|16       |2020-06-01 08:40:40.189497|
|17       |2020-06-03 19:43:24.094119|
|18       |2020-06-07 14:21:51.717996|
|19       |2020-06-01 01:31:49.319809|
|20       |2020-06-10 00:09:16.655234|
+---------+--------------------------+
only showing top 20 rows

