# BerlinMOD Queries

So far we have replicated the BerlinMOD Pymeos tutorial using Pyspark. Now we will execute a subset of the BerlinMOD queries.

In [None]:
cd "../mobilitydb-berlinmod-sf0.1/"

In [None]:
ls -lh

In [None]:
rm -R spark-warehouse/

## Imports

In [None]:
from pymeos import *
from pymeos.plotters import *

from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F

from pysparkmeos.UDT.MeosDatatype import *

from pysparkmeos.partitions.grid.grid_partitioner import GridPartition
from pysparkmeos.partitions.kdtree_partitioner import KDTreePartition
from pysparkmeos.partitions.adaptive_partitioner_spark import AdaptiveBinsPartitionerSpark
from pysparkmeos.partitions.approx_adaptive_partitioner import ApproximateAdaptiveBinsPartitioner

from pysparkmeos.utils.udt_appender import *
from pysparkmeos.utils.utils import *

from pysparkmeos.UDF.udf import *
from pysparkmeos.UDTF.BerlinMOD import *

from pysparkmeos.BerlinMOD.config import load_config
from pysparkmeos.BerlinMOD.queries import *
from pysparkmeos.BerlinMOD.transformation_queries import *
from pysparkmeos.BerlinMOD.partition_queries import *
from pysparkmeos.BerlinMOD.func import *

import random, datetime, os, sys
from datetime import timedelta
from functools import partial
from datetime import datetime, timezone
import contextily as cx
import distinctipy
import geopandas as gpd
import pandas as pd
import shapely.geometry as shp

import matplotlib.pyplot as plt
import numpy as np
from shapely import wkb, box, from_wkb
from typing import Union
from time import time

## Spark Initialization

In [None]:
def startspark():
    # Initialize PyMEOS
    pymeos_initialize("UTC")
    
    os.environ['PYSPARK_DRIVER_PYTHON_OPTS']= "notebook"
    os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
    os.environ['PYSPARK_PYTHON'] = sys.executable
    
    # Initialize a Spark session
    spark = SparkSession.builder \
        .appName("BerlinMOD with PySpark") \
        .master("local[3]") \
        .config("spark.default.parallelism", 12) \
        .config("spark.executor.memory", "3g") \
        .config("spark.executor.cores", 1) \
        .config("spark.driver.memory", "2g") \
        .config("spark.driver.maxResultSize", 0) \
        .config("spark.sql.execution.arrow.maxRecordsPerBatch", "500") \
        .config("spark.sql.allowMultipleTableArguments.enabled", True) \
        .getOrCreate()
        
    # Append the UDT mapping to the PyMEOS classes
    udt_append()
    
    # Get the value of 'spark.default.parallelism'
    default_parallelism = spark.sparkContext.getConf().get("spark.default.parallelism")
    print(f"spark.default.parallelism: {default_parallelism}")

    # Register udfs in Spark SQL
    register_udfs_under_spark_sql(spark)

    # Register the udtfs in Spark SQL
    register_udtfs_under_spark_sql(spark)

    return spark

spark = startspark()

## Load Tables
We will use the power of Spark SQL to read in the raw dataframes and then create the tables.

In [None]:
ls

## Experiments

Here you can run an experiment, select the experiment to run in this notebook.  
Available experiments:
1. Run Queries AS-IS (default PySpark partitioning).
2. Run Queries with Trips partitioned by vehid, using Hash Partitioning.
3. Run Queries with Trips partitioned by trip, using RegularGrid.
4. Run Queries with Trips partitioned by trip, using KDTreePartitioning.
5. Run Queries with Trips partitioned by trip, using AdaptiveBinsPartitioning with Spark background.
6. Run Queries with Trips partitioned by trip, using ApproximateAdaptiveBinsPartitioning.

In [None]:
# Change to your desired experiment number.
run_exp_number = 3

# Select the queries to run 
querynumbers = [1, 2, 3, 4, 5, 6, 11, 13]

### Set up the configurations for the experiment

In [None]:
paths = {
    'trips': 'trips_small.csv',
    'instants': 'instants.csv',
    'licences': 'licences.csv',
    'periods': 'periods.csv',
    'points': 'points.csv',
    'regions': 'regions.csv',
    'vehicles': 'vehicles_small.csv'
}

transformation_queries_simple = {
    'trips': transtripssimple,
    'instants': transinstantssimple,
    'periods': transperiodsimple,
    'points': transpointssimple,
    'regions': transregionssimple
}

transformation_queries = {
    'trips': transtrips,
    'instants': transinstants,
    'periods': transperiod,
    'points': transpoints,
    'regions': transregions
}

partition_queries = {
    'trips': parttrips
}

partition_keys = {
    'trips': 'tileid'
}

num_buckets = 64
inferSchema = True
header = True

In [None]:
configs_exp1 = load_config(
    spark=spark, 
    paths=paths, 
    trans_queries=transformation_queries_simple, 
    part_queries=None, 
    partition_keys=None,
    partitioner_class=None,
    partitioner_args=None,
    num_buckets = None,
    inferSchema = inferSchema,
    header=header
)

In [None]:
configs_exp2 = load_config(
    spark=spark, 
    paths=paths, 
    trans_queries=transformation_queries_simple, 
    part_queries=None, 
    partition_keys={'trips': 'vehid'},
    partitioner_class=None,
    partitioner_args=None,
    num_buckets = num_buckets,
    inferSchema = inferSchema,
    header=header
)

In [None]:
configs_exp3 = load_config(
    spark=spark, 
    paths=paths, 
    trans_queries=transformation_queries, 
    part_queries=partition_queries, 
    partition_keys=partition_keys,
    partitioner_class=GridPartition,
    partitioner_args={'cells_per_side': 8},
    num_buckets = num_buckets,
    inferSchema = inferSchema,
    header=header
)

In [None]:
configs_exp4 = load_config(
    spark=spark, 
    paths=paths, 
    trans_queries=transformation_queries, 
    part_queries=partition_queries, 
    partition_keys=partition_keys,
    partitioner_class=KDTreePartition,
    partitioner_args={
        'moving_objects': None, 
        'dimensions': ['x', 'y', 't'], 
        'max_depth': 11},
    num_buckets = num_buckets,
    inferSchema = inferSchema,
    header=header
)

In [None]:
configs_exp5 = load_config(
    spark=spark, 
    paths=paths, 
    trans_queries=transformation_queries, 
    part_queries=partition_queries, 
    partition_keys=partition_keys,
    partitioner_class=AdaptiveBinsPartitionerSpark,
    partitioner_args={
        'spark': spark, 
        'dfname': 'tripsRaw', 
        'colname': 'trip',
        'num_tiles': 8, 
        'dimensions': ['x', 'y', 't'], 
        'utc': "UTC"},
    num_buckets = num_buckets,
    inferSchema = inferSchema,
    header=header
)

In [None]:
configs_exp6 = load_config(
    spark=spark, 
    paths=paths, 
    trans_queries=transformation_queries, 
    part_queries=partition_queries, 
    partition_keys=partition_keys,
    partitioner_class=ApproximateAdaptiveBinsPartitioner,
    partitioner_args={
        'spark': spark,
        'df': None, 
        'colname': 'trip',
        'num_tiles': 8, 
        'dimensions': ['x', 'y', 't'], 
        'utc': "UTC",
        'tablename': "tripsRaw"},
    num_buckets = num_buckets,
    inferSchema = inferSchema,
    header=header
)

In [None]:
experiment_configs = {
    i+1: config 
    for i, config in enumerate([configs_exp1, configs_exp2, configs_exp3, configs_exp4, configs_exp5, configs_exp6])
}
config = experiment_configs[run_exp_number]

queries = {
    1: querytext1,
    2: querytext2,
    3: querytext3,
    4: querytext4,
    5: querytext5,
    6: querytext6,
    11: querytext11,
    13: querytext13
}

queries_to_run = [queries[querynum] for querynum in querynumbers if querynum in queries]

### Run the experiment

#### Create Tables

In [None]:
tables, stats = load_all_tables(spark, config)

#### Execute Queries

In [None]:
qdfs_exp, stats_exp = run_all_queries(queries_to_run, spark, explain=True, printplan=False)

## Mapping the regions and trips

In [None]:
_, ax = plt.subplots(1, 1, figsize=(15, 15))
brussels = pd.read_csv(
    "brussels_region.csv", converters={"geom": partial(wkb.loads, hex=True)}
)
brussels = gpd.GeoDataFrame(brussels, geometry="geom")
brussels_geom = brussels["geom"][0]
brussels.plot(ax=ax, alpha=0.3, color='black')
cx.add_basemap(ax, alpha=0.3)
grid = spark.table('grid')

for gridrow in grid.toLocalIterator():
    gridrow.tile.plot_xy(axes=ax, color="black", draw_filling=False)

regions = spark.table('regions').select("regionid", "geom").distinct()

for regionrow in regions.toLocalIterator():
    myPoly = gpd.GeoSeries([regionrow.geom])
    myPoly.plot(ax=ax, alpha=0.7, color='lightblue')
    
#trips = spark.table('trips').sample(0.1, seed=3).select('movingobjectid', 'movingobject')
trips = spark.table('trips').select('movingobjectid', 'movingobject')
colors = ['orange', 'red', 'pink', 'green', 'purple']
for triprow in trips.toLocalIterator():
    TemporalPointSequenceSetPlotter.plot_xy(
        triprow.movingobject, axes=ax, show_markers=True, show_grid=False, color=colors[int(triprow.movingobjectid) % len(colors)]
    )