In [1]:
cd '../../MobilityDB-BerlinMOD/BerlinMOD/BerlinMOD_0_005_CSV_new'

/data/MobilityDB-BerlinMOD/BerlinMOD/BerlinMOD_0_005_CSV_new


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pyspark.sql.functions as F

from pysparkmeos.UDT.MeosDatatype import *
from pysparkmeos.utils.udt_appender import *

import os, sys

## Initialize

In [3]:
# Initialize PyMEOS
pymeos_initialize("UTC")

os.environ['PYSPARK_DRIVER_PYTHON_OPTS']= "notebook"
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
os.environ['PYSPARK_PYTHON'] = sys.executable

#     #.config("spark.default.parallelism", 12) \
#    .config("spark.memory.offHeap.enabled","true") \
#    .config("spark.memory.offHeap.size","1g") \
# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Preprocess BerlinMOD CSV") \
    .master("local[*]") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.execution.arrow.maxRecordsPerBatch", "100") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true") \
    .getOrCreate()

# spark.sparkContext.setLogLevel("INFO")

# Append the UDT mapping to the PyMEOS classes
udt_append()

# Get the value of 'spark.default.parallelism'
default_parallelism = spark.sparkContext.getConf().get("spark.default.parallelism")

#spark.sparkContext.getConf().install_pypi_package("pyarrow==0.14.1")

print(f"spark.default.parallelism: {default_parallelism}")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/02 08:33:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


spark.default.parallelism: None


## Read the dataframes

In [4]:
csvs = os.listdir()

queryinstants = spark.read.csv('queryinstants.csv', header=True, inferSchema=True)
querypoints = spark.read.csv('querypoints.csv', header=True, inferSchema=True)
datamcar = spark.read.csv('datamcar.csv', header=True, inferSchema=True)
queryperiods = spark.read.csv('queryperiods.csv', header=True, inferSchema=True)
streets = spark.read.csv('streets.csv', header=True, inferSchema=True)
queryregions = spark.read.csv('queryregions.csv', header=True, inferSchema=True)
trips = spark.read.csv('trips.csv', header=True, inferSchema=True)
querylicences = spark.read.csv('querylicences.csv', header=True, inferSchema=True)

queryinstants.show(3, truncate=False)
querypoints.show(3, truncate=False)
datamcar.show(3, truncate=False)
queryperiods.show(3, truncate=False)
streets.show(3, truncate=False)
queryregions.show(3, truncate=False)
trips.show(3, truncate=False)
querylicences.show(3, truncate=False)

queryinstants.printSchema()
querypoints.printSchema()
datamcar.printSchema()
queryperiods.printSchema()
streets.printSchema()
queryregions.printSchema()
trips.printSchema()
querylicences.printSchema()

                                                                                

+---+-----------------------+
|Id |Instant                |
+---+-----------------------+
|1  |2007-05-28 19:08:40.114|
|2  |2007-05-29 11:38:44.224|
|3  |2007-05-29 09:17:42.536|
+---+-----------------------+
only showing top 3 rows

+---+-------+-------+
|Id |Pos_x  |Pos_y  |
+---+-------+-------+
|1  |23144.0|11963.0|
|2  |13838.0|15113.0|
|3  |16786.0|11668.0|
+---+-------+-------+
only showing top 3 rows

+----+-------+---------+--------+
|Moid|Licence|Type     |Model   |
+----+-------+---------+--------+
|1   |B-RL 1 |passenger|Audi    |
|2   |B-XW 2 |passenger|Audi    |
|3   |B-BH 3 |passenger|Multicar|
+----+-------+---------+--------+
only showing top 3 rows

+---+-----------------------+-----------------------+
|Id |Begin                  |End                    |
+---+-----------------------+-----------------------+
|1  |2007-05-28 04:08:41.33 |2007-05-28 15:00:46.322|
|2  |2007-05-28 07:39:14.538|2007-05-29 05:07:26.278|
|3  |2007-05-29 05:39:24.335|2007-05-29 11:26:44.682|

### Add TsTzSpan to QueryPeriods

In [5]:
@F.udf(returnType=TsTzSpanUDT())
def to_period_udf(begin, end):
    pymeos_initialize()
    return TsTzSpan(lower=begin, upper=end)

queryperiods = queryperiods.withColumn("TsTzSpan", to_period_udf("begin", "end"))
queryperiods.select("Id", "TsTzSpan").show(3, truncate=False)
queryperiods.printSchema()

[Stage 24:>                                                         (0 + 1) / 1]

+---+------------------------------------------------+
|Id |TsTzSpan                                        |
+---+------------------------------------------------+
|1  |[2007-05-28 04:08:41+00, 2007-05-28 15:00:46+00)|
|2  |[2007-05-28 07:39:14+00, 2007-05-29 05:07:26+00)|
|3  |[2007-05-29 05:39:24+00, 2007-05-29 11:26:44+00)|
+---+------------------------------------------------+
only showing top 3 rows

root
 |-- Id: integer (nullable = true)
 |-- Begin: timestamp (nullable = true)
 |-- End: timestamp (nullable = true)
 |-- TsTzSpan: pythonuserdefined (nullable = true)



                                                                                

### Optimize Trips table with PyMEOS

In [9]:
import pandas as pd
from typing import Iterator
import pickle

@F.pandas_udf(returnType=StringType())
def berlinmod_trip_to_trajectory(seqs: pd.Series) -> pd.Series:
    return seqs.apply(lambda seq: seq_to_trajectory(seq))

def seq_to_trajectory(seq):
    pymeos_initialize()
    return TGeogPointSeqSet(sequence_list=seq).__str__()

def get_seq(xstart, ystart, xend, yend, tstart, tend):
    pymeos_initialize()
    return TGeogPointSeqWrap(f"[POINT({xstart} {ystart})@{tstart}, POINT({xend} {yend})@{tend})").__str__()

@F.pandas_udf("string")
def berlinmod_trip_to_seq(xstart: pd.Series, 
                          ystart: pd.Series, 
                          xend: pd.Series, 
                          yend: pd.Series, 
                          tstart: pd.Series, 
                          tend: pd.Series) -> pd.Series:
    temp_df = pd.DataFrame({
        'xstart': xstart,
        'ystart': ystart,
        'xend': xend,
        'yend': yend,
        'tstart': tstart,
        'tend': tend
    })
    return temp_df.apply(lambda row: get_seq(row['xstart'], row['ystart'], row['xend'], row['yend'], row['tstart'], row['tend']), axis=1)

trips = spark.read.csv('trips.csv', header=True, inferSchema=True).limit(10000)
trips.repartition("Moid", "Tripid")
trips = trips.withColumn("Sequence", berlinmod_trip_to_seq("Xstart", "Ystart", "Xend", "Yend", "Tstart", "Tend"))

trips.show()

trips = trips.groupby("Moid", "Tripid").agg(
    berlinmod_trip_to_trajectory(F.collect_list("Sequence")).alias("Trajectory")
)

trips.show()

                                                                                

+----+------+--------------------+--------------------+-------+-------+-------+-------+--------------------+
|Moid|Tripid|              Tstart|                Tend| Xstart| Ystart|   Xend|   Yend|            Sequence|
+----+------+--------------------+--------------------+-------+-------+-------+-------+--------------------+
|   1|     1| 2007-05-27 00:00:00|2007-05-28 08:36:...|12785.0| 1308.0|12785.0| 1308.0|[POINT(-175 -48)@...|
|   1|     2|2007-05-28 08:36:...|2007-05-28 08:36:...|12785.0| 1308.0|12793.0| 1310.2|[POINT(-175 -48)@...|
|   1|     2|2007-05-28 08:36:...|2007-05-28 08:36:...|12793.0| 1310.2|12808.7| 1314.5|[POINT(-167 -50.2...|
|   1|     2|2007-05-28 08:36:...|2007-05-28 08:36:...|12808.7| 1314.5|12824.8| 1318.9|[POINT(-151.29999...|
|   1|     2|2007-05-28 08:36:...|2007-05-28 08:36:...|12824.8| 1318.9|12840.9| 1323.3|[POINT(-135.20000...|
|   1|     2|2007-05-28 08:36:...|2007-05-28 08:36:...|12840.9| 1323.3|12855.2|1327.22|[POINT(-119.10000...|
|   1|     2|2007-0

[Stage 45:>                                                         (0 + 1) / 1]

+----+------+--------------------+
|Moid|Tripid|          Trajectory|
+----+------+--------------------+
|   1|     1|{[POINT(-175 -48)...|
|   1|     2|{[POINT(-175 -48)...|
|   1|     3|{[POINT(-109 4)@2...|
|   1|     4|{[POINT(-109 4)@2...|
|   1|     5|{[POINT(-175 -48)...|
|   1|     6|{[POINT(-175 -48)...|
|   1|     7|{[POINT(-109 4)@2...|
|   1|     8|{[POINT(-109 4)@2...|
|   1|     9|{[POINT(-175 -48)...|
|   2|    10|{[POINT(-4 -55)@2...|
|   2|    11|{[POINT(-4 -55)@2...|
|   2|    12|{[POINT(-123 72)@...|
|   2|    13|{[POINT(-123 72)@...|
|   2|    14|{[POINT(-4 -55)@2...|
|   2|    15|{[POINT(-4 -55)@2...|
|   2|    16|{[POINT(11 37)@20...|
|   2|    17|{[POINT(11 37)@20...|
|   2|    18|{[POINT(-4 -55)@2...|
|   2|    19|{[POINT(-4 -55)@2...|
|   2|    20|{[POINT(-123 72)@...|
+----+------+--------------------+
only showing top 20 rows



                                                                                

In [10]:
trips.write.csv("trips_proc.csv", header=True, mode="overwrite")
queryperiods.write.csv("queryperiods_proc.csv", header=True, mode="overwrite")

                                                                                

In [None]:
"""
SELECT tgeogpointSeq(ARRAY[tgeogpoint 'Point(1 1)@2001-01-01 08:00:00',
'Point(2 2)@2001-01-01 08:05:00']);
"""
pymeos_initialize()
tps = TGeogPointSeq("[POINT(1 1)@2001-01-01 08:00:00, Point(0 0)@2001-01-01 08:05:00)")
print(tps)