In [1]:
# Ensure project root (OpenCEP) is on sys.path for imports like `from base.Pattern import Pattern`
import sys, os, pathlib

nb_dir = (
    pathlib.Path(__file__).parent if "__file__" in globals() else pathlib.Path.cwd()
)
project_root = str((nb_dir / "..").resolve())
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [2]:
from datetime import timedelta
from CEP import CEP
from base.Pattern import Pattern
from base.PatternStructure import (
    SeqOperator,
    PrimitiveEventStructure,
    KleeneClosureOperator,
)
from condition.CompositeCondition import AndCondition
from condition.Condition import Variable, SimpleCondition
from condition.KCCondition import KCIndexCondition
from stream.FileStream import FileOutputStream
from stream.DataFrameStream import CitiBikeDataFrameInputStream
from plugin.citibike.CitiBike2 import (
    CitiBikeTripEventTypeClassifier,
    CitiBikeDataFormatter,
)
import test
from tree.PatternMatchStorage import TreeStorageParameters
from parallel.ParallelExecutionParameters import (
    DataParallelExecutionParametersHirzelAlgorithm,
)
from parallel.ParallelExecutionPlatforms import ParallelExecutionPlatforms


In [3]:
citibikeHotPathsPattern = Pattern(
    SeqOperator(
        KleeneClosureOperator(PrimitiveEventStructure("CitiBikeTrip", "a"), max_size=5),
        PrimitiveEventStructure("CitiBikeTrip", "b"),
    ),
    AndCondition(
        KCIndexCondition(
            names={"a"},
            getattr_func=lambda x: x["bikeid"],
            relation_op=lambda a1, a2: a1 == a2,
            offset=-1,
        ),
        KCIndexCondition(
            names={"a"},
            getattr_func=lambda x: (
                int(float(x["startstationid"])),
                int(float(x["endstationid"])),
            ),
            relation_op=lambda a1, a2: a1[0] == a2[1],
            offset=-1,
        ),
        SimpleCondition(
            Variable("a", lambda x: x[-1]["bikeid"]),
            Variable("b", lambda x: x["bikeid"]),
            relation_op=lambda a, b: a == b,
        ),
        SimpleCondition(
            Variable("a", lambda x: int(float(x[-1]["endstationid"]))),
            Variable("b", lambda x: int(float(x["startstationid"]))),
            relation_op=lambda a, b: a == b,
        ),
        SimpleCondition(
            Variable("b", lambda x: int(float(x["endstationid"]))),
            relation_op=lambda end_id: str(end_id) in {"111111"},
        ),
    ),
    timedelta(minutes=61),
)

In [4]:
cep = CEP([citibikeHotPathsPattern])

Creating evaluation manager...
 - Parallel execution: None
 - Storage: None
 - Using ParallelExecutionModes.SEQUENTIAL execution mode
!!! Using default evaluation mechanism parameters...
!!! Creating tree-based evaluation mechanism...
Tree calling create_storage_unit with storage_params: TreeStorageParameters(sort_storage=False, attributes_priorities={}, clean_up_interval=10, prioritize_sorting_by_timestamp=True, enable_load_shedding=False, load_shedding_threshold=1000, load_shedding_drop_rate=0.1, load_shedding_strategy=random,latency_bound=None)
InternalNode creating storage: sort=False, sorting_key=None
UnsortedPatternMatchStorage created with storage_params: TreeStorageParameters(sort_storage=False, attributes_priorities={}, clean_up_interval=10, prioritize_sorting_by_timestamp=True, enable_load_shedding=False, load_shedding_threshold=1000, load_shedding_drop_rate=0.1, load_shedding_strategy=random,latency_bound=None)
InternalNode creating storage: sort=False, sorting_key=None
Unso

# Baseline - small dataset

In [None]:
# input_file = "../test/EventFiles/201901-citibike-tripdata-1-fabricated-small.csv"
# output_file = "output_citibike_baseline_small.txt"

# events = CitiBikeDataFrameInputStream(
#     input_file,
#     timestamp_column="starttime",
# )

# cep.run(
#     events,
#     FileOutputStream("../test/demo/Matches/testing", output_file),
#     CitiBikeDataFormatter(),
# )

Starting CEP evaluation...
zad unit
starttime 174382.728190416
Using optimized DataFrame input stream processing
Processing event in TreeBasedEvaluationMechanism: {'tripduration': '320', 'starttime': Timestamp('2019-01-01 00:01:47.401000'), 'stoptime': Timestamp('2019-01-01 00:07:07.581000'), 'startstationid': '3160', 'endstationid': '3283', 'bikeid': '15839'}
Playing new event on tree: {'tripduration': '320', 'starttime': 2019-01-01 00:01:47.401000, 'stoptime': 2019-01-01 00:07:07.581000, 'startstationid': '3160', 'endstationid': '3283', 'bikeid': '15839'}, trying to find matches
we are updating event count 0
Playing new event. Event types listeners: {'CitiBikeTrip': [<tree.nodes.LeafNode.LeafNode object at 0x116503470>, <tree.nodes.LeafNode.LeafNode object at 0x1193149b0>]}
SortedPatternMatchStorage.add() called Key: 2019-01-01 00:01:47.401000, length of partial matches: 0
Current events in pattern match: [{'tripduration': '320', 'starttime': 2019-01-01 00:01:47.401000, 'stoptime': 2

488.00609

In [None]:
# Parallelization only - big dataset

In [None]:


dp_params = DataParallelExecutionParametersHirzelAlgorithm(
    platform=ParallelExecutionPlatforms.THREADING,
    units_number=8,  # how many threads
    key="bikeid",
)
cep = CEP([citibikeHotPathsPattern], parallel_execution_params=dp_params)

Creating evaluation manager...
 - Parallel execution: <parallel.ParallelExecutionParameters.DataParallelExecutionParametersHirzelAlgorithm object at 0x11c963ef0>
 - Storage: None
 - Using ParallelExecutionModes.DATA_PARALLELISM execution mode
!!! Using default evaluation mechanism parameters...
!!! Creating tree-based evaluation mechanism...
Tree calling create_storage_unit with storage_params: TreeStorageParameters(sort_storage=False, attributes_priorities={}, clean_up_interval=10, prioritize_sorting_by_timestamp=True, enable_load_shedding=False, load_shedding_threshold=1000, load_shedding_drop_rate=0.1, load_shedding_strategy=random,latency_bound=None)
InternalNode creating storage: sort=False, sorting_key=None
UnsortedPatternMatchStorage created with storage_params: TreeStorageParameters(sort_storage=False, attributes_priorities={}, clean_up_interval=10, prioritize_sorting_by_timestamp=True, enable_load_shedding=False, load_shedding_threshold=1000, load_shedding_drop_rate=0.1, load_

In [6]:
input_file = "../test/EventFiles/201901-citibike-tripdata-1-fabricated.csv"
output_file = "output_citibike_only_parallel.txt"

events = CitiBikeDataFrameInputStream(
    input_file,
    timestamp_column="starttime",
)


In [None]:
# cep.run(
#     events,
#     FileOutputStream("../test/demo/Matches/testing", output_file),
#     CitiBikeDataFormatter(),
# )
#ran for 513 min

Starting CEP evaluation...
zad unit
starttime 176627.172104041
Using generic file input stream processing
zad unit
Using generic file input stream processing
zad unit
Using generic file input stream processing
zad unit
Using generic file input stream processing
zad unit
Using generic file input stream processing
zad unit
Using generic file input stream processing
zad unit
Using generic file input stream processing
zad unit
Using generic file input stream processing
Using optimized DataFrame input stream processing
Playing new event on tree: {'tripduration': '300', 'starttime': 2019-01-01 00:12:17.931000, 'stoptime': 2019-01-01 00:17:18.830000, 'startstationid': '3394', 'endstationid': '3398', 'bikeid': '18636'}, trying to find matchesPlaying new event on tree: {'tripduration': '123', 'starttime': 2019-01-01 00:12:19.603000, 'stoptime': 2019-01-01 00:14:23.104000, 'startstationid': '432', 'endstationid': '3656', 'bikeid': '35206'}, trying to find matches
we are updating event count 0
Pl

# Parallelization only - small dataset

In [4]:
input_file = "../test/EventFiles/201901-citibike-tripdata-1-fabricated-small.csv"
output_file = "output_citibike_only_parallel-small.txt"

events = CitiBikeDataFrameInputStream(
    input_file,
    timestamp_column="starttime",
)

In [8]:
cep.run(
    events,
    FileOutputStream("../test/demo/Matches/testing", output_file),
    CitiBikeDataFormatter(),
)

Starting CEP evaluation...
zad unit
starttime 182255.820851041
Using generic file input stream processing
zad unit
Using generic file input stream processing
zad unit
Using generic file input stream processing
zad unit
Using generic file input stream processing
zad unit
Using generic file input stream processing
zad unit
Using generic file input stream processing
zad unit
Using generic file input stream processing
zad unit
Using generic file input stream processing
Using optimized DataFrame input stream processing
Playing new event on tree: {'tripduration': '320', 'starttime': 2019-01-01 00:01:47.401000, 'stoptime': 2019-01-01 00:07:07.581000, 'startstationid': '3160', 'endstationid': '3283', 'bikeid': '15839'}, trying to find matches
we are updating event count 0
Playing new event. Event types listeners: {'CitiBikeTrip': [<tree.nodes.LeafNode.LeafNode object at 0x11d7d29f0>, <tree.nodes.LeafNode.LeafNode object at 0x11d7d2fc0>]}
SortedPatternMatchStorage.add() called Key: 2019-01-01 0

0.301408

# Parallelization & Load shedding

In [11]:
load_shedding_params = TreeStorageParameters(
    sort_storage=True,
    enable_load_shedding=True,
    load_shedding_threshold=15,
    load_shedding_drop_rate=0.3,
    load_shedding_strategy="oldest",
    clean_up_interval=10,
)
dp_params = DataParallelExecutionParametersHirzelAlgorithm(
    platform=ParallelExecutionPlatforms.THREADING,
    units_number=8,  # how many threads
    key="bikeid",
)
cep = CEP([citibikeHotPathsPattern], parallel_execution_params=dp_params, storage_params=load_shedding_params )


Creating evaluation manager...
 - Parallel execution: <parallel.ParallelExecutionParameters.DataParallelExecutionParametersHirzelAlgorithm object at 0x11d7e0860>
 - Storage: TreeStorageParameters(sort_storage=True, attributes_priorities={}, clean_up_interval=10, prioritize_sorting_by_timestamp=True, enable_load_shedding=True, load_shedding_threshold=15, load_shedding_drop_rate=0.3, load_shedding_strategy=oldest,latency_bound=None)
 - Using ParallelExecutionModes.DATA_PARALLELISM execution mode
!!! Using default evaluation mechanism parameters...
!!! Creating tree-based evaluation mechanism...
Tree calling create_storage_unit with storage_params: TreeStorageParameters(sort_storage=False, attributes_priorities={}, clean_up_interval=10, prioritize_sorting_by_timestamp=True, enable_load_shedding=False, load_shedding_threshold=1000, load_shedding_drop_rate=0.1, load_shedding_strategy=random,latency_bound=None)
InternalNode creating storage: sort=False, sorting_key=None
UnsortedPatternMatchS

In [12]:
input_file = "../test/EventFiles/201901-citibike-tripdata-1-fabricated.csv"
output_file = "output_citibike_both.txt"

events = CitiBikeDataFrameInputStream(
    input_file,
    timestamp_column="starttime",
)

In [None]:
cep.run(
    events,
    FileOutputStream("../test/demo/Matches/testing", output_file),
    CitiBikeDataFormatter(),
)# ran for an hour

Starting CEP evaluation...
zad unit
Using generic file input stream processing
zad unit
Using generic file input stream processing
zad unit
Using generic file input stream processing
zad unit
Using generic file input stream processing
zad unit
Using generic file input stream processing
zad unit
Using generic file input stream processing
zad unit
Using generic file input stream processing
zad unit
Using generic file input stream processing
Using optimized DataFrame input stream processing
Playing new event on tree: {'tripduration': '320', 'starttime': 2019-01-01 00:01:47.401000, 'stoptime': 2019-01-01 00:07:07.581000, 'startstationid': '3160', 'endstationid': '3283', 'bikeid': '15839'}, trying to find matchesPlaying new event on tree: {'tripduration': '316', 'starttime': 2019-01-01 00:04:43.736000, 'stoptime': 2019-01-01 00:10:00.608000, 'startstationid': '519', 'endstationid': '518', 'bikeid': '32723'}, trying to find matches
we are updating event count 96
Playing new event. Event type

KeyboardInterrupt: 

UnsortedPatternMatchStorage.add() called! Total matches: 98
into unsorted add
Playing new event on tree: {'tripduration': '198', 'starttime': 2019-01-01 09:35:26.889000, 'stoptime': 2019-01-01 09:38:44.941000, 'startstationid': '3154', 'endstationid': '3375', 'bikeid': '33671'}, trying to find matches
we are updating event count 2451
Playing new event. Event types listeners: {'CitiBikeTrip': [<tree.nodes.LeafNode.LeafNode object at 0x11d87fbc0>, <tree.nodes.LeafNode.LeafNode object at 0x11d87c080>]}
SortedPatternMatchStorage.add() called Key: 2019-01-01 09:35:26.889000, length of partial matches: 76
Current events in pattern match: [{'tripduration': '198', 'starttime': 2019-01-01 09:35:26.889000, 'stoptime': 2019-01-01 09:38:44.941000, 'startstationid': '3154', 'endstationid': '3375', 'bikeid': '33671'}]
we are in add
UnsortedPatternMatchStorage.add() called! Total matches: 104
into unsorted add
UnsortedPatternMatchStorage.add() called! Total matches: 105
into unsorted add
UnsortedPatt

# Load shedding 2

In [4]:
load_shedding_params = TreeStorageParameters(
    sort_storage=True,
    enable_load_shedding=True,
    load_shedding_threshold=15,
    load_shedding_drop_rate=0.3,
    load_shedding_strategy="oldest",
    clean_up_interval=10,
)
dp_params = DataParallelExecutionParametersHirzelAlgorithm(
    platform=ParallelExecutionPlatforms.THREADING,
    units_number=8,  # how many threads
    key="bikeid",
)
cep = CEP([citibikeHotPathsPattern], parallel_execution_params=dp_params, storage_params=load_shedding_params )


Creating evaluation manager...
 - Parallel execution: <parallel.ParallelExecutionParameters.DataParallelExecutionParametersHirzelAlgorithm object at 0x119b5bf20>
 - Storage: TreeStorageParameters(sort_storage=True, attributes_priorities={}, clean_up_interval=10, prioritize_sorting_by_timestamp=True, enable_load_shedding=True, load_shedding_threshold=15, load_shedding_drop_rate=0.3, load_shedding_strategy=oldest,latency_bound=None)
 - Using ParallelExecutionModes.DATA_PARALLELISM execution mode
!!! Using default evaluation mechanism parameters...
!!! Creating tree-based evaluation mechanism...
Tree calling create_storage_unit with storage_params: TreeStorageParameters(sort_storage=False, attributes_priorities={}, clean_up_interval=10, prioritize_sorting_by_timestamp=True, enable_load_shedding=False, load_shedding_threshold=1000, load_shedding_drop_rate=0.1, load_shedding_strategy=random,latency_bound=None)
InternalNode creating storage: sort=False, sorting_key=None
UnsortedPatternMatchS

In [5]:
input_file = "../test/EventFiles/201901-citibike-tripdata-1-fabricated.csv"
output_file = "output_citibike_both-2.txt"

events = CitiBikeDataFrameInputStream(
    input_file,
    timestamp_column="starttime",
)

In [None]:
cep.run(
    events,
    FileOutputStream("../test/demo/Matches/testing", output_file),
    CitiBikeDataFormatter(),
)

Starting CEP evaluation...
zad unit
starttime 186641.458923
Using generic file input stream processing
zad unit
Using generic file input stream processing
zad unit
Using generic file input stream processing
zad unit
Using generic file input stream processing
zad unit
Using generic file input stream processing
zad unit
Using generic file input stream processing
zad unit
Using generic file input stream processing
zad unit
Using generic file input stream processing
Using optimized DataFrame input stream processing
Playing new event on tree: {'tripduration': '320', 'starttime': 2019-01-01 00:01:47.401000, 'stoptime': 2019-01-01 00:07:07.581000, 'startstationid': '3160', 'endstationid': '3283', 'bikeid': '15839'}, trying to find matchesPlaying new event on tree: {'tripduration': '316', 'starttime': 2019-01-01 00:04:43.736000, 'stoptime': 2019-01-01 00:10:00.608000, 'startstationid': '519', 'endstationid': '518', 'bikeid': '32723'}, trying to find matches
we are updating event count 0
Playin