In [8]:
# Ensure project root (OpenCEP) is on sys.path for imports like `from base.Pattern import Pattern`
import sys, os, pathlib

nb_dir = (
    pathlib.Path(__file__).parent if "__file__" in globals() else pathlib.Path.cwd()
)
project_root = str((nb_dir / "..").resolve())
if project_root not in sys.path:
    sys.path.insert(0, project_root)

In [9]:
from datetime import timedelta
from CEP import CEP
from base.Pattern import Pattern
from base.PatternStructure import (
    SeqOperator,
    PrimitiveEventStructure,
    KleeneClosureOperator,
)
from condition.CompositeCondition import AndCondition
from condition.Condition import Variable, SimpleCondition
from condition.KCCondition import KCIndexCondition
from stream.FileStream import FileOutputStream
from stream.DataFrameStream import CitiBikeDataFrameInputStream
from plugin.citibike.CitiBike2 import (
    CitiBikeTripEventTypeClassifier,
    CitiBikeDataFormatter,
)
import test

In [None]:
citibikeHotPathsPattern = Pattern(
    SeqOperator(
        KleeneClosureOperator(PrimitiveEventStructure("CitiBikeTrip", "a"), max_size=5),
        PrimitiveEventStructure("CitiBikeTrip", "b"),
    ),
    AndCondition(
        KCIndexCondition(
            names={"a"},
            getattr_func=lambda x: x["bikeid"],
            relation_op=lambda a1, a2: a1 == a2,
            offset=-1,
        ),
        KCIndexCondition(
            names={"a"},
            getattr_func=lambda x: (
                int(float(x["startstationid"])),
                int(float(x["endstationid"])),
            ),
            relation_op=lambda a1, a2: a1[0] == a2[1],
            offset=-1,
        ),
        SimpleCondition(
            Variable("a", lambda x: x[-1]["bikeid"]),
            Variable("b", lambda x: x["bikeid"]),
            relation_op=lambda a, b: a == b,
        ),
        SimpleCondition(
            Variable("a", lambda x: int(float(x[-1]["endstationid"]))),
            Variable("b", lambda x: int(float(x["startstationid"]))),
            relation_op=lambda a, b: a == b,
        ),
        SimpleCondition(
            Variable("b", lambda x: int(float(x["endstationid"]))),
            relation_op=lambda end_id: str(end_id) in {"490"},
        ),
    ),
    timedelta(minutes=61),
)

In [11]:
"""def create_sample_preprocessor():
return CitiBikeDataFrameInputStream.create_citibike_preprocessor(
    bike_ids_filter={5206, 5215, 5220}  # Only include these bike IDs
)
"""

events = CitiBikeDataFrameInputStream(
    "../test/EventFiles/201901-citibike-tripdata-1-med.csv",
    timestamp_column="starttime",
    # preprocessor=create_sample_preprocessor()
)
print(events.dataframe[0:5])  # Print first 5 rows of the dataframe

  tripduration               starttime                stoptime startstationid  \
0          320 2019-01-01 00:01:47.401 2019-01-01 00:07:07.581           3160   
1          316 2019-01-01 00:04:43.736 2019-01-01 00:10:00.608            519   
2          591 2019-01-01 00:06:03.997 2019-01-01 00:15:55.438           3171   
3         2719 2019-01-01 00:07:03.545 2019-01-01 00:52:22.650            504   
4          303 2019-01-01 00:07:35.945 2019-01-01 00:12:39.502            229   

  endstationid bikeid  
0         3283  15839  
1          518  32723  
2         3154  27451  
3         3709  21579  
4          503  35379  


In [None]:
from parallel.ParallelExecutionParameters import (
    DataParallelExecutionParametersHirzelAlgorithm,
)
from parallel.ParallelExecutionPlatforms import ParallelExecutionPlatforms


dp_params = DataParallelExecutionParametersHirzelAlgorithm(
    platform=ParallelExecutionPlatforms.THREADING,
    units_number=8,  # how many threads
    key="bikeid",
)

'dp_params = DataParallelExecutionParametersHirzelAlgorithm(\n    platform=ParallelExecutionPlatforms.THREADING,\n    units_number=8,  # how many threads\n    key="bikeid",\n)'

In [None]:
cep = CEP([citibikeHotPathsPattern], parallel_execution_params=dp_params)

Creating evaluation manager...
 - Parallel execution: None
 - Storage: None
 - Using ParallelExecutionModes.SEQUENTIAL execution mode
!!! Using default evaluation mechanism parameters...
!!! Creating tree-based evaluation mechanism...
Tree calling create_storage_unit with storage_params: TreeStorageParameters(sort_storage=False, attributes_priorities={}, clean_up_interval=10, prioritize_sorting_by_timestamp=True, enable_load_shedding=False, load_shedding_threshold=1000, load_shedding_drop_rate=0.1, load_shedding_strategy=random)
InternalNode creating storage: sort=False, sorting_key=None
UnsortedPatternMatchStorage created with storage_params: TreeStorageParameters(sort_storage=False, attributes_priorities={}, clean_up_interval=10, prioritize_sorting_by_timestamp=True, enable_load_shedding=False, load_shedding_threshold=1000, load_shedding_drop_rate=0.1, load_shedding_strategy=random)
InternalNode creating storage: sort=False, sorting_key=None
UnsortedPatternMatchStorage created with s

In [None]:
cep.run(
    events,
    FileOutputStream("../test/demo/Matches", "output_citibike_hirtz.txt"),
    CitiBikeDataFormatter(),
)

Starting CEP evaluation...
Using optimized DataFrame input stream processing
Processing event in TreeBasedEvaluationMechanism: {'tripduration': '320', 'starttime': Timestamp('2019-01-01 00:01:47.401000'), 'stoptime': Timestamp('2019-01-01 00:07:07.581000'), 'startstationid': '3160', 'endstationid': '3283', 'bikeid': '15839'}
Playing new event on tree: {'tripduration': '320', 'starttime': 2019-01-01 00:01:47.401000, 'stoptime': 2019-01-01 00:07:07.581000, 'startstationid': '3160', 'endstationid': '3283', 'bikeid': '15839'}, trying to find matches
Playing new event. Event types listeners: {'CitiBikeTrip': [<tree.nodes.LeafNode.LeafNode object at 0x0000025ED2B3EF60>, <tree.nodes.LeafNode.LeafNode object at 0x0000025ED2B3F410>]}
SortedPatternMatchStorage.add() called Key: 2019-01-01 00:01:47.401000, length of partial matches: 0
Current events in pattern match: [{'tripduration': '320', 'starttime': 2019-01-01 00:01:47.401000, 'stoptime': 2019-01-01 00:07:07.581000, 'startstationid': '3160',