In [1]:
##############################
####  Importing XES files  ###
##############################
import os
import pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.util import constants
import pandas as pd
from pm4py.objects.log.util import dataframe_utils
from pm4py.objects.conversion.log import converter as filtered_log_converter
from pm4py.objects.conversion.log import converter as log_converter

In [None]:
log = xes_importer.apply('./data/BPI_Challenge_2019.xes')
print('Number of events:', len(log))

  from .autonotebook import tqdm as notebook_tqdm
parsing log, completed traces :: 100%|██████████| 251734/251734 [04:09<00:00, 1010.32it/s]

Number of events: 251734





: 

In [None]:
print('Number of cases:', len(log))
print('Number of events:', len([i for case in log._list for i in case._list]))
print('Event attributes (columns) in the dataset:')
for attr in pm4py.get_event_attributes(log):
    print(f"- {attr}")

print("Start Time: ", min(pm4py.get_event_attribute_values(log, "time:timestamp")))
print("End Time: ",max(pm4py.get_event_attribute_values(log, "time:timestamp")))

dataframe = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME)
print(dataframe.columns)
dataframe

Number of cases: 251734


Number of events: 1595923
Event attributes (columns) in the dataset:
- time:timestamp
- concept:name
- User
- Cumulative net worth (EUR)
- org:resource
Start Time:  1948-01-26 22:59:00+00:00
End Time:  2020-04-09 21:59:00+00:00


In [None]:
pm4py.get_event_attributes(log)

: 

### Filter for all cases that have started after 01012018 and have started with create purchase order item.

In [None]:
# Filter for cases that start with "create purchase order item"
filtered_log = pm4py.filter_start_activities(log, ["Create Purchase Order Item"], retain=True)

# Further filter for cases starting after January 1, 2018
filtered_log = pm4py.filter_time_range(
    filtered_log,
    "2018-01-01 00:00:00",
    "2025-05-15 00:00:00",
    mode="traces_contained",
    case_id_key="concept:name",
    timestamp_key="time:timestamp"
)

# show coverage of the filtered log
print(f"Number of cases before filtering: {len(log)}")
print(f"Number of cases after filtering: {len(filtered_log)}")
print(f"Number of events before filtering: {len([i for case in log._list for i in case._list])}")
print(f"Number of events after filtering: {len([i for case in filtered_log._list for i in case._list])}")

Number of cases before filtering: 251734
Number of cases after filtering: 199867
Number of events before filtering: 1595923
Number of events after filtering: 1264983


In [None]:
print(min(pm4py.get_event_attribute_values(filtered_log, "time:timestamp")))
print(max(pm4py.get_event_attribute_values(filtered_log, "time:timestamp")))

2018-01-01 02:42:00+00:00
2020-04-09 21:59:00+00:00


### Filter all cases within the timeframe and bucket based on item category

In [None]:
# Define the groups we want to filter
item_categories = {
    "group_3_way_matching_after_GR": ["3-way match, invoice before GR"],
    "group_3_way_matching_before_GR": ["3-way match, invoice after GR"],
    "group_3_way_matching": ["2-way match"],
    "group_consignment": ["Consignment"]
}

# Track cases that have been grouped
grouped_case_ids = set()
total_cases_before = len(filtered_log)
total_events_before = len([i for case in filtered_log._list for i in case._list])

# Process each group one by one
for group_name, category_values in item_categories.items():
    # Filter the log
    group = pm4py.filter_event_attribute_values(filtered_log, "case:Item Category", category_values, level="case", retain=True)
    
    # Print stats
    print(f"Number of cases in {group_name}: {len(group)}")
    
    # Export to XES file
    output_path = f"./data/{group_name}.xes"
    pm4py.write_xes(group, output_path)
    print(f"Exported {group_name} to {output_path}")
    
    # Add case IDs to the grouped set
    for case in group:
        grouped_case_ids.add(case.attributes["concept:name"])
    
    # Clear the variable to free memory
    del group

# Filter out grouped cases to get "other" cases
group_other = pm4py.filter_log(lambda case: case.attributes["concept:name"] not in grouped_case_ids, filtered_log)
print(f"Number of 'other' cases: {len(group_other)}")

# Export other group
pm4py.write_xes(group_other, "./data/group_other.xes")
print("Exported group_other to ./data/group_other.xes")

# Show overall coverage
total_grouped_cases = len(grouped_case_ids) + len(group_other)
print(f"\nSummary:")
print(f"Number of cases before filtering: {total_cases_before}")
print(f"Number of cases processed in all groups: {total_grouped_cases}")
print(f"Number of events before filtering: {total_events_before}")

Number of cases in group_3_way_matching_after_GR: 0


exporting log, completed traces :: : 0it [00:00, ?it/s]

Exported group_3_way_matching_after_GR to ./data/group_3_way_matching_after_GR.xes





Number of cases in group_3_way_matching_before_GR: 0


exporting log, completed traces :: : 0it [00:00, ?it/s]

Exported group_3_way_matching_before_GR to ./data/group_3_way_matching_before_GR.xes





Number of cases in group_3_way_matching: 0


exporting log, completed traces :: : 0it [00:00, ?it/s]

Exported group_3_way_matching to ./data/group_3_way_matching.xes





Number of cases in group_consignment: 0


exporting log, completed traces :: : 0it [00:00, ?it/s]

Exported group_consignment to ./data/group_consignment.xes





AttributeError: module 'pm4py' has no attribute 'filter_log'