This Jupyter notebook processes log data, adding the position of the current process, activity, and timestamp of the next process based on position. Finally, it saves the dataframe into a CSV format for increased convenience in further steps.

### Importing libraries

In [2]:
import pandas as pd
import numpy as np
import pm4py
import os

### Processing the file

In [3]:
# change the path to your desired file path
unprocessed_file_path = "data/extracted/BPI_Challenge_2018.xes"
log = pm4py.read_xes(unprocessed_file_path)
df = pm4py.convert_to_dataframe(log)
df.head()



parsing log, completed traces ::   0%|          | 0/43809 [00:00<?, ?it/s]

Unnamed: 0,success,org:resource,docid_uuid,doctype,subprocess,docid,activity,note,eventid,identity:id,...,case:concept:name,case:penalty_amount1,case:payment_actual1,case:amount_applied1,case:penalty_amount2,case:payment_actual2,case:amount_applied2,case:penalty_amount3,case:payment_actual3,case:amount_applied3
0,True,0;n/a,CD3DC291-76C6-420A-B3F1-7C808970915B,Payment application,Application,-18008611495569447,mail income,none,,510B5333-731A-40FD-B7D6-FC149E50E961,...,8b99873a6136cfa6,,,,,,,,,
1,True,0;n/a,CD3DC291-76C6-420A-B3F1-7C808970915B,Payment application,Application,-18008611495569447,mail valid,none,,F1DD45EF-80BF-46A5-97D6-CC5886DD2D23,...,8b99873a6136cfa6,,,,,,,,,
2,True,0;n/a,7CB69360-6D10-426F-A426-DDE3E24E4334,Entitlement application,Main,-18008615298673397,mail valid,none,,AA02CA32-D021-4264-A7CB-660A9D603EFC,...,8b99873a6136cfa6,,,,,,,,,
3,True,0;n/a,7CB69360-6D10-426F-A426-DDE3E24E4334,Entitlement application,Main,-18008615298673397,mail valid,none,,097D1E41-3CDB-4652-ABF1-EAEFC0410FA0,...,8b99873a6136cfa6,,,,,,,,,
4,True,fb5fa8,CCBAA174-CDD7-4D32-892E-F14197C65B8A,Parcel document,Main,-72051858488795160,initialize,none,-7.205185848879516e+16,96CBE6E6-9774-4DF8-842B-073F4FDCE2B8,...,8b99873a6136cfa6,,,,,,,,,


In [None]:
# Turning the time:timestamp into a datetime object
with_nanosec = pd.to_datetime(df['time:timestamp'], errors='coerce', format='%Y-%m-%d %H:%M:%S.%f%z')
without_nanosec = pd.to_datetime(df['time:timestamp'], errors='coerce', format='%Y-%m-%d %H:%M:%S%z')
df['time:timestamp'] = with_nanosec.fillna(without_nanosec)

In [None]:
# sorting on time:timestamp
df = df.sort_values('time:timestamp')

In [None]:
# restarting the index
df = df.reset_index(drop=True)

### Adding the necessary columns

In [4]:
# Adding position to the dataframe
df['position'] = df.groupby('case:concept:name').cumcount() + 1
    
# Adding the next activity(concept:name) to the dataframe and if the next activity is not available, then it will be fill in with No_Activity
df['next_case:concept:name'] = df.groupby('case:concept:name')['concept:name'].shift(-1).fillna('No_Activity')

# Adding the next timestame to the dataframe and if the next timestamp is not available, then it will be fill in with the Null
df['next_timestamp'] = df.groupby('case:concept:name')['time:timestamp'].shift(-1).fillna(np.nan)

In [15]:
# An example case to check the position and next_activity
df[df['case:concept:name'] == '8b99873a6136cfa6'].head()

Unnamed: 0,success,org:resource,docid_uuid,doctype,subprocess,docid,activity,note,eventid,identity:id,...,case:amount_applied1,case:penalty_amount2,case:payment_actual2,case:amount_applied2,case:penalty_amount3,case:payment_actual3,case:amount_applied3,position,next_case:concept:name,next_timestamp
0,True,0;n/a,CD3DC291-76C6-420A-B3F1-7C808970915B,Payment application,Application,-18008611495569447,mail income,none,,510B5333-731A-40FD-B7D6-FC149E50E961,...,,,,,,,,1,mail valid,2015-05-08 00:00:00+00:00
1,True,0;n/a,CD3DC291-76C6-420A-B3F1-7C808970915B,Payment application,Application,-18008611495569447,mail valid,none,,F1DD45EF-80BF-46A5-97D6-CC5886DD2D23,...,,,,,,,,2,mail valid,2015-05-08 00:00:00+00:00
2,True,0;n/a,7CB69360-6D10-426F-A426-DDE3E24E4334,Entitlement application,Main,-18008615298673397,mail valid,none,,AA02CA32-D021-4264-A7CB-660A9D603EFC,...,,,,,,,,3,mail valid,2015-05-08 00:00:00+00:00
3,True,0;n/a,7CB69360-6D10-426F-A426-DDE3E24E4334,Entitlement application,Main,-18008615298673397,mail valid,none,,097D1E41-3CDB-4652-ABF1-EAEFC0410FA0,...,,,,,,,,4,initialize,2015-06-10 11:16:28+00:00
4,True,fb5fa8,CCBAA174-CDD7-4D32-892E-F14197C65B8A,Parcel document,Main,-72051858488795160,initialize,none,-7.205185848879516e+16,96CBE6E6-9774-4DF8-842B-073F4FDCE2B8,...,,,,,,,,5,initialize,2015-06-10 11:16:29+00:00


### Saving the csv file

In [None]:
save_path = 'data/preprocessed/'
if not os.path.exists(save_path):
    os.makedirs(save_path)
    print(f"Directory {save_path} created.")

In [None]:
# Change the file name to your desired file name
file_name = 'BPI_Challenge_2018.csv'

# Saving the dataframe to a csv file
df.to_csv(save_path+file_name, index=False)