This Jupyter notebook processes log data, adding the position of the current process, activity, and timestamp of the next process based on position. Finally, it saves the dataframe into a CSV format for further steps.

### Importing libraries

In [None]:
import pandas as pd
import numpy as np
import pm4py
import os

### Processing the file

In [None]:
# Change the path to your desired file path
unprocessed_file_path = "data/extracted/BPI_Challenge_2012.xes"
log = pm4py.read_xes(unprocessed_file_path)
df = pm4py.convert_to_dataframe(log)
df.head()

In [None]:
# Turning the time:timestamp into a datetime object
with_nanosec = pd.to_datetime(df['time:timestamp'], errors='coerce', format='%Y-%m-%d %H:%M:%S.%f%z')
without_nanosec = pd.to_datetime(df['time:timestamp'], errors='coerce', format='%Y-%m-%d %H:%M:%S%z')
df['time:timestamp'] = with_nanosec.fillna(without_nanosec)

In [None]:
# Sorting on time:timestamp
df = df.sort_values('time:timestamp')

In [None]:
# Restarting the index
df = df.reset_index(drop=True)

### Adding the necessary columns

In [None]:
# Adding position to the dataframe
df['position'] = df.groupby('case:concept:name').cumcount() + 1
    
# Adding the next activity(concept:name) to the dataframe and if the next activity is not available, then it will be fill in with No_Activity
df['next_concept:name'] = df.groupby('case:concept:name')['concept:name'].shift(-1).fillna('No_Activity')

# Adding the next timestame to the dataframe and if the next timestamp is not available, then it will be fill in with the Null
df['next_timestamp'] = df.groupby('case:concept:name')['time:timestamp'].shift(-1).fillna(np.nan)

In [None]:
# An example case to check the position and next_activity
df[df['case:concept:name'] == '173688']

### Saving the csv file

In [None]:
save_path = 'data/preprocessed/'
if not os.path.exists(save_path):
    os.makedirs(save_path)
    print(f"Directory {save_path} created.")

In [None]:
# Change the file name to your desired file name
file_name = 'BPI_Challenge_2012.csv'

# Saving the dataframe to a csv file
df.to_csv(save_path+file_name, index=False)