In [1]:
import pandas as pd
import pm4py
from datetime import datetime

In [2]:
# Defining database-specific variables
case_column = "case concept:name"
event_column = "event concept:name"
timestamp_column = "event time:timestamp"
timeformat_timestamp = "%d-%m-%Y %H:%M:%S.%f"

In [3]:
def import_csv(file_path):
    event_log = pd.read_csv(file_path, sep=',')
    # make a timestamp out of the timefield
    event_log[timestamp_column] = [datetime.strptime(date, timeformat_timestamp) for date in event_log[timestamp_column]]
    event_log = event_log.sort_values(by=[case_column, timestamp_column])  # sort values by user and time of event


    # change format into pm4py event_log
    event_log = pm4py.format_dataframe(event_log, case_id = case_column, activity_key = event_column,
                                       timestamp_key = timestamp_column, timest_format = '%Y-%m-%d %H:%M:%Sz')

    #2011-10-01T00:38:44.546+02:00
    # print what the start and end activities are:
    start_activities = pm4py.get_start_activities(event_log)
    end_activities = pm4py.get_end_activities(event_log)
    print("Start activities: {}\n\nEnd activities: {}".format(start_activities, end_activities))

    # print how many events and how many instances (cases) there are
    num_events = len(event_log)
    num_cases = len(event_log[case_column].unique())
    print(' ')
    print("Number of events: {}\nNumber of cases: {}".format(num_events, num_cases))


    return event_log

In [4]:
file = "data/BPI_Challenge_2012-test.csv"

event_log = import_csv(file)

Start activities: {'A_SUBMITTED': 2618}

End activities: {'A_DECLINED': 739, 'W_Completeren aanvraag': 466, 'W_Valideren aanvraag': 454, 'W_Nabellen offertes': 389, 'W_Afhandelen leads': 388, 'W_Nabellen incomplete dossiers': 114, 'A_CANCELLED': 45, 'O_CANCELLED': 15, 'W_Beoordelen fraude': 8}
 
Number of events: 47823
Number of cases: 2618


In [5]:
print(event_log.head())
print(event_log.tail())

         eventID   case concept:name                  case REG_DATE  \
0  44964012621824             206324  2012-02-03T17:17:11.047+01:00   
1  44964012621825             206324  2012-02-03T17:17:11.047+01:00   
2  44964012621826             206324  2012-02-03T17:17:11.047+01:00   
3  44968307589120             206327  2012-02-03T17:23:41.949+01:00   
4  44968307589121             206327  2012-02-03T17:23:41.949+01:00   

   case AMOUNT_REQ event concept:name event lifecycle:transition  \
0             2500        A_SUBMITTED                   COMPLETE   
1             2500  A_PARTLYSUBMITTED                   COMPLETE   
2             2500         A_DECLINED                   COMPLETE   
3             6000        A_SUBMITTED                   COMPLETE   
4             6000  A_PARTLYSUBMITTED                   COMPLETE   

     event time:timestamp case:concept:name       concept:name  \
0 2012-02-03 17:17:11.047            206324        A_SUBMITTED   
1 2012-02-03 17:17:11.323       

In [6]:
# which features would be good predictors?
# the previous event
# have certain events already taken place in the sequence

In [7]:
event_log = event_log.sort_values('event time:timestamp')

def add_previous_events(instance):
    """instance is sorted by timestamp"""
    instance = instance.reset_index()
    previous_list = ['start']
    previous_lifecycle = 'start'

    for event_nr in range(len(instance)):
        instance.loc[event_nr, 'previous_events'] = ' '.join(str(e) for e in previous_list)
        instance.loc[event_nr, 'previous_event'] = previous_list[-1]
        instance.loc[event_nr, 'previous lifecycle'] = previous_lifecycle
        for event in previous_list:
            instance.loc[event_nr, event] = 1
        previous_list.append(instance.loc[event_nr, 'event concept:name'])
        previous_lifecycle = instance.loc[event_nr, 'event lifecycle:transition']

    return  instance

new_event_log = pd.DataFrame()
instance_list = pd.unique(event_log['case concept:name'])

for instance_id in instance_list:
    instance = event_log[event_log['case concept:name'] == instance_id]
    new_event_log = new_event_log.append(add_previous_events(instance))

print(new_event_log)

    index        eventID   case concept:name                  case REG_DATE  \
0       0  44964012621824             206324  2012-02-03T17:17:11.047+01:00   
1       1  44964012621825             206324  2012-02-03T17:17:11.047+01:00   
2       2  44964012621826             206324  2012-02-03T17:17:11.047+01:00   
0       3  44968307589120             206327  2012-02-03T17:23:41.949+01:00   
1       4  44968307589121             206327  2012-02-03T17:23:41.949+01:00   
..    ...             ...                ...                            ...   
1   38039  56203942035457             214376  2012-02-29T23:51:16.799+01:00   
2   38040  56203942035458             214376  2012-02-29T23:51:16.799+01:00   
3   38096  56203942035459             214376  2012-02-29T23:51:16.799+01:00   
4   38099  56203942035460             214376  2012-02-29T23:51:16.799+01:00   
5   38100  56203942035461             214376  2012-02-29T23:51:16.799+01:00   

    case AMOUNT_REQ  event concept:name event lifec

In [7]:
new_event_log.to_csv('added_features_event_log_test.csv')