In [2]:
import pandas as pd
import pm4py


In [24]:
def import_csv(file_path):
    event_log = pd.read_csv(file_path, sep=',')
    # make a timestamp out of the timefield
    event_log["event time:timestamp"] = pd.to_datetime(event_log["event time:timestamp"])

    # change format into pm4py event_log
    event_log = pm4py.format_dataframe(event_log, case_id = 'case concept:name', activity_key = 'event concept:name',
                                       timestamp_key = "event time:timestamp", timest_format = '%Y-%m-%d %H:%M:%Sz')

    #2011-10-01T00:38:44.546+02:00
    # print what the start and end activities are:
    start_activities = pm4py.get_start_activities(event_log)
    end_activities = pm4py.get_end_activities(event_log)
    print("Start activities: {}\n\nEnd activities: {}".format(start_activities, end_activities))

    # print how many events and how many instances (cases) there are
    num_events = len(event_log)
    num_cases = len(event_log['case concept:name'].unique())
    print(' ')
    print("Number of events: {}\nNumber of cases: {}".format(num_events, num_cases))


    return event_log

In [25]:
file = "data/BPI_Challenge_2012-training2.csv"

event_log = import_csv(file)

Start activities: {'A_SUBMITTED': 10469}

End activities: {'A_DECLINED': 2690, 'W_Valideren aanvraag': 1953, 'W_Afhandelen leads': 1864, 'W_Completeren aanvraag': 1702, 'W_Nabellen offertes': 1286, 'A_CANCELLED': 433, 'W_Nabellen incomplete dossiers': 318, 'O_CANCELLED': 171, 'W_Beoordelen fraude': 49, 'W_Wijzigen contractgegevens': 2, 'A_ACTIVATED': 1}
 
Number of events: 199668
Number of cases: 10469


In [5]:
print(event_log.head())
print(event_log.tail())

    Unnamed: 0  index  eventID   case concept:name  \
0            0      0         0             173688   
1            1      1         1             173688   
2            2      2         2             173688   
3            3      3         3             173688   
89          89     89         4             173688   

                       case REG_DATE  case AMOUNT_REQ      event concept:name  \
0   2011-10-01 00:38:44.546000+02:00            20000             A_SUBMITTED   
1   2011-10-01 00:38:44.546000+02:00            20000       A_PARTLYSUBMITTED   
2   2011-10-01 00:38:44.546000+02:00            20000           A_PREACCEPTED   
3   2011-10-01 00:38:44.546000+02:00            20000  W_Completeren aanvraag   
89  2011-10-01 00:38:44.546000+02:00            20000  W_Completeren aanvraag   

   event lifecycle:transition     event time:timestamp  check  \
0                    COMPLETE  2011-10-01 00:38:44.546  train   
1                    COMPLETE  2011-10-01 00:38:44.880  tr

In [6]:
# which features would be good predictors?
# the previous event
# have certain events already taken place in the sequence


In [None]:
event_log = event_log.sort_values('event time:timestamp')

def add_previous_events(instance):
    """instance is sorted by timestamp"""
    instance = instance.reset_index()
    previous_list = ['start']

    for event_nr in range(len(instance)):
        instance.loc[event_nr, 'previous_events'] = ' '.join(str(e) for e in previous_list)
        instance.loc[event_nr, 'previous_event'] = previous_list[-1]
        for event in previous_list:
            instance.loc[event_nr, event] = 1
        previous_list.append(instance.loc[event_nr, 'event concept:name'])

    return  instance

new_event_log = pd.DataFrame()
instance_list = pd.unique(event_log['case concept:name'])

for instance_id in instance_list:
    instance = event_log[event_log['case concept:name'] == instance_id]
    new_event_log = new_event_log.append(add_previous_events(instance))

print(new_event_log)

In [None]:
new_event_log.to_csv('added_features_event_log.csv')