In [7]:
# imports
import time
import timeit
import pandas as pd
import statistics
import numpy as np
from datetime import datetime
import re
from numpy import arange
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.metrics import accuracy_score

In [32]:
def prep_data(df):
    # definitions
    case_column = "case concept:name"
    registration_time_column = "case REG_DATE"
    event_column = "event concept:name"
    timestamp_column = "event time:timestamp"
    position_column = "position"
    baseline_next_event_column = "Baseline Prediction for Next Activity"
    pos_event_next_event_column = "Position&Event Prediction for Next Activity"
    baseline_next_timestamp_column = "Baseline Prediction for Next Timestamp"
    pos_event_next_timestamp_column = "Position&Event Prediction for Next Timestamp"
    time_since_registration_column = "Time Since Registration"
    day_of_week_column = "day_of_week"
    month_of_year_column = "month_of_year"
    offer_sent = 'offer_sent_already' # the name a of a sent offer state within event_column
    timeformat_registration = "%Y-%m-%dT%H:%M:%S" # new time format
    timeformat_timestamp = "%d-%m-%Y %H:%M:%S.%f"
    predicted_shift_column = 'Predicted event from baseline'
    start_time = timeit.default_timer()

    #make timestamps
    df[registration_time_column] = [re.sub('\..*|\+.*', '', a, flags=re.DOTALL) for a in df[registration_time_column]]
    df[registration_time_column] = [datetime.strptime(date, timeformat_registration) for date in df[registration_time_column]]
    #df[timestamp_column] = [datetime.strptime(date, timeformat_timestamp) for date in df[timestamp_column]]

    #sort and order dataframe
    df = df.sort_values(by=[case_column, timestamp_column])
    df.reset_index(inplace=True, drop=True)
    df.reset_index(inplace=True)

    # add time related columns
    df[time_since_registration_column] = df[timestamp_column] - df[registration_time_column] # Adding time since registration
    df[day_of_week_column] = df[timestamp_column].dt.dayofweek
    df[month_of_year_column] = df[timestamp_column].dt.month
    minutes_since_reg= 'minutes_since_reg'
    df[minutes_since_reg]=[((i.total_seconds()+0.000001)/60) for i in df[time_since_registration_column]]
    hour = 'hour'
    df[hour]  = [i.hour for i in df[timestamp_column]]

    '''df[offer_sent] = 0
    offer_sent_mask = df[event_column] == offer_sent
    case_check = df[offer_sent_mask][[case_column,timestamp_column]].copy().reset_index(drop=True)
    for i in range(len(case_check)):
        case_name = case_check.iloc[i,0]
        case_time = case_check.iloc[i,1]
        mask = (df[case_column]==case_name)&(df[timestamp_column]>=case_time)
        df.loc[mask, offer_sent] = 1'''

    # add position column
    df[position_column] = df.groupby([case_column]).cumcount()+1

    # add time until next event column
    shifted_deltatimes_list = df[timestamp_column].diff().shift(periods=-1)
    shifted_deltatimes = 'shifted_deltatimes'
    df[shifted_deltatimes] = [(i.total_seconds()) for i in shifted_deltatimes_list]
    mask1  = (df[shifted_deltatimes]<0.001)
    mask2 = (df[shifted_deltatimes]<0)
    df.loc[mask1, shifted_deltatimes]=0.001
    df.loc[mask2, shifted_deltatimes]=np.nan

    # add previous event and previous lifecycle column
    df['previous_event']=0
    df['previous_lifecycle']=0
    all_ids = list(df[case_column].unique())

    for ids in all_ids:
        df2 = df[df[case_column]==ids].copy()
        df.loc[df[case_column]==ids, 'previous_event']= df2[event_column].shift(periods=1)
        df.loc[df[case_column]==ids, 'previous_lifecycle']= df2['event lifecycle:transition'].shift(periods=1)
    df.loc[df['previous_event'].isnull(), 'previous_event']='START'
    df.loc[df['previous_lifecycle'].isnull(), 'previous_lifecycle']='START'

    # add binary columns of what events already happened in a case
    all_events = list(df['previous_event'].unique())
    for event in all_events:
        df[event] = 0
    df['last_event'] = 0
    a=0
    for ids in all_ids:
        index_case = df[df[case_column]==ids]['index'].max()
        df.loc[index_case,'last_event']=1
        for i in range(a,index_case+1):
            mask = ((df[case_column]==ids)& (df['index']>=i))
            prev_event = df.loc[mask,'previous_event'].reset_index(drop=True)[0]
            df.loc[mask, prev_event]=1
        a = index_case+1

    # print time and add log column of time until next event
    print((timeit.default_timer() - start_time)/60)
    df['log_y'] = np.log(df['shifted_deltatimes'])
    return df

def prep_data_reg(df):
    df_prep = prep_data(df)
    mask_nan = df_prep['last_event']==0
    df_reg = df_prep[mask_nan].copy()
    return df_reg

def MAPE(Y_actual,Y_Predicted):
    mape = np.mean(np.abs((Y_actual - Y_Predicted)/Y_actual))*100
    return mape

def MAE(Y_actual,Y_Predicted):
    mae = np.mean(np.abs((Y_actual - Y_Predicted)))
    return mae

def split_train_data(df_train, df_test):
    timeformat_timestamp = "%d-%m-%Y %H:%M:%S.%f"
    df_train["event time:timestamp"] = [datetime.strptime(date, timeformat_timestamp) for date in df_train["event time:timestamp"]]
    df_test["event time:timestamp"] = [datetime.strptime(date, timeformat_timestamp) for date in df_test["event time:timestamp"]]
    max_time_train = min(df_test['event time:timestamp'])
    for i in range((round(len(df_train)/2)), len(df_train)):
        if df_train['event time:timestamp'][i] > max_time_train:
            print(df_train['event time:timestamp'][i])
            df_train2 = df_train[:i].copy()
            break
    return df_train2

def prep_data2(df):
    # definitions
    case_column = "case concept:name"
    registration_time_column = "case REG_DATE"
    event_column = "event concept:name"
    timestamp_column = "event time:timestamp"
    position_column = "position"
    baseline_next_event_column = "Baseline Prediction for Next Activity"
    pos_event_next_event_column = "Position&Event Prediction for Next Activity"
    baseline_next_timestamp_column = "Baseline Prediction for Next Timestamp"
    pos_event_next_timestamp_column = "Position&Event Prediction for Next Timestamp"
    time_since_registration_column = "Time Since Registration"
    day_of_week_column = "day_of_week"
    month_of_year_column = "month_of_year"
    offer_sent = 'offer_sent_already' # the name a of a sent offer state within event_column
    timeformat_registration = "%Y-%m-%dT%H:%M:%S" # new time format
    timeformat_timestamp = "%d-%m-%Y %H:%M:%S.%f"
    predicted_shift_column = 'Predicted event from baseline'
    start_time = timeit.default_timer()

    #make timestamps
    df[registration_time_column] = [re.sub('\..*|\+.*', '', a, flags=re.DOTALL) for a in df[registration_time_column]]
    df[registration_time_column] = [datetime.strptime(date, timeformat_registration) for date in df[registration_time_column]]
    #df[timestamp_column] = [datetime.strptime(date, timeformat_timestamp) for date in df[timestamp_column]]

    #sort and order dataframe
    df = df.sort_values(by=[case_column, timestamp_column])
    df.reset_index(inplace=True, drop=True)
    df.reset_index(inplace=True)

    # add time related columns
    df[time_since_registration_column] = df[timestamp_column] - df[registration_time_column] # Adding time since registration
    df[day_of_week_column] = df[timestamp_column].dt.dayofweek
    df[month_of_year_column] = df[timestamp_column].dt.month
    minutes_since_reg= 'minutes_since_reg'
    df[minutes_since_reg]=[((i.total_seconds()+0.000001)/60) for i in df[time_since_registration_column]]
    hour = 'hour'
    df[hour]  = [i.hour for i in df[timestamp_column]]

    '''df[offer_sent] = 0
    offer_sent_mask = df[event_column] == offer_sent
    case_check = df[offer_sent_mask][[case_column,timestamp_column]].copy().reset_index(drop=True)
    for i in range(len(case_check)):
        case_name = case_check.iloc[i,0]
        case_time = case_check.iloc[i,1]
        mask = (df[case_column]==case_name)&(df[timestamp_column]>=case_time)
        df.loc[mask, offer_sent] = 1'''

    # add position column
    df[position_column] = df.groupby([case_column]).cumcount()+1

    # add time until next event column
    shifted_deltatimes_list = df[timestamp_column].diff().shift(periods=-1)
    shifted_deltatimes = 'shifted_deltatimes'
    df[shifted_deltatimes] = [(i.total_seconds()) for i in shifted_deltatimes_list]
    mask1  = (df[shifted_deltatimes]<0.001)
    mask2 = (df[shifted_deltatimes]<0)
    df.loc[mask1, shifted_deltatimes]=0.001
    df.loc[mask2, shifted_deltatimes]=np.nan

    # add previous event and previous lifecycle column
    df['previous_event']=0
    df['previous_lifecycle']=0
    all_ids = list(df[case_column].unique())

    for ids in all_ids:
        df2 = df[df[case_column]==ids].copy()
        df.loc[df[case_column]==ids, 'previous_event']= df2[event_column].shift(periods=1)
        df.loc[df[case_column]==ids, 'previous_lifecycle']= df2['event lifecycle:transition'].shift(periods=1)
    df.loc[df['previous_event'].isnull(), 'previous_event']='START'
    df.loc[df['previous_lifecycle'].isnull(), 'previous_lifecycle']='START'


    # add binary columns of what events already happened in a case
    '''all_events = list(df['previous_event'].unique())
    for event in all_events:
        df[event] = 0
    '''
    df['last_event'] = 0
    a=0
    for ids in all_ids:
        index_case = df[df[case_column]==ids]['index'].max()
        df.loc[index_case,'last_event']=1
        '''
        for i in range(a,index_case+1):
            mask = ((df[case_column]==ids)& (df['index']>=i))
            prev_event = df.loc[mask,'previous_event'].reset_index(drop=True)[0]
            df.loc[mask, prev_event]=1
        a = index_case+1
    '''
    # print time and add log column of time until next event
    print((timeit.default_timer() - start_time)/60)
    df['log_y'] = np.log(df['shifted_deltatimes'])
    dummies = pd.get_dummies(df['previous_event'])
    df1 = pd.concat([df, dummies], axis=1)
    return df1

def prep_data_reg2(df):
    df_prep = prep_data2(df)
    mask_nan = df_prep['last_event']==0
    df_reg = df_prep[mask_nan].copy()
    return df_reg

In [33]:
df = pd.read_csv('data/BPI_Challenge_2012-training.csv')
df_t = pd.read_csv('data/BPI_Challenge_2012-test.csv')

In [34]:
df_split = split_train_data(df,df_t)
df_train = prep_data_reg2(df_split)
df_test = prep_data_reg2(df_t)

2012-02-03 17:18:54.290000
1.7853828316666902
0.21400695833332672


In [36]:
df_train

Unnamed: 0,index,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp,Time Since Registration,day_of_week,...,O_SENT,O_SENT_BACK,START,W_Afhandelen leads,W_Beoordelen fraude,W_Completeren aanvraag,W_Nabellen incomplete dossiers,W_Nabellen offertes,W_Valideren aanvraag,W_Wijzigen contractgegevens
0,0,0,173688,2011-10-01 00:38:44,20000,A_SUBMITTED,COMPLETE,2011-10-01 00:38:44.546,0 days 00:00:00.546000,5,...,0,0,1,0,0,0,0,0,0,0
1,1,1,173688,2011-10-01 00:38:44,20000,A_PARTLYSUBMITTED,COMPLETE,2011-10-01 00:38:44.880,0 days 00:00:00.880000,5,...,0,0,0,0,0,0,0,0,0,0
2,2,2,173688,2011-10-01 00:38:44,20000,A_PREACCEPTED,COMPLETE,2011-10-01 00:39:37.906,0 days 00:00:53.906000,5,...,0,0,0,0,0,0,0,0,0,0
3,3,3,173688,2011-10-01 00:38:44,20000,W_Completeren aanvraag,SCHEDULE,2011-10-01 00:39:38.875,0 days 00:00:54.875000,5,...,0,0,0,0,0,0,0,0,0,0
4,4,4,173688,2011-10-01 00:38:44,20000,W_Completeren aanvraag,START,2011-10-01 11:36:46.437,0 days 10:58:02.437000,5,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199660,199660,44951127719937,206315,2012-02-03 17:04:58,7000,A_PARTLYSUBMITTED,COMPLETE,2012-02-03 17:04:58.813,0 days 00:00:00.813000,4,...,0,0,0,0,0,0,0,0,0,0
199662,199662,44955422687232,206318,2012-02-03 17:07:38,5000,A_SUBMITTED,COMPLETE,2012-02-03 17:07:38.334,0 days 00:00:00.334000,4,...,0,0,1,0,0,0,0,0,0,0
199663,199663,44955422687233,206318,2012-02-03 17:07:38,5000,A_PARTLYSUBMITTED,COMPLETE,2012-02-03 17:07:38.843,0 days 00:00:00.843000,4,...,0,0,0,0,0,0,0,0,0,0
199665,199665,44959717654528,206321,2012-02-03 17:08:39,2000,A_SUBMITTED,COMPLETE,2012-02-03 17:08:39.200,0 days 00:00:00.200000,4,...,0,0,1,0,0,0,0,0,0,0


In [37]:
X = df_train[['day_of_week', 'hour', 'month_of_year','position','minutes_since_reg', 'case AMOUNT_REQ', 'START', 'A_SUBMITTED', 'A_PARTLYSUBMITTED', 'A_PREACCEPTED', 'W_Completeren aanvraag', 'A_DECLINED', 'A_ACCEPTED', 'A_FINALIZED', 'O_SELECTED', 'O_CREATED', 'O_SENT', 'W_Nabellen offertes', 'O_CANCELLED', 'W_Afhandelen leads', 'A_CANCELLED', 'O_SENT_BACK', 'W_Valideren aanvraag', 'O_ACCEPTED', 'A_APPROVED', 'A_REGISTERED', 'A_ACTIVATED', 'O_DECLINED', 'W_Nabellen incomplete dossiers', 'W_Beoordelen fraude']]
y = df_train['log_y']

cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)

model = RidgeCV(alphas=arange(0, 1, 0.01), cv=cv, scoring='neg_mean_absolute_error')

model.fit(X, y)

print(f'Alpha: {model.alpha_}')

X_test = df_test[['day_of_week', 'hour', 'month_of_year','position','minutes_since_reg', 'case AMOUNT_REQ', 'START', 'A_SUBMITTED', 'A_PARTLYSUBMITTED', 'A_PREACCEPTED', 'W_Completeren aanvraag', 'A_DECLINED', 'A_ACCEPTED', 'A_FINALIZED', 'O_SELECTED', 'O_CREATED', 'O_SENT', 'W_Nabellen offertes', 'O_CANCELLED', 'W_Afhandelen leads', 'A_CANCELLED', 'O_SENT_BACK', 'W_Valideren aanvraag', 'O_ACCEPTED', 'A_APPROVED', 'A_REGISTERED', 'A_ACTIVATED', 'O_DECLINED', 'W_Nabellen incomplete dossiers', 'W_Beoordelen fraude']]
Y_test = df_test['shifted_deltatimes']
ridge_predict = np.exp(model.predict(X_test))
ridge_MAPE = MAPE(Y_test,ridge_predict)
ridge_MAE = MAE(Y_test,ridge_predict)
r2 = model.score(X_test,df_test['log_y'])
print(f'MAPE: {ridge_MAPE}')
print(f'MAE: {ridge_MAE/3600} hours')
print(f'R2: {r2}')

Alpha: 0.03
MAPE: 701496.1587222911
MAE: 9.530618055932312 hours
R2: 0.4348927550698244


In [None]:
X = df_train[['day_of_week', 'hour', 'month_of_year','position','minutes_since_reg', 'case AMOUNT_REQ', 'START', 'A_SUBMITTED', 'A_PARTLYSUBMITTED', 'A_PREACCEPTED', 'W_Completeren aanvraag', 'A_DECLINED', 'A_ACCEPTED', 'A_FINALIZED', 'O_SELECTED', 'O_CREATED', 'O_SENT', 'W_Nabellen offertes', 'O_CANCELLED', 'W_Afhandelen leads', 'A_CANCELLED', 'O_SENT_BACK', 'W_Valideren aanvraag', 'O_ACCEPTED', 'A_APPROVED', 'A_REGISTERED', 'A_ACTIVATED', 'O_DECLINED', 'W_Nabellen incomplete dossiers', 'W_Beoordelen fraude']]
y = df_train['log_y']

cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)

model = RidgeCV(alphas=arange(0, 1, 0.01), cv=cv, scoring='neg_mean_absolute_error')

model.fit(X, y)

print(f'Alpha: {model.alpha_}')

X_test = df_test[['day_of_week', 'hour', 'month_of_year','position','minutes_since_reg', 'case AMOUNT_REQ', 'START', 'A_SUBMITTED', 'A_PARTLYSUBMITTED', 'A_PREACCEPTED', 'W_Completeren aanvraag', 'A_DECLINED', 'A_ACCEPTED', 'A_FINALIZED', 'O_SELECTED', 'O_CREATED', 'O_SENT', 'W_Nabellen offertes', 'O_CANCELLED', 'W_Afhandelen leads', 'A_CANCELLED', 'O_SENT_BACK', 'W_Valideren aanvraag', 'O_ACCEPTED', 'A_APPROVED', 'A_REGISTERED', 'A_ACTIVATED', 'O_DECLINED', 'W_Nabellen incomplete dossiers', 'W_Beoordelen fraude']]
Y_test = df_test['shifted_deltatimes']
ridge_predict = np.exp(model.predict(X_test))
ridge_MAPE = MAPE(Y_test,ridge_predict)
ridge_MAE = MAE(Y_test,ridge_predict)
r2 = model.score(X_test,df_test['log_y'])
print(f'MAPE: {ridge_MAPE}')
print(f'MAE: {ridge_MAE/3600} hours')
print(f'R2: {r2}')

In [38]:
model.coef_

array([ 5.05698284e-02,  7.90357299e-02,  8.45807231e-03,  1.91786689e-02,
       -9.00419107e-06, -2.63848332e-06, -3.69433512e+09,  4.50964467e+00,
        4.97113333e-03,  2.04004906e+00,  7.95090073e-01, -3.51447311e+00,
       -1.07878945e+01,  9.67749572e-01,  2.05893261e+00, -4.18521413e-01,
        2.28285058e+00,  5.02739317e+00, -1.57739590e+00, -4.12299957e-01,
       -4.40605035e+00, -4.40678094e+00,  4.10610140e+00, -6.36926289e+00,
       -2.63261834e+00, -2.63974782e-01, -4.17866519e-01, -3.42910073e+00,
        1.23151406e-01, -8.41922452e-02])

In [None]:
mae 9.45 hours
r2 0.34