In [139]:
# imports
import time
import timeit
import pandas as pd
import statistics
import numpy as np
from datetime import datetime
import re
from numpy import arange
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import accuracy_score

In [125]:
# read the data (2012)
df = pd.read_csv('data/BPI_Challenge_2012-training.csv')

df_test = pd.read_csv('data/BPI_Challenge_2012-test.csv')

# Defining database-specific variables
case_column = "case concept:name"
registration_time_column = "case REG_DATE"
event_column = "event concept:name"
timestamp_column = "event time:timestamp"
position_column = "Position"
baseline_next_event_column = "Baseline Prediction for Next Activity"
pos_event_next_event_column = "Position&Event Prediction for Next Activity"
baseline_next_timestamp_column = "Baseline Prediction for Next Timestamp"
pos_event_next_timestamp_column = "Position&Event Prediction for Next Timestamp"
time_since_registration_column = "Time Since Registration"
day_of_week_column = "day_of_week"
month_of_year_column = "month_of_year"
offer_sent = 'offer_sent_already' # the name a of a sent offer state within event_column
timeformat_registration = "%Y-%m-%dT%H:%M:%S" # new time format
timeformat_timestamp = "%d-%m-%Y %H:%M:%S.%f"
predicted_shift_column = 'Predicted event from baseline'

df.head(10)

Unnamed: 0,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp
0,0,173688,2011-10-01T00:38:44.546+02:00,20000,A_SUBMITTED,COMPLETE,01-10-2011 00:38:44.546
1,1,173688,2011-10-01T00:38:44.546+02:00,20000,A_PARTLYSUBMITTED,COMPLETE,01-10-2011 00:38:44.880
2,2,173688,2011-10-01T00:38:44.546+02:00,20000,A_PREACCEPTED,COMPLETE,01-10-2011 00:39:37.906
3,3,173688,2011-10-01T00:38:44.546+02:00,20000,W_Completeren aanvraag,SCHEDULE,01-10-2011 00:39:38.875
4,4294967296,173691,2011-10-01T08:08:58.256+02:00,5000,A_SUBMITTED,COMPLETE,01-10-2011 08:08:58.256
5,4294967297,173691,2011-10-01T08:08:58.256+02:00,5000,A_PARTLYSUBMITTED,COMPLETE,01-10-2011 08:09:02.195
6,4294967298,173691,2011-10-01T08:08:58.256+02:00,5000,A_PREACCEPTED,COMPLETE,01-10-2011 08:09:56.648
7,4294967299,173691,2011-10-01T08:08:58.256+02:00,5000,W_Completeren aanvraag,SCHEDULE,01-10-2011 08:09:59.578
8,8589934592,173694,2011-10-01T08:10:30.287+02:00,7000,A_SUBMITTED,COMPLETE,01-10-2011 08:10:30.287
9,8589934593,173694,2011-10-01T08:10:30.287+02:00,7000,A_PARTLYSUBMITTED,COMPLETE,01-10-2011 08:10:30.591


In [126]:
df[registration_time_column]= [re.sub('\..*|\+.*','',a,flags=re.DOTALL) for a in df[registration_time_column]]
df[registration_time_column] = [datetime.strptime(date, timeformat_registration) for date in df[registration_time_column]]
df[timestamp_column] = [datetime.strptime(date, timeformat_timestamp) for date in df[timestamp_column]]
df = df.sort_values(by=[case_column, timestamp_column])
df.reset_index(inplace=True, drop=True)# sort values by user and time of event
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214377 entries, 0 to 214376
Data columns (total 7 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   eventID                     214377 non-null  int64         
 1   case concept:name           214377 non-null  int64         
 2   case REG_DATE               214377 non-null  datetime64[ns]
 3   case AMOUNT_REQ             214377 non-null  int64         
 4   event concept:name          214377 non-null  object        
 5   event lifecycle:transition  214377 non-null  object        
 6   event time:timestamp        214377 non-null  datetime64[ns]
dtypes: datetime64[ns](2), int64(3), object(2)
memory usage: 11.4+ MB


In [136]:
df[time_since_registration_column] = df[timestamp_column] - df[registration_time_column] # Adding time since registration
df[day_of_week_column] = df[timestamp_column].dt.dayofweek
df[month_of_year_column] = df[timestamp_column].dt.month
df

Unnamed: 0,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp,Time Since Registration,day_of_week,month_of_year,...,A_APPROVED,A_ACTIVATED,O_CANCELLED,A_CANCELLED,W_Afhandelen leads,A_DECLINED,O_DECLINED,W_Nabellen incomplete dossiers,W_Beoordelen fraude,W_Wijzigen contractgegevens
0,0,173688,2011-10-01 00:38:44,20000,A_SUBMITTED,COMPLETE,2011-10-01 00:38:44.546,0 days 00:00:00.546000,5,10,...,0,0,0,0,0,0,0,0,0,0
1,1,173688,2011-10-01 00:38:44,20000,A_PARTLYSUBMITTED,COMPLETE,2011-10-01 00:38:44.880,0 days 00:00:00.880000,5,10,...,0,0,0,0,0,0,0,0,0,0
2,2,173688,2011-10-01 00:38:44,20000,A_PREACCEPTED,COMPLETE,2011-10-01 00:39:37.906,0 days 00:00:53.906000,5,10,...,0,0,0,0,0,0,0,0,0,0
3,3,173688,2011-10-01 00:38:44,20000,W_Completeren aanvraag,SCHEDULE,2011-10-01 00:39:38.875,0 days 00:00:54.875000,5,10,...,0,0,0,0,0,0,0,0,0,0
4,4,173688,2011-10-01 00:38:44,20000,W_Completeren aanvraag,START,2011-10-01 11:36:46.437,0 days 10:58:02.437000,5,10,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214372,44955422687236,206318,2012-02-03 17:07:38,5000,A_DECLINED,COMPLETE,2012-02-03 17:55:57.294,0 days 00:48:19.294000,4,2,...,0,0,0,0,0,0,0,0,0,0
214373,44955422687237,206318,2012-02-03 17:07:38,5000,W_Afhandelen leads,COMPLETE,2012-02-03 17:56:00.998,0 days 00:48:22.998000,4,2,...,0,0,0,0,0,0,0,0,0,0
214374,44959717654528,206321,2012-02-03 17:08:39,2000,A_SUBMITTED,COMPLETE,2012-02-03 17:08:39.200,0 days 00:00:00.200000,4,2,...,0,0,0,0,0,0,0,0,0,0
214375,44959717654529,206321,2012-02-03 17:08:39,2000,A_PARTLYSUBMITTED,COMPLETE,2012-02-03 17:08:39.459,0 days 00:00:00.459000,4,2,...,0,0,0,0,0,0,0,0,0,0


In [128]:
minutes_since_reg= 'minutes_since_reg'
df[minutes_since_reg]=[((i.total_seconds()+0.000001)/60) for i in df[time_since_registration_column]]
hour = 'hour'
df[hour]  = [i.hour for i in df[timestamp_column]]
df[offer_sent] = 0
offer_sent_mask = df[event_column] == offer_sent
case_check = df[offer_sent_mask][[case_column,timestamp_column]].copy().reset_index(drop=True)
for i in range(len(case_check)):
    case_name = case_check.iloc[i,0]
    case_time = case_check.iloc[i,1]
    mask = (df[case_column]==case_name)&(df[timestamp_column]>=case_time)
    df.loc[mask, offer_sent] = 1


In [129]:
df[position_column] = df.groupby([case_column]).cumcount()+1
length_process = df.groupby([case_column]).size().to_dict()

In [130]:
shifted_deltatimes_list = df[timestamp_column].diff().shift(periods=-1)
shifted_deltatimes = 'shifted_deltatimes'
df[shifted_deltatimes] = [(i.total_seconds()) for i in shifted_deltatimes_list]
mask1  = (df[shifted_deltatimes]<0.01)
mask2 = (df[shifted_deltatimes]<0)
df.loc[mask1, shifted_deltatimes]=0
df.loc[mask2, shifted_deltatimes]=np.nan
df

Unnamed: 0,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp,Time Since Registration,day_of_week,month_of_year,minutes_since_reg,hour,offer_sent_already,Position,shifted_deltatimes
0,0,173688,2011-10-01 00:38:44,20000,A_SUBMITTED,COMPLETE,2011-10-01 00:38:44.546,0 days 00:00:00.546000,Saturday,October,0.009100,0,0,1,0.334
1,1,173688,2011-10-01 00:38:44,20000,A_PARTLYSUBMITTED,COMPLETE,2011-10-01 00:38:44.880,0 days 00:00:00.880000,Saturday,October,0.014667,0,0,2,53.026
2,2,173688,2011-10-01 00:38:44,20000,A_PREACCEPTED,COMPLETE,2011-10-01 00:39:37.906,0 days 00:00:53.906000,Saturday,October,0.898433,0,0,3,0.969
3,3,173688,2011-10-01 00:38:44,20000,W_Completeren aanvraag,SCHEDULE,2011-10-01 00:39:38.875,0 days 00:00:54.875000,Saturday,October,0.914583,0,0,4,39427.562
4,4,173688,2011-10-01 00:38:44,20000,W_Completeren aanvraag,START,2011-10-01 11:36:46.437,0 days 10:58:02.437000,Saturday,October,658.040617,11,0,5,356.871
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214372,44955422687236,206318,2012-02-03 17:07:38,5000,A_DECLINED,COMPLETE,2012-02-03 17:55:57.294,0 days 00:48:19.294000,Friday,February,48.321567,17,0,5,3.704
214373,44955422687237,206318,2012-02-03 17:07:38,5000,W_Afhandelen leads,COMPLETE,2012-02-03 17:56:00.998,0 days 00:48:22.998000,Friday,February,48.383300,17,0,6,
214374,44959717654528,206321,2012-02-03 17:08:39,2000,A_SUBMITTED,COMPLETE,2012-02-03 17:08:39.200,0 days 00:00:00.200000,Friday,February,0.003333,17,0,1,0.259
214375,44959717654529,206321,2012-02-03 17:08:39,2000,A_PARTLYSUBMITTED,COMPLETE,2012-02-03 17:08:39.459,0 days 00:00:00.459000,Friday,February,0.007650,17,0,2,39.653


In [131]:
df['previous_event']=0
df['previous_lifecycle']=0
all_ids = list(df[case_column].unique())

for ids in all_ids:
    df2 = df[df[case_column]==ids].copy()
    df.loc[df[case_column]==ids, 'previous_event']= df2[event_column].shift(periods=1)
    df.loc[df[case_column]==ids, 'previous_lifecycle']= df2['event lifecycle:transition'].shift(periods=1)
df.loc[df['previous_event'].isnull(), 'previous_event']='START'
df.loc[df['previous_lifecycle'].isnull(), 'previous_lifecycle']='START'

In [None]:
df.reset_index(inplace=True)

In [142]:
start_time = timeit.default_timer()

all_events = list(df['previous_event'].unique())
for event in all_events:
    df[event] = 0
print(timeit.default_timer() - start_time)
df

0.03333139999995183


Unnamed: 0,level_0,index,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp,Time Since Registration,...,A_APPROVED,A_ACTIVATED,O_CANCELLED,A_CANCELLED,W_Afhandelen leads,A_DECLINED,O_DECLINED,W_Nabellen incomplete dossiers,W_Beoordelen fraude,W_Wijzigen contractgegevens
0,0,0,0,173688,2011-10-01 00:38:44,20000,A_SUBMITTED,COMPLETE,2011-10-01 00:38:44.546,0 days 00:00:00.546000,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,173688,2011-10-01 00:38:44,20000,A_PARTLYSUBMITTED,COMPLETE,2011-10-01 00:38:44.880,0 days 00:00:00.880000,...,0,0,0,0,0,0,0,0,0,0
2,2,2,2,173688,2011-10-01 00:38:44,20000,A_PREACCEPTED,COMPLETE,2011-10-01 00:39:37.906,0 days 00:00:53.906000,...,0,0,0,0,0,0,0,0,0,0
3,3,3,3,173688,2011-10-01 00:38:44,20000,W_Completeren aanvraag,SCHEDULE,2011-10-01 00:39:38.875,0 days 00:00:54.875000,...,0,0,0,0,0,0,0,0,0,0
4,4,4,4,173688,2011-10-01 00:38:44,20000,W_Completeren aanvraag,START,2011-10-01 11:36:46.437,0 days 10:58:02.437000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214372,214372,214372,44955422687236,206318,2012-02-03 17:07:38,5000,A_DECLINED,COMPLETE,2012-02-03 17:55:57.294,0 days 00:48:19.294000,...,0,0,0,0,0,0,0,0,0,0
214373,214373,214373,44955422687237,206318,2012-02-03 17:07:38,5000,W_Afhandelen leads,COMPLETE,2012-02-03 17:56:00.998,0 days 00:48:22.998000,...,0,0,0,0,0,0,0,0,0,0
214374,214374,214374,44959717654528,206321,2012-02-03 17:08:39,2000,A_SUBMITTED,COMPLETE,2012-02-03 17:08:39.200,0 days 00:00:00.200000,...,0,0,0,0,0,0,0,0,0,0
214375,214375,214375,44959717654529,206321,2012-02-03 17:08:39,2000,A_PARTLYSUBMITTED,COMPLETE,2012-02-03 17:08:39.459,0 days 00:00:00.459000,...,0,0,0,0,0,0,0,0,0,0


In [143]:
#takes 10 mins
start_time = timeit.default_timer()
a=0
for ids in all_ids:
    index_case = df[df[case_column]==ids]['index'].max()
    for i in range(a,index_case+1):
        mask = ((df[case_column]==ids)& (df['index']>=i))
        prev_event = df.loc[mask,'previous_event'].reset_index(drop=True)[0]
        df.loc[mask, prev_event]=1
    a = index_case+1
print(timeit.default_timer() - start_time)

26
65
124
127
130
139
153
165
179
203
280
315
335
338
414
420
475
523
563
582
620
644
686
689
715
721
750
753
756
765
771
778
886
921
949
985
1005
1018
1024
1065
1068
1116
1126
1155
1158
1182
1202
1214
1217
1247
1250
1261
1287
1307
1310
1313
1321
1342
1348
1351
1379
1415
1418
1421
1482
1485
1488
1494
1497
1500
1527
1533
1539
1542
1545
1573
1583
1601
1604
1616
1731
1734
1740
1772
1820
1866
1887
1935
1938
2027
2049
2055
2065
2089
2107
2132
2153
2156
2159
2185
2225
2228
2234
2266
2307
2313
2335
2341
2355
2361
2425
2428
2488
2538
2580
2630
2637
2645
2648
2651
2778
2781
2784
2844
2850
2856
2886
2889
2930
2956
2982
3017
3078
3119
3129
3151
3154
3166
3172
3178
3200
3206
3212
3218
3336
3342
3356
3387
3415
3452
3487
3523
3526
3586
3592
3595
3637
3640
3643
3671
3700
3703
3713
3716
3719
3729
3745
3761
3767
3785
3788
3808
3855
3867
3870
3873
3879
3897
3903
3955
3965
3975
3981
4016
4058
4088
4098
4141
4213
4216
4284
4290
4293
4309
4315
4349
4357
4394
4447
4459
4465
4477
4517
4552
4598
4601
4611
464

In [144]:
df

Unnamed: 0,level_0,index,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp,Time Since Registration,...,A_APPROVED,A_ACTIVATED,O_CANCELLED,A_CANCELLED,W_Afhandelen leads,A_DECLINED,O_DECLINED,W_Nabellen incomplete dossiers,W_Beoordelen fraude,W_Wijzigen contractgegevens
0,0,0,0,173688,2011-10-01 00:38:44,20000,A_SUBMITTED,COMPLETE,2011-10-01 00:38:44.546,0 days 00:00:00.546000,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,173688,2011-10-01 00:38:44,20000,A_PARTLYSUBMITTED,COMPLETE,2011-10-01 00:38:44.880,0 days 00:00:00.880000,...,0,0,0,0,0,0,0,0,0,0
2,2,2,2,173688,2011-10-01 00:38:44,20000,A_PREACCEPTED,COMPLETE,2011-10-01 00:39:37.906,0 days 00:00:53.906000,...,0,0,0,0,0,0,0,0,0,0
3,3,3,3,173688,2011-10-01 00:38:44,20000,W_Completeren aanvraag,SCHEDULE,2011-10-01 00:39:38.875,0 days 00:00:54.875000,...,0,0,0,0,0,0,0,0,0,0
4,4,4,4,173688,2011-10-01 00:38:44,20000,W_Completeren aanvraag,START,2011-10-01 11:36:46.437,0 days 10:58:02.437000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214372,214372,214372,44955422687236,206318,2012-02-03 17:07:38,5000,A_DECLINED,COMPLETE,2012-02-03 17:55:57.294,0 days 00:48:19.294000,...,0,0,0,0,1,0,0,0,0,0
214373,214373,214373,44955422687237,206318,2012-02-03 17:07:38,5000,W_Afhandelen leads,COMPLETE,2012-02-03 17:56:00.998,0 days 00:48:22.998000,...,0,0,0,0,1,1,0,0,0,0
214374,214374,214374,44959717654528,206321,2012-02-03 17:08:39,2000,A_SUBMITTED,COMPLETE,2012-02-03 17:08:39.200,0 days 00:00:00.200000,...,0,0,0,0,0,0,0,0,0,0
214375,214375,214375,44959717654529,206321,2012-02-03 17:08:39,2000,A_PARTLYSUBMITTED,COMPLETE,2012-02-03 17:08:39.459,0 days 00:00:00.459000,...,0,0,0,0,0,0,0,0,0,0


In [105]:

X = df[[day_of_week_column, hour, month_of_year_column,offer_sent,position_column,minutes_since_reg]]
y = df[shifted_deltatimes]

#define cross-validation method to evaluate model
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

#define model
model = RidgeCV(alphas=arange(0, 1, 0.01), cv=cv, scoring='neg_mean_absolute_error')

#fit model
model.fit(X, y)

#display lambda that produced the lowest test MSE
print(model.alpha_)



KeyboardInterrupt: 