In [104]:
# imports
import pandas as pd
import statistics
import numpy as np
from datetime import datetime
import re
from numpy import arange
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import accuracy_score

In [76]:
# read the data (2012)
df = pd.read_csv('data/BPI_Challenge_2012-training.csv')

df_test = pd.read_csv('data/BPI_Challenge_2012-test.csv')

# Defining database-specific variables
case_column = "case concept:name"
registration_time_column = "case REG_DATE"
event_column = "event concept:name"
timestamp_column = "event time:timestamp"
position_column = "Position"
baseline_next_event_column = "Baseline Prediction for Next Activity"
pos_event_next_event_column = "Position&Event Prediction for Next Activity"
baseline_next_timestamp_column = "Baseline Prediction for Next Timestamp"
pos_event_next_timestamp_column = "Position&Event Prediction for Next Timestamp"
time_since_registration_column = "Time Since Registration"
day_of_week_column = "day_of_week"
month_of_year_column = "month_of_year"
offer_sent = 'O_SENT' # the name a of a sent offer state within event_column
timeformat_registration = "%Y-%m-%dT%H:%M:%S" # new time format
timeformat_timestamp = "%d-%m-%Y %H:%M:%S.%f"
predicted_shift_column = 'Predicted event from baseline'

df.head(10)

Unnamed: 0,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp
0,0,173688,2011-10-01T00:38:44.546+02:00,20000,A_SUBMITTED,COMPLETE,01-10-2011 00:38:44.546
1,1,173688,2011-10-01T00:38:44.546+02:00,20000,A_PARTLYSUBMITTED,COMPLETE,01-10-2011 00:38:44.880
2,2,173688,2011-10-01T00:38:44.546+02:00,20000,A_PREACCEPTED,COMPLETE,01-10-2011 00:39:37.906
3,3,173688,2011-10-01T00:38:44.546+02:00,20000,W_Completeren aanvraag,SCHEDULE,01-10-2011 00:39:38.875
4,4294967296,173691,2011-10-01T08:08:58.256+02:00,5000,A_SUBMITTED,COMPLETE,01-10-2011 08:08:58.256
5,4294967297,173691,2011-10-01T08:08:58.256+02:00,5000,A_PARTLYSUBMITTED,COMPLETE,01-10-2011 08:09:02.195
6,4294967298,173691,2011-10-01T08:08:58.256+02:00,5000,A_PREACCEPTED,COMPLETE,01-10-2011 08:09:56.648
7,4294967299,173691,2011-10-01T08:08:58.256+02:00,5000,W_Completeren aanvraag,SCHEDULE,01-10-2011 08:09:59.578
8,8589934592,173694,2011-10-01T08:10:30.287+02:00,7000,A_SUBMITTED,COMPLETE,01-10-2011 08:10:30.287
9,8589934593,173694,2011-10-01T08:10:30.287+02:00,7000,A_PARTLYSUBMITTED,COMPLETE,01-10-2011 08:10:30.591


In [77]:
df[registration_time_column]= [re.sub('\..*|\+.*','',a,flags=re.DOTALL) for a in df[registration_time_column]]
df[registration_time_column] = [datetime.strptime(date, timeformat_registration) for date in df[registration_time_column]]
df[timestamp_column] = [datetime.strptime(date, timeformat_timestamp) for date in df[timestamp_column]]
df = df.sort_values(by=[case_column, timestamp_column])
df.reset_index(inplace=True, drop=True)# sort values by user and time of event
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214377 entries, 0 to 214376
Data columns (total 7 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   eventID                     214377 non-null  int64         
 1   case concept:name           214377 non-null  int64         
 2   case REG_DATE               214377 non-null  datetime64[ns]
 3   case AMOUNT_REQ             214377 non-null  int64         
 4   event concept:name          214377 non-null  object        
 5   event lifecycle:transition  214377 non-null  object        
 6   event time:timestamp        214377 non-null  datetime64[ns]
dtypes: datetime64[ns](2), int64(3), object(2)
memory usage: 11.4+ MB


In [78]:
df[time_since_registration_column] = df[timestamp_column] - df[registration_time_column] # Adding time since registration
df[day_of_week_column] = df[timestamp_column].dt.day_name()
df[month_of_year_column] = df[timestamp_column].dt.month_name()
df

Unnamed: 0,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp,Time Since Registration,day_of_week,month_of_year
0,0,173688,2011-10-01 00:38:44,20000,A_SUBMITTED,COMPLETE,2011-10-01 00:38:44.546,0 days 00:00:00.546000,Saturday,October
1,1,173688,2011-10-01 00:38:44,20000,A_PARTLYSUBMITTED,COMPLETE,2011-10-01 00:38:44.880,0 days 00:00:00.880000,Saturday,October
2,2,173688,2011-10-01 00:38:44,20000,A_PREACCEPTED,COMPLETE,2011-10-01 00:39:37.906,0 days 00:00:53.906000,Saturday,October
3,3,173688,2011-10-01 00:38:44,20000,W_Completeren aanvraag,SCHEDULE,2011-10-01 00:39:38.875,0 days 00:00:54.875000,Saturday,October
4,4,173688,2011-10-01 00:38:44,20000,W_Completeren aanvraag,START,2011-10-01 11:36:46.437,0 days 10:58:02.437000,Saturday,October
...,...,...,...,...,...,...,...,...,...,...
214372,44955422687236,206318,2012-02-03 17:07:38,5000,A_DECLINED,COMPLETE,2012-02-03 17:55:57.294,0 days 00:48:19.294000,Friday,February
214373,44955422687237,206318,2012-02-03 17:07:38,5000,W_Afhandelen leads,COMPLETE,2012-02-03 17:56:00.998,0 days 00:48:22.998000,Friday,February
214374,44959717654528,206321,2012-02-03 17:08:39,2000,A_SUBMITTED,COMPLETE,2012-02-03 17:08:39.200,0 days 00:00:00.200000,Friday,February
214375,44959717654529,206321,2012-02-03 17:08:39,2000,A_PARTLYSUBMITTED,COMPLETE,2012-02-03 17:08:39.459,0 days 00:00:00.459000,Friday,February


In [79]:
minutes_since_reg= 'minutes_since_reg'
df[minutes_since_reg]=[np.log((i.total_seconds()+0.000001)*60) for i in df[time_since_registration_column]]
hour = 'hour'
df[hour]  = [i.hour for i in df[timestamp_column]]
df[offer_sent] = 0
offer_sent_mask = df[event_column] == offer_sent
case_check = df[offer_sent_mask][[case_column,timestamp_column]].copy().reset_index(drop=True)
for i in range(len(case_check)):
    case_name = case_check.iloc[i,0]
    case_time = case_check.iloc[i,1]
    mask = (df[case_column]==case_name)&(df[timestamp_column]>=case_time)
    df.loc[mask, offer_sent] = 1


In [80]:
df[position_column] = df.groupby([case_column]).cumcount()+1
length_process = df.groupby([case_column]).size().to_dict()

In [102]:
shifted_deltatimes_list = df[timestamp_column].diff().shift(periods=-1)
shifted_deltatimes = 'shifted_deltatimes'
df[shifted_deltatimes] = [(i.total_seconds()) for i in shifted_deltatimes_list]
mask1  = (df[shifted_deltatimes]<0.01)
mask2 = (df[shifted_deltatimes]<0)
df.loc[mask1, shifted_deltatimes]=0
df.loc[mask2, shifted_deltatimes]=np.nan
df

Unnamed: 0,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp,Time Since Registration,day_of_week,month_of_year,minutes_since_reg,hour,O_SENT,Position,shifted_deltatimes
0,0,173688,2011-10-01 00:38:44,20000,A_SUBMITTED,COMPLETE,2011-10-01 00:38:44.546,0 days 00:00:00.546000,Saturday,October,3.489210,0,0,1,0.334
1,1,173688,2011-10-01 00:38:44,20000,A_PARTLYSUBMITTED,COMPLETE,2011-10-01 00:38:44.880,0 days 00:00:00.880000,Saturday,October,3.966512,0,0,2,53.026
2,2,173688,2011-10-01 00:38:44,20000,A_PREACCEPTED,COMPLETE,2011-10-01 00:39:37.906,0 days 00:00:53.906000,Saturday,October,8.081586,0,0,3,0.969
3,3,173688,2011-10-01 00:38:44,20000,W_Completeren aanvraag,SCHEDULE,2011-10-01 00:39:38.875,0 days 00:00:54.875000,Saturday,October,8.099402,0,0,4,39427.562
4,4,173688,2011-10-01 00:38:44,20000,W_Completeren aanvraag,START,2011-10-01 11:36:46.437,0 days 10:58:02.437000,Saturday,October,14.677956,11,0,5,356.871
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214372,44955422687236,206318,2012-02-03 17:07:38,5000,A_DECLINED,COMPLETE,2012-02-03 17:55:57.294,0 days 00:48:19.294000,Friday,February,12.066567,17,0,5,3.704
214373,44955422687237,206318,2012-02-03 17:07:38,5000,W_Afhandelen leads,COMPLETE,2012-02-03 17:56:00.998,0 days 00:48:22.998000,Friday,February,12.067844,17,0,6,
214374,44959717654528,206321,2012-02-03 17:08:39,2000,A_SUBMITTED,COMPLETE,2012-02-03 17:08:39.200,0 days 00:00:00.200000,Friday,February,2.484912,17,0,1,0.259
214375,44959717654529,206321,2012-02-03 17:08:39,2000,A_PARTLYSUBMITTED,COMPLETE,2012-02-03 17:08:39.459,0 days 00:00:00.459000,Friday,February,3.315642,17,0,2,39.653


In [95]:
df[timestamp_column].diff().shift(periods=-1)

In [96]:
shifted_deltatimes_list

0        0 days 00:00:00.334000
1        0 days 00:00:53.026000
2        0 days 00:00:00.969000
3        0 days 10:57:07.562000
4        0 days 00:05:56.871000
                  ...          
214370   0 days 00:41:36.288000
214371   0 days 00:06:28.186000
214372   0 days 00:00:03.704000
214374   0 days 00:00:00.259000
214375   0 days 00:00:39.653000
Name: event time:timestamp, Length: 203908, dtype: timedelta64[ns]

In [105]:

X = df[[day_of_week_column, hour, month_of_year_column,offer_sent,position_column,minutes_since_reg]]
y = df[shifted_deltatimes]

#define cross-validation method to evaluate model
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

#define model
model = RidgeCV(alphas=arange(0, 1, 0.01), cv=cv, scoring='neg_mean_absolute_error')

#fit model
model.fit(X, y)

#display lambda that produced the lowest test MSE
print(model.alpha_)



KeyboardInterrupt: 