In [50]:
# imports
import time
import timeit
import pandas as pd
import statistics
import numpy as np
from datetime import datetime
import re
from numpy import arange
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, mean_absolute_percentage_error
from sklearn.metrics import accuracy_score

In [8]:
# read the data (2012)
df = pd.read_csv('data/BPI_Challenge_2012-training.csv')

df_test = pd.read_csv('data/BPI_Challenge_2012-test.csv')

# Defining database-specific variables
case_column = "case concept:name"
registration_time_column = "case REG_DATE"
event_column = "event concept:name"
timestamp_column = "event time:timestamp"
position_column = "Position"
baseline_next_event_column = "Baseline Prediction for Next Activity"
pos_event_next_event_column = "Position&Event Prediction for Next Activity"
baseline_next_timestamp_column = "Baseline Prediction for Next Timestamp"
pos_event_next_timestamp_column = "Position&Event Prediction for Next Timestamp"
time_since_registration_column = "Time Since Registration"
day_of_week_column = "day_of_week"
month_of_year_column = "month_of_year"
offer_sent = 'offer_sent_already' # the name a of a sent offer state within event_column
timeformat_registration = "%Y-%m-%dT%H:%M:%S" # new time format
timeformat_timestamp = "%d-%m-%Y %H:%M:%S.%f"
predicted_shift_column = 'Predicted event from baseline'
df.head(10)

Unnamed: 0,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp
0,0,173688,2011-10-01T00:38:44.546+02:00,20000,A_SUBMITTED,COMPLETE,01-10-2011 00:38:44.546
1,1,173688,2011-10-01T00:38:44.546+02:00,20000,A_PARTLYSUBMITTED,COMPLETE,01-10-2011 00:38:44.880
2,2,173688,2011-10-01T00:38:44.546+02:00,20000,A_PREACCEPTED,COMPLETE,01-10-2011 00:39:37.906
3,3,173688,2011-10-01T00:38:44.546+02:00,20000,W_Completeren aanvraag,SCHEDULE,01-10-2011 00:39:38.875
4,4294967296,173691,2011-10-01T08:08:58.256+02:00,5000,A_SUBMITTED,COMPLETE,01-10-2011 08:08:58.256
5,4294967297,173691,2011-10-01T08:08:58.256+02:00,5000,A_PARTLYSUBMITTED,COMPLETE,01-10-2011 08:09:02.195
6,4294967298,173691,2011-10-01T08:08:58.256+02:00,5000,A_PREACCEPTED,COMPLETE,01-10-2011 08:09:56.648
7,4294967299,173691,2011-10-01T08:08:58.256+02:00,5000,W_Completeren aanvraag,SCHEDULE,01-10-2011 08:09:59.578
8,8589934592,173694,2011-10-01T08:10:30.287+02:00,7000,A_SUBMITTED,COMPLETE,01-10-2011 08:10:30.287
9,8589934593,173694,2011-10-01T08:10:30.287+02:00,7000,A_PARTLYSUBMITTED,COMPLETE,01-10-2011 08:10:30.591


In [204]:
df[registration_time_column]= [re.sub('\..*|\+.*','',a,flags=re.DOTALL) for a in df[registration_time_column]]
df[registration_time_column] = [datetime.strptime(date, timeformat_registration) for date in df[registration_time_column]]
df[timestamp_column] = [datetime.strptime(date, timeformat_timestamp) for date in df[timestamp_column]]
df = df.sort_values(by=[case_column, timestamp_column])
df.reset_index(inplace=True, drop=True)
df.reset_index(inplace=True)
df

Unnamed: 0,index,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp
0,0,0,173688,2011-10-01 00:38:44,20000,A_SUBMITTED,COMPLETE,2011-10-01 00:38:44.546
1,1,1,173688,2011-10-01 00:38:44,20000,A_PARTLYSUBMITTED,COMPLETE,2011-10-01 00:38:44.880
2,2,2,173688,2011-10-01 00:38:44,20000,A_PREACCEPTED,COMPLETE,2011-10-01 00:39:37.906
3,3,3,173688,2011-10-01 00:38:44,20000,W_Completeren aanvraag,SCHEDULE,2011-10-01 00:39:38.875
4,4,4,173688,2011-10-01 00:38:44,20000,W_Completeren aanvraag,START,2011-10-01 11:36:46.437
...,...,...,...,...,...,...,...,...
214372,214372,44955422687236,206318,2012-02-03 17:07:38,5000,A_DECLINED,COMPLETE,2012-02-03 17:55:57.294
214373,214373,44955422687237,206318,2012-02-03 17:07:38,5000,W_Afhandelen leads,COMPLETE,2012-02-03 17:56:00.998
214374,214374,44959717654528,206321,2012-02-03 17:08:39,2000,A_SUBMITTED,COMPLETE,2012-02-03 17:08:39.200
214375,214375,44959717654529,206321,2012-02-03 17:08:39,2000,A_PARTLYSUBMITTED,COMPLETE,2012-02-03 17:08:39.459


In [205]:
df[time_since_registration_column] = df[timestamp_column] - df[registration_time_column] # Adding time since registration
df[day_of_week_column] = df[timestamp_column].dt.dayofweek
df[month_of_year_column] = df[timestamp_column].dt.month
df

Unnamed: 0,index,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp,Time Since Registration,day_of_week,month_of_year
0,0,0,173688,2011-10-01 00:38:44,20000,A_SUBMITTED,COMPLETE,2011-10-01 00:38:44.546,0 days 00:00:00.546000,5,10
1,1,1,173688,2011-10-01 00:38:44,20000,A_PARTLYSUBMITTED,COMPLETE,2011-10-01 00:38:44.880,0 days 00:00:00.880000,5,10
2,2,2,173688,2011-10-01 00:38:44,20000,A_PREACCEPTED,COMPLETE,2011-10-01 00:39:37.906,0 days 00:00:53.906000,5,10
3,3,3,173688,2011-10-01 00:38:44,20000,W_Completeren aanvraag,SCHEDULE,2011-10-01 00:39:38.875,0 days 00:00:54.875000,5,10
4,4,4,173688,2011-10-01 00:38:44,20000,W_Completeren aanvraag,START,2011-10-01 11:36:46.437,0 days 10:58:02.437000,5,10
...,...,...,...,...,...,...,...,...,...,...,...
214372,214372,44955422687236,206318,2012-02-03 17:07:38,5000,A_DECLINED,COMPLETE,2012-02-03 17:55:57.294,0 days 00:48:19.294000,4,2
214373,214373,44955422687237,206318,2012-02-03 17:07:38,5000,W_Afhandelen leads,COMPLETE,2012-02-03 17:56:00.998,0 days 00:48:22.998000,4,2
214374,214374,44959717654528,206321,2012-02-03 17:08:39,2000,A_SUBMITTED,COMPLETE,2012-02-03 17:08:39.200,0 days 00:00:00.200000,4,2
214375,214375,44959717654529,206321,2012-02-03 17:08:39,2000,A_PARTLYSUBMITTED,COMPLETE,2012-02-03 17:08:39.459,0 days 00:00:00.459000,4,2


In [206]:
minutes_since_reg= 'minutes_since_reg'
df[minutes_since_reg]=[((i.total_seconds()+0.000001)/60) for i in df[time_since_registration_column]]
hour = 'hour'
df[hour]  = [i.hour for i in df[timestamp_column]]
df[offer_sent] = 0
offer_sent_mask = df[event_column] == offer_sent
case_check = df[offer_sent_mask][[case_column,timestamp_column]].copy().reset_index(drop=True)
for i in range(len(case_check)):
    case_name = case_check.iloc[i,0]
    case_time = case_check.iloc[i,1]
    mask = (df[case_column]==case_name)&(df[timestamp_column]>=case_time)
    df.loc[mask, offer_sent] = 1


In [207]:
df[position_column] = df.groupby([case_column]).cumcount()+1
length_process = df.groupby([case_column]).size().to_dict()

In [208]:
shifted_deltatimes_list = df[timestamp_column].diff().shift(periods=-1)
shifted_deltatimes = 'shifted_deltatimes'
df[shifted_deltatimes] = [(i.total_seconds()) for i in shifted_deltatimes_list]
mask1  = (df[shifted_deltatimes]<0.001)
mask2 = (df[shifted_deltatimes]<0)
df.loc[mask1, shifted_deltatimes]=0
df.loc[mask2, shifted_deltatimes]=np.nan
df

Unnamed: 0,index,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp,Time Since Registration,day_of_week,month_of_year,minutes_since_reg,hour,offer_sent_already,Position,shifted_deltatimes
0,0,0,173688,2011-10-01 00:38:44,20000,A_SUBMITTED,COMPLETE,2011-10-01 00:38:44.546,0 days 00:00:00.546000,5,10,0.009100,0,0,1,0.334
1,1,1,173688,2011-10-01 00:38:44,20000,A_PARTLYSUBMITTED,COMPLETE,2011-10-01 00:38:44.880,0 days 00:00:00.880000,5,10,0.014667,0,0,2,53.026
2,2,2,173688,2011-10-01 00:38:44,20000,A_PREACCEPTED,COMPLETE,2011-10-01 00:39:37.906,0 days 00:00:53.906000,5,10,0.898433,0,0,3,0.969
3,3,3,173688,2011-10-01 00:38:44,20000,W_Completeren aanvraag,SCHEDULE,2011-10-01 00:39:38.875,0 days 00:00:54.875000,5,10,0.914583,0,0,4,39427.562
4,4,4,173688,2011-10-01 00:38:44,20000,W_Completeren aanvraag,START,2011-10-01 11:36:46.437,0 days 10:58:02.437000,5,10,658.040617,11,0,5,356.871
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214372,214372,44955422687236,206318,2012-02-03 17:07:38,5000,A_DECLINED,COMPLETE,2012-02-03 17:55:57.294,0 days 00:48:19.294000,4,2,48.321567,17,0,5,3.704
214373,214373,44955422687237,206318,2012-02-03 17:07:38,5000,W_Afhandelen leads,COMPLETE,2012-02-03 17:56:00.998,0 days 00:48:22.998000,4,2,48.383300,17,0,6,
214374,214374,44959717654528,206321,2012-02-03 17:08:39,2000,A_SUBMITTED,COMPLETE,2012-02-03 17:08:39.200,0 days 00:00:00.200000,4,2,0.003333,17,0,1,0.259
214375,214375,44959717654529,206321,2012-02-03 17:08:39,2000,A_PARTLYSUBMITTED,COMPLETE,2012-02-03 17:08:39.459,0 days 00:00:00.459000,4,2,0.007650,17,0,2,39.653


In [209]:
df['previous_event']=0
df['previous_lifecycle']=0
all_ids = list(df[case_column].unique())

for ids in all_ids:
    df2 = df[df[case_column]==ids].copy()
    df.loc[df[case_column]==ids, 'previous_event']= df2[event_column].shift(periods=1)
    df.loc[df[case_column]==ids, 'previous_lifecycle']= df2['event lifecycle:transition'].shift(periods=1)
df.loc[df['previous_event'].isnull(), 'previous_event']='START'
df.loc[df['previous_lifecycle'].isnull(), 'previous_lifecycle']='START'

In [210]:
all_events = list(df['previous_event'].unique())
for event in all_events:
    df[event] = 0
df

Unnamed: 0,index,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp,Time Since Registration,day_of_week,...,A_APPROVED,A_ACTIVATED,O_CANCELLED,A_CANCELLED,W_Afhandelen leads,A_DECLINED,O_DECLINED,W_Nabellen incomplete dossiers,W_Beoordelen fraude,W_Wijzigen contractgegevens
0,0,0,173688,2011-10-01 00:38:44,20000,A_SUBMITTED,COMPLETE,2011-10-01 00:38:44.546,0 days 00:00:00.546000,5,...,0,0,0,0,0,0,0,0,0,0
1,1,1,173688,2011-10-01 00:38:44,20000,A_PARTLYSUBMITTED,COMPLETE,2011-10-01 00:38:44.880,0 days 00:00:00.880000,5,...,0,0,0,0,0,0,0,0,0,0
2,2,2,173688,2011-10-01 00:38:44,20000,A_PREACCEPTED,COMPLETE,2011-10-01 00:39:37.906,0 days 00:00:53.906000,5,...,0,0,0,0,0,0,0,0,0,0
3,3,3,173688,2011-10-01 00:38:44,20000,W_Completeren aanvraag,SCHEDULE,2011-10-01 00:39:38.875,0 days 00:00:54.875000,5,...,0,0,0,0,0,0,0,0,0,0
4,4,4,173688,2011-10-01 00:38:44,20000,W_Completeren aanvraag,START,2011-10-01 11:36:46.437,0 days 10:58:02.437000,5,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214372,214372,44955422687236,206318,2012-02-03 17:07:38,5000,A_DECLINED,COMPLETE,2012-02-03 17:55:57.294,0 days 00:48:19.294000,4,...,0,0,0,0,0,0,0,0,0,0
214373,214373,44955422687237,206318,2012-02-03 17:07:38,5000,W_Afhandelen leads,COMPLETE,2012-02-03 17:56:00.998,0 days 00:48:22.998000,4,...,0,0,0,0,0,0,0,0,0,0
214374,214374,44959717654528,206321,2012-02-03 17:08:39,2000,A_SUBMITTED,COMPLETE,2012-02-03 17:08:39.200,0 days 00:00:00.200000,4,...,0,0,0,0,0,0,0,0,0,0
214375,214375,44959717654529,206321,2012-02-03 17:08:39,2000,A_PARTLYSUBMITTED,COMPLETE,2012-02-03 17:08:39.459,0 days 00:00:00.459000,4,...,0,0,0,0,0,0,0,0,0,0


In [211]:
#takes 10 mins
start_time = timeit.default_timer()
df['last_event'] = 0
a=0
for ids in all_ids:
    index_case = df[df[case_column]==ids]['index'].max()
    df.loc[index_case,'last_event']=1
    for i in range(a,index_case+1):
        mask = ((df[case_column]==ids)& (df['index']>=i))
        prev_event = df.loc[mask,'previous_event'].reset_index(drop=True)[0]
        df.loc[mask, prev_event]=1
    a = index_case+1
print(timeit.default_timer() - start_time)

520.7884987000016


In [212]:
df

Unnamed: 0,index,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp,Time Since Registration,day_of_week,...,A_ACTIVATED,O_CANCELLED,A_CANCELLED,W_Afhandelen leads,A_DECLINED,O_DECLINED,W_Nabellen incomplete dossiers,W_Beoordelen fraude,W_Wijzigen contractgegevens,last_event
0,0,0,173688,2011-10-01 00:38:44,20000,A_SUBMITTED,COMPLETE,2011-10-01 00:38:44.546,0 days 00:00:00.546000,5,...,0,0,0,0,0,0,0,0,0,0
1,1,1,173688,2011-10-01 00:38:44,20000,A_PARTLYSUBMITTED,COMPLETE,2011-10-01 00:38:44.880,0 days 00:00:00.880000,5,...,0,0,0,0,0,0,0,0,0,0
2,2,2,173688,2011-10-01 00:38:44,20000,A_PREACCEPTED,COMPLETE,2011-10-01 00:39:37.906,0 days 00:00:53.906000,5,...,0,0,0,0,0,0,0,0,0,0
3,3,3,173688,2011-10-01 00:38:44,20000,W_Completeren aanvraag,SCHEDULE,2011-10-01 00:39:38.875,0 days 00:00:54.875000,5,...,0,0,0,0,0,0,0,0,0,0
4,4,4,173688,2011-10-01 00:38:44,20000,W_Completeren aanvraag,START,2011-10-01 11:36:46.437,0 days 10:58:02.437000,5,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214372,214372,44955422687236,206318,2012-02-03 17:07:38,5000,A_DECLINED,COMPLETE,2012-02-03 17:55:57.294,0 days 00:48:19.294000,4,...,0,0,0,1,0,0,0,0,0,0
214373,214373,44955422687237,206318,2012-02-03 17:07:38,5000,W_Afhandelen leads,COMPLETE,2012-02-03 17:56:00.998,0 days 00:48:22.998000,4,...,0,0,0,1,1,0,0,0,0,1
214374,214374,44959717654528,206321,2012-02-03 17:08:39,2000,A_SUBMITTED,COMPLETE,2012-02-03 17:08:39.200,0 days 00:00:00.200000,4,...,0,0,0,0,0,0,0,0,0,0
214375,214375,44959717654529,206321,2012-02-03 17:08:39,2000,A_PARTLYSUBMITTED,COMPLETE,2012-02-03 17:08:39.459,0 days 00:00:00.459000,4,...,0,0,0,0,0,0,0,0,0,0


In [213]:
df_reg = df.copy()

In [215]:
mask_nan = df_reg['last_event']==0
df_reg = df_reg[mask_nan].copy()

In [57]:
def prep_data(df):
    # definitions
    case_column = "case concept:name"
    registration_time_column = "case REG_DATE"
    event_column = "event concept:name"
    timestamp_column = "event time:timestamp"
    position_column = "position"
    baseline_next_event_column = "Baseline Prediction for Next Activity"
    pos_event_next_event_column = "Position&Event Prediction for Next Activity"
    baseline_next_timestamp_column = "Baseline Prediction for Next Timestamp"
    pos_event_next_timestamp_column = "Position&Event Prediction for Next Timestamp"
    time_since_registration_column = "Time Since Registration"
    day_of_week_column = "day_of_week"
    month_of_year_column = "month_of_year"
    offer_sent = 'offer_sent_already' # the name a of a sent offer state within event_column
    timeformat_registration = "%Y-%m-%dT%H:%M:%S" # new time format
    timeformat_timestamp = "%d-%m-%Y %H:%M:%S.%f"
    predicted_shift_column = 'Predicted event from baseline'
    start_time = timeit.default_timer()

    #make timestamps
    df[registration_time_column] = [re.sub('\..*|\+.*', '', a, flags=re.DOTALL) for a in df[registration_time_column]]
    df[registration_time_column] = [datetime.strptime(date, timeformat_registration) for date in df[registration_time_column]]
    df[timestamp_column] = [datetime.strptime(date, timeformat_timestamp) for date in df[timestamp_column]]

    #sort and order dataframe
    df = df.sort_values(by=[case_column, timestamp_column])
    df.reset_index(inplace=True, drop=True)
    df.reset_index(inplace=True)

    # add time related columns
    df[time_since_registration_column] = df[timestamp_column] - df[registration_time_column] # Adding time since registration
    df[day_of_week_column] = df[timestamp_column].dt.dayofweek
    df[month_of_year_column] = df[timestamp_column].dt.month
    minutes_since_reg= 'minutes_since_reg'
    df[minutes_since_reg]=[((i.total_seconds()+0.000001)/60) for i in df[time_since_registration_column]]
    hour = 'hour'
    df[hour]  = [i.hour for i in df[timestamp_column]]

    '''df[offer_sent] = 0
    offer_sent_mask = df[event_column] == offer_sent
    case_check = df[offer_sent_mask][[case_column,timestamp_column]].copy().reset_index(drop=True)
    for i in range(len(case_check)):
        case_name = case_check.iloc[i,0]
        case_time = case_check.iloc[i,1]
        mask = (df[case_column]==case_name)&(df[timestamp_column]>=case_time)
        df.loc[mask, offer_sent] = 1'''

    # add position column
    df[position_column] = df.groupby([case_column]).cumcount()+1

    # add time until next event column
    shifted_deltatimes_list = df[timestamp_column].diff().shift(periods=-1)
    shifted_deltatimes = 'shifted_deltatimes'
    df[shifted_deltatimes] = [(i.total_seconds()) for i in shifted_deltatimes_list]
    mask1  = (df[shifted_deltatimes]<0.001)
    mask2 = (df[shifted_deltatimes]<0)
    df.loc[mask1, shifted_deltatimes]=0.001
    df.loc[mask2, shifted_deltatimes]=np.nan

    # add previous event and previous lifecycle column
    df['previous_event']=0
    df['previous_lifecycle']=0
    all_ids = list(df[case_column].unique())

    for ids in all_ids:
        df2 = df[df[case_column]==ids].copy()
        df.loc[df[case_column]==ids, 'previous_event']= df2[event_column].shift(periods=1)
        df.loc[df[case_column]==ids, 'previous_lifecycle']= df2['event lifecycle:transition'].shift(periods=1)
    df.loc[df['previous_event'].isnull(), 'previous_event']='START'
    df.loc[df['previous_lifecycle'].isnull(), 'previous_lifecycle']='START'

    # add binary columns of what events already happened in a case
    all_events = list(df['previous_event'].unique())
    for event in all_events:
        df[event] = 0
    df['last_event'] = 0
    a=0
    for ids in all_ids:
        index_case = df[df[case_column]==ids]['index'].max()
        df.loc[index_case,'last_event']=1
        for i in range(a,index_case+1):
            mask = ((df[case_column]==ids)& (df['index']>=i))
            prev_event = df.loc[mask,'previous_event'].reset_index(drop=True)[0]
            df.loc[mask, prev_event]=1
        a = index_case+1

    # print time and add log column of time until next event
    print((timeit.default_timer() - start_time)/60)
    df['log_y'] = np.log(df['shifted_deltatimes'])
    return df

def prep_data_reg(df):
    df_prep = prep_data(df)
    mask_nan = df_prep['last_event']==0
    df_reg = df_prep[mask_nan].copy()
    return df_reg

def MAPE(Y_actual,Y_Predicted):
    mape = np.mean(np.abs((Y_actual - Y_Predicted)/Y_actual))*100
    return mape

def MAE(Y_actual,Y_Predicted):
    mae = np.mean(np.abs((Y_actual - Y_Predicted)))
    return mae

In [58]:
df = pd.read_csv('data/BPI_Challenge_2012-training.csv')
df_t = pd.read_csv('data/BPI_Challenge_2012-test.csv')

In [59]:
df_train = prep_data_reg(df)
df_test = prep_data_reg(df_t)

13.128087163333324
1.1282564616666604


Unnamed: 0,index,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp,Time Since Registration,day_of_week,...,W_Valideren aanvraag,O_ACCEPTED,A_APPROVED,A_REGISTERED,A_ACTIVATED,O_DECLINED,W_Nabellen incomplete dossiers,W_Beoordelen fraude,last_event,log_y
0,0,44964012621824,206324,2012-02-03 17:17:11,2500,A_SUBMITTED,COMPLETE,2012-02-03 17:17:11.047,0 days 00:00:00.047000,4,...,0,0,0,0,0,0,0,0,0,-1.287354
1,1,44964012621825,206324,2012-02-03 17:17:11,2500,A_PARTLYSUBMITTED,COMPLETE,2012-02-03 17:17:11.323,0 days 00:00:00.323000,4,...,0,0,0,0,0,0,0,0,0,3.454454
3,3,44968307589120,206327,2012-02-03 17:23:41,6000,A_SUBMITTED,COMPLETE,2012-02-03 17:23:41.949,0 days 00:00:00.949000,4,...,0,0,0,0,0,0,0,0,0,-0.588787
4,4,44968307589121,206327,2012-02-03 17:23:41,6000,A_PARTLYSUBMITTED,COMPLETE,2012-02-03 17:23:42.504,0 days 00:00:01.504000,4,...,0,0,0,0,0,0,0,0,0,3.710519
5,5,44968307589122,206327,2012-02-03 17:23:41,6000,A_PREACCEPTED,COMPLETE,2012-02-03 17:24:23.379,0 days 00:00:42.379000,4,...,0,0,0,0,0,0,0,0,0,-0.396010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47817,47817,56203942035456,214376,2012-02-29 23:51:16,15000,A_SUBMITTED,COMPLETE,2012-02-29 23:51:16.799,0 days 00:00:00.799000,2,...,0,0,0,0,0,0,0,0,0,-0.471605
47818,47818,56203942035457,214376,2012-02-29 23:51:16,15000,A_PARTLYSUBMITTED,COMPLETE,2012-02-29 23:51:17.423,0 days 00:00:01.423000,2,...,0,0,0,0,0,0,0,0,0,3.781094
47819,47819,56203942035458,214376,2012-02-29 23:51:16,15000,W_Afhandelen leads,SCHEDULE,2012-02-29 23:52:01.287,0 days 00:00:45.287000,2,...,0,0,0,0,0,0,0,0,0,10.448293
47820,47820,56203942035459,214376,2012-02-29 23:51:16,15000,W_Afhandelen leads,START,2012-03-01 09:26:46.736,0 days 09:35:30.736000,3,...,0,0,0,0,0,0,0,0,0,3.919634


In [60]:
X = df_train[['day_of_week', 'hour', 'month_of_year','position','minutes_since_reg', 'case AMOUNT_REQ', 'START', 'A_SUBMITTED', 'A_PARTLYSUBMITTED', 'A_PREACCEPTED', 'W_Completeren aanvraag', 'A_DECLINED', 'A_ACCEPTED', 'A_FINALIZED', 'O_SELECTED', 'O_CREATED', 'O_SENT', 'W_Nabellen offertes', 'O_CANCELLED', 'W_Afhandelen leads', 'A_CANCELLED', 'O_SENT_BACK', 'W_Valideren aanvraag', 'O_ACCEPTED', 'A_APPROVED', 'A_REGISTERED', 'A_ACTIVATED', 'O_DECLINED', 'W_Nabellen incomplete dossiers', 'W_Beoordelen fraude']]
y = df_train['log_y']

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

model = RidgeCV(alphas=arange(0, 1, 0.01), cv=cv, scoring='neg_mean_absolute_error')

model.fit(X, y)

print(f'Alpha: {model.alpha_}')

X_test = df_test[['day_of_week', 'hour', 'month_of_year','position','minutes_since_reg', 'case AMOUNT_REQ', 'START', 'A_SUBMITTED', 'A_PARTLYSUBMITTED', 'A_PREACCEPTED', 'W_Completeren aanvraag', 'A_DECLINED', 'A_ACCEPTED', 'A_FINALIZED', 'O_SELECTED', 'O_CREATED', 'O_SENT', 'W_Nabellen offertes', 'O_CANCELLED', 'W_Afhandelen leads', 'A_CANCELLED', 'O_SENT_BACK', 'W_Valideren aanvraag', 'O_ACCEPTED', 'A_APPROVED', 'A_REGISTERED', 'A_ACTIVATED', 'O_DECLINED', 'W_Nabellen incomplete dossiers', 'W_Beoordelen fraude']]
Y_test = df_test['shifted_deltatimes']
ridge_predict = np.exp(model.predict(X_test))
ridge_MAPE = MAPE(Y_test,ridge_predict)
ridge_MAE = MAE(Y_test,ridge_predict)
r2 = model.score(X_test,df_test['log_y'])
print(f'MAPE: {ridge_MAPE}')
print(f'MAE: {ridge_MAE/3600} hours')
print(f'R2: {r2}')

Alpha: 0.0
MAPE: 511745.1158522866
MAE: 9.531905645772383 hours
R2: 0.33949739994279193


In [32]:
model.coef_

array([ 3.00222933e+03,  1.52395234e+03, -5.17032126e+02,  1.12166025e+03,
       -9.68733787e-01, -6.74174558e-02,  0.00000000e+00, -1.08622676e+03,
        8.92583015e+03,  6.65769665e+03,  1.78508300e+04, -5.03832073e+04,
       -3.60168820e+04, -2.49832007e+03, -2.47479019e+03, -6.76265977e+02,
       -9.95758273e+02,  1.16587848e+05, -2.72583520e+04, -6.51639002e+03,
       -9.48356534e+04, -1.00267332e+05,  5.09112402e+04, -2.45518943e+04,
       -1.44746797e+04, -6.79883379e+03, -6.58249324e+03, -4.92650048e+04,
       -4.48670972e+04, -5.60597345e+03])

In [37]:
Y_test2 = df_test['shifted_deltatimes']
ridge_predict2 = np.exp(model.predict(X_test))
ridge_MAPE2 = MAPE(Y_test2,ridge_predict2)
ridge_MAE2 = MAE(Y_test2,ridge_predict2)
print(f'MAPE: {ridge_MAPE2}')
print(f'MAE: {ridge_MAE2}')

MAPE: 511745.1158522866
MAE: 34314.86032478058


In [52]:
print(mean_squared_error(Y_test2,ridge_predict2))
print(r2_score(Y_test2,ridge_predict2))
print(mean_absolute_error(Y_test2,ridge_predict2))
print(mean_absolute_percentage_error(Y_test2,ridge_predict2))


18366516961.876656
-0.06737323207863244
34314.860324780726
5117.451158522869


In [54]:
model.score(X_test,df_test['log_y'])

0.33949739994279193