In [22]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
import psutil

import math

np.set_printoptions(formatter={'float_kind':'{:f}'.format})
pd.set_option('display.float_format', lambda x: '%.3f' % x)
plt.style.use('seaborn')

## Loading data and preprocessing for Linear Model

In [23]:
def data_split(df):
    """returns 10% of the data"""
    return df[: int((len(df)/10))]


def del_intersection(train, test):
    lst_tr = train['case concept:name'].unique().tolist()
    lst_te = test['case concept:name'].unique().tolist()

    lst_int = set(lst_tr).intersection(lst_te)

    train = train[~train['case concept:name'].isin(lst_int)]
    test = test[~test['case concept:name'].isin(lst_int)]
    return train, test

In [24]:
path = r"D:/University/Year 2/Q3/DBL/Data/BPI Challenge 2012/"
data_train = pd.read_csv(f'{path}BPI_Challenge_2012-training.csv')
data_test = pd.read_csv(f'{path}BPI_Challenge_2012-test.csv')
df_data = pd.read_pickle("D:/University/Year 2/Q3/DBL/Data/processed2012.pkl")

data = pd.concat([data_train, data_test]) 

df_lin = data.copy()

df_lin['event time:timestamp'] = pd.to_datetime(df_lin['event time:timestamp'], dayfirst=True)
df_lin['case REG_DATE'] = pd.to_datetime(df_lin['case REG_DATE'])

df_lin.sort_values(by=['event time:timestamp'], inplace=True)
df_lin.reset_index(inplace=True, drop=True)

# remove whitespace at beginning and end of column name
df_lin.columns = df_lin.columns.str.strip(' ')

# data_split(df_lin)

In [25]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
unixTransform = lambda x: time.mktime(x.timetuple())
secondsTransform = lambda x: x.total_seconds()

df_data["timestamp"] = df_data["event time:timestamp"].copy()
df_data["event time:timestamp"] = pd.to_datetime(df_data["event time:timestamp"], dayfirst=True)

df_data.sort_values(by=['event time:timestamp'], inplace=True)
df_data.columns = df_data.columns.str.strip()

df_naive = df_data.copy()

In [26]:
df_processed = df_data.copy()

In [27]:
# SPLITS

data_split(df_lin)
data_split(df_naive)
data_split(df_processed)

Unnamed: 0,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp,time of day,weekday,Unix,next event,...,prevTime,day,month,hour,day_of_week,hour_cos,hour_sin,day_of_week_cos,day_of_week_sin,timestamp
0,173688,2011-10-01T00:38:44.546+02:00,20000,A_SUBMITTED,COMPLETE,2011-10-01 00:38:44.546,00:38:44.546,Saturday,1317422324,A_PARTLYSUBMITTED,...,NaT,1,10,0.000,5.236,1.000,0.000,0.500,-0.866,2011-10-01 00:38:44.546
1,173688,2011-10-01T00:38:44.546+02:00,20000,A_PARTLYSUBMITTED,COMPLETE,2011-10-01 00:38:44.880,00:38:44.880,Saturday,1317422324,A_PREACCEPTED,...,2011-10-01 00:38:44.546,1,10,0.000,5.236,1.000,0.000,0.500,-0.866,2011-10-01 00:38:44.880
2,173688,2011-10-01T00:38:44.546+02:00,20000,A_PREACCEPTED,COMPLETE,2011-10-01 00:39:37.906,00:39:37.906,Saturday,1317422377,W_Completeren aanvraag,...,2011-10-01 00:38:44.880,1,10,0.000,5.236,1.000,0.000,0.500,-0.866,2011-10-01 00:39:37.906
3,173688,2011-10-01T00:38:44.546+02:00,20000,W_Completeren aanvraag,SCHEDULE,2011-10-01 00:39:38.875,00:39:38.875,Saturday,1317422378,W_Completeren aanvraag,...,2011-10-01 00:39:37.906,1,10,0.000,5.236,1.000,0.000,0.500,-0.866,2011-10-01 00:39:38.875
4,173691,2011-10-01T08:08:58.256+02:00,5000,A_SUBMITTED,COMPLETE,2011-10-01 08:08:58.256,08:08:58.256,Saturday,1317449338,A_PARTLYSUBMITTED,...,NaT,1,10,2.185,5.236,-0.577,0.817,0.500,-0.866,2011-10-01 08:08:58.256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26215,179188,2011-10-23T12:14:03.577+02:00,15000,W_Completeren aanvraag,START,2011-10-24 11:06:15.224,11:06:15.224,Monday,1319447175,W_Completeren aanvraag,...,2011-10-24 09:56:42.066,24,10,3.005,0.000,-0.991,0.136,1.000,0.000,2011-10-24 11:06:15.224
26216,177609,2011-10-17T18:38:21.448+02:00,47000,O_SENT_BACK,COMPLETE,2011-10-24 11:06:37.730,11:06:37.730,Monday,1319447197,W_Valideren aanvraag,...,2011-10-24 11:05:54.882,24,10,3.005,0.000,-0.991,0.136,1.000,0.000,2011-10-24 11:06:37.730
26217,177609,2011-10-17T18:38:21.448+02:00,47000,W_Valideren aanvraag,SCHEDULE,2011-10-24 11:06:38.945,11:06:38.945,Monday,1319447198,W_Nabellen offertes,...,2011-10-24 11:06:37.730,24,10,3.005,0.000,-0.991,0.136,1.000,0.000,2011-10-24 11:06:38.945
26218,177609,2011-10-17T18:38:21.448+02:00,47000,W_Nabellen offertes,COMPLETE,2011-10-24 11:06:40.112,11:06:40.112,Monday,1319447200,W_Valideren aanvraag,...,2011-10-24 11:06:38.945,24,10,3.005,0.000,-0.991,0.136,1.000,0.000,2011-10-24 11:06:40.112


In [28]:
import random
random.seed(41)

df_lin = df_lin[df_lin['case concept:name'].isin(np.random.choice(df_lin['case concept:name'].unique(), size=10000, replace=False))]

In [29]:
# Remove timezone information
def remove_timezone(dt):
    return dt.replace(tzinfo=None)

# Calculate time difference since case start
df_lin['case REG_DATE'] = df_lin['case REG_DATE'].apply(remove_timezone);
df_lin['days_since_start'] = df_lin['event time:timestamp'] - df_lin['case REG_DATE'];
df_lin['days_since_start'] = df_lin['days_since_start'].dt.days;
df_lin = df_lin.iloc[:-1];

In [30]:
from sklearn.model_selection import train_test_split

df_lin, df_test = train_test_split(df_lin, test_size=0.2, shuffle=False)

In [31]:
df_data.head()

Unnamed: 0,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp,time of day,weekday,Unix,next event,...,prevTime,day,month,hour,day_of_week,hour_cos,hour_sin,day_of_week_cos,day_of_week_sin,timestamp
0,173688,2011-10-01T00:38:44.546+02:00,20000,A_SUBMITTED,COMPLETE,2011-10-01 00:38:44.546,00:38:44.546,Saturday,1317422324,A_PARTLYSUBMITTED,...,NaT,1,10,0.0,5.236,1.0,0.0,0.5,-0.866,2011-10-01 00:38:44.546
1,173688,2011-10-01T00:38:44.546+02:00,20000,A_PARTLYSUBMITTED,COMPLETE,2011-10-01 00:38:44.880,00:38:44.880,Saturday,1317422324,A_PREACCEPTED,...,2011-10-01 00:38:44.546,1,10,0.0,5.236,1.0,0.0,0.5,-0.866,2011-10-01 00:38:44.880
2,173688,2011-10-01T00:38:44.546+02:00,20000,A_PREACCEPTED,COMPLETE,2011-10-01 00:39:37.906,00:39:37.906,Saturday,1317422377,W_Completeren aanvraag,...,2011-10-01 00:38:44.880,1,10,0.0,5.236,1.0,0.0,0.5,-0.866,2011-10-01 00:39:37.906
3,173688,2011-10-01T00:38:44.546+02:00,20000,W_Completeren aanvraag,SCHEDULE,2011-10-01 00:39:38.875,00:39:38.875,Saturday,1317422378,W_Completeren aanvraag,...,2011-10-01 00:39:37.906,1,10,0.0,5.236,1.0,0.0,0.5,-0.866,2011-10-01 00:39:38.875
4,173691,2011-10-01T08:08:58.256+02:00,5000,A_SUBMITTED,COMPLETE,2011-10-01 08:08:58.256,08:08:58.256,Saturday,1317449338,A_PARTLYSUBMITTED,...,NaT,1,10,2.185,5.236,-0.577,0.817,0.5,-0.866,2011-10-01 08:08:58.256


## Naive Models

### Events

In [32]:
def simplePred(df,lst, cEvnt, cNw, cPrd):
    '''loops through the list of events and adds them to the df'''
    i = 0
    while i < len(lst):
        df.loc[df[cEvnt] == lst[i], cPrd] = df[df[cEvnt] == lst[i]][cNw].mode()[0]
        i += 1
    RAM = ('RAM memory % used:', psutil.virtual_memory()[2])
    CPU = ('CPU % used:', psutil.cpu_percent())
    df, RAM, CPU

def simpleAccuracy(df, cReal, cPrd):
    '''computes a simple accuracy score'''
    return len(df[df[cReal] == df[cPred]]) / len(df)

In [33]:
cEvent = 'event concept:name'
cNew = 'next event'
cPred = 'predicted next event'

df_naive[cNew] = df_naive[cEvent].shift(-1)
df_naive[cPred] = np.nan
lstEvents = df_naive[cEvent].unique().tolist()

In [34]:
# naive_event, naive_event_ram, naive_event_cpu = simplePred(df_naive, lstEvents, cEvent, cNew, cPred)
naive_acc = simpleAccuracy(df_naive, cNew, cPred)

In [35]:
naive_acc = simpleAccuracy(df_naive, cNew, cPred)
print(f'The accuracy for the naive event prediction is {round(naive_acc, 4)}, this prediction is log based')
# print(f'The naive event prediction has the following usage {naive_event_ram}')
# print(f'The naive event prediction has the following usage {naive_event_cpu}')

The accuracy for the naive event prediction is 0.0, this prediction is log based


### Time

In [36]:
def EventTime(data):
    memory = {} # Stores active events, key=event, value=index of start
    for i, j in data[["event concept:name"]].iterrows():
        
        if j[0] not in memory.keys() and data.loc[i]["event lifecycle:transition"].lower() == "start":
            memory[j[0]] = i
        
        elif j[0] in memory.keys():
            time = pd.to_datetime(data.loc[i]["event time:timestamp"], dayfirst=True) - pd.to_datetime(data.loc[memory[j[0]]]["event time:timestamp"], dayfirst=True)           # print(time)
            data.loc[i,"Completion Time"] = time.total_seconds()
            del memory[j[0]] # Removes the completed event from active event dictionary(memory)
    
    return data # Returns a new dataframe with event completion time. Can be improved.

def AverageTime(df_train):
    """
    1. Computes the event completion time for events that don't have a "start" by taking the 
       difference between said event's completion time and the previous (i-1) event time. Still have 
       to determine how correct this is, as several events can run in parallel.
    2. Sets new index on df and computes the remaining event times (for events that have a start and completion time)
       by running the EventTime function case by case. Can be improved.
    3. Returns average completion time per event.
    
    """
    df_train["Completion Time"] = 0
    df_train.replace(0,np.NaN, inplace=True)
    for i in range(1, df_train['Completion Time'].shape[0]-1): # 1
        if pd.isnull(df_train['Completion Time'][i]):
            if df_train['event lifecycle:transition'][i].lower() == 'complete':
                time = pd.to_datetime(df_train["event time:timestamp"][i], dayfirst=True) - pd.to_datetime(df_train["event time:timestamp"][i-1], dayfirst=True)          
                df_train['Completion Time'][i] = time.total_seconds()
    
    df_train.set_index(["case concept:name", "eventID"], inplace=True) # 2
    
    for idx, new_df in df_train.groupby(level=0): # Adds event completion time, case by case.
        df_train.loc[idx] = EventTime(new_df)
        
    Average_time = df_train.groupby(['event concept:name'])[['Completion Time']].mean() # 3
    return Average_time

#Average time in seconds
def AddAverageTime(data):
    data['Average time till next event'] = 0
    avr_time = AverageTime(data)
    avr_time.reset_index(inplace=True)
    data.reset_index(inplace=True)
    for i in range(0, data['Completion Time'].shape[0]):
        abc = data['event concept:name'][i]
        time = avr_time.loc[avr_time['event concept:name'] == abc, 'Completion Time']
        time_list = list(time)
        data['Average time till next event'][i] = time_list[0]
    RAM = ('RAM memory % used:', psutil.virtual_memory()[2])
    CPU = ('CPU % used:', psutil.cpu_percent())
    return data, RAM, CPU

In [37]:
# naive_time, naive_time_ram, naive_time_cpu = AddAverageTime(df_naive)

### Decision Tree

In [39]:
# assign long column names to variables for easier use
cases = "case concept:name"
reg_date = "case REG_DATE"
amount_req = "case AMOUNT_REQ"
event_name = "event concept:name"
lifecycle = "event lifecycle:transition"
tmstmp = "event time:timestamp"
nxt_event = "next event"
dtime = "delta time"

In [40]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

event_encoder = LabelEncoder()
time_of_day_encoder = OrdinalEncoder()

labels = df_processed[event_name].unique()
event_encoder.fit(labels);

In [41]:
df_processed[[event_name, nxt_event]] = df_processed[[event_name, nxt_event]].apply(event_encoder.fit_transform)
df_processed["time of day"] = time_of_day_encoder.fit_transform(df_processed[["time of day"]])

In [50]:
tree_data = df_processed[[amount_req, event_name, nxt_event, "time of day", "case concept:name"]]

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

tscv = TimeSeriesSplit()

train_tree, test_tree = train_test_split(tree_data, test_size=0.2, random_state=42, shuffle=False)

In [64]:
# train_tree

In [54]:
# delete intersection cases
train_tree, test_tree = del_intersection(train_tree, test_tree)

In [56]:
X_train_tree = train_tree[[amount_req, event_name, "time of day"]]
X_test_tree = test_tree[[amount_req, event_name, "time of day"]]

y_train_tree = train_tree[[nxt_event]]
y_test_tree = test_tree[[nxt_event]]

In [57]:
tree_reg = DecisionTreeClassifier(random_state=42)
tree_reg.fit(X_train_tree, y_train_tree);

In [None]:
# this is obviously false
prediction_tree = tree_reg.predict(X_train_tree)
tree_mse = mean_squared_error(y_train_tree, prediction_tree)
tree_rmse = np.sqrt(tree_mse)
tree_rmse

In [None]:
params = {"max_depth" : list(range(5,12))}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42), params, cv=tscv)

In [None]:
final_model = grid_search_cv.fit(X_train_tree, y_train_tree).best_estimator_

In [None]:
print(f'The best estimator is: {final_model}')

In [None]:
y_pred_tree = final_model.predict(X_test_tree)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

prec_score = precision_score(y_test_tree, y_pred_tree, average="weighted", zero_division=0)
rec_score = recall_score(y_test_tree, y_pred_tree, average="weighted", zero_division=0)
F1_score = f1_score(y_test_tree, y_pred_tree, average="weighted", zero_division=0)
acc_score = accuracy_score(y_test_tree, y_pred_tree)

print(f'The accuracy of the model is {acc_score}.')
print(f'The precision of the model is {prec_score}, using weighted average.')
print(f'The recall of the model is {rec_score}, using weighted average.')
print(f'The f1-score of the model is {F1_score}, using weighted average.')

### Time

In [59]:
df_encodedTime = df_processed.copy()

In [60]:
df_encodedTime["Completion Time"] = np.nan
df_encodedTime["Completion Time"] = (df_encodedTime["nextTime"] - df_encodedTime["event time:timestamp"]).apply(secondsTransform)
df_encodedTime.dropna(inplace=True)

df_encodedTime["Time Average"] = 0

for name, group in df_encodedTime[["event concept:name", "Completion Time"]].groupby(["event concept:name"]):
    mean = group["Completion Time"].mean()
    df_encodedTime.loc[df_encodedTime["event concept:name"] == name, "Time Average"] = mean

In [63]:
# df_encodedTime

In [65]:
# X = df_encodedTime[['event concept:name', "case AMOUNT_REQ", "Time Average"]]
# y = df_encodedTime['Completion Time']

train_time, test_time = train_test_split(df_encodedTime, test_size=0.3, random_state=42, shuffle=False)

train_time, test_time = del_intersection(train_time, test_time)

In [66]:
X_train_time = train_time[['event concept:name', "case AMOUNT_REQ", "Time Average"]]
X_test_time = test_time[['event concept:name', "case AMOUNT_REQ", "Time Average"]]

y_train_time = train_time['Completion Time']
y_test_time = test_time['Completion Time']

In [68]:
from sklearn import linear_model
regr = linear_model.LinearRegression()
regr.fit(X_train_time, y_train_time)

y_pred = regr.predict(X_test_time)
# % of negative predicted values
(len(y_pred[y_pred<0])/len(y_pred))*100

13.2014139296943

In [69]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test_time, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test_time, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test_time, y_pred)))

Mean Absolute Error: 56051.62936958207
Mean Squared Error: 17188625796.689976
Root Mean Squared Error: 131105.39957107021


## Linear Model 2

### Function definitions

In [None]:
import statsmodels.api as sm



def sliding_window(window_size, df):
    """transforms df_lin into supervised form 
    with rolling window implementations

    Args:
        window_size (int): size of rolling window

    Returns:
        (X, Y): tuple of input and output arrays
    """

    windows = list(df.rolling(window=window_size))
    for i in windows[window_size-1:]:    
        # split into X and Y
        temp = i.to_numpy()
        temp = [item for sublist in temp for item in sublist]
        Y.append(temp.pop(-1))
        X.append(temp[1:])
    return None


def cross_validate(X, Y):
    """Creates a timesseries split and calculates 
    cross validation error fitted on a given estimator

    Args:
        X (array): input array 
        Y (array): output array

    Returns:
        output, model (tuple): list of true y and predicted + model
    """
    output = []
    ts = TimeSeriesSplit(gap=175, max_train_size=None, n_splits=5, test_size=None)

    for train_index, test_index in ts.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        
        model = linear_model.LinearRegression().fit(X_train, y_train)
        y_pred = model.predict(X_test)
        output.append((y_test, y_pred))
        
    return output, model

### Encoding

In [None]:
from sklearn.preprocessing import MinMaxScaler

def encode(df):
    scaler = MinMaxScaler()
    scaler.fit(df[['case AMOUNT_REQ']])
    df['case AMOUNT_REQ'] = scaler.transform(df[['case AMOUNT_REQ']])

    # Get dummy variables and encode lifecycle
    df_dummies_lifecycle = pd.get_dummies(df['event lifecycle:transition'], prefix='Lifecycle', drop_first=True)
    df = df.loc[:, df.columns != 'event lifecycle:transition'].copy().join(df_dummies_lifecycle)


    # dummy variables encoded
    df_dummies_name = pd.get_dummies(df['event concept:name'], prefix='Event Name', drop_first=True)
    df.drop('event concept:name', axis=1, inplace=True)
    df = df.join(df_dummies_name)
    return df

### Feature engineering (I think?)

In [None]:
def preprocess(df, log, outliers):
    df = df.copy()
    # Calculate time difference between current and next row
    df['time_diff'] = df['event time:timestamp'].shift(-1);
    df['time_diff'] = df['time_diff'] - df['event time:timestamp'] 
    df['time_diff'] = df['time_diff'].dt.total_seconds();
    # df['time_diff'] = df['time_diff'].round();
    df = df.iloc[:-1];
    
    df = df[df['time_diff']>=0]

    if log == True:
        df['time_diff'] = np.log(df['time_diff'].replace(0, np.nan))
        df['time_diff'] = df['time_diff'].replace(np.nan, 0)
    
    
    # Adding time features
    df['day'] = df['event time:timestamp'].dt.day;
    df['month'] = df['event time:timestamp'].dt.month;
    df['hour'] = df['event time:timestamp'].dt.hour;
    df['day_of_week'] = df['event time:timestamp'].dt.weekday;

    # Cyclical encoding
    df["hour"] = 2 * math.pi * df["hour"] / df["hour"].max()
    df["hour_cos"] = np.cos(df["hour"])
    df["hour_sin"] = np.sin(df["hour"])
    df["day_of_week"] = 2 * math.pi * df["day_of_week"] / df["day_of_week"].max()
    df["day_of_week_cos"] = np.cos(df["day_of_week"])
    df["day_of_week_sin"] = np.sin(df["day_of_week"])

    # drop unnecessary columns
    df.drop(columns=['event time:timestamp', 'eventID', 'case REG_DATE',
                        'day_of_week', 'hour'], inplace=True)

    # put columns in right order
    cols = df.columns.tolist()
    cols.remove('time_diff')
    cols = cols + ['time_diff']
    df = df[cols]

    if outliers == 'capped':
        # Filter outliers
        df.loc[df['time_diff']>600, 'time_diff'] = 601
    elif outliers == 'removed':
        df = df[df['time_diff']<=600]
    elif outliers == 'keep':
        pass
    else:
        print('ERROR')
        
        
    return df
    

In [None]:
df_lin = encode(df_lin)
df_lin = df_lin.groupby('case concept:name').apply(lambda x: preprocess(x, log=True, outliers='removed'))
df_lin.drop('case concept:name', axis=1, inplace = True)

df_lin = df_lin[~df_lin['hour_cos'].isna()]
df_lin = df_lin[~df_lin['day_of_week_cos'].isna()]

In [None]:
X = []
Y = []
df_lin.groupby('case concept:name').apply(lambda x: sliding_window(3, x))
test = cross_validate(np.array(X), np.array(Y))
model = test[1]

In [None]:
test = test[0]
mse = np.mean([mean_squared_error(i[0], i[1]) for i in test])
r2 = np.mean([r2_score(i[0], i[1]) for i in test])
mae = np.mean([mean_absolute_error(i[0], i[1]) for i in test])
rmse = np.mean([mean_squared_error(i[0], i[1], squared=False) for i in test])
# print(f'mse: {math.exp(mse)}\nr^2: {r2}\nrmse: {math.exp(rmse)}\nmae:{math.exp(mae)}')
print(f'mse: {mse}\nr^2: {r2}\nrmse: {rmse}\nmae:{mae}')

In [None]:
y_true = pd.DataFrame(test[-1][0])
y_pred = pd.DataFrame(test[-1][1])
df_error = pd.concat([y_true, y_pred], axis=1)
df_error.columns = ['y_true', 'y_pred']
df_error.reset_index(inplace=True)
df_error['error'] = df_error['y_true'] - df_error['y_pred']

In [None]:
fig, ax = plt.subplots(figsize=(10,7))
# df_error.plot.hist(x='index', y='error', figsize=(10,5), ax=ax, bins=10)
sns.histplot(data=df_error, x='error', ax=ax, color='salmon')
ax.set_title('Error distribution with log transform', size=30)
ax.set_ylabel('count', fontsize=20)
ax.set_xlabel('Error in e^seconds', fontsize=20)
plt.yticks(fontsize=20);
plt.xticks(fontsize=20);

### Test set

In [None]:
df_test = encode(df_test)

df_test = df_test.groupby('case concept:name').apply(lambda x: preprocess(x, log=True, outliers='keep'))
df_test.drop('case concept:name', axis=1, inplace = True)

df_test = df_test[~df_test['hour_cos'].isna()]
df_test = df_test[~df_test['day_of_week_cos'].isna()]

In [None]:
X = []
Y = []
df_test.groupby('case concept:name').apply(lambda x: sliding_window(3, x))
y_pred = model.predict(X)

In [None]:
mse = mean_squared_error(Y, y_pred)
r2 = r2_score(Y, y_pred)
mae = mean_absolute_error(Y,y_pred )
rmse = mean_squared_error(Y, y_pred, squared=False)
print(f'mse: {math.exp(mse)}\nr^2: {r2}\nrmse: {math.exp(rmse)}\nmae:{math.exp(mae)}')

---