In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
import datetime
import math

np.set_printoptions(formatter={'float_kind':'{:f}'.format})

plt.style.use('seaborn')

In [2]:
# path = r"/home/dazai/Documents/Processmining/Data/"
# df_train = pd.read_csv(f'{path}BPI_Challenge_2012-training.csv')
# df_test = pd.read_csv(f'{path}BPI_Challenge_2012-test.csv')


# df_data = pd.concat([df_train, df_test])
# df_data['event time:timestamp'] = pd.to_datetime(df_data['event time:timestamp'], dayfirst=False)
# df_data['case REG_DATE'] = pd.to_datetime(df_data['case REG_DATE'], dayfirst=True)

# # df_data['event time:timestamp'] = df_data['event time:timestamp'].to_timestamp()
# df_data.sort_values(by=['event time:timestamp'], inplace=True)
# df_data.reset_index(inplace=True, drop=True)

# # remove whitespace at beginning and end of column name
# df_data.columns = df_data.columns.str.strip()

# Everything above is put into a pickle
df_data = pd.read_pickle('/home/dazai/Documents/Processmining/Data/pickle.pkl')


In [3]:

def sliding_window(df_data, window_size, response_variable): 
    """Iterates over input data and creates batches 
    with variable lag based on response_variable

    Args:
        df_data (dataframe): input dataframe
        window_size (int): amount of lags
        response_variable (str): column we want to predict

    Returns:
        (X, Y) (tuple): tuple of arrays transformed using sliding window
    """
    start = 0
    X = []
    Y = []

    for i in range(len(df_data)):
        if i >= window_size:
            window = df_data.iloc[start:i+1]
            temp = window.iloc[:window_size].to_numpy()
            X.append([item for sublist in temp for item in sublist])
            Y.append(window.iloc[-1][response_variable])
            start +=1
            
    return np.array(X), np.array(Y)
    
# Remove timezone information
def remove_timezone(dt):
    return dt.replace(tzinfo=None)
 
df_data['case REG_DATE'] = df_data['case REG_DATE'].apply(remove_timezone)

## Encoding

In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(df_data[['case AMOUNT_REQ']])
df_data['case AMOUNT_REQ'] = scaler.transform(df_data[['case AMOUNT_REQ']])

# Get unique activities and encode them
Y = df_data['event concept:name'].unique()
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(Y)

# Get dummy variables and encode lifecycle
df_dummies_lifecycle = pd.get_dummies(df_data['event lifecycle:transition'], prefix='Lifecycle', drop_first=True)
df_data = df_data.loc[:, df_data.columns != 'event lifecycle:transition'].copy().join(df_dummies_lifecycle)

# Encode event concept name
df_data['event concept:name'] = label_encoder.transform(df_data['event concept:name'])


## Preprocessing

Note: The following cell is not used for time regression as the error is much lower without filter

In [5]:
def filter(df):
    # Check if any time difference between case start and first event time is larger than 1 second
    for i in df['case concept:name'].unique():
        start = df[df['case concept:name']==i].iloc[0]
        if (start['event time:timestamp'] - start['case REG_DATE']) > datetime.timedelta(seconds=1):
            display(start.to_frame())
            
    bad_traces = df[df['case REG_DATE'] > df['event time:timestamp']]['case concept:name'].unique()
    
    return df[~df['case concept:name'].isin(bad_traces)]


# Example of a trace with events starting before the starting time of the trace itself.
# df_data[df_data['case REG_DATE'] > df_data['event time:timestamp']]['case concept:name']
# df_data[df_data['case concept:name']==177531]
    

In [6]:
# Calculate time difference between current and next row
df_data['time_diff'] = df_data['event time:timestamp'].shift(1);
df_data['time_diff'] = df_data['event time:timestamp'] - df_data['time_diff']
df_data['time_diff'] = df_data['time_diff'].dt.total_seconds();
df_data['time_diff'] = df_data['time_diff'].round();
df_data = df_data.iloc[1:];


# Calculate time difference since case start
df_data['case REG_DATE'] = df_data['case REG_DATE'].apply(remove_timezone);
df_data['days_since_start'] = df_data['event time:timestamp'] - df_data['case REG_DATE'];
df_data['days_since_start'] = df_data['days_since_start'].dt.days;
df_data = df_data.iloc[:-1];

# Adding time features
df_data['day'] = df_data['event time:timestamp'].dt.day;
df_data['month'] = df_data['event time:timestamp'].dt.month;
df_data['hour'] = df_data['event time:timestamp'].dt.hour;
df_data['day_of_week'] = df_data['event time:timestamp'].dt.weekday;


Note: we can see that 99% of the events have their time between current and next event smaller than 10 minutes in order to keep these stats we just assign 601 to everything. This happens in 18% of the traces. Capping this before scaling the data greatly reduces the error on predictions

In [7]:
print(len(df_data[df_data['time_diff']<600])/len(df_data))
print(len(df_data[df_data['time_diff']>600]['case concept:name'].unique())/len(df_data['case concept:name'].unique()))
df_data.loc[df_data['time_diff']>600, 'time_diff'] = 601


0.9904080122655398
0.18132497898678077


## Evaluation
Note: error measures are done on the capped time we did at the last note

In [8]:
from sklearn.ensemble import RandomForestClassifier
# from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score


def cross_validate(X, Y, estimator, tree_based):
    """Creates a timesseries split and calculates 
    cross validation error fitted on a given estimator

    Args:
        X (array): input array 
        Y (array): output array
        estimator (sklearn_model): Tree based machine learning model

    Returns:
        accuracies (list): list of cross validation accuracies
    """
    errors = []
    ts = TimeSeriesSplit(gap=10, max_train_size=None, n_splits=5, test_size=None)
    
    for train_index, test_index in ts.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        
        clf = estimator
        clf = clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        # Tree based and linear regression have different evaluation metrics
        if tree_based:
            error = accuracy_score(y_test, y_pred)
            errors.append(error)
        else:
            mse = mean_squared_error(y_test, y_pred)
            errors.append(scaler.inverse_transform(np.array(mse).reshape(1, -1))[0][0])
    return errors

        

## Tree based regression, Only run the cells under this header and restart if you want to use a different type of estimator

In [9]:
df_data.drop(columns=['event time:timestamp', 'eventID', 'case concept:name', 'case REG_DATE'], inplace=True)

bins = 10

label_encoder = LabelEncoder()
# print confidence intervals and encode them
print(pd.cut(df_data['time_diff'], bins=bins, retbins=True))
df_data['time_diff'] = label_encoder.fit_transform(pd.cut(df_data['time_diff'], bins=bins, retbins=True)[0])


X, Y = sliding_window(df_data, 3, 'time_diff')
for i in range(2,10):
    print(f'estimators:{i} mean error:{np.mean(cross_validate(X,Y, RandomForestClassifier(n_estimators=10), True))}')

(1         (-0.601, 60.1]
2         (-0.601, 60.1]
3         (-0.601, 60.1]
4         (540.9, 601.0]
5         (-0.601, 60.1]
               ...      
262194    (-0.601, 60.1]
262195    (120.2, 180.3]
262196    (180.3, 240.4]
262197    (120.2, 180.3]
262198    (240.4, 300.5]
Name: time_diff, Length: 262198, dtype: category
Categories (10, interval[float64, right]): [(-0.601, 60.1] < (60.1, 120.2] < (120.2, 180.3] < (180.3, 240.4] ... (360.6, 420.7] < (420.7, 480.8] < (480.8, 540.9] < (540.9, 601.0]], array([-0.601000, 60.100000, 120.200000, 180.300000, 240.400000,
       300.500000, 360.600000, 420.700000, 480.800000, 540.900000,
       601.000000]))
estimators:2 mean error:0.882450399322639
estimators:3 mean error:0.8829721503924575
estimators:4 mean error:0.8827661960227923
estimators:5 mean error:0.8832650632737591
estimators:6 mean error:0.8835900135014532
estimators:7 mean error:0.8816082747888968


mean accuracy per amount of estimators <br><br>
estimators:2 mean error:0.8830819927229456 <br>
estimators:3 mean error:0.8833245612027735 <br>
estimators:4 mean error:0.8821437561500263 <br>
estimators:5 mean error:0.8826517769285338 <br>
estimators:6 mean error:0.8829309595185244 <br>
estimators:7 mean error:0.8822352914254331 <br>
estimators:8 mean error:0.8826563536923041 <br>
estimators:9 mean error:0.8829492665736058 <br>


## Linear regression, Only run the cells under this header and restart if you want to use a different type of estimator

In [None]:
df_time = df_data.copy()

# Cyclical encoding
df_time["hour"] = 2 * math.pi * df_time["hour"] / df_time["hour"].max()
df_time["hour_cos"] = np.cos(df_time["hour"])
df_time["hour_sin"] = np.sin(df_time["hour"])
df_time.drop(columns='hour', inplace=True)

df_time["day_of_week"] = 2 * math.pi * df_time["day_of_week"] / df_time["day_of_week"].max()
df_time["day_of_week_cos"] = np.cos(df_time["day_of_week"])
df_time["day_of_week_sin"] = np.sin(df_time["day_of_week"])
df_time.drop(columns='day_of_week', inplace=True)

scaler = MinMaxScaler()
scaler.fit(df_time[['time_diff']])
df_time['time_diff'] = scaler.transform(df_time[['time_diff']])


In [None]:
df_time.drop(columns=['event time:timestamp', 'eventID', 'case concept:name', 'case REG_DATE'], inplace=True)

X, Y = sliding_window(df_time, 3, 'time_diff')
np.mean(cross_validate(X,Y, linear_model.LinearRegression(),False))

# Event prediction
The sliding window implementation is log based, for time prediction this seems to work well. However event prediction seem to be more reliant on information within traces.

In [None]:
df_data.drop(columns=['day', 'month'])

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

from sklearn import tree

def cross_validate(X, Y):
    mses = []

    for train_index, test_index in ts.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        clf = RandomForestClassifier(n_estimators=10)
        clf = clf.fit(X_train, y_train)

        
        y_pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        mses.append(accuracy)
    return mses

In [None]:
X, Y = sliding_window(df_data, 3, 'event concept:name')


In [None]:
np.mean(cross_validate(X, Y))
