In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
import datetime

np.set_printoptions(formatter={'float_kind':'{:f}'.format})

plt.style.use('seaborn')

In [2]:
# path = r"/home/dazai/Documents/Processmining/Data/"
# df_train = pd.read_csv(f'{path}BPI_Challenge_2012-training.csv')
# df_test = pd.read_csv(f'{path}BPI_Challenge_2012-test.csv')


# df_data = pd.concat([df_train, df_test])
# df_data['event time:timestamp'] = pd.to_datetime(df_data['event time:timestamp'], dayfirst=False)
# df_data['case REG_DATE'] = pd.to_datetime(df_data['case REG_DATE'], dayfirst=True)

# # df_data['event time:timestamp'] = df_data['event time:timestamp'].to_timestamp()
# df_data.sort_values(by=['event time:timestamp'], inplace=True)
# df_data.reset_index(inplace=True, drop=True)

# # remove whitespace at beginning and end of column name
# df_data.columns = df_data.columns.str.strip()

# Everything above is put into a pickle
df_data = pd.read_pickle('/home/dazai/Documents/Processmining/Data/pickle.pkl')


In [3]:

def sliding_window(df_data, window_size, response_variable):
    start = 0
    X = []
    Y = []

    for i in range(len(df_data)):
        if i >= window_size:
            window = df_data.iloc[start:i+1]
            temp = window.iloc[:window_size].to_numpy()
            X.append([item for sublist in temp for item in sublist])
            Y.append(window.iloc[-1][response_variable])
            start +=1
            
    return np.array(X), np.array(Y)
    
# Remove timezone information
def remove_timezone(dt):
    return dt.replace(tzinfo=None)
 
df_data['case REG_DATE'] = df_data['case REG_DATE'].apply(remove_timezone)

# Time Regression

In [4]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df_data[['case AMOUNT_REQ']])
df_data['case AMOUNT_REQ'] = scaler.transform(df_data[['case AMOUNT_REQ']])

In [5]:
# Get unique activities and encode them
Y = df_data['event concept:name'].unique()
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(Y)

# Get dummy variables and encode lifecycle
df_dummies_lifecycle = pd.get_dummies(df_data['event lifecycle:transition'], prefix='Lifecycle', drop_first=True)
df_data = df_data.loc[:, df_data.columns != 'event lifecycle:transition'].copy().join(df_dummies_lifecycle)

# Encode event concept name
df_data['event concept:name'] = label_encoder.transform(df_data['event concept:name'])


In [6]:
# Check if any time difference between case start and first event time is larger than 1 second
for i in df_data['case concept:name'].unique():
    start = df_data[df_data['case concept:name']==i].iloc[0]
    if (start['event time:timestamp'] - start['case REG_DATE']) > datetime.timedelta(seconds=1):
        display(start.to_frame())
        
        
bad_traces = df_data[df_data['case REG_DATE'] > df_data['event time:timestamp']]['case concept:name'].unique()
    
df_data = df_data[~df_data['case concept:name'].isin(bad_traces)]
    

Example of a case containing events before the start of the trace

In [7]:
# df_data[df_data['case REG_DATE'] > df_data['event time:timestamp']]['case concept:name']
# df_data[df_data['case concept:name']==177531]

In [8]:
# Calculate time difference between current and next row
df_data['time_diff'] = df_data['event time:timestamp'].shift(-1);
df_data['time_diff'] = df_data['time_diff'] - df_data['event time:timestamp'];
df_data['time_diff'] = df_data['time_diff'].dt.total_seconds();
df_data['time_diff'] = df_data['time_diff'].round();

# Calculate time difference since case start
df_data['case REG_DATE'] = df_data['case REG_DATE'].apply(remove_timezone);
df_data['time_since_start'] = df_data['event time:timestamp'] - df_data['case REG_DATE'];
df_data['time_since_start'] = df_data['time_since_start'].dt.days;
df_data = df_data.iloc[:-1];

# Adding time features
df_data['day'] = df_data['event time:timestamp'].dt.day;
df_data['month'] = df_data['event time:timestamp'].dt.month;
df_data['hour'] = df_data['event time:timestamp'].dt.hour;

Note: we can see that 98.3% of the events have their time between current and next event larger than 10 minutes in order to keep these stats we just assign 601 to everything. This happens in 23% of the traces.

In [9]:
print(len(df_data[df_data['time_diff']<600])/len(df_data))
print(len(df_data[df_data['time_diff']>600]['case concept:name'].unique())/len(df_data['case concept:name'].unique()))
df_data.loc[df_data['time_diff']>600, 'time_diff'] = 601


0.9830521561209127
0.22706333973128598


In [10]:
scaler = MinMaxScaler()
scaler.fit(df_data[['time_diff']])
df_data['time_diff'] = scaler.transform(df_data[['time_diff']])

df_data.drop(columns=['event time:timestamp', 'eventID', 'case concept:name', 'case REG_DATE'], inplace=True)


In [11]:
X, Y = sliding_window(df_data, 3, 'time_diff')
ts = TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)

def cross_validate(X, Y):
    for train_index, test_index in ts.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        reg = linear_model.LinearRegression()
        reg.fit(X_train, y_train)   

        y_pred = reg.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        print(scaler.inverse_transform(np.array(mse).reshape(1, -1)))

Note: cross validation error measure is done on the capped time we did at the last note

In [12]:
cross_validate(X, Y)

[[17.541530]]
[[13.008707]]
[[13.256605]]
[[11.605092]]
[[19.596198]]


In [13]:
# Cyclical encoding
# df_data["x_norm"] = 2 * math.pi * df_data["x"] / df_data["x"].max()

# df_data["cos_x"] = np.cos(df_data["x_norm"])
# df_data["sin_x"] = np.sin(df_data["x_norm"])

# Event prediction