In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
import math

np.set_printoptions(formatter={'float_kind':'{:f}'.format})
pd.set_option('display.float_format', lambda x: '%.3f' % x)
plt.style.use('seaborn')

## Loading data and preprocessing

In [2]:
df_data = pd.read_pickle('/home/dazai/Documents/Processmining/Data/pickle.pkl')

Uncomment the cell below and run it if you dont have a pickle of preprocessed data

In [3]:
# path = r"/home/dazai/Documents/Processmining/Data/"
# df_train = pd.read_csv(f'{path}BPI_Challenge_2012-training.csv')
# df_test = pd.read_csv(f'{path}BPI_Challenge_2012-test.csv')


# df_data = pd.concat([df_train, df_test])
# df_data['event time:timestamp'] = pd.to_datetime(df_data['event time:timestamp'], dayfirst=True)
# df_data['case REG_DATE'] = pd.to_datetime(df_data['case REG_DATE'])

# df_data.sort_values(by=['event time:timestamp'], inplace=True)
# df_data.reset_index(inplace=True, drop=True)

# # remove whitespace at beginning and end of column name
# df_data.columns = df_data.columns.str.strip(' ')

## Function definitions

In [4]:
def sliding_window(window_size):
    """transforms df_data into supervised form 
    with rolling window implementations

    Args:
        window_size (int): size of rolling window

    Returns:
        (X, Y): tuple of input and output arrays
    """
    X = []
    Y = []

    windows = list(df_data.rolling(window=window_size))
    for i in windows[window_size-1:]:    
        # split into X and Y
        temp = i.to_numpy()
        temp = [item for sublist in temp for item in sublist]
        Y.append(temp.pop(-1))
        X.append(temp)
    return np.array(X), np.array(Y)


def cross_validate(X, Y):
    """Creates a timesseries split and calculates 
    cross validation error fitted on a given estimator

    Args:
        X (array): input array 
        Y (array): output array
        estimator (sklearn_model): Tree based machine learning model

    Returns:
        accuracies (list): list of cross validation accuracies
    """
    errors = []
    ts = TimeSeriesSplit(gap=10, max_train_size=None, n_splits=5, test_size=None)

    for train_index, test_index in ts.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = Y[train_index], Y[test_index]
        
        clf = linear_model.LinearRegression()
        clf = clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        

        mse = mean_squared_error(y_test, y_pred)
        errors.append(mse)
    return errors

        
    
# Remove timezone information
def remove_timezone(dt):
    return dt.replace(tzinfo=None)
 
df_data['case REG_DATE'] = df_data['case REG_DATE'].apply(remove_timezone)

## Encoding

In [5]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(df_data[['case AMOUNT_REQ']])
df_data['case AMOUNT_REQ'] = scaler.transform(df_data[['case AMOUNT_REQ']])

# Get dummy variables and encode lifecycle
df_dummies_lifecycle = pd.get_dummies(df_data['event lifecycle:transition'], prefix='Lifecycle', drop_first=True)
df_data = df_data.loc[:, df_data.columns != 'event lifecycle:transition'].copy().join(df_dummies_lifecycle)


# dummy variables encoded
df_dummies_name = pd.get_dummies(df_data['event concept:name'], prefix='Event Name', drop_first=True)
df_data.drop('event concept:name', axis=1, inplace=True)
df_data = df_data.join(df_dummies_name)

## Feature engineering (I think?)

In [6]:
# Calculate time difference between current and next row
df_data['time_diff'] = df_data['event time:timestamp'].shift(-1);
df_data['time_diff'] = df_data['time_diff'] - df_data['event time:timestamp'] 
df_data['time_diff'] = df_data['time_diff'].dt.total_seconds();
# df_data['time_diff'] = df_data['time_diff'].round();
df_data = df_data.iloc[:-1];

# Filter outliers
# df_data.loc[df_data['time_diff']>600, 'time_diff'] = 601
df_data = df_data[df_data['time_diff']<=600]

# Calculate time difference since case start
df_data['case REG_DATE'] = df_data['case REG_DATE'].apply(remove_timezone);
df_data['days_since_start'] = df_data['event time:timestamp'] - df_data['case REG_DATE'];
df_data['days_since_start'] = df_data['days_since_start'].dt.days;
df_data = df_data.iloc[:-1];

# Adding time features
df_data['day'] = df_data['event time:timestamp'].dt.day;
df_data['month'] = df_data['event time:timestamp'].dt.month;
df_data['hour'] = df_data['event time:timestamp'].dt.hour;
df_data['day_of_week'] = df_data['event time:timestamp'].dt.weekday;

# Cyclical encoding
df_data["hour"] = 2 * math.pi * df_data["hour"] / df_data["hour"].max()
df_data["hour_cos"] = np.cos(df_data["hour"])
df_data["hour_sin"] = np.sin(df_data["hour"])
df_data["day_of_week"] = 2 * math.pi * df_data["day_of_week"] / df_data["day_of_week"].max()
df_data["day_of_week_cos"] = np.cos(df_data["day_of_week"])
df_data["day_of_week_sin"] = np.sin(df_data["day_of_week"])

# drop unnecessary columns
df_data.drop(columns=['event time:timestamp', 'eventID', 'case concept:name', 'case REG_DATE', 'day_of_week', 'hour'], inplace=True)

# put columns in right order
cols = df_data.columns.tolist()
cols.remove('time_diff')
cols = cols + ['time_diff']
df_data = df_data[cols]


In [7]:
X, Y = sliding_window(5)
cross_validate(X, Y)

[1892.0392908253148,
 1993.7187287617103,
 3685.817597814194,
 1711.2514610945411,
 2346.429837422544]

### Results based on window_size of 5
Mse without filtering outliers: [225325.63533256113,
 285941.6154406469,
 45467736.95099862,
 216166.30340970002,
 855288.0102282097] <br>mean: a lot <br> <br>
Mse removed outliers : [1892.0392908253148,
 1993.7187287617103,
 3685.817597814194,
 1711.2514610945411,
 2346.429837422544] <br>mean: 2325.851383183661<br><br>
Mse with capped outliers: [3884.040469677467,
 4085.3240826743213,
 73244.44519472372,
 3520.115231681768,
 4315.405208939567] <br>mean: 17809.86603753937

Checks

In [8]:
# np.mean([1940.9284421407326,
#  2055.2853996454674,
#  3018.4338305175916,
#  1752.2514213197157,
#  2414.388346213773])


# counter = 0
# for i in range(100000,260000,5000):
#     if counter < 150:
#         display(df_data.iloc[[i]])
#     counter += 1


# counter = 0
# for i in df_data[df_data['time_diff']>28800].index:
#     if counter < 5:
#         display(df_data.loc[i-3:i+3])
#     counter += 1


# def sliding_window(df_data, window_size, response_variable): 
#     """Iterates over input data and creates batches 
#     with variable lag based on response_variable

#     Args:
#         df_data (dataframe): input dataframe
#         window_size (int): amount of lags
#         response_variable (str): column we want to predict

#     Returns:
#         (X, Y) (tuple): tuple of arrays transformed using sliding window
#     """
#     start = 0
#     X = []
#     Y = []

#     for i in range(len(df_data)):
#         if i >= window_size:
#             window = df_data.iloc[start:i+1]
#             temp = window.iloc[:window_size].to_numpy()
#             X.append([item for sublist in temp for item in sublist])
#             Y.append(window.iloc[-1][response_variable])
#             start +=1
            
#     return np.array(X), np.array(Y)
    