In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import random
from sklearn import tree
from sklearn.cluster import KMeans

In [2]:
df_train = pd.read_csv('BPI_Challenge_2012-training.csv')
df_test = pd.read_csv('BPI_Challenge_2012-test.csv')


df_data = pd.concat([df_train, df_test])
df_data['event time:timestamp'] = pd.to_datetime(df_data['event time:timestamp'], dayfirst=True)
# df_data['event time:timestamp'] = df_data['event time:timestamp'].to_timestamp()
df_data.sort_values(by=['event time:timestamp'])
df_data.reset_index(inplace=True, drop=True)
# df_data.drop(['case description', 'event org:resource'], axis=1, inplace=True)

# remove whitespace at beginning and end of column name
df_data.columns = df_data.columns.str.strip()

In [3]:
# Get unique activities and encode them
Y = df_data['event concept:name'].unique()
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(Y)

# Randomly selecting a few to test
randomYs = random.choices(Y, k=5)
print(randomYs)
label_encoder.transform(randomYs)

['O_ACCEPTED', 'W_Nabellen incomplete dossiers', 'W_Afhandelen leads', 'O_DECLINED', 'A_SUBMITTED']


array([10, 20, 17, 13,  9])

In [4]:
# Get dummy variables and encode lifecycle
df_dummies_lifecycle = pd.get_dummies(df_data['event lifecycle:transition'], prefix='Lifecycle', drop_first=True)
df_encoded = df_data.loc[:, df_data.columns != 'event lifecycle:transition'].copy().join(df_dummies_lifecycle)

# Encode event concept name
df_encoded['event concept:name'] = label_encoder.transform(df_data['event concept:name'])
df_encoded

Unnamed: 0,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event time:timestamp,Lifecycle_SCHEDULE,Lifecycle_START
0,0,173688,2011-10-01T00:38:44.546+02:00,20000,9,2011-10-01 00:38:44.546,0,0
1,1,173688,2011-10-01T00:38:44.546+02:00,20000,6,2011-10-01 00:38:44.880,0,0
2,2,173688,2011-10-01T00:38:44.546+02:00,20000,7,2011-10-01 00:39:37.906,0,0
3,3,173688,2011-10-01T00:38:44.546+02:00,20000,19,2011-10-01 00:39:38.875,1,0
4,4294967296,173691,2011-10-01T08:08:58.256+02:00,5000,9,2011-10-01 08:08:58.256,0,0
...,...,...,...,...,...,...,...,...
262195,54666343743523,213276,2012-02-27T14:12:41.868+01:00,15000,20,2012-03-14 15:59:28.309,0,1
262196,54666343743524,213276,2012-02-27T14:12:41.868+01:00,15000,20,2012-03-14 16:00:09.680,0,0
262197,49495203119136,209595,2012-02-15T10:10:36.503+01:00,13000,21,2012-03-14 16:02:03.883,0,1
262198,52342766436386,211624,2012-02-21T23:38:40.044+01:00,35000,20,2012-03-14 16:04:46.192,0,1


In [5]:
def EventTime(data):
    memory = {} # Stores active events, key=event, value=index of start
    for i, j in data[["event concept:name"]].iterrows():
        
        if j[0] not in memory.keys() and data.loc[i]["Lifecycle_SCHEDULE"]==0 and  data.loc[i]["Lifecycle_START"]==1:
            memory[j[0]] = i
        
        elif j[0] in memory.keys():
            time = pd.to_datetime(data.loc[i]["event time:timestamp"], dayfirst=True) - pd.to_datetime(data.loc[memory[j[0]]]["event time:timestamp"], dayfirst=True)           # print(time)
            data.loc[i,"Completion Time"] = time.total_seconds()
            del memory[j[0]] # Removes the completed event from active event dictionary(memory)
    
    return data # Returns a new dataframe with event completion time. Can be improved.



In [6]:
def AverageTime(df_train):
    """
    1. Computes the event completion time for events that don't have a "start" by taking the 
       difference between said event's completion time and the previous (i-1) event time. Still have 
       to determine how correct this is, as several events can run in parallel.
    2. Sets new index on df and computes the remaining event times (for events that have a start and completion time)
       by running the EventTime function case by case. Can be improved.
    3. Returns average completion time per event.
    
    """
    df_train["Completion Time"] = 0
    df_train["Completion Time"].replace(0,np.NaN, inplace=True)
    
    for i in range(1, df_train['Completion Time'].shape[0]-1): # 1
        if pd.isnull(df_train['Completion Time'][i]):
            if df_train['Lifecycle_SCHEDULE'][i] == 0 and df_train['Lifecycle_START'][i] == 0:
                time = pd.to_datetime(df_train["event time:timestamp"][i], dayfirst=True) - pd.to_datetime(df_train["event time:timestamp"][i-1], dayfirst=True)          
                df_train['Completion Time'][i] = time.total_seconds()
    df_train.set_index(["case concept:name", "eventID"], inplace=True) # 2
    
    for idx, new_df in df_train.groupby(level=0): # Adds event completion time, case by case.
        df_train.loc[idx] = EventTime(new_df)
        
    Average_time = df_train.groupby(['event concept:name'])[['Completion Time']].mean() # 3
    return Average_time



In [7]:
%time
AverageTime(df_encoded)

Wall time: 0 ns


Unnamed: 0_level_0,Completion Time
event concept:name,Unnamed: 1_level_1
0,54.821758
1,8.858934
2,22.117776
3,23.838522
4,37.052715
5,21.63863
6,0.572195
7,32.928399
8,9.359576
9,213.027539


In [22]:
import pickle

In [23]:
df_encoded.to_pickle('df_encoded.pickle')

In [15]:
df_encoded2 = df_encoded.copy()
df_encoded2.dropna(inplace=True)

In [16]:
X = df_encoded2[['event concept:name', 'Lifecycle_SCHEDULE', 'Lifecycle_START']]
y = df_encoded2['Completion Time']

In [17]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [18]:
from sklearn import linear_model
regr = linear_model.LinearRegression()
regr.fit(x_train, y_train)

LinearRegression()

In [19]:
y_pred = regr.predict(x_test)

In [21]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 535.7083709999283
Mean Squared Error: 27311898.03192462
Root Mean Squared Error: 5226.078647697967
