In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn')
import seaborn as sns
import os


In [5]:
# change to own path!
path = r"/home/dazai/Documents/Process mining/Data/printer data/"
df_train = pd.read_csv(f'{path}/event-log-training.csv')
df_test = pd.read_csv(f'{path}/event-log-test.csv')


# cleaning
df_data = pd.concat([df_train, df_test])
df_data['event time:timestamp'] = pd.to_datetime(df_data['event time:timestamp'], dayfirst=True)
df_data = df_data.sort_values(by=['event time:timestamp'])
df_data.reset_index(inplace=True, drop=True)
df_data.drop(['case description', 'event org:resource'], axis=1, inplace=True)

# remove whitespace at beginning and end of column name
df_data.columns = df_data.columns.str.strip()


## splitting data

In [None]:
def split(df_data, percent):
    """input percentage of testing data to split"""
    index = round(df_data.shape[0]*(percent/100))
    
    testset = df_data.iloc[index:]
    testset.reset_index(drop=True, inplace=True)
    return df_data.iloc[:index], testset

df_train = split(df_data, 80)[0]

df_test = split(df_data, 80)[1]

Note on advanced splitting:
we have defined advanced splitting as splitting the data by a certain percentage split (80/20 traing/test) where all traces starting in the training set will not be included in the test set. However in this case all traces start in the first month of the dataset. We will use the simple split accordingly.

# Baseline model next event

In [None]:
# assign column names to variables for ease

cEvent = 'event concept:name'
cNew = 'next event'
cPred = 'predicted next event'

df_data[cNew] = df_data[cEvent].shift(-1)
df_data[cPred] = np.nan
df_data.head()

Unnamed: 0,eventID,case concept:name,case Class,event concept:name,event lifecycle:transition,event time:timestamp,next event,predicted next event
0,0,1,Print,Job,start,1970-01-01 01:00:00,Remote Print,
1,1,1,Print,Remote Print,complete,1970-01-01 01:15:00,Read Print Options,
2,2,1,Print,Read Print Options,complete,1970-01-01 01:26:00,Rasterization,
3,3,1,Print,Rasterization,start,1970-01-01 01:38:00,Interpretation,
4,4,1,Print,Interpretation,start,1970-01-01 01:51:00,Unformatted Text,


In [None]:
lstEvents = df_data[cEvent].unique().tolist()

# helper function
def simplePred(df,lst, cEvnt, cNw, cPrd):
    '''loops through the list of events and adds them to the df'''
    i = 0
    while i < len(lst):
        df.loc[df[cEvnt] == lst[i], cPrd] = df[df[cEvnt] == lst[i]][cNw].mode()[0]
        i += 1
    return df

def simpleAccuracy(df, cReal, cPrd):
    '''computes a simple accuracy score'''
    return len(df[df[cReal] == df[cPred]]) / len(df)

In [None]:
simplePred(df_data, lstEvents, cEvent, cNew, cPred)

Unnamed: 0,eventID,case concept:name,case Class,event concept:name,event lifecycle:transition,event time:timestamp,next event,predicted next event
0,0,1,Print,Job,start,1970-01-01 01:00:00,Remote Print,Remote Print
1,1,1,Print,Remote Print,complete,1970-01-01 01:15:00,Read Print Options,Read Print Options
2,2,1,Print,Read Print Options,complete,1970-01-01 01:26:00,Rasterization,Rasterization
3,3,1,Print,Rasterization,start,1970-01-01 01:38:00,Interpretation,Interpretation
4,4,1,Print,Interpretation,start,1970-01-01 01:51:00,Unformatted Text,Screening
...,...,...,...,...,...,...,...,...
40990,412316860519,96,Print,Heated Roller Spin Stop,complete,1970-12-26 15:42:00,Fusing,Fusing
40991,412316860520,96,Print,Fusing,complete,1970-12-26 16:01:00,Wipe Toner on Drum,Wipe Toner on Drum
40992,412316860521,96,Print,Wipe Toner on Drum,complete,1970-12-26 16:26:00,Erase Charge on Drum,Erase Charge on Drum
40993,412316860522,96,Print,Erase Charge on Drum,complete,1970-12-26 16:50:00,Job,Writing


In [None]:
simpleAccuracy(df_data, cNew, cPred)

0.5876814245639712

In [None]:
confusion_matrix = pd.crosstab(df_data['next event'], df_data['predicted next event'], rownames=['Actual'], colnames=['Predicted'])
confusion_matrix.style.background_gradient(cmap ='viridis').set_properties(**{'font-size': '20px'})

Predicted,A/D Conversion,Accumulate Images,Apply Pressure,Calc Quantization Error,Calc Total Neighbor Quant Error,Collect Copy/Scan Options,Collect Image,Compression,Current Page Image,Drum Spin Start,Drum Spin Stop,Emit Laser,Erase Charge on Drum,Error Diffusion Method,Filtered Image,Focus Light Beam,Fusing,Heated Roller Spin Start,Illuminate Document,Interpolation,Interpretation,Job,Move Scan Head,Neighbor Quant Error Packingl,Paper Roller Spin Start,Paper Roller Spin Stop,Photo Quality Reproduction,Photolens Travel to Drum,Place Doc,Pressure Roller Spin Start,Pressure Roller Spin Stop,Rasterization,Read Print Options,Remote Print,Rendering,Reverse Charges,Rotate,Scanner Compensation,Screening,Send FTP,Store Image,Store Quantizing Pixel,Subtract,Table Based Multilevel Quantizer,Transfer Toner (drum to paper),Wipe Toner on Drum,Writing,Y-Zoom,Zooming
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1
A/D Conversion,479,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
AM Screening,0,0,0,0,0,0,0,0,115,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,81,0,0,0,0,0,0,0,0,0,0
Accumulate Images,0,304,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,32,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,123,0,0,0,0,0,0,0,0,0,0
Apply Heat,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,174,395,0,0,0,0,0,0,0,0,0,0,0,323,405,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Apply Negative Charge on Drum,0,0,0,0,0,0,0,0,0,194,0,0,0,0,0,0,0,0,0,0,0,0,0,0,265,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Apply Pressure,0,0,435,0,0,0,0,0,0,0,0,0,0,0,0,0,200,299,0,0,0,0,0,0,0,0,0,0,0,362,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Calc Quantization Error,0,0,0,101,0,0,0,0,0,0,0,0,0,93,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Calc Total Neighbor Quant Error,0,0,0,0,195,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Coat Light Toner on Drum,0,0,0,0,0,0,0,0,0,0,0,0,96,0,0,0,0,0,0,0,0,0,0,0,441,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
Coat Toner on Drum,0,0,0,0,0,0,0,0,0,0,0,0,333,0,0,0,0,0,0,0,0,0,0,0,1152,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
