In [1]:
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import psutil
plt.style.use('seaborn')
pd.set_option("display.max_rows", 10000)
pd.set_option("display.max_columns", 10000)

Wall time: 2 s


In [2]:
df_train = pd.read_csv('Printer--training.csv')
df_test = pd.read_csv('Printer--test.csv')
df_train.drop(['case description', 'event org:resource'], axis=1)

Unnamed: 0,eventID,case concept:name,case Class,event concept:name,event lifecycle:transition,event time:timestamp,Completion Time
0,,1,Print,Job,start,01-01-1970 01:00:00.000,
1,1.000000e+00,1,Print,Remote Print,complete,01-01-1970 01:15:00.000,
2,2.000000e+00,1,Print,Read Print Options,complete,01-01-1970 01:26:00.000,
3,3.000000e+00,1,Print,Rasterization,start,01-01-1970 01:38:00.000,
4,4.000000e+00,1,Print,Interpretation,start,01-01-1970 01:51:00.000,
...,...,...,...,...,...,...,...
36623,4.294967e+10,18,Print,Pressure Roller Spin Stop,complete,25-12-1970 15:51:00.000,
36624,4.294967e+10,18,Print,Fusing,complete,25-12-1970 16:05:00.000,
36625,4.294967e+10,18,Print,Wipe Toner on Drum,complete,25-12-1970 16:10:00.000,
36626,4.294967e+10,18,Print,Erase Charge on Drum,complete,25-12-1970 16:22:00.000,


In [3]:
def EventTime(data):
    memory = {} # Stores active events, key=event, value=index of start
    for i, j in data[["event concept:name"]].iterrows():
        
        if j[0] not in memory.keys() and data.loc[i]["event lifecycle:transition"].lower() == "start":
            memory[j[0]] = i
        
        elif j[0] in memory.keys():
            time = pd.to_datetime(data.loc[i]["event time:timestamp"], dayfirst=True) - pd.to_datetime(data.loc[memory[j[0]]]["event time:timestamp"], dayfirst=True)           # print(time)
            data.loc[i,"Completion Time"] = time.total_seconds()
            del memory[j[0]] # Removes the completed event from active event dictionary(memory)
    
    return data # Returns a new dataframe with event completion time. Can be improved.


In [4]:
def AverageTime(df_train):
    """
    1. Computes the event completion time for events that don't have a "start" by taking the 
       difference between said event's completion time and the previous (i-1) event time. Still have 
       to determine how correct this is, as several events can run in parallel.
    2. Sets new index on df and computes the remaining event times (for events that have a start and completion time)
       by running the EventTime function case by case. Can be improved.
    3. Returns average completion time per event.
    
    """
    df_train["Completion Time"] = 0
    df_train.replace(0,np.NaN, inplace=True)
    
    for i in range(1, df_train['Completion Time'].shape[0]-1): # 1
        if pd.isnull(df_train['Completion Time'][i]):
            if df_train['event lifecycle:transition'][i].lower() == 'complete':
                time = pd.to_datetime(df_train["event time:timestamp"][i], dayfirst=True) - pd.to_datetime(df_train["event time:timestamp"][i-1], dayfirst=True)          
                df_train['Completion Time'][i] = time.total_seconds()
    
    df_train.set_index(["case concept:name", "eventID "], inplace=True) # 2
    
    for idx, new_df in df_train.groupby(level=0): # Adds event completion time, case by case.
        df_train.loc[idx] = EventTime(new_df)
        
    Average_time = df_train.groupby(['event concept:name'])[['Completion Time']].mean() # 3
    return Average_time

In [None]:
def AddAverageTime(data):
    data['Average time till next event'] = 0
    avr_time = AverageTime(data)
    avr_time.reset_index(inplace=True)
    data.reset_index(inplace=True)
    for i in range(0, data['Completion Time'].shape[0]):
        abc = df_train['event concept:name'][i]
        time = avr_time.loc[avr_time['event concept:name'] == abc, 'Completion Time']
        time_list = list(time)
        data['Average time till next event'][i] = time_list[0]
    RAM = ('RAM memory % used:', psutil.virtual_memory()[2])
    CPU = ('CPU % used:', psutil.cpu_percent())
    return data, RAM, CPU

In [None]:
%%time
AddAverageTime(df_train)

In [None]:
#Time Accuracy, since we take the average, the accuracy is 0
total_count = 0
correct = 0
wrong = 0
for i in range(0, df_train['Completion Time'].shape[0]):
    if pd.notna(df_train['Completion Time'][i]):
        if df_train['Completion Time'][i] == df_train['Average time till next event'][i]:
            correct += 1
            total_count += 1 
        else:
            total_count += 1
            wrong += 1