## Import Lib

In [1]:
from func import Initial_Log, Optimal_Process
import pandas as pd
import warnings
import os
warnings.filterwarnings('ignore')

## Load data

In [2]:
log = pd.read_csv('example_log.csv', sep=';')

In [3]:
log.head(10)

Unnamed: 0,ids,events,time
0,travel permit 76455,Start trip,2016-10-05 00:00:00
1,travel permit 76455,End trip,2016-10-05 00:00:00
2,travel permit 76455,Permit SUBMITTED by EMPLOYEE,2017-04-06 13:32:10
3,travel permit 76455,Permit FINAL_APPROVED by SUPERVISOR,2017-04-06 13:32:28
4,travel permit 76455,Declaration SUBMITTED by EMPLOYEE,2017-04-07 13:38:14
5,travel permit 76455,Declaration FINAL_APPROVED by SUPERVISOR,2017-04-07 13:40:17
6,travel permit 76455,Request Payment,2017-04-11 15:03:43
7,travel permit 76455,Payment Handled,2017-04-13 17:30:53
8,travel permit 76665,Start trip,2016-11-21 00:00:00
9,travel permit 76665,End trip,2016-12-22 00:00:00


## Prepare data

In [4]:
%%time
ilog = Initial_Log(log, "ids", "events", "time", timeformat="%Y-%m-%d %H:%M:%S")
prep_log = ilog.get_frame()

Wall time: 1.76 s


In [5]:
prep_log.head()

Unnamed: 0,index,ids,events,time,next_event,next_time
0,0,travel permit 76455,Start trip,2016-10-05 00:00:00,End trip,2016-10-05 00:00:00
1,1,travel permit 76455,End trip,2016-10-05 00:00:00,Permit SUBMITTED by EMPLOYEE,2017-04-06 13:32:10
2,2,travel permit 76455,Permit SUBMITTED by EMPLOYEE,2017-04-06 13:32:10,Permit FINAL_APPROVED by SUPERVISOR,2017-04-06 13:32:28
3,3,travel permit 76455,Permit FINAL_APPROVED by SUPERVISOR,2017-04-06 13:32:28,Declaration SUBMITTED by EMPLOYEE,2017-04-07 13:38:14
4,4,travel permit 76455,Declaration SUBMITTED by EMPLOYEE,2017-04-07 13:38:14,Declaration FINAL_APPROVED by SUPERVISOR,2017-04-07 13:40:17


## Get top 10 chain of sequences

In [6]:
df_topchain = ilog.get_top_chain_sequences(10)

In [7]:
df_topchain.head(12)

Unnamed: 0,ChainNumber,ChainFrequency,StepNumberOfChain,EventName
0,1,956,1,Permit SUBMITTED by EMPLOYEE
1,1,956,2,Permit APPROVED by ADMINISTRATION
2,1,956,3,Permit FINAL_APPROVED by SUPERVISOR
3,1,956,4,Start trip
4,1,956,5,End trip
5,1,956,6,Declaration SUBMITTED by EMPLOYEE
6,1,956,7,Declaration APPROVED by ADMINISTRATION
7,1,956,8,Declaration FINAL_APPROVED by SUPERVISOR
8,1,956,9,Request Payment
9,1,956,10,Payment Handled


## Find optimal process

In [8]:
op = Optimal_Process(ilog)

#### Set base sequences for search optimal 

In [9]:
for_compare = ['Permit SUBMITTED by EMPLOYEE',
               'Permit APPROVED by ADMINISTRATION',
               'Start trip',
               'End trip',
               'Declaration SUBMITTED by EMPLOYEE',
               'Declaration APPROVED by ADMINISTRATION',
               'Request Payment',
               'Payment Handled']

In [10]:
%%time
b, s = op.get_faster_similar_sequence(for_compare, best_seq_ind = 0)

Wall time: 997 ms


In [11]:
split_events = b.split("-|>")
split_events

['Permit SUBMITTED by EMPLOYEE',
 'Permit APPROVED by ADMINISTRATION',
 'Start trip',
 'End trip',
 'Permit FINAL_APPROVED by SUPERVISOR',
 'Declaration SUBMITTED by EMPLOYEE',
 'Declaration APPROVED by ADMINISTRATION',
 'Declaration FINAL_APPROVED by SUPERVISOR',
 'Request Payment',
 'Payment Handled']

### Best seq to df

In [12]:
data = []
for i in range(len(split_events)-1):
    data.append([split_events[i], split_events[i+1], s[(split_events[i], split_events[i+1])]])
df_res = pd.DataFrame(data, columns=["Start_event", "End_event", "Median timedelta"])    

In [13]:
df_res

Unnamed: 0,Start_event,End_event,Median timedelta
0,Permit SUBMITTED by EMPLOYEE,Permit APPROVED by ADMINISTRATION,"-1 day, 23:59:59"
1,Permit APPROVED by ADMINISTRATION,Start trip,"-1 day, 4:38:40"
2,Start trip,End trip,"-4 days, 0:00:00"
3,End trip,Permit FINAL_APPROVED by SUPERVISOR,"-1 day, 22:23:44"
4,Permit FINAL_APPROVED by SUPERVISOR,Declaration SUBMITTED by EMPLOYEE,17:16:10.500000
5,Declaration SUBMITTED by EMPLOYEE,Declaration APPROVED by ADMINISTRATION,"-1 day, 23:57:52.500000"
6,Declaration APPROVED by ADMINISTRATION,Declaration FINAL_APPROVED by SUPERVISOR,"1 day, 18:39:25.500000"
7,Declaration FINAL_APPROVED by SUPERVISOR,Request Payment,"-1 day, 20:04:32"
8,Request Payment,Payment Handled,"-1 day, 19:04:50.500000"


Median timedelta - разница медианного времени исполнения событий между всем логом и оптимальной последовательностью

## get employee's id which have best sequence

In [14]:
op.get_id_by_sequence(b)

array(['travel permit 10066', 'travel permit 10741',
       'travel permit 13644', 'travel permit 13759',
       'travel permit 15784', 'travel permit 19950',
       'travel permit 23492', 'travel permit 23726',
       'travel permit 24909', 'travel permit 29224',
       'travel permit 29453', 'travel permit 30171',
       'travel permit 31729', 'travel permit 34005',
       'travel permit 34706', 'travel permit 35196',
       'travel permit 35430', 'travel permit 35537',
       'travel permit 35581', 'travel permit 36093',
       'travel permit 38284', 'travel permit 3830', 'travel permit 40778',
       'travel permit 43176', 'travel permit 45381',
       'travel permit 47865', 'travel permit 48225',
       'travel permit 50376', 'travel permit 51646', 'travel permit 5172',
       'travel permit 52957', 'travel permit 55048',
       'travel permit 55722', 'travel permit 56883',
       'travel permit 56893', 'travel permit 62655',
       'travel permit 65783', 'travel permit 71537',
  