In [1]:
import pandas as pd
import networkx as nx
import numpy as np
from tqdm import tqdm

from april import Dataset
from april.processmining import ProcessMap
from april.fs import get_event_log_files
from april.fs import get_process_model_files

# Event Log information

A list of all event logs used in the evaluation

In [2]:
logs = sorted([e.name for e in get_event_log_files() if e.p == 0.3])
columns = ['name', 'base_name', 'num_cases', 'num_events', 'num_activities', 
           'num_attributes', 'attribute_keys', 'attribute_dims', 
           'min_attribute_dim', 'max_attribute_dim',
           'min_case_len', 'max_case_len', 'mean_case_len']
df = []
for log in tqdm(logs):
    d = Dataset(log)
    dim_min = d.attribute_dims[1:].astype(int).min() if d.attribute_dims[1:].size else None
    dim_max = d.attribute_dims[1:].astype(int).max() if d.attribute_dims[1:].size else None
    df.append([log, log.split('-')[0], d.num_cases, d.num_events, d.attribute_dims[0].astype(int), 
               d.num_attributes - 1, d.attribute_keys[1:], d.attribute_dims[1:].astype(int), dim_min, dim_max,
               d.case_lens.min(), d.case_lens.max(), d.case_lens.mean().round(2)])
event_logs = pd.DataFrame(df, columns=columns)

100%|██████████| 1/1 [00:00<?, ?it/s]


## Basis for Table 1 in the Paper

In [3]:
event_logs

Unnamed: 0,name,base_name,num_cases,num_events,num_activities,num_attributes,attribute_keys,attribute_dims,min_attribute_dim,max_attribute_dim,min_case_len,max_case_len,mean_case_len
0,paper-0.3-1,paper,5000,66814,27,1,[user],[13],13,13,9,17,13.36


In [4]:
event_logs[['base_name', 'num_activities', 'num_cases', 'num_events', 'min_attribute_dim', 'max_attribute_dim']].groupby('base_name').agg(['count', 'min', 'max'])

Unnamed: 0_level_0,num_activities,num_activities,num_activities,num_cases,num_cases,num_cases,num_events,num_events,num_events,min_attribute_dim,min_attribute_dim,min_attribute_dim,max_attribute_dim,max_attribute_dim,max_attribute_dim
Unnamed: 0_level_1,count,min,max,count,min,max,count,min,max,count,min,max,count,min,max
base_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2
paper,1,27,27,1,5000,5000,1,66814,66814,1,13,13,1,13,13


# Process Model Information

In [5]:
maps = sorted([m for m in get_process_model_files()])
df = []
for process_map in tqdm(maps):
    model = ProcessMap.from_plg(process_map)

    num_variants = len(model.variants.cases)
    max_case_len = model.variants.max_case_len

    nodes = model.graph.number_of_nodes()
    edges = model.graph.number_of_edges()
    dens = nx.density(model.graph)
    in_degree = np.mean([d[1] for d in model.graph.in_degree()])
    out_degree = np.mean([d[1] for d in model.graph.out_degree()])

    df.append([nodes, edges, num_variants, max_case_len, dens, in_degree, out_degree])
process_models = pd.DataFrame(df, index=maps, columns=['nodes', 'edges', 'num_variants', 'max_case_len', 'density', 'in_deg', 'out_deg'])

100%|██████████| 8/8 [00:00<00:00, 48.96it/s]


In [7]:
# process_models.loc[['paper', 'p2p', 'small', 'medium', 'large', 'huge', 'gigantic', 'wide', 'testing']].round(2)
process_models.loc[['paper', 'p2p', 'small', 'medium', 'large', 'huge', 'gigantic', 'wide']].round(2)


Unnamed: 0,nodes,edges,num_variants,max_case_len,density,in_deg,out_deg
paper,16,18,8,12,0.08,1.12,1.12
p2p,15,18,8,11,0.09,1.2,1.2
small,22,26,6,10,0.06,1.18,1.18
medium,34,48,25,8,0.04,1.41,1.41
large,44,56,28,12,0.03,1.27,1.27
huge,56,75,39,11,0.02,1.34,1.34
gigantic,80,119,71,11,0.02,1.49,1.49
wide,36,53,19,7,0.04,1.47,1.47
