In [32]:
import sys
sys.executable

'/home/proxpxd/anaconda3/envs/common311/bin/python'

In [33]:
import pandas as pd
from datetime import datetime
import json
from functools import reduce
import operator as op
import networkx as nx
from more_itertools import split_when, split_at, unique_everseen, flatten, repeatfunc, side_effect
from itertools import chain, islice
from statistics import mean
import numpy as np

In [34]:
df = pd.read_csv('repairExample.csv')

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7734 entries, 0 to 7733
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Case ID               7734 non-null   int64  
 1   Activity              7734 non-null   object 
 2   Resource              7734 non-null   object 
 3   Start Timestamp       7734 non-null   object 
 4   Complete Timestamp    7734 non-null   object 
 5   Variant               7734 non-null   object 
 6   Variant index         7734 non-null   int64  
 7   (case) description    7734 non-null   object 
 8   defectFixed           2508 non-null   object 
 9   defectType            1104 non-null   float64
 10  lifecycle:transition  7734 non-null   object 
 11  numberRepairs         2508 non-null   float64
 12  phoneType             1104 non-null   object 
dtypes: float64(2), int64(2), object(9)
memory usage: 785.6+ KB


In [36]:
list(df.groupby('Case ID'))

[(1,
     Case ID          Activity  Resource          Start Timestamp  \
  0        1          Register    System  1970/01/02 13:23:00.000   
  1        1    Analyze Defect   Tester3  1970/01/02 13:23:00.000   
  2        1  Repair (Complex)  SolverC1  1970/01/02 13:31:00.000   
  3        1       Test Repair   Tester3  1970/01/02 13:49:00.000   
  4        1       Inform User    System  1970/01/02 14:10:00.000   
  5        1    Archive Repair    System  1970/01/02 14:10:00.000   
  
          Complete Timestamp    Variant  Variant index  \
  0  1970/01/02 13:23:00.000  Variant 2              2   
  1  1970/01/02 13:30:00.000  Variant 2              2   
  2  1970/01/02 13:49:00.000  Variant 2              2   
  3  1970/01/02 13:55:00.000  Variant 2              2   
  4  1970/01/02 14:10:00.000  Variant 2              2   
  5  1970/01/02 14:10:00.000  Variant 2              2   
  
             (case) description defectFixed  defectType lifecycle:transition  \
  0  Simulated proce

## Useful Functions

**Compose** - helper function that is mathematical function composition

**Merge** - merge function used to join the results of the *EventIdentifiers*

In [37]:
def compose(*functions):
    def inner(arg):
        for f in functions:
            arg = f(arg)
        return arg
    return inner

def merge(*structs):
    structs = list(filter(bool, structs))
    match structs[0]:
        case list():
            return reduce(op.add, structs)  # ? merge
        case dict():
            result = {}
            keys = set((key for struct in structs for key in struct.keys()))
            for key in keys:
                elems = (struct.get(key) for struct in structs if struct.get(key) is not None)
                result[key] = merge(*tuple(elems))
            return result
        case _:
            elems = set(structs)
            if len(elems) > 1:
                raise ValueError
            return elems[0]
            

In [38]:
'''
logExample.csv
purchasingExample.csv
repairExample.csv
sepsisExample.csv
'''

all_filenames = [
    'logExample.csv',
    'purchasingExample.csv',
    'repairExample.csv',
    'sepsisExample.csv',
    'first_ad.csv',
    'second_ad.csv',
]

to_filter = ['repairExample.csv', 'first_ad.csv', 'second_ad.csv'] or all_filenames[:]
used_filenames = list(filter(to_filter.__contains__, all_filenames))
dataframes = {filename: pd.read_csv(filename) for filename in used_filenames}

In [39]:
cols = [list(dataframe.columns) for dataframe in dataframes.values()]
for kind, columns in zip(used_filenames, cols):
    print(f'{kind:>22}: ', columns)
    print(f'{"- ":>25}', next((col for col in columns if 'case' in col.lower()), None))
    print(f'{"- ":>25}', next((col for col in columns if 'activ' in col.lower()), None))
    print(f'{"- ":>25}', [col for col in columns if 'timestamp' in col.lower() or 'date' in col.lower()])

     repairExample.csv:  ['Case ID', 'Activity', 'Resource', 'Start Timestamp', 'Complete Timestamp', 'Variant', 'Variant index', '(case) description', 'defectFixed', 'defectType', 'lifecycle:transition', 'numberRepairs', 'phoneType']
                       -  Case ID
                       -  Activity
                       -  ['Start Timestamp', 'Complete Timestamp']
          first_ad.csv:  ['Case ID', 'Activity', 'Start Timestamp', 'End Timestamp']
                       -  Case ID
                       -  Activity
                       -  ['Start Timestamp', 'End Timestamp']
         second_ad.csv:  ['Case ID', 'Activity', 'Start Timestamp', 'End Timestamp']
                       -  Case ID
                       -  Activity
                       -  ['Start Timestamp', 'End Timestamp']


## Processing

In [40]:
def map_names(dataframe):
    mapper = {
        'Complete Timestamp': 'End Timestamp',
        'Start Date': 'Start Timestamp',
        'End Date': 'End Timestamp',
    }
    renamed = dataframe.rename(columns=mapper)
    return renamed

def add_start_timestamp_if_needed(dataframe):
    dataframe = dataframe.copy()
    if 'Start Timestamp' not in dataframe:
        dataframe['Start Timestamp'] = dataframe['End Timestamp']
    return dataframe

def map_timestamps(dataframe):
    dataframe['Start Timestamp'] = pd.to_datetime(dataframe['Start Timestamp'])
    dataframe['End Timestamp'] = pd.to_datetime(dataframe['End Timestamp'])
    return dataframe

def add_duration(dataframe):
    dataframe['Duration'] = dataframe['Start Timestamp'] - dataframe['End Timestamp']
    return dataframe

normalise = compose(
    map_names,
    add_start_timestamp_if_needed,
    map_timestamps,
    add_duration,
)

dataframes = dict(zip(dataframes.keys(), map(normalise, dataframes.values())))

## Pairing

### Pairing Events

With this function, we are pairing up 2 events that are sequential together for deeper analysis.
For a single case where the sequence would be A -> B -> C -> D -> E, the function would return a dataframe with following information:
| Pairs | Times | Mean | Standard Deviation|
| --- | --- | --- | --- |
| A, B | ALL POSSIBLE TIME LENGTHS OF EVENT A AS A LIST | MEAN(GIVEN LIST) | STD(GIVEN LIST) |
| B, C | ALL POSSIBLE TIME LENGTHS OF EVENT A AS A LIST | MEAN(GIVEN LIST) | STD(GIVEN LIST) |
| C, D | ALL POSSIBLE TIME LENGTHS OF EVENT A AS A LIST | MEAN(GIVEN LIST) | STD(GIVEN LIST) |
| D, E | ALL POSSIBLE TIME LENGTHS OF EVENT A AS A LIST | MEAN(GIVEN LIST) | STD(GIVEN LIST) |
| D, B | ALL POSSIBLE TIME LENGTHS OF EVENT A AS A LIST | MEAN(GIVEN LIST) | STD(GIVEN LIST) |

In [41]:
def get_seq_as_pairs(df):
    
    df.sort_values(by=['Case ID', 'Start Timestamp'], inplace=True)
    grouped = df.groupby('Case ID')
    
    result = []
    for case_id, group in grouped:
        activities = group['Activity'].tolist()
        start_times = group['Start Timestamp'].tolist()
        complete_times = group['End Timestamp'].tolist()
        case_result = [[from_activity, (te-t0).total_seconds(), to_activity] for from_activity, t0, te, to_activity in  zip(activities, start_times, complete_times, activities[1:])]
        result.append(case_result)
    
    pairs_time = {}
    for from_activity, period, to_activity in (case for row in result for case in row):
        pairs_time.setdefault((from_activity, to_activity), []).append(period)
    
    reshaped_data = {'pairs': [], 'times': []}
    for key, values in pairs_time.items():
        reshaped_data['pairs'].append(key)
        reshaped_data['times'].append(values)
    
    pairs_df = pd.DataFrame(reshaped_data)
    
    for index, row in pairs_df.iterrows():
        times_list = (row.iloc[1])
        pairs_df.at[index, 'mean'] = mean(times_list)
        pairs_df.at[index, 'std'] = np.std(times_list)
    
    return pairs_df

In [42]:
next(iter(dataframes.values()))

Unnamed: 0,Case ID,Activity,Resource,Start Timestamp,End Timestamp,Variant,Variant index,(case) description,defectFixed,defectType,lifecycle:transition,numberRepairs,phoneType,Duration
0,1,Register,System,1970-01-02 13:23:00,1970-01-02 13:23:00,Variant 2,2,Simulated process instance,,,complete,,,0 days 00:00:00
1,1,Analyze Defect,Tester3,1970-01-02 13:23:00,1970-01-02 13:30:00,Variant 2,2,Simulated process instance,,6.0,complete,,T2,-1 days +23:53:00
2,1,Repair (Complex),SolverC1,1970-01-02 13:31:00,1970-01-02 13:49:00,Variant 2,2,Simulated process instance,,,complete,,,-1 days +23:42:00
3,1,Test Repair,Tester3,1970-01-02 13:49:00,1970-01-02 13:55:00,Variant 2,2,Simulated process instance,True,,complete,0.0,,-1 days +23:54:00
4,1,Inform User,System,1970-01-02 14:10:00,1970-01-02 14:10:00,Variant 2,2,Simulated process instance,,,complete,,,0 days 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7729,999,Restart Repair,System,1970-01-22 20:38:00,1970-01-22 20:38:00,Variant 11,11,Simulated process instance,,,complete,,,0 days 00:00:00
7730,999,Repair (Simple),SolverS3,1970-01-22 20:38:00,1970-01-22 20:48:00,Variant 11,11,Simulated process instance,,,complete,,,-1 days +23:50:00
7731,999,Test Repair,Tester2,1970-01-22 20:48:00,1970-01-22 20:55:00,Variant 11,11,Simulated process instance,True,,complete,2.0,,-1 days +23:53:00
7732,999,Inform User,System,1970-01-22 20:56:00,1970-01-22 20:56:00,Variant 11,11,Simulated process instance,,,complete,,,0 days 00:00:00


# Identifying

**SingleEventTypeIdentifier** is a class from which the rest of the classes inherit to give access to function avvailable in parent (**EventTypeIdentifier**)

In [43]:
class SingleEventTypeIdentifier:
    def __init__(self, parent, name: str = None):
        self.name = name or self.__class__.__name__.replace('EventIdentifier', '')
        self.parent = parent
        
    def identify(self, dataframe: pd.DataFrame, earlier_data: dict):
        raise NotImplementedErrror

**EventTypeIdentifier** - class that contains the identifiers for specific events and contains common logic

In [44]:
class EventTypeIdentifier:
    def __init__(self):
        self._dataframe: pd.DataFrame = None
        self.identifiers: list[SingleEventTypeIdentifier] = []
        self.paths = None
        self.cycles = None
        self.cyclied_paths = None

    @property
    def dataframe(self):
        return self._dataframe

    @dataframe.setter
    def dataframe(self, to_set):
        self._dataframe = to_set
        self.analyse()

    @property
    def n_activities(self):
        return len(df['Activity'].unique())

    def analyse(self, dataframe: pd.DataFrame = None):
        dataframe = dataframe or self.dataframe
        self.paths = self.get_paths(dataframe)
        self.analyse_cycles()

    def get_paths(self, dataframe):
        grouped_df = dataframe.groupby('Case ID')['Activity'].agg(list).reset_index()
        return tuple(set(map(tuple, grouped_df['Activity'])))

    def analyse_cycles(self, paths=None):
        paths = paths or self.paths
        paths_with_cycles = list(filter(lambda pc: pc[1], ((path, self.get_cycles(path)) for path in paths)))
        # self.cycles = list(unique_everseen((cycle for path, cycles in self.paths_with_cycles for cycle in cycles)))
        self.cyclied_paths = self.get_cyclied_paths(paths_with_cycles)
        self.cycles = self.get_normalised_cycles(self.cyclied_paths)
        # print('cyclied_paths')
        # for i, cyclied_path in enumerate(self.cyclied_paths):
        #     print(i, cyclied_path)
        # print('#'*20)
        # print('cycles')
        # for i, cycle in enumerate(self.cycles):
        #     print(i, cycle)
        
        
    def get_cycles(self, path):
        G = self.create_graph(path)
        cycles = tuple(nx.simple_cycles(G))
        return cycles

    def create_graph(self, path):
        G = nx.DiGraph()
        for i in range(len(path) - 1):
            G.add_edge(path[i], path[i + 1])
        return G

    def get_cyclied_paths(self, paths_with_cycles):
        cyclied_paths = []
        for path, cycles in paths_with_cycles:
            cycle_nodes = tuple((node for cycle in cycles for node in cycle))
            partial_cyclied_paths = map(tuple, filter(bool, split_when(path, lambda n1, n2: (n1 in cycle_nodes) ^ (n2 in cycle_nodes))))
            cyclied_path = tuple(islice(chain(partial_cyclied_paths, repeatfunc(tuple)), 3))
            cyclied_paths.append(cyclied_path)
        return list(set(cyclied_paths))

    def get_normalised_cycles(self, cyclied_paths):
        cyclied_paths = cyclied_paths or self.cyclied_paths
        succesfully_ending = filter(lambda bma: bma[2], cyclied_paths)
        # p = side_effect(lambda bma: print(f'- {bma[0]}\n- {bma[1]}\n- {bma[2]}\n\n'), succesfully_ending)
        cycles = map(lambda bma: bma[1], succesfully_ending)
        normalised = unique_everseen(map(compose(unique_everseen, tuple), cycles))
        return list(normalised)

    def get_nth(self, n: int, of: str, reverse=False):
        self._dataframe.sort_values(by=of, ascending=False)
        if n < 0: 
            n = -n
            reverse = True
        groupped_of = self._dataframe.groupby('Case ID')[of]
        # print(self._dataframe.loc[groupped_of.tail(1).index]['Activity'].value_counts().to_dict())
        activity_indices = (groupped_of.nth(n) if not reverse else groupped_of.tail(n)).index
        if n == 1 and reverse:  # since tail has some problems working
            activity_indices = groupped_of.idxmax()
        activity_df = self._dataframe.loc[activity_indices]
        activities_dict = activity_df['Activity'].value_counts().to_dict()
        return activities_dict

    def filter_any_of_type(self, data: dict[str, list], event_type: str):
        return {activity: descriptions for activity, descriptions in data.items() if any(description['type'] == event_type for description in descriptions)}

    def identify(self, dataframe):
        self.dataframe = dataframe
        curr_data = {}
        for identifier in self.identifiers:
            curr_result_data = identifier.identify(dataframe, curr_data)
            curr_data = merge(curr_data, curr_result_data)
        # print(json.dumps(curr_data, indent=4))
        return curr_data



In [70]:
class StartEventIdentifier(SingleEventTypeIdentifier): 
    def identify(self, dataframe: pd.DataFrame, earlier_data ):
        partial_data = self.parent.get_nth(0, of='Start Timestamp')
        data = {activity: [{'type': self.name, 'n_occurences': n_occurences}] for activity, n_occurences in partial_data.items()}
        return data
        
class TerminationEventIdentifier(SingleEventTypeIdentifier):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.not_always_lasts = None
    
    def identify(self, dataframe: pd.DataFrame, earlier_data):
        prelim_end_events = self.parent.get_nth(-1, of='End Timestamp')
        self.init_not_always_lasts(dataframe, prelim_end_events.keys())
        data = {activity: [{'type': 'Termination', 'n_occurences': n_occurences}, {'type': self.get_subevent_name(activity)}] for activity, n_occurences in prelim_end_events.items()}
        return data

    def init_not_always_lasts(self, dataframe: pd.DataFrame, prelim_end_events: list[str]) -> list[str]:
        not_always_lasts = dataframe\
            .groupby('Case ID')['Activity']\
            .apply(lambda group: group[group.isin(prelim_end_events)])\
            .groupby('Case ID')\
            .apply(lambda group: group[:-1])\
            .to_list()
        self.not_always_lasts = list(set(not_always_lasts))

    def get_subevent_name(self, activity):
        return 'Error' if activity in self.not_always_lasts else 'End'


class IntermediateEventIdentifier(SingleEventTypeIdentifier):
     def identify(self, dataframe: pd.DataFrame, earlier_data: dict):
         partial_data = list(dataframe[
            (dataframe['Start Timestamp'] > dataframe.groupby('Case ID')['Start Timestamp'].transform('min')) &
            (dataframe['End Timestamp'] < dataframe.groupby('Case ID')['End Timestamp'].transform('max'))
         ]['Activity'].unique())
         data = {activity: [{'type': self.name}] for activity in partial_data}
         return data

class TimerLoopEventIdentifier(SingleEventTypeIdentifier):
    def identify(self, dataframe: pd.DataFrame, earlier_data: dict):
        std_threshold = 1.2
        pairs_df = get_seq_as_pairs(dataframe)
        same_activity_statistics = ((from_activity, statistics['times'], statistics['mean']) for ((from_activity, to_activity), statistics) in pairs_df.groupby('pairs') if from_activity == to_activity)
        timer_loop_activities = (activity for activity, times, mean in same_activity_statistics if mean != max(times) and np.std(times) > std_threshold)
        data = {activity:[{'type': self.name}] for activity in timer_loop_activities}                
        return data

class TimerExceptionEventIdentifier(SingleEventTypeIdentifier):
    def identify(self, dataframe: pd.DataFrame, earlier_data: dict):
        pairs_df = get_seq_as_pairs(dataframe)
        std_threshold = 1.2

        occurrences = {}
        for group in pairs_df.groupby('pairs'):
            pair = group[0]
            from_activity = pair[0]
            to_activity = pair[1]
            info = [to_activity, group[1]['mean'], group[1]['times']]
            occurrences.setdefault(from_activity, []).append(info)
        
        candidates = [from_activity for from_activity, all_info in occurrences.items() if len(all_info) > 1]        
        candidates_with_times = {from_activity: [time for info in occurrences[from_activity] for time in info[2]] for from_activity in candidates}

        exception_results = []
        for candidate, times in candidates_with_times.items():
            
            longest = float('-inf')
            
            for time_set in times:
                if longest < max(time_set):
                    longest = max(time_set)
            
            for time_set in times:
                mean_time = mean(time_set)
            
                if mean_time == longest or np.std(time_set) < std_threshold:
                    exception_results.append(candidate)
                    
        data = {activity: [{'type': self.name}] for activity in exception_results}
        return data

In [71]:
eti = EventTypeIdentifier()
eti.identifiers.extend((
    StartEventIdentifier(eti),
    TerminationEventIdentifier(eti),
    IntermediateEventIdentifier(eti),
    TimerLoopEventIdentifier(eti),
    TimerExceptionEventIdentifier(eti),
))
for name, dataframe in dataframes.items():
    print('#'*5, name, '#'*20)
    result = eti.identify(dataframe)
    for activity, activity_result in result.items():
        print(f'{" "*3}{activity}:', activity_result)

##### repairExample.csv ####################
Analyze Defect 3
Inform User 5
Repair (Complex) 2
Repair (Simple) 2
Restart Repair 3
Test Repair 3
   Archive Repair: [{'type': 'Termination', 'n_occurences': 890}, {'type': 'End'}]
   Restart Repair: [{'type': 'Intermediate'}, {'type': 'TimerException'}]
   Register: [{'type': 'Start', 'n_occurences': 1104}]
   Test Repair: [{'type': 'Termination', 'n_occurences': 76}, {'type': 'Error'}, {'type': 'Intermediate'}]
   Inform User: [{'type': 'Termination', 'n_occurences': 136}, {'type': 'Error'}, {'type': 'Intermediate'}, {'type': 'TimerException'}]
   Repair (Simple): [{'type': 'Intermediate'}]
   Repair (Complex): [{'type': 'Termination', 'n_occurences': 2}, {'type': 'Error'}, {'type': 'Intermediate'}]
##### first_ad.csv ####################
B 2
   E: [{'type': 'Intermediate'}]
   C: [{'type': 'Intermediate'}]
   A: [{'type': 'Start', 'n_occurences': 13}]
   B: [{'type': 'Intermediate'}]
   D: [{'type': 'Termination', 'n_occurences': 13}, {'

### StartEventIdentifier
Simply identifies event that has no predecessors

### TerminationEventIdentifier
Termination Event detects two tyes of events:
- End which is the last event that never occurs in before any other event.
- Error which is an event that appears to be last, but during a normal flow has succesors.
Both events are marked as Termination Events

### IntermediateEventIdentifier
Is simple an event that is neither a start nor end event

### Timer Exception Event Identifier

We use the 'get_seq_as_pairs' function to get the pairs dataframe. 
Then use it by grouping up the pairs and saving all the possible next events after the current selected event with information such as time list of the first event and mean of the time list inside a dictionary. This information can be shown in the following format:
{First Event : [Second Event, Mean of the time list, Time list (Every time length that first event takes before second event occurs in the whole data)], [Another possible second event,...]}
Then we check the number of possible second events for the given first event, if the number is bigger than 1, we take them as candidates for this event.
Since this event type is interruptive, we take the longest time possible for the first event, and if the mean of time equals to this longest time possible for a special second event, we label the first event as Timer Exception Event.

- We also determine a very low standard deviation threshold since a real data can be a little different and not exactly the same time for all of the data, so we check the standard deviation as well.
- Since in such an event, the first event might take different times for the conditions that interruption does not occur, we do not take into consideration those different times. We are only taking into consideration where the event is interrupted at a given maximum time.

### Timer Loop Event Identifier

We use the 'get_seq_as_pairs' function to get the pairs dataframe. We follow a similar but simpler approach compared to Timer Exception Event Identification. While grouping the pairs dataframe by pairs again, we check for if for any given pair first and the second element are the same events. If that is the case, we check with a similar method of Timer Exception Event Identification the maximum time and mean of time since it is an interruptive event as well. Also again, we check for standard deviation for possible small differences in real time examples.

### Artificial Data
The data in first_ad and second_ad is created artificially in order to test the Timer Event identifiers.
- first_ad: Timer Loop Event identification. 
- second_ad: Timer Exception Event identification.

The data is created manually without using any algorithm, and we tried to simulate reality by adding imperfections such as short delays for interruptive events. And we are detecting this delays and ignoring them by using a threshold with standard deviation in order to make it more realistic and generic. 

The differene between the first and the second artificial data is that on the second_ad the second event on the sequence is same as the first one in order to detect the loop, but on the first_ad, the events are different.

Here is an example of creation of the data;

```
new_row_list = [ {'Case ID': case,'Activity': 'A', 'Start Timestamp': datetime(2023, 1, day, 9, 0), 'End Timestamp': datetime(2023, 1, day, 9, 30)},
                 {'Case ID': case,'Activity': 'B', 'Start Timestamp': datetime(2023, 1, day, 9, 30), 'End Timestamp': datetime(2023, 1, day, 10, 16)},
                 {'Case ID': case,'Activity': 'E', 'Start Timestamp': datetime(2023, 1, day, 10, 16), 'End Timestamp': datetime(2023, 1, day, 10, 30)},
                 {'Case ID': case,'Activity': 'C', 'Start Timestamp': datetime(2023, 1, day, 10, 30), 'End Timestamp': datetime(2023, 1, day, 11, 0)},
                 {'Case ID': case,'Activity': 'D', 'Start Timestamp': datetime(2023, 1, day, 11, 45), 'End Timestamp': datetime(2023, 1, day, 12, 15)}
]
```

### Other approachs
On the last version we are using the get_seq_as_pairs function that is mentioned before in order to detect the loops and cycles. Previously we were trying to use the 'networkx' library to detect these patterns. This library detects the cycles for each case and handles them individually, but also isn't able to detect the cause of the cycle or get the first or last event in the cycle (the event that is a part of the cycle and appears first or last in the given case). This could be detected easily, but the first problem was that there were many exceptions to this in the used data, and second problem was that it is possible to have multiple cycles in a single case.

We were able to surpass these problems with the given functions analyse_cycles and get_cyclied_paths, but we have chosen to move on with a more generic and precise solution using sequences.

### get_nth function
Returns the nth event for any given column for all cases. We use this function for start and end event identifiers using '0' with 'Start Timestamp' and '-1' with 'End Timestamp' for starting, and ending events.

For example, using it with '0'th activity with 'Start Timestamp' column, we get a dictionary that has keys showing first activity when the case is ordered by Start Timestamps with number of occurences as values.