In [None]:
import pprint
from copy import deepcopy
import pandas as pd
import numpy as np
from known_ko_utils import *

from IPython.display import display

import pm4py
from pm4py.algo.filtering.log.start_activities import start_activities_filter
from pm4py.algo.filtering.pandas.end_activities import end_activities_filter

pd.set_option('display.float_format', lambda x: '%.2f' % x)


# Detect which rejected cases were knockouts;
# With known "negative end":
"""
for every case:
    if eventually_follows(start, negative_end):
        case <- Knockout
        for every activity in case:
            if directly_follows(activity, negative_end):
                activity <- Knockout
"""
# With unknown "negative end":
# maybe look at the next-to-last activity of completed cases?
# if any case did not have that activity & ended in rejection -> knockout

In [None]:
log_df, config = preprocess(config_file="./config/consulta.yml")
pprint.pprint(set(log_df['task']))

In [None]:
# Find start/end activities & define "negative" end activity

df = pm4py.format_dataframe(log_df, case_id='caseid', activity_key='task', timestamp_key='end_timestamp', start_timestamp_key="start_timestamp")

# TODO: some issue with start filter...
#start_activities = start_activities_filter.get_start_activities(df)
top_start_activity = "Traer informacion estudiante - banner" #list(start_activities.keys())[0]
#filtered_log_st = start_activities_filter.apply(df, [top_start_activity])

end_activities = end_activities_filter.get_end_activities(df)
top_end_activity = "Cancelar Solicitud"
#top_end_activity = list(end_activities.keys())[0] # "Cancelar Solicitud"
filtered_log_end = end_activities_filter.apply(df, [top_end_activity])

#display(filtered_log_st.info())
#display(filtered_log_end.info())

In [None]:
# Eventually/Directly-follows experiments

rejected = pm4py.filter_eventually_follows_relation(df, [(top_start_activity, top_end_activity)])
rejected = pm4py.convert_to_dataframe(rejected)

# Mark Knocked-out cases & their knock-out activity (simple: next-to-last)
log_df['knocked_out_case'] = False
log_df['knockout_activity'] = False
log_df['knockout_prefix'] = None

gr = rejected.groupby('case:concept:name')
for group in gr.groups.keys():
    case_df = gr.get_group(group)
    sorted_case = case_df.sort_values("start_timestamp")
    knockout_activity = sorted_case.iloc[-3]['concept:name']
    log_df.loc[log_df['caseid']==group, 'knocked_out_case'] = True
    log_df.loc[log_df['caseid']==group, 'knockout_activity'] = knockout_activity
    log_df.loc[log_df['caseid']==group, 'knockout_prefix'] = repr(case_df['concept:name'].values)
    log_df.loc[log_df['caseid']==group, 'knockout_prefix_length'] = len(case_df['concept:name'].values)
    # TODO: maybe also capture the "decision path" that led to knock-out?
    # TODO: fix prefix notion, until when to count?
    # TODO: capture data / stats about case attributes here?

#display(log_df.head())
display(log_df[log_df['knocked_out_case']==True].head())

log_df.to_pickle("./cache/consulta_with_knockouts.pkl")

In [None]:
# Experiments with RIPPER algorithm
import wittgenstein as lw
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score
import seaborn as sns
import matplotlib.pyplot as plt

by_case = pd.read_pickle("./cache/consulta_intercase_and_kproto_by_case")
by_case = by_case.drop(columns='k_proto_cluster')

full = pd.read_pickle("./cache/consulta_with_knockouts.pkl")
full = full[['caseid', 'knocked_out_case', 'knockout_activity']]

by_case = by_case.merge(full, on="caseid", how="left")

for_plot = by_case[by_case['knockout_activity']!=False]
sns.set(rc={'figure.facecolor':'white'})
ax = sns.countplot(y=for_plot['knockout_activity'],
                   order=for_plot['knockout_activity'].value_counts(ascending=False).index);
abs_values = for_plot['knockout_activity'].value_counts(ascending=False).values
ax.bar_label(label_type="center",  container=ax.containers[0], labels=abs_values)
pass

In [None]:
ko_activities = set(by_case['knockout_activity'])

for activity in ko_activities:
    if not activity:
        continue

    # Keep only cases knocked out by current activity and non-knocked out ones
    _by_case = by_case[by_case['knockout_activity'].isin([activity, False])]

    try:
        train, test = train_test_split(_by_case.drop(columns=['knockout_activity', 'caseid', 'user']), test_size=.33, random_state=42)

        ripper_clf = lw.RIPPER(random_state=42, max_rules=5)
        ripper_clf.fit(train, class_feat='knocked_out_case')

        # Visualize Rules
        print(f"\n\n** RIPPER knock-out rules for the activity \"{activity}\":\n")
        ripper_clf.out_model()

        # Performance metrics
        # TODO:skip rule display if problematic & report
        X_test = test.drop(['knocked_out_case'], axis=1)
        y_test = test['knocked_out_case']
        precision = ripper_clf.score(X_test, y_test, precision_score)
        recall = ripper_clf.score(X_test, y_test, recall_score)
        cond_count = ripper_clf.ruleset_.count_conds()

        print(f'precision: {precision:.2f}, recall: {recall:.2f}, conds: {cond_count}')

    except:
        continue