In [1]:
# training a xgboost model to predix suffix of the processes
import pandas as pd
import numpy as np
import xgboost as xgb
from aux_functions import split_data
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('data/preprocessed/BPI_Challenge_2012.csv')

In [3]:
dropping_clomns = ['time:timestamp', 'case:REG_DATE', 'next_timestamp']
df.drop(dropping_clomns, axis=1, inplace=True)
df

Unnamed: 0,org:resource,lifecycle:transition,concept:name,case:concept:name,case:AMOUNT_REQ,position,next_concept:name
0,112.0,COMPLETE,A_SUBMITTED,173688,20000,1,A_PARTLYSUBMITTED
1,112.0,COMPLETE,A_PARTLYSUBMITTED,173688,20000,2,A_PREACCEPTED
2,112.0,COMPLETE,A_PREACCEPTED,173688,20000,3,W_Completeren aanvraag
3,112.0,SCHEDULE,W_Completeren aanvraag,173688,20000,4,W_Completeren aanvraag
4,112.0,COMPLETE,A_SUBMITTED,173691,5000,1,A_PARTLYSUBMITTED
...,...,...,...,...,...,...,...
262195,10863.0,START,W_Nabellen incomplete dossiers,213276,15000,36,W_Nabellen incomplete dossiers
262196,10863.0,COMPLETE,W_Nabellen incomplete dossiers,213276,15000,37,No_Activity
262197,11169.0,START,W_Nabellen offertes,209595,13000,33,No_Activity
262198,11203.0,START,W_Nabellen incomplete dossiers,211624,35000,35,W_Nabellen incomplete dossiers


In [4]:
# frequency of each length of the traces in a dictionary
trace_lengths = df['case:concept:name'].value_counts().to_dict()
trace_lengths_frequency = {}
for process, key in trace_lengths.items():
    if key in trace_lengths_frequency:
        trace_lengths_frequency[key] += 1
    else:
        trace_lengths_frequency[key] = 1

trace_lengths_frequency

{175: 1,
 170: 1,
 167: 1,
 163: 1,
 161: 1,
 159: 1,
 151: 1,
 142: 1,
 141: 1,
 137: 1,
 133: 2,
 131: 1,
 130: 2,
 128: 1,
 127: 2,
 126: 1,
 125: 2,
 124: 1,
 122: 1,
 119: 2,
 118: 3,
 116: 2,
 115: 2,
 113: 2,
 112: 2,
 111: 3,
 110: 1,
 109: 4,
 108: 2,
 107: 3,
 106: 1,
 104: 4,
 103: 1,
 102: 2,
 101: 2,
 100: 3,
 99: 3,
 98: 2,
 97: 4,
 96: 3,
 95: 4,
 94: 4,
 93: 3,
 92: 5,
 91: 5,
 90: 7,
 89: 2,
 88: 5,
 87: 4,
 86: 13,
 85: 6,
 84: 10,
 83: 4,
 82: 14,
 81: 10,
 80: 11,
 79: 9,
 78: 16,
 77: 12,
 76: 15,
 75: 13,
 74: 18,
 73: 14,
 72: 17,
 71: 17,
 70: 26,
 69: 28,
 68: 22,
 67: 15,
 66: 26,
 65: 12,
 64: 30,
 63: 22,
 62: 29,
 61: 34,
 60: 46,
 59: 31,
 58: 38,
 57: 46,
 56: 47,
 55: 44,
 54: 61,
 53: 42,
 52: 50,
 51: 53,
 50: 82,
 49: 70,
 48: 78,
 47: 69,
 46: 90,
 45: 84,
 44: 95,
 43: 94,
 42: 117,
 41: 104,
 40: 116,
 39: 119,
 38: 147,
 37: 129,
 36: 144,
 35: 157,
 34: 175,
 33: 160,
 32: 202,
 31: 176,
 30: 207,
 29: 142,
 28: 260,
 27: 145,
 26: 264,
 25: 143,

In [5]:
# sort trace_lengths_frequency by key
trace_lengths_frequency = dict(sorted(trace_lengths_frequency.items()))
# Getting the cumulative sum of the values
cumulative_sum = 0
cum_tace_lengths_frequency = {}
for key, value in trace_lengths_frequency.items():
    cumulative_sum += value
    cum_tace_lengths_frequency[key] = cumulative_sum

cum_tace_lengths_frequency

{3: 3429,
 6: 5303,
 7: 5597,
 8: 5925,
 9: 6106,
 10: 6467,
 11: 6594,
 12: 6820,
 13: 6904,
 14: 7059,
 15: 7113,
 16: 7295,
 17: 7347,
 18: 7540,
 19: 7596,
 20: 7818,
 21: 7885,
 22: 8121,
 23: 8218,
 24: 8504,
 25: 8647,
 26: 8911,
 27: 9056,
 28: 9316,
 29: 9458,
 30: 9665,
 31: 9841,
 32: 10043,
 33: 10203,
 34: 10378,
 35: 10535,
 36: 10679,
 37: 10808,
 38: 10955,
 39: 11074,
 40: 11190,
 41: 11294,
 42: 11411,
 43: 11505,
 44: 11600,
 45: 11684,
 46: 11774,
 47: 11843,
 48: 11921,
 49: 11991,
 50: 12073,
 51: 12126,
 52: 12176,
 53: 12218,
 54: 12279,
 55: 12323,
 56: 12370,
 57: 12416,
 58: 12454,
 59: 12485,
 60: 12531,
 61: 12565,
 62: 12594,
 63: 12616,
 64: 12646,
 65: 12658,
 66: 12684,
 67: 12699,
 68: 12721,
 69: 12749,
 70: 12775,
 71: 12792,
 72: 12809,
 73: 12823,
 74: 12841,
 75: 12854,
 76: 12869,
 77: 12881,
 78: 12897,
 79: 12906,
 80: 12917,
 81: 12927,
 82: 12941,
 83: 12945,
 84: 12955,
 85: 12961,
 86: 12974,
 87: 12978,
 88: 12983,
 89: 12985,
 90: 12992,


In [6]:
# Getting the percentage of the cumulative sum
total = sum(trace_lengths_frequency.values())
cum_tace_lengths_frequency_percentage = {}
for key, value in cum_tace_lengths_frequency.items():
    cum_tace_lengths_frequency_percentage[key] = value/total

cum_tace_lengths_frequency_percentage

{3: 0.2620157408114923,
 6: 0.40521127836784593,
 7: 0.4276763200122259,
 8: 0.4527393596699014,
 9: 0.466569878505387,
 10: 0.4941545044700848,
 11: 0.5038587911668068,
 12: 0.5211278367845954,
 13: 0.527546420111561,
 14: 0.5393902345839383,
 15: 0.5435164667227019,
 16: 0.5574233972644609,
 17: 0.5613968059906778,
 18: 0.5761442653014441,
 19: 0.5804233208527546,
 20: 0.5973867196454496,
 21: 0.6025063039657675,
 22: 0.6205394666462902,
 23: 0.6279514021548102,
 24: 0.6498051501490029,
 25: 0.6607320241460992,
 26: 0.6809047146022771,
 27: 0.6919844120119202,
 28: 0.7118514556430046,
 29: 0.7227019179338274,
 30: 0.7385191411324215,
 31: 0.7519676014365401,
 32: 0.7674027661037671,
 33: 0.7796286391075112,
 34: 0.7930006877053565,
 35: 0.8049973255902805,
 36: 0.8160006112936502,
 37: 0.8258577214029189,
 38: 0.8370902422251089,
 39: 0.8461832352716436,
 40: 0.8550469931993582,
 41: 0.8629938106517918,
 42: 0.8719339802857797,
 43: 0.8791166806754794,
 44: 0.8863757927714526,
 45: 0

In [7]:
# Due to do this we cut of the traces with length more than 50 since they are only 8% of the data
df = df[df['case:concept:name'].isin(df.groupby('case:concept:name').size()[df.groupby('case:concept:name').size() <= 50].index)]

In [8]:
train, test = split_data(df, 0.8) 

Original size: 193369
Train size: 140459
Test size: 30912
Ratio: 0.8196194221892852
Dropped cases in both sets: 666
Dropped rows from dataset: 21998


In [34]:
prediction_columns = ['case:concept:name', 'concept:name', 'position']

In [35]:
# Generating all the possible prefixes and suffixes of length 5 in the train set
prefix = []
suffix = []
for case in train['case:concept:name'].unique():
    case_df = train[train['case:concept:name'] == case]
    for i in range(5, len(case_df)):
        prefix.append(case_df.iloc[i-5:i][prediction_columns])
        suffix.append(case_df.iloc[i:][prediction_columns])

In [38]:
prefix[-1]

Unnamed: 0,case:concept:name,concept:name,position
209618,207855,A_SUBMITTED,1
209619,207855,A_PARTLYSUBMITTED,2
209620,207855,W_Afhandelen leads,3
209653,207855,W_Afhandelen leads,4
209676,207855,A_DECLINED,5


In [39]:
suffix[-1]

Unnamed: 0,case:concept:name,concept:name,position
209677,207855,W_Afhandelen leads,6
