In this file we train a XGboost model on our data on the BPI_Challenge_2012 dataset

### Importing the libraries

In [1]:
import xgboost as xgb
import pandas as pd
from aux_functions import split_data
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

### Loading the data

In [2]:
df = pd.read_csv('data/preprocessed/BPI_Challenge_2012.csv')
df.head()

Unnamed: 0,org:resource,lifecycle:transition,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ,position,next_concept:name,next_timestamp
0,112.0,COMPLETE,A_SUBMITTED,2011-10-01 00:38:44.546000+00:00,2011-10-01 00:38:44.546000+00:00,173688,20000,1,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+00:00
1,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+00:00,2011-10-01 00:38:44.546000+00:00,173688,20000,2,A_PREACCEPTED,2011-10-01 00:39:37.906000+00:00
2,112.0,COMPLETE,A_PREACCEPTED,2011-10-01 00:39:37.906000+00:00,2011-10-01 00:38:44.546000+00:00,173688,20000,3,W_Completeren aanvraag,2011-10-01 00:39:38.875000+00:00
3,112.0,SCHEDULE,W_Completeren aanvraag,2011-10-01 00:39:38.875000+00:00,2011-10-01 00:38:44.546000+00:00,173688,20000,4,W_Completeren aanvraag,2011-10-01 11:36:46.437000+00:00
4,112.0,COMPLETE,A_SUBMITTED,2011-10-01 08:08:58.256000+00:00,2011-10-01 08:08:58.256000+00:00,173691,5000,1,A_PARTLYSUBMITTED,2011-10-01 08:09:02.195000+00:00


### Specifying the columns

In [3]:
# Defining what category each column fall into
dropping_columns = ['time:timestamp', 'case:REG_DATE', 'case:concept:name', 'next_timestamp', 'org:resource', 'case:AMOUNT_REQ','lifecycle:transition']
numerical_columns = ['position']
categorical_columns = ['concept:name']
target_column = 'next_concept:name'

### Feature engineering

In [4]:
# Convert the target column to category
original_target = df[target_column].copy()
df[target_column] = df[target_column].astype('category')
df[target_column] = df[target_column].cat.codes

# One-hot encoding of the categorical columns 
df = pd.get_dummies(df, columns=categorical_columns)

# Columns that were one_hot_encoded
one_hot_encoded_columns = [col for col in df.columns if (col not in numerical_columns) and (col != target_column) and (col not in dropping_columns)]

### Splitting the dataset into train and test dataframes

In [5]:
# Splitting the data
train_df, test_df = split_data(df, ratio=0.8, report=True)
X_train = train_df[numerical_columns + one_hot_encoded_columns]
y_train = train_df[target_column]
X_test = test_df[numerical_columns + one_hot_encoded_columns]
y_test = test_df[target_column]

# Getting the feature types for xgboost
ft = ['q' if feature in numerical_columns else 'c' for feature in X_train.columns]

Original size: 262200
Train size: 186785
Test size: 37781
Ratio: 0.8317599280389729
Dropped cases in both sets: 888
Dropped rows from dataset: 37634


### Training the model

In [6]:
# Create DMatrix for training and testing
train_dmatrix = xgb.DMatrix(X_train, y_train, feature_types=ft, enable_categorical=True)
test_dmatrix = xgb.DMatrix(data=X_test, label=y_test, feature_types=ft, enable_categorical=True)

In [7]:
# Create XGBoost parameters
params = {
    'eta': 0.3,  # the training step for each iteration
    'objective': 'multi:softmax',  # for multi-class classification
    'num_class': len(df[target_column].unique()),  # number of classes
    'eval_metric': 'merror' # evaluation metric
}

In [8]:
# Train the XGBoost model
xgb_model = xgb.train(params, train_dmatrix, num_boost_round=10)

### Predicting the result and Evaluation

In [9]:
# Evaluate the model
predictions = xgb_model.predict(test_dmatrix)

In [10]:
# find the mapping with the orginal_target and encoded one
mapping = dict(zip(df[target_column], original_target))

In [11]:
def predict_first(X_test, index, ft):
    single_instance = X_test.loc[index]
    single_instance = single_instance.values.reshape(1, -1)
    single_instance_dmatrix = xgb.DMatrix(data=single_instance, feature_names=list(X_test.columns), feature_types=ft, enable_categorical=True)
    single_instance_prediction = xgb_model.predict(single_instance_dmatrix)
    return single_instance_prediction

In [12]:
def get_first_position(X_test, last_event_array_position):
    single_instance = X_test.iloc[last_event_array_position]
    position = int(single_instance.iloc[0])
    return position

print(get_first_position(X_test, 45))

3


In [13]:
#takes prediction and makes it into one hot encoded df
def format_prediction(prediction, position, column_names):
    temp_df = pd.DataFrame(False, index=[0], columns=column_names)
    temp_df.insert(0, 'position', position, True)
    temp_df['concept:name_'+prediction] = temp_df['concept:name_'+prediction].replace([False], [True])
    return temp_df


In [14]:
def predict_next(df, ft):
    single_instance = df.iloc[0]
    single_instance = single_instance.values.reshape(1, -1)
    single_instance_dmatrix = xgb.DMatrix(data=single_instance, feature_names=list(X_test.columns), feature_types=ft, enable_categorical=True)
    single_instance_prediction = xgb_model.predict(single_instance_dmatrix)
    return single_instance_prediction

In [15]:
def combine_pos_event(position, event):
    hot_pre = 'concept:name_'
    if hot_pre in event:
        event = event.replace(hot_pre, '')
    array = [position, event]
    return array


In [16]:
def update_prefix(prefix, addon):
    #print(addon)
    prefix.append(addon)
    return prefix

In [17]:
#returns name of column wiht true value
#aka the event in that position
def column_with_true_value(row):
    for col in row.index:
        if row[col] == True:
            return col
    return None  # Return None if no True value found

In [18]:
# estabishing case concept name index dictionary
X_test['case:concept:name'] = df.loc[X_test.index]['case:concept:name']
case_dict = {}

for case in X_test['case:concept:name'].unique():
    case_dict[case] = X_test[X_test["case:concept:name"] == case].index

X_test = X_test.drop(columns='case:concept:name')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test['case:concept:name'] = df.loc[X_test.index]['case:concept:name']


In [19]:
#create fucntion to find and create an array of indexes to a certain trace
#input case:concept:name of a trace
#output array of indexes of all event belonging to that trace in order

def find_trace_index_array(caseconceptname, position):
    return case_dict[caseconceptname][position]

In [20]:
def compute_levensthein_distance(predicted, actual):
    """
    calculating the distance between the two lists of activities
    :param predicted: list of activities
    :param actual: list of activities
    :return: distance between the two lists
    """

    # if the actual list is empty, return the length of the predicted list
    if len(actual) == 0:
        return len(predicted)

    # if the predicted list is empty, return the length of the actual list
    if len(predicted) == 0:
        return len(actual)

    # creating a matrix with the size of the two lists
    matrix = [[0 for _ in range(len(actual) + 1)] for _ in range(len(predicted) + 1)]

    # filling the first row and the first column of the matrix
    for i in range(len(predicted) + 1):
        matrix[i][0] = i
    for j in range(len(actual) + 1):
        matrix[0][j] = j

    # filling the matrix
    for i in range(1, len(predicted) + 1):
        for j in range(1, len(actual) + 1):
            if predicted[i - 1] == actual[j - 1]:
                matrix[i][j] = matrix[i - 1][j - 1]
            else:
                matrix[i][j] = min(matrix[i - 1][j] + 1, matrix[i][j - 1] + 1, matrix[i - 1][j - 1] + 1)

    return matrix[len(predicted)][len(actual)]

In [21]:
from tqdm import tqdm
summ = 0
counter = 0
length = 0
maxx = 0
for case in tqdm(case_dict.keys()):
    trace = case
    trace_position = 0
    prefix = []
    actual_suffix = []
    predicted_suffix = []
    for i in range(1, 100):
        index = find_trace_index_array(trace, trace_position)
        if i == 1:
            prefixevent = column_with_true_value(X_test.loc[index])
            new_entry = combine_pos_event(trace_position, prefixevent)
            prefix = update_prefix(prefix, new_entry)
            prediction = mapping[predict_first(X_test, index, ft)[0]]
            trace_position += 1
        else:
            new_entry = combine_pos_event(trace_position, prediction)
            prefix = update_prefix(prefix, new_entry)
            prediction = mapping[predict_next(df_prediction, ft)[0]]
            trace_position += 1
        actual_suffix.append(mapping[y_test.loc[index]])
        predicted_suffix.append(prediction)
        if mapping[y_test.loc[index]] == 'No_Activity':
            break

        if prediction == 'No_Activity':
            break

        df_prediction = format_prediction(prediction, trace_position, one_hot_encoded_columns)
    if len(actual_suffix) > maxx:
        maxx = len(actual_suffix)
    summ += compute_levensthein_distance(actual_suffix, predicted_suffix)
    counter += 1
    length += len(actual_suffix)
summ/counter
    

  0%|          | 0/2123 [00:00<?, ?it/s]

100%|██████████| 2123/2123 [01:31<00:00, 23.27it/s]


3.8973151201130474

In [22]:
#predicting single case
trace = 207882
trace_position = 0
prefix = []
actual_suffix = []
predicted_suffix = []
for i in range(1, 100):
    index = find_trace_index_array(trace, trace_position)
    if i == 1:
        prefixevent = column_with_true_value(X_test.loc[index])
        new_entry = combine_pos_event(trace_position, prefixevent)
        prefix = update_prefix(prefix, new_entry)
        prediction = mapping[predict_first(X_test, index, ft)[0]]
        trace_position += 1
    else:
        new_entry = combine_pos_event(trace_position, prediction)
        prefix = update_prefix(prefix, new_entry)
        prediction = mapping[predict_next(df_prediction, ft)[0]]
        trace_position += 1
    actual_suffix.append(mapping[y_test.loc[index]])
    predicted_suffix.append(prediction)
    if mapping[y_test.loc[index]] == 'No_Activity':
        break

    if prediction == 'No_Activity':
        break

    df_prediction = format_prediction(prediction, trace_position, one_hot_encoded_columns)
if len(actual_suffix) > maxx:
    maxx = len(actual_suffix)
summ += compute_levensthein_distance(actual_suffix, predicted_suffix)
counter += 1
length += len(actual_suffix)
summ/counter 

3.8983050847457625

In [23]:
# find the mapping with the orginal_target and encoded one
mapping = dict(zip(df[target_column], original_target))
# predicting a single instance 
trace = 207870
trace_position = 0
index = find_trace_index_array(trace, trace_position)
xtest_case = X_test.loc[index].copy()

single_instance = X_test.loc[index].values.reshape(1, -1)
single_instance_dmatrix = xgb.DMatrix(data=single_instance, feature_names=list(X_test.columns), feature_types=ft, enable_categorical=True)
single_instance_prediction = xgb_model.predict(single_instance_dmatrix)
# Finding the mapping of the prediction to the actual value
print(f'Prediction: {mapping[single_instance_prediction[0]]},\nActual: {mapping[y_test.loc[index]]}')



Prediction: A_PARTLYSUBMITTED,
Actual: A_PARTLYSUBMITTED
