In this file we train a XGboost model on our data on the BPI_Challenge_2012 dataset

### Importing the libraries

In [1]:
import xgboost as xgb
import pandas as pd
from aux_functions import split_data
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

### Loading the data

In [2]:
df = pd.read_csv('data/preprocessed/BPI_Challenge_2012.csv')
df.head()

Unnamed: 0,org:resource,lifecycle:transition,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ,position,next_concept:name,next_timestamp
0,112.0,COMPLETE,A_SUBMITTED,2011-10-01 00:38:44.546000+00:00,2011-10-01 00:38:44.546000+00:00,173688,20000,1,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+00:00
1,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+00:00,2011-10-01 00:38:44.546000+00:00,173688,20000,2,A_PREACCEPTED,2011-10-01 00:39:37.906000+00:00
2,112.0,COMPLETE,A_PREACCEPTED,2011-10-01 00:39:37.906000+00:00,2011-10-01 00:38:44.546000+00:00,173688,20000,3,W_Completeren aanvraag,2011-10-01 00:39:38.875000+00:00
3,112.0,SCHEDULE,W_Completeren aanvraag,2011-10-01 00:39:38.875000+00:00,2011-10-01 00:38:44.546000+00:00,173688,20000,4,W_Completeren aanvraag,2011-10-01 11:36:46.437000+00:00
4,112.0,COMPLETE,A_SUBMITTED,2011-10-01 08:08:58.256000+00:00,2011-10-01 08:08:58.256000+00:00,173691,5000,1,A_PARTLYSUBMITTED,2011-10-01 08:09:02.195000+00:00


### Specifying the columns

In [3]:
# Defining what category each column fall into
dropping_columns = ['time:timestamp', 'case:REG_DATE', 'case:concept:name', 'next_timestamp', 'org:resource', 'case:AMOUNT_REQ','lifecycle:transition']
numerical_columns = ['position']
categorical_columns = ['concept:name']
target_column = 'next_concept:name'

### Feature engineering

In [4]:
# Convert the target column to category
original_target = df[target_column].copy()
df[target_column] = df[target_column].astype('category')
df[target_column] = df[target_column].cat.codes

# One-hot encoding of the categorical columns 
df = pd.get_dummies(df, columns=categorical_columns)

# Columns that were one_hot_encoded
one_hot_encoded_columns = [col for col in df.columns if (col not in numerical_columns) and (col != target_column) and (col not in dropping_columns)]

['concept:name_A_ACCEPTED',
 'concept:name_A_ACTIVATED',
 'concept:name_A_APPROVED',
 'concept:name_A_CANCELLED',
 'concept:name_A_DECLINED',
 'concept:name_A_FINALIZED',
 'concept:name_A_PARTLYSUBMITTED',
 'concept:name_A_PREACCEPTED',
 'concept:name_A_REGISTERED',
 'concept:name_A_SUBMITTED',
 'concept:name_O_ACCEPTED',
 'concept:name_O_CANCELLED',
 'concept:name_O_CREATED',
 'concept:name_O_DECLINED',
 'concept:name_O_SELECTED',
 'concept:name_O_SENT',
 'concept:name_O_SENT_BACK',
 'concept:name_W_Afhandelen leads',
 'concept:name_W_Beoordelen fraude',
 'concept:name_W_Completeren aanvraag',
 'concept:name_W_Nabellen incomplete dossiers',
 'concept:name_W_Nabellen offertes',
 'concept:name_W_Valideren aanvraag',
 'concept:name_W_Wijzigen contractgegevens']

### Splitting the dataset into train and test dataframes

In [5]:
# Splitting the data
train_df, test_df = split_data(df, ratio=0.8, report=True)
X_train = train_df[numerical_columns + one_hot_encoded_columns]
y_train = train_df[target_column]
X_test = test_df[numerical_columns + one_hot_encoded_columns]
y_test = test_df[target_column]

# Getting the feature types for xgboost
ft = ['q' if feature in numerical_columns else 'c' for feature in X_train.columns]

Original size: 262200
Train size: 186785
Test size: 37781
Ratio: 0.8317599280389729
Dropped cases in both sets: 888
Dropped rows from dataset: 37634


### Training the model

In [6]:
# Create DMatrix for training and testing
train_dmatrix = xgb.DMatrix(X_train, y_train, feature_types=ft, enable_categorical=True)
test_dmatrix = xgb.DMatrix(data=X_test, label=y_test, feature_types=ft, enable_categorical=True)

In [7]:
# Create XGBoost parameters
params = {
    'eta': 0.3,  # the training step for each iteration
    'objective': 'multi:softmax',  # for multi-class classification
    'num_class': len(df[target_column].unique()),  # number of classes
    'eval_metric': 'merror' # evaluation metric
}

In [8]:
# Train the XGBoost model
xgb_model = xgb.train(params, train_dmatrix, num_boost_round=10)

### Predicting the result and Evaluation

In [9]:
# Evaluate the model
predictions = xgb_model.predict(test_dmatrix)

In [11]:
#take test data
#[] [] [] [] ([]) ---> (next)
#output is array of precesses --> compare with actual
#input: array of preffix processes (only uses last)
#
#turn into one hot encoding
#write for loop when sequence, every process in sequence do one hot and predict next

In [95]:
# find the mapping with the orginal_target and encoded one
mapping = dict(zip(df[target_column], original_target))

In [94]:
def predict_first(X_test, last_event_array_position, ft):
    single_instance = X_test.iloc[last_event_array_position]
    single_instance = single_instance.values.reshape(1, -1)
    single_instance_dmatrix = xgb.DMatrix(data=single_instance, feature_names=list(X_test.columns), feature_types=ft, enable_categorical=True)
    single_instance_prediction = xgb_model.predict(single_instance_dmatrix)
    return single_instance_prediction

In [93]:
def get_first_position(X_test, last_event_array_position):
    single_instance = X_test.iloc[last_event_array_position]
    position = int(single_instance.iloc[0])
    return position

print(get_first_position(X_test, 45))

3


In [92]:
def add_dummy_columns(df, column_names):
    # Remove column names that already exist in the DataFrame
    column_names = [col for col in column_names if col not in df.columns]
    
    # Create a DataFrame with zeros for only the new columns
    dummy_df = pd.DataFrame(0, index=df.index, columns=column_names)
    
    # Concatenate the original DataFrame and the dummy DataFrame
    new_df = pd.concat([df, dummy_df], axis=1)
    
    # Update values from the original DataFrame
    for col in df.columns.intersection(column_names):
        new_df[col] = df[col]
    
    return new_df

In [91]:
#takes prediction and makes it into one hot encoded df
def format_prediction(prediction, position, column_names):
    temp_df = pd.DataFrame(False, index=[0], columns=column_names)
    temp_df.insert(0, 'position', position, True)
    temp_df['concept:name_'+prediction] = temp_df['concept:name_'+prediction].replace([False], [True])
    return temp_df


In [90]:
def predict_next(df, ft):
    single_instance = df.iloc[0]
    single_instance = single_instance.values.reshape(1, -1)
    single_instance_dmatrix = xgb.DMatrix(data=single_instance, feature_names=list(X_test.columns), feature_types=ft, enable_categorical=True)
    single_instance_prediction = xgb_model.predict(single_instance_dmatrix)
    return single_instance_prediction

In [48]:
event_position = 45
prediction = predict_next(X_test, event_position, ft)
pred = mapping[prediction[0]]
frame = format_prediction(pred, 4, one_hot_encoded_columns)
#frame.shape[1]
#frame.iloc[0].values.reshape(1, -1)

Unnamed: 0,W_Completeren aanvraag
0,True


In [106]:
# 
event_position = 1
for i in range(1, 10):
    if i == 1:
        prediction = mapping[predict_first(X_test, event_position, ft)[0]]
        position = get_first_position(X_test, event_position) + 1
    else:
        prediction = mapping[predict_next(df_prediction, ft)[0]]
        position += 1
    if prediction == 'No_Activity':
        break
    
    df_prediction = format_prediction(prediction, position, one_hot_encoded_columns)
    print(f'Prediction: {prediction},\nActual: {mapping[y_test.iloc[event_position -1 + i]]}')
    
# for i in range(1, 25):
# if i == 1:
#   predict no dummy
#   prosition = getfirstposition
#else:
#   predict dummy
#   position + 1
# prediction = mapping[single_instance_prediction[0]]
# add position to an array with full sequence, amend array after every rpediction to include previous and current prediction


    

Prediction: W_Afhandelen leads,
Actual: W_Afhandelen leads
Prediction: W_Afhandelen leads,
Actual: W_Afhandelen leads
Prediction: A_DECLINED,
Actual: W_Afhandelen leads
Prediction: W_Afhandelen leads,
Actual: W_Afhandelen leads


In [105]:
# find the mapping with the orginal_target and encoded one
mapping = dict(zip(df[target_column], original_target))
# predicting a single instance 
position = 50
single_instance = X_test.iloc[position].values.reshape(1, -1)
single_instance_dmatrix = xgb.DMatrix(data=single_instance, feature_names=list(X_test.columns), feature_types=ft, enable_categorical=True)
single_instance_prediction = xgb_model.predict(single_instance_dmatrix)
# Finding the mapping of the prediction to the actual value
print(f'Prediction: {mapping[single_instance_prediction[0]]},\nActual: {mapping[y_test.iloc[position]]}')
# print(int(single_instance[0][0]))
mapping

Prediction: W_Completeren aanvraag,
Actual: W_Completeren aanvraag


{6: 'A_PARTLYSUBMITTED',
 7: 'A_PREACCEPTED',
 19: 'W_Completeren aanvraag',
 4: 'A_DECLINED',
 9: 'No_Activity',
 17: 'W_Afhandelen leads',
 0: 'A_ACCEPTED',
 14: 'O_SELECTED',
 5: 'A_FINALIZED',
 12: 'O_CREATED',
 15: 'O_SENT',
 21: 'W_Nabellen offertes',
 11: 'O_CANCELLED',
 3: 'A_CANCELLED',
 18: 'W_Beoordelen fraude',
 16: 'O_SENT_BACK',
 22: 'W_Valideren aanvraag',
 20: 'W_Nabellen incomplete dossiers',
 10: 'O_ACCEPTED',
 2: 'A_APPROVED',
 1: 'A_ACTIVATED',
 8: 'A_REGISTERED',
 13: 'O_DECLINED',
 23: 'W_Wijzigen contractgegevens'}

In [12]:
# Calculate the f1, precision, recall, and accuracy
f1 = f1_score(y_test, predictions, average='weighted')
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
accuracy = accuracy_score(y_test, predictions)

print(f'F1: {f1:.5f}, Precision: {precision:.5f}, Recall: {recall:.5f}, Accuracy: {accuracy:.5f}')

F1: 0.64459, Precision: 0.63291, Recall: 0.68212, Accuracy: 0.68212


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [13]:
# find the mapping with the orginal_target and encoded one
mapping = dict(zip(df[target_column], original_target))

In [15]:
# predicting a single instance 
position = 100
single_instance = X_test.iloc[position].values.reshape(1, -1)
single_instance_dmatrix = xgb.DMatrix(data=single_instance, feature_names=list(X_test.columns), feature_types=ft, enable_categorical=True)
single_instance_prediction = xgb_model.predict(single_instance_dmatrix)

# Finding the mapping of the prediction to the actual value
print(f'Prediction: {mapping[single_instance_prediction[0]]},\nActual: {mapping[y_test.iloc[position]]}')

Prediction: O_SELECTED,
Actual: O_SELECTED
