In this file we train a XGboost model on our data on the BPI_Challenge_2012 dataset

### Importing the libraries

In [None]:
import xgboost as xgb
import pandas as pd
from aux_functions import split_data
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

### Loading the data

In [None]:
df = pd.read_csv('data/preprocessed/BPI_Challenge_2012.csv')
df.head()

### Specifying the columns

In [None]:
# Defining what category each column fall into
dropping_columns = ['time:timestamp', 'case:REG_DATE', 'case:concept:name', 'next_timestamp']
numerical_columns = ['org:resource', 'case:AMOUNT_REQ', 'position']
categorical_columns = ['lifecycle:transition', 'concept:name']
target_column = 'next_concept:name'

### Feature engineering

In [None]:
# Convert the target column to category
original_target = df[target_column].copy()
df[target_column] = df[target_column].astype('category')
df[target_column] = df[target_column].cat.codes

# One-hot encoding of the categorical columns 
df = pd.get_dummies(df, columns=categorical_columns)

# Columns that were one_hot_encoded
one_hot_encoded_columns = [col for col in df.columns if (col not in numerical_columns) and (col != target_column) and (col not in dropping_columns)]
one_hot_encoded_columns

### Splitting the dataset into train and test dataframes

In [None]:
# Splitting the data
train_df, test_df = split_data(df, ratio=0.8, report=True)
X_train = train_df[numerical_columns + one_hot_encoded_columns]
y_train = train_df[target_column]
X_test = test_df[numerical_columns + one_hot_encoded_columns]
y_test = test_df[target_column]

# Getting the feature types for xgboost
ft = ['q' if feature in numerical_columns else 'c' for feature in X_train.columns]

### Training the model

In [None]:
# Create DMatrix for training and testing
train_dmatrix = xgb.DMatrix(X_train, y_train, feature_types=ft, enable_categorical=True)
test_dmatrix = xgb.DMatrix(data=X_test, label=y_test, feature_types=ft, enable_categorical=True)

In [None]:
# Create XGBoost parameters
params = {
    'eta': 0.3,  # the training step for each iteration
    'objective': 'multi:softmax',  # for multi-class classification
    'num_class': len(df[target_column].unique()),  # number of classes
    'eval_metric': 'merror' # evaluation metric
}

In [None]:
# Train the XGBoost model
xgb_model = xgb.train(params, train_dmatrix, num_boost_round=10)

### Predicting the result and Evaluation

In [None]:
# Evaluate the model
predictions = xgb_model.predict(test_dmatrix)

In [None]:
# Calculate the f1, precision, recall, and accuracy
f1 = f1_score(y_test, predictions, average='weighted')
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
accuracy = accuracy_score(y_test, predictions)

print(f'F1: {f1:.5f}, Precision: {precision:.5f}, Recall: {recall:.5f}, Accuracy: {accuracy:.5f}')

In [None]:
print(classification_report(y_test, predictions))

In [None]:
# We can see if the model is overfitting by comparing the training and testing error
train_predictions = xgb_model.predict(train_dmatrix)   
f1 = f1_score(y_train, train_predictions, average='weighted')
precision = precision_score(y_train, train_predictions, average='weighted')
recall = recall_score(y_train, train_predictions, average='weighted')
train_accuracy = accuracy_score(y_train, train_predictions)

print(f'Training F1: {f1:.5f}, Training Precision: {precision:.5f}, Training Recall: {recall:.5f}, Training Accuracy: {train_accuracy:.5f}')

In [None]:
# find the mapping with the orginal_target and encoded one
mapping = dict(zip(df[target_column], original_target))

In [None]:
# predicting a single instance 
position = 45
single_instance = X_test.iloc[position].values.reshape(1, -1)
single_instance_dmatrix = xgb.DMatrix(data=single_instance, feature_names=X_test.columns, feature_types=ft, enable_categorical=True)
single_instance_prediction = xgb_model.predict(single_instance_dmatrix)

# Finding the mapping of the prediction to the actual value
print(f'Prediction: {mapping[single_instance_prediction[0]]},\nActual: {mapping[y_test.iloc[position]]}')

In [None]:
# predicting a single instance 
position = 600
single_instance = X_test.iloc[position].values.reshape(1, -1)
single_instance_dmatrix = xgb.DMatrix(data=single_instance, feature_names=X_test.columns, feature_types=ft, enable_categorical=True)
single_instance_prediction = xgb_model.predict(single_instance_dmatrix)

# Finding the mapping of the prediction to the actual value
print(f'Prediction: {mapping[single_instance_prediction[0]]},\nActual: {mapping[y_test.iloc[position]]}')

In [None]:
# Plotting the feature importance of the top 10 features

xgb.plot_importance(xgb_model, max_num_features=10, importance_type='weight')
fig = plt.gcf()
fig.set_size_inches(20, 20)
plt.rcParams.update({'font.size': 18})
plt.ylabel('Feature Names', fontsize=15)
plt.xlabel('Feature Importance', fontsize=15)
plt.yticks(fontsize=12)
plt.xticks(fontsize=12)
plt.tight_layout()
plt.title('Contribution of each Feature to the Model XGBoost 2012')
plt.savefig('figs/feature_importance_XGboost_2012.png')
plt.show()