In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
import datetime
import psutil
np.random.seed(42)
from sklearn.preprocessing import LabelEncoder
import random
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [13]:
train = pd.read_csv('BPI Challenge 2017-training.csv')
test = pd.read_csv('BPI Challenge 2017-test.csv')
data = pd.concat([train, test])

In [14]:
data['next_event'] = data.groupby('case concept:name')['event concept:name'].shift(-1)
data[['next_event']] = data[['next_event']].fillna('LAST EVENT')
data["prev_event"] = data.groupby('case concept:name')['event concept:name'].shift(1)
data[['prev_event']] = data[['prev_event']].fillna('FIRST EVENT')
data["prev_lifecycle"] = data.groupby('case concept:name')['event lifecycle:transition'].shift(1)
data[['prev_lifecycle']] = data[['prev_lifecycle']].fillna('FIRST EVENT')

In [16]:
def evaluate(model, test_features, test_labels):
    y_pred = model.predict(test_features)
    prec_score = precision_score(test_labels, y_pred, average="weighted", zero_division=0)
    rec_score = recall_score(test_labels, y_pred, average="weighted", zero_division=0)
    F1_score = f1_score(test_labels, y_pred, average="weighted", zero_division=0)
    acc_score = accuracy_score(test_labels, y_pred)

    print(f'The accuracy of the model is {acc_score}.')
    print(f'The precision of the model is {prec_score}, using weighted average.')
    print(f'The recall of the model is {rec_score}, using weighted average.')
    print(f'The f1-score of the model is {F1_score}, using weighted average.')
    
    return acc_score

In [17]:
def split(data, X, y, smpl_size):
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=smpl_size, random_state=42, shuffle=False)
    y_train = y_train.to_frame()
    y_test = y_test.to_frame()
    return X_train, X_test, y_train, y_test

In [18]:
def encode(data,X_train, X_test, y_train, y_test):
    event_encoder = LabelEncoder()
    labels_name = data["prev_event"].unique()
    event_encoder.fit(labels_name)
    X_train[["event concept:name","prev_event"]] = X_train[["event concept:name", "prev_event"]].apply(event_encoder.transform)
    X_test[["event concept:name", "prev_event"]] = X_test[["event concept:name", "prev_event"]].apply(event_encoder.transform)


    labels_lifecycle = data["prev_lifecycle"].unique()
    event_encoder.fit(labels_lifecycle)
    X_train[["event lifecycle:transition", 'prev_lifecycle']] = X_train[["event lifecycle:transition", 'prev_lifecycle']].apply(event_encoder.transform)
    X_test[["event lifecycle:transition", 'prev_lifecycle']] = X_test[["event lifecycle:transition", 'prev_lifecycle']].apply(event_encoder.transform)

    return X_train, X_test, y_train, y_test

In [19]:
X = data[["event concept:name", "prev_event", "event lifecycle:transition", 'prev_lifecycle']]
y = data["next_event"]

In [22]:
X_train, X_test, y_train, y_test = split(data, X, y, 0.3)
X_train, X_test, y_train, y_test = encode(data, X_train, X_test, y_train, y_test)

In [21]:
base_model = RandomForestClassifier(n_estimators=10, max_depth = 160, bootstrap = True, criterion = 'entropy', random_state=42)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

  


The accuracy of the model is 0.861353938799105.
The precision of the model is 0.8464720336601855, using weighted average.
The recall of the model is 0.861353938799105, using weighted average.
The f1-score of the model is 0.8455375185534018, using weighted average.
