In [1]:
import numpy as np
import pandas as pd
import os
import psutil

np.random.seed(42)

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
plt.rcParams["figure.figsize"] = [16, 11]
plt.rcParams["figure.autolayout"] = True
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import RandomizedSearchCV


In [2]:
def data_split(df):
    """returns 10% of the data"""
    return df[: int((len(df)/10))]


def EventTime(data):
    
    for i in list(data["case concept:name"].unique()):
        data.loc[data["case concept:name"] == i, "nextTIME"] = data.loc[data["case concept:name"] == i, 
                                                                "event time:timestamp"].shift(-1)
    return data


def next_event(data, lst, case, nxt, name):
    """function to add the next event of a trace"""
    for i in lst:
        data.loc[data[case] == i, nxt] = data.loc[data[case] == i, name].shift(-1)
    return data

def prev_event(data, lst, case, prv, name):
    """function to add the next event of a trace"""
    for i in lst:
        data.loc[data[case] == i, prv] = data.loc[data[case] == i, name].shift(1)
    return data

In [3]:
df_train = pd.read_csv('BPI_Challenge_2012-training.csv')
df_test = pd.read_csv('BPI_Challenge_2012-test.csv')
df_data = pd.concat([df_train, df_test])

In [4]:
# split the data to 10% of total data, to help with runtime for demo
df_data = data_split(df_data)

pd.set_option('display.float_format', lambda x: '%.3f' % x)
unixTransform = lambda x: time.mktime(x.timetuple())
secondsTransform = lambda x: x.total_seconds()

df_data["timestamp"] = df_data["event time:timestamp"].copy()
df_data["event time:timestamp"] = pd.to_datetime(df_data["event time:timestamp"], dayfirst=True)

df_data.sort_values(by=['event time:timestamp'], inplace=True)
df_data.columns = df_data.columns.str.strip()

In [5]:
# assign long column names to variables for easier use
cases = "case concept:name"
reg_date = "case REG_DATE"
amount_req = "case AMOUNT_REQ"
event_name = "event concept:name"
lifecycle = "event lifecycle:transition"
tmstmp = "event time:timestamp"
nxt_event = "next event"
prv_event = 'previous event'
dtime = "delta time"

In [6]:
df_complete = df_data[df_data[lifecycle] == df_data[lifecycle].unique()[0]].copy()

In [7]:
lst_events = df_complete[cases].unique().tolist()

In [8]:
# add next events per trace
next_event(df_complete, lst_events, cases, nxt_event, event_name);

In [9]:
# add time till next event per trace
EventTime(df_complete);

In [10]:
prev_event(df_complete, lst_events, cases, prv_event, event_name);

In [11]:
df_processed = df_complete.drop([lifecycle], axis=1).copy()


In [12]:
event_encoder = LabelEncoder()
time_of_day_encoder = OrdinalEncoder()

labels = df_processed[event_name].unique()
event_encoder.fit(labels);

In [13]:
df_processed['time of day'] = df_processed["timestamp"].str.split(expand=True)[1]

In [14]:
df_processed[[event_name, nxt_event, prv_event]] = df_processed[[event_name, nxt_event, prv_event]].apply(event_encoder.fit_transform)
df_processed["time of day"] = time_of_day_encoder.fit_transform(df_processed[["time of day"]])

In [15]:
X = df_processed[[amount_req, event_name, prv_event]]
y = df_processed[nxt_event]

In [16]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, shuffle=False)

# Random Hyperparameter Grid

### https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

In [40]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [41]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)# Fit the random search model
rf_random.fit(x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, verbose=2)

In [42]:
rf_random.best_params_

{'n_estimators': 1600,
 'min_samples_split': 5,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 10,
 'bootstrap': True}

In [22]:
def evaluate(model, test_features, test_labels):
    y_pred = model.predict(x_test)
    prec_score = precision_score(y_test, y_pred, average="weighted", zero_division=0)
    rec_score = recall_score(y_test, y_pred, average="weighted", zero_division=0)
    F1_score = f1_score(y_test, y_pred, average="weighted", zero_division=0)
    acc_score = accuracy_score(y_test, y_pred)

    print(f'The accuracy of the model is {acc_score}.')
    print(f'The precision of the model is {prec_score}, using weighted average.')
    print(f'The recall of the model is {rec_score}, using weighted average.')
    print(f'The f1-score of the model is {F1_score}, using weighted average.')
    
    return acc_score

In [26]:
base_model = RandomForestClassifier(n_estimators = 10, random_state = 42)
base_model.fit(x_train, y_train)
base_accuracy = evaluate(base_model, x_train, y_train)

The accuracy of the model is 0.6488563259471051.
The precision of the model is 0.6554693616434675, using weighted average.
The recall of the model is 0.6488563259471051, using weighted average.
The f1-score of the model is 0.6209804992585465, using weighted average.


In [66]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, x_test, y_test)

The accuracy of the model is 0.6588634739099357.
The precision of the model is 0.6662702949725373, using weighted average.
The recall of the model is 0.6588634739099357, using weighted average.
The f1-score of the model is 0.613446793399547, using weighted average.


In [69]:
print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

Improvement of 1.54%.


In [67]:
best_random.feature_importances_

array([0.0385619 , 0.45269283, 0.50874527])

# Grid Search with Cross Validation


In [28]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [1, 10, 15, 20],
    'max_features': [2, 3],
    'min_samples_leaf': [1, 2, 3],
    'min_samples_split': [3, 5, 10],
    'n_estimators': [100, 1400, 1600, 2000]
}# Create a based model
rf = RandomForestClassifier()# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [29]:
# Fit the grid search to the data
grid_search.fit(x_train, y_train)
grid_search.best_params_

Fitting 3 folds for each of 288 candidates, totalling 864 fits


{'bootstrap': True,
 'max_depth': 10,
 'max_features': 2,
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'n_estimators': 2000}

In [30]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, x_test, y_test)

The accuracy of the model is 0.6767333809864189.
The precision of the model is 0.6785113784560418, using weighted average.
The recall of the model is 0.6767333809864189, using weighted average.
The f1-score of the model is 0.6373846895148406, using weighted average.


In [31]:
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

Improvement of 4.30%.


In [76]:
best_grid.feature_importances_

array([0.05971821, 0.4454768 , 0.49480499])

# Grid V2


### https://towardsdatascience.com/optimizing-hyperparameters-in-random-forest-classification-ec7741f9d3f6

In [17]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [5, 8, 15, 25, 30],
    'max_features': [2, 3],
    'min_samples_leaf': [1, 2, 5, 10],
    'min_samples_split': [2, 5, 10, 15, 100],
    'n_estimators': [100, 300, 500, 800, 1200]
}
rf = RandomForestClassifier()# Instantiate the grid search model
grid_search2 = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [18]:
# Fit the grid search to the data
grid_search2.fit(x_train, y_train)
grid_search2.best_params_

Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


{'bootstrap': True,
 'max_depth': 8,
 'max_features': 2,
 'min_samples_leaf': 1,
 'min_samples_split': 15,
 'n_estimators': 1200}

In [24]:
best_grid2 = grid_search2.best_estimator_
grid_accuracy2 = evaluate(best_grid2, x_test, y_test)

The accuracy of the model is 0.672265904217298.
The precision of the model is 0.6751476704278561, using weighted average.
The recall of the model is 0.672265904217298, using weighted average.
The f1-score of the model is 0.6285485501309177, using weighted average.


In [27]:
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy2 - base_accuracy) / base_accuracy))

Improvement of 3.61%.


In [None]:
best_grid2.feature_importances_

### As we can see from the results the improvement is really small, considering the time it needs to run, I think it is not worth our time. 
"Hyperparameter tuning can be advantageous in creating a model that is better at classification. In the case of a random forest, it may not be necessary, as random forests are already very good at classification. "