In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import time
import datetime
import psutil
np.random.seed(42)
from sklearn.preprocessing import LabelEncoder
import random
random.random_state = 0
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

In [19]:
train = pd.read_csv('BPI Challenge 2017-training.csv')
test = pd.read_csv('BPI Challenge 2017-test.csv')
data = pd.concat([train, test])

In [20]:
# event shifting
data['next_event'] = data.groupby('case concept:name')['event concept:name'].shift(-1)
data["prev_event"] = data.groupby('case concept:name')['event concept:name'].shift(1)
data["prev_lifecycle"] = data.groupby('case concept:name')['event lifecycle:transition'].shift(1)

In [21]:
event_encoder = LabelEncoder()

In [22]:
def random_trace_split(data):

    test = data[data["case concept:name"].isin(random.sample(data["case concept:name"].tolist(), round(len(data["case concept:name"].unique())/8)))]

    train = data[~data["case concept:name"].isin(test["case concept:name"].unique())]

    labels_name = data["event concept:name"].unique()
    event_encoder.fit(labels_name)
    train[["event concept:name", "next_event", "prev_event"]] = train[["event concept:name", "next_event", "prev_event"]].apply(event_encoder.fit_transform)
    test[["event concept:name", "next_event", "prev_event"]] = test[["event concept:name", "next_event", "prev_event"]].apply(event_encoder.fit_transform)


    labels_lifecycle = data["event lifecycle:transition"].unique()
    event_encoder.fit(labels_lifecycle)
    train[["event lifecycle:transition", 'prev_lifecycle']] = train[["event lifecycle:transition", 'prev_lifecycle']].apply(event_encoder.fit_transform)
    test[["event lifecycle:transition", 'prev_lifecycle']] = test[["event lifecycle:transition", 'prev_lifecycle']].apply(event_encoder.fit_transform)

    X_test = test[[
#                    "case AMOUNT_REQ", 
                   "event concept:name", "prev_event", "event lifecycle:transition", 'prev_lifecycle']]
    y_test = test["next_event"].values

    X_train = train[[
#                     "case AMOUNT_REQ", 
                    "event concept:name", "prev_event", "event lifecycle:transition", 'prev_lifecycle']]
    y_train = train["next_event"].values
    return X_train, X_test, y_train, y_test

In [23]:
X_train, X_test, y_train, y_test = random_trace_split(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [25]:
max_features = ['auto', 'log2']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
criterion = ['gini', 'entropy']
random_grid = {'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'criterion': criterion}

In [31]:
def evaluate(model, test_features, test_labels):
    y_pred = model.predict(X_test)
    prec_score = precision_score(y_test, y_pred, average="weighted", zero_division=0)
    rec_score = recall_score(y_test, y_pred, average="weighted", zero_division=0)
    F1_score = f1_score(y_test, y_pred, average="weighted", zero_division=0)
    acc_score = accuracy_score(y_test, y_pred)

    print(f'The accuracy of the model is {acc_score}.')
    print(f'The precision of the model is {prec_score}, using weighted average.')
    print(f'The recall of the model is {rec_score}, using weighted average.')
    print(f'The f1-score of the model is {F1_score}, using weighted average.')
    
    return acc_score



In [35]:
base_model = RandomForestClassifier(n_estimators=10,random_state=42)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_train, y_train)

The accuracy of the model is 0.8837560039006427.
The precision of the model is 0.8702997781928496, using weighted average.
The recall of the model is 0.8837560039006427, using weighted average.
The f1-score of the model is 0.8694969376601162, using weighted average.


In [None]:
#The accuracy of the model is 0.8837560039006427.
#The precision of the model is 0.8702997781928496, using weighted average.
#The recall of the model is 0.8837560039006427, using weighted average.
#The f1-score of the model is 0.8694969376601162, using weighted average

In [36]:
base_model.feature_importances_

array([0.33421777, 0.27947688, 0.20062442, 0.18568092])

In [None]:
#array([0.33421777, 0.27947688, 0.20062442, 0.18568092])


In [45]:
rf = RandomForestClassifier(n_estimators=10,random_state=42)
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, scoring='accuracy')# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END bootstrap=True, criterion=gini, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2; total time=   6.0s
[CV] END bootstrap=True, criterion=gini, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2; total time=   5.5s
[CV] END bootstrap=True, criterion=gini, max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2; total time=   5.8s
[CV] END bootstrap=False, criterion=entropy, max_depth=50, max_features=auto, min_samples_leaf=1, min_samples_split=2; total time=   4.7s
[CV] END bootstrap=False, criterion=entropy, max_depth=50, max_features=auto, min_samples_leaf=1, min_samples_split=2; total time=   4.6s
[CV] END bootstrap=False, criterion=entropy, max_depth=50, max_features=auto, min_samples_leaf=1, min_samples_split=2; total time=   5.2s
[CV] END bootstrap=False, criterion=gini, max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=

RandomizedSearchCV(cv=3,
                   estimator=RandomForestClassifier(n_estimators=10,
                                                    random_state=42),
                   n_iter=100,
                   param_distributions={'bootstrap': [True, False],
                                        'criterion': ['gini', 'entropy'],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'log2'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10]},
                   random_state=42, scoring='accuracy', verbose=2)

In [52]:
results = pd.DataFrame(rf_random.cv_results_)

In [71]:
results.sort_values('rank_test_score', inplace=True) 

In [82]:
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_split,param_min_samples_leaf,param_max_features,param_max_depth,param_criterion,param_bootstrap,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
89,5.622437,0.308862,1.062451,0.026260,10,1,log2,80,gini,False,"{'min_samples_split': 10, 'min_samples_leaf': ...",0.889203,0.882747,0.862528,0.878159,0.011363,1
99,5.592264,0.274193,1.085084,0.036510,5,1,log2,,gini,False,"{'min_samples_split': 5, 'min_samples_leaf': 1...",0.889191,0.882738,0.862540,0.878156,0.011353,2
12,4.442567,0.224793,0.903339,0.013086,5,1,auto,,gini,False,"{'min_samples_split': 5, 'min_samples_leaf': 1...",0.889191,0.882738,0.862540,0.878156,0.011353,2
95,5.489658,0.055350,1.031860,0.017192,5,1,auto,60,gini,False,"{'min_samples_split': 5, 'min_samples_leaf': 1...",0.889191,0.882738,0.862540,0.878156,0.011353,2
13,4.797953,0.308969,1.104418,0.118015,10,2,log2,20,gini,False,"{'min_samples_split': 10, 'min_samples_leaf': ...",0.889214,0.882718,0.862531,0.878155,0.011361,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20,5.322373,0.028304,1.149573,0.005675,5,4,log2,10,entropy,False,"{'min_samples_split': 5, 'min_samples_leaf': 4...",0.888633,0.882374,0.862433,0.877813,0.011172,96
74,4.889700,0.039135,1.045399,0.009774,10,1,auto,10,entropy,False,"{'min_samples_split': 10, 'min_samples_leaf': ...",0.888731,0.882368,0.861684,0.877594,0.011547,97
21,5.539213,0.702590,1.086522,0.042307,10,1,log2,10,entropy,False,"{'min_samples_split': 10, 'min_samples_leaf': ...",0.888731,0.882368,0.861684,0.877594,0.011547,97
77,5.294918,0.207141,1.017829,0.007029,2,4,log2,10,gini,False,"{'min_samples_split': 2, 'min_samples_leaf': 4...",0.882455,0.875952,0.857623,0.872010,0.010514,99


In [54]:
rf_random.best_params_

{'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'log2',
 'max_depth': 80,
 'criterion': 'gini',
 'bootstrap': False}

In [None]:
#{'min_samples_split': 10,
# 'min_samples_leaf': 1,
# 'max_features': 'log2',
# 'max_depth': 80,
# 'criterion': 'gini',
# 'bootstrap': False}

In [55]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

The accuracy of the model is 0.8837196625096153.
The precision of the model is 0.870227155088374, using weighted average.
The recall of the model is 0.8837196625096153, using weighted average.
The f1-score of the model is 0.869462885411546, using weighted average.


In [None]:
#The accuracy of the model is 0.8837196625096153.
#The precision of the model is 0.870227155088374, using weighted average.
#The recall of the model is 0.8837196625096153, using weighted average.
#The f1-score of the model is 0.869462885411546, using weighted average.

In [56]:
print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

Improvement of -0.00%.


In [57]:
best_random.feature_importances_

array([0.34756019, 0.272267  , 0.20991309, 0.17025972])

In [None]:
#array([0.34756019, 0.272267  , 0.20991309, 0.17025972])

In [58]:
from sklearn.model_selection import GridSearchCV

In [60]:
param_grid = {
    'bootstrap': [True, False],
    'max_depth': [80],
    'max_features': ['auto', 'log2'],
    'min_samples_leaf': [1],
    'min_samples_split': [10],
    'criterion': ['gini', 'entropy'],
    'n_estimators': [10, 50, 100, 200]
}

rf = RandomForestClassifier()# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, scoring='accuracy')



In [61]:
grid_search.fit(X_train, y_train)
grid_search.best_params_

Fitting 3 folds for each of 32 candidates, totalling 96 fits


{'bootstrap': True,
 'criterion': 'gini',
 'max_depth': 80,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'n_estimators': 100}

In [None]:
#{'bootstrap': True,
# 'criterion': 'gini',
# 'max_depth': 80,
# 'max_features': 'log2',
# 'min_samples_leaf': 1,
# 'min_samples_split': 10,
# 'n_estimators': 100}

In [62]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test)

The accuracy of the model is 0.8837438901036335.
The precision of the model is 0.8702337060701957, using weighted average.
The recall of the model is 0.8837438901036335, using weighted average.
The f1-score of the model is 0.8694810225054729, using weighted average.


In [None]:
#The accuracy of the model is 0.8837438901036335.
#The precision of the model is 0.8702337060701957, using weighted average.
#The recall of the model is 0.8837438901036335, using weighted average.
#The f1-score of the model is 0.8694810225054729, using weighted average.

In [63]:
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

Improvement of -0.00%.


In [64]:
best_grid.feature_importances_

array([0.37183513, 0.23795614, 0.21980817, 0.17040056])

In [None]:
#array([0.37183513, 0.23795614, 0.21980817, 0.17040056])

In [65]:
results2 = pd.DataFrame(grid_search.cv_results_)

In [73]:
results2.sort_values('rank_test_score', inplace=True) 

In [81]:
results2

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bootstrap,param_criterion,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
6,125.946505,3.968314,18.941077,0.497318,True,gini,80,log2,1,10,100,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.889217,0.88275,0.862537,0.878168,0.011364,1
8,10.892582,0.170529,1.887396,0.02984,True,entropy,80,auto,1,10,10,"{'bootstrap': True, 'criterion': 'entropy', 'm...",0.889214,0.882724,0.86256,0.878166,0.011349,2
22,128.217235,2.757529,19.095713,0.78714,False,gini,80,log2,1,10,100,"{'bootstrap': False, 'criterion': 'gini', 'max...",0.889212,0.882732,0.862549,0.878164,0.011354,3
18,125.280601,2.656169,19.480678,0.452778,False,gini,80,auto,1,10,100,"{'bootstrap': False, 'criterion': 'gini', 'max...",0.889214,0.882738,0.862537,0.878163,0.011361,4
21,61.653585,1.689483,9.228623,0.245591,False,gini,80,log2,1,10,50,"{'bootstrap': False, 'criterion': 'gini', 'max...",0.8892,0.882732,0.862543,0.878158,0.011353,5
1,70.50205,2.22828,10.42244,0.702287,True,gini,80,auto,1,10,50,"{'bootstrap': True, 'criterion': 'gini', 'max_...",0.889209,0.882698,0.862566,0.878157,0.011341,6
26,118.63401,2.502314,18.975838,0.971527,False,entropy,80,auto,1,10,100,"{'bootstrap': False, 'criterion': 'entropy', '...",0.889223,0.882701,0.862549,0.878157,0.011354,6
15,200.244801,5.990138,36.137974,0.86526,True,entropy,80,log2,1,10,200,"{'bootstrap': True, 'criterion': 'entropy', 'm...",0.889223,0.882695,0.862551,0.878156,0.011352,8
27,245.315382,15.171571,50.878374,2.967358,False,entropy,80,auto,1,10,200,"{'bootstrap': False, 'criterion': 'entropy', '...",0.889223,0.882698,0.862543,0.878155,0.011356,9
25,57.455302,2.39967,8.853323,0.074098,False,entropy,80,auto,1,10,50,"{'bootstrap': False, 'criterion': 'entropy', '...",0.889223,0.882701,0.862534,0.878153,0.01136,10
