## Download file models.csv if you want to view different models' performance or add results to the file.

In [1]:
import numpy as np
import pandas as pd
pd.set_option("display.max_rows", 10000)
pd.set_option("display.max_columns", 10000)

In [2]:
df_train = pd.read_csv('BPI_Challenge_2012-training.csv')
df_test = pd.read_csv('BPI_Challenge_2012-test.csv')

df_data = pd.concat([df_train, df_test])

df_data.drop(["eventID ","case REG_DATE", "event lifecycle:transition"],axis=1,inplace=True)

In [3]:
df_data.head(20)

Unnamed: 0,case concept:name,case AMOUNT_REQ,event concept:name,event time:timestamp
0,173688,20000,A_SUBMITTED,01-10-2011 00:38:44.546
1,173688,20000,A_PARTLYSUBMITTED,01-10-2011 00:38:44.880
2,173688,20000,A_PREACCEPTED,01-10-2011 00:39:37.906
3,173688,20000,W_Completeren aanvraag,01-10-2011 00:39:38.875
4,173691,5000,A_SUBMITTED,01-10-2011 08:08:58.256
5,173691,5000,A_PARTLYSUBMITTED,01-10-2011 08:09:02.195
6,173691,5000,A_PREACCEPTED,01-10-2011 08:09:56.648
7,173691,5000,W_Completeren aanvraag,01-10-2011 08:09:59.578
8,173694,7000,A_SUBMITTED,01-10-2011 08:10:30.287
9,173694,7000,A_PARTLYSUBMITTED,01-10-2011 08:10:30.591


In [4]:
df_data["next event"] = 0
df_data["prev event"] = 0

In [5]:
def next_event(data, lst):
    """function to add the next and previous event of a trace"""
    for i in lst:
        data.loc[data["case concept:name"] == i, "next event"] = data.loc[data["case concept:name"] == i, "event concept:name"].shift(-1)
    
        data.loc[data["case concept:name"] == i, "prev event"] = data.loc[data["case concept:name"] == i, "event concept:name"].shift(1)
    
    return data

In [6]:
%%time
next_event(df_data, df_data["case concept:name"].unique().tolist())

Wall time: 25.4 s


Unnamed: 0,case concept:name,case AMOUNT_REQ,event concept:name,event time:timestamp,next event,prev event
0,173688,20000,A_SUBMITTED,01-10-2011 00:38:44.546,A_PARTLYSUBMITTED,
1,173688,20000,A_PARTLYSUBMITTED,01-10-2011 00:38:44.880,A_PREACCEPTED,A_SUBMITTED
2,173688,20000,A_PREACCEPTED,01-10-2011 00:39:37.906,W_Completeren aanvraag,A_PARTLYSUBMITTED
3,173688,20000,W_Completeren aanvraag,01-10-2011 00:39:38.875,W_Completeren aanvraag,A_PREACCEPTED
4,173691,5000,A_SUBMITTED,01-10-2011 08:08:58.256,A_PARTLYSUBMITTED,
...,...,...,...,...,...,...
47818,213276,15000,W_Nabellen incomplete dossiers,14-03-2012 15:59:28.309,W_Nabellen incomplete dossiers,W_Nabellen incomplete dossiers
47819,213276,15000,W_Nabellen incomplete dossiers,14-03-2012 16:00:09.680,,W_Nabellen incomplete dossiers
47820,209595,13000,W_Nabellen offertes,14-03-2012 16:02:03.883,,W_Nabellen offertes
47821,211624,35000,W_Nabellen incomplete dossiers,14-03-2012 16:04:46.192,W_Nabellen incomplete dossiers,W_Nabellen incomplete dossiers


In [7]:
# Check a trace
df_data.loc[df_data["case concept:name"] == df_data["case concept:name"].unique().tolist()[1261]];

In [8]:
df_encoded = df_data.copy()

In [9]:
df_encoded.iloc[-1] = 0

df_encoded.loc[47822, "next event"] = "A_SUBMITTED"

df_encoded.iloc[-1]

df_encoded.replace(0, np.nan, inplace=True);

In [10]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

event_encoder = LabelEncoder()
time_of_day_encoder = OrdinalEncoder()

labels = df_encoded["event concept:name"].unique()
event_encoder.fit(labels)
df_encoded[["event concept:name", "next event", "prev event"]] = df_encoded[["event concept:name", "next event", "prev event"]].apply(event_encoder.fit_transform)

In [11]:
labels, len(labels);

In [12]:
#next event doesnt have A SUBMITTED.

print([i for i in df_encoded["prev event"].unique() if i not in df_encoded["next event"].unique()])

print(len(df_encoded["event concept:name"].unique()))
print(len(df_encoded["next event"].unique()))
print(len(df_encoded["prev event"].unique()))

[]
25
25
25


In [13]:
# Check an encoded trace
df_encoded.loc[df_encoded["case concept:name"] == df_encoded["case concept:name"].unique().tolist()[1261]]

Unnamed: 0,case concept:name,case AMOUNT_REQ,event concept:name,event time:timestamp,next event,prev event
17875,177651.0,50000.0,9,17-10-2011 20:35:46.307,6,24
17876,177651.0,50000.0,6,17-10-2011 20:35:46.726,7,9
17877,177651.0,50000.0,7,17-10-2011 20:36:25.612,19,6
17878,177651.0,50000.0,19,17-10-2011 20:36:26.008,19,7
17879,177651.0,50000.0,19,17-10-2011 20:36:39.880,0,19
17894,177651.0,50000.0,0,17-10-2011 20:41:33.475,14,19
17911,177651.0,50000.0,14,17-10-2011 20:44:43.085,5,0
17912,177651.0,50000.0,5,17-10-2011 20:44:43.085,12,14
17913,177651.0,50000.0,12,17-10-2011 20:44:44.269,15,5
17914,177651.0,50000.0,15,17-10-2011 20:44:44.308,21,12


In [14]:
# Time of day implementation, not sure if correct.
df_encoded["time of day"] = df_encoded["event time:timestamp"].str.split(expand=True)[1]
time_of_day_encoder = OrdinalEncoder()
df_encoded["time of day"] = time_of_day_encoder.fit_transform(df_encoded[["time of day"]])

In [15]:
df_encoded.drop(["event time:timestamp", "case concept:name"], axis=1, inplace=True)

In [16]:
df_encoded.dropna(inplace=True)

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score


X_train, X_test, y_train, y_test = train_test_split(df_encoded[["case AMOUNT_REQ", "event concept:name", "prev event", "time of day"]], 
                                df_encoded["next event"].values, test_size=0.2, random_state=42, shuffle=False)

In [18]:
def trainRFC(X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test):
    """
    Grid Search CV. Manually set the values and parameters to test in this function.
    """
    rfc = RandomForestClassifier()
    
    params_rfc = {"n_estimators":np.arange(5, 100)}
    
    
    #Remove cv for faster code
    rfc_gs = GridSearchCV(rfc, params_rfc, cv=5, scoring="accuracy")
    rfc_gs.fit(X_train, y_train)
    rfc_best = rfc_gs.best_estimator_
     
    # Check best n_neigbors value
    print(rfc_gs.best_params_)
    
    prediction = rfc_best.predict(X_test)

    print(classification_report(y_test, prediction))
    print(confusion_matrix(y_test, prediction))
    
    return rfc_best



models = {}
def add(lst,dct=models):
    """
    Adds a model's results and data to a dict.
    """
    dct[len(dct)+1] = lst


    
    
def testRFC():
    """
    For testing a RFC by inputting n_estimators and max_depth, rest = default
    """
    testRFC = RandomForestClassifier(n_estimators=int(input("n_estimators: ")), max_depth=int(input("max_depth: ")), random_state=0)
    testRFC.fit(X_train, y_train)
    y_pred = testRFC.predict(X_test)
    print("MSE: ",mean_squared_error(y_test, y_pred))
    print("Accuracy: ",accuracy_score(y_test, y_pred))
    print("Precision: ",precision_score(y_test, y_pred, average="weighted", zero_division=0))
    print("Recall: ",recall_score(y_test, y_pred, average="weighted", zero_division=0))
    print("F1: ",f1_score(y_test, y_pred, average="weighted", zero_division=0))    

def nRFC(n_estimators=False, max_depth=False, min_samples_leaf=False, criterion=False,bootstrap=False):
    """
    Trains a new RFC model and adds its results and data to a dict.
    Input: as argument or as input, if no arguments are specified it automatically prompts an input box.
    """
    if not n_estimators:
        n_estimators=int(input("n_estimators: "))
    if not max_depth:
        max_depth=int(input("max_depth: "))
    if not min_samples_leaf:
        min_samples_leaf=int(input("min_samples_leaf: "))
    if not criterion:
        criterion=str(input("criterion: "))
    
    #bootstrap = bool(input("bootstrap(T/F): "))
    print("n_estimators: ", n_estimators)
    print("max_depth: ", max_depth)
    print("min_samples_leaf: ", min_samples_leaf)
    print("criterion: ", criterion)
    print("bootstrap: ", bootstrap)
    
    
    newRFC = RandomForestClassifier(bootstrap=bootstrap,criterion=criterion,n_estimators=n_estimators, 
                                     max_depth=max_depth, min_samples_leaf=min_samples_leaf,random_state=0)
    newRFC.fit(X_train, y_train)
    y_pred = newRFC.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)
    
    features = newRFC.feature_names_in_
    importance = newRFC.feature_importances_
    values = [acc, f1, n_estimators, max_depth, min_samples_leaf, bootstrap, criterion, list(features), list(importance)]
    print(values)
    add(values)
    
    
def save_models(data):
    """
    Run to save model data as csv
    """
    df_models_old = pd.read_csv("models.csv")
    df_models = pd.DataFrame.from_dict(models, columns = ["accuracy", "f1", "n_estimators", "max_depth", "min_samples_leaf", "bootstrap", "criterion", "predictors", "predictor importances"], orient="index")
    df_models = pd.concat([df_models_old, df_models])
    df_models.drop_duplicates(subset=["accuracy", "f1"], inplace=True)
    df_models.to_csv('models.csv', index=False)
    return df_models



df_models = pd.DataFrame.from_dict(models, columns = ["accuracy", "f1", "n_estimators", "max_depth", "min_samples_leaf", "bootstrap", "criterion", "predictors", "predictor importances"], orient="index")
df_models = save_models(df_models)

In [19]:
df_models.head(5)

Unnamed: 0,accuracy,f1,n_estimators,max_depth,min_samples_leaf,bootstrap,criterion,predictors,predictor importances
0,0.814943,0.793232,12,12,4,True,entropy,"['case AMOUNT_REQ', 'event concept:name', 'pre...","[0.008260270405499408, 0.4871278531101007, 0.4..."
1,0.815401,0.793864,90,12,4,True,entropy,"['case AMOUNT_REQ', 'event concept:name', 'pre...","[0.007948105983414333, 0.4719249087041311, 0.4..."
2,0.815496,0.794006,75,12,4,True,entropy,"['case AMOUNT_REQ', 'event concept:name', 'pre...","[0.007926367095683373, 0.4710246967483686, 0.4..."
3,0.814009,0.790918,5,12,4,False,entropy,"['case AMOUNT_REQ', 'event concept:name', 'pre...","[0.00792509195269893, 0.5065354277371766, 0.44..."
4,0.814905,0.79277,10,12,4,False,entropy,"['case AMOUNT_REQ', 'event concept:name', 'pre...","[0.006545774308586411, 0.5136902523239764, 0.4..."


In [20]:
nRFC(10,12,4,"entropy", False)

n_estimators:  10
max_depth:  12
min_samples_leaf:  4
criterion:  entropy
bootstrap:  False
[0.8149052214043251, 0.7927698646131112, 10, 12, 4, False, 'entropy', ['case AMOUNT_REQ', 'event concept:name', 'prev event', 'time of day'], [0.006545774308586411, 0.5136902523239764, 0.4412304066434768, 0.03853356672396044]]


In [21]:
#Can also be used empty
#nRFC()

In [22]:
%%time
# for i in [8,9,10,11]:
#     nRFC(90,i,4,"entropy", False)

Wall time: 0 ns


In [23]:
# Run to view and/or save all models
save_models(df_models).sort_values(by="accuracy", ascending=0)

Unnamed: 0,accuracy,f1,n_estimators,max_depth,min_samples_leaf,bootstrap,criterion,predictors,predictor importances
27,0.816011,0.794954,90,12,4,False,entropy,"['case AMOUNT_REQ', 'event concept:name', 'pre...","[0.0058678741875894174, 0.4849924811231534, 0...."
29,0.815878,0.794861,110,12,4,False,entropy,"['case AMOUNT_REQ', 'event concept:name', 'pre...","[0.005818044430765628, 0.48014713310653234, 0...."
31,0.815878,0.794782,130,12,4,False,entropy,"['case AMOUNT_REQ', 'event concept:name', 'pre...","[0.00583357869257726, 0.4816857311007901, 0.47..."
30,0.81584,0.794746,120,12,4,False,entropy,"['case AMOUNT_REQ', 'event concept:name', 'pre...","[0.005805415447788359, 0.479759968347332, 0.47..."
34,0.815744,0.794289,300,12,4,False,entropy,"['case AMOUNT_REQ', 'event concept:name', 'pre...","[0.005891843013575961, 0.4740153998654865, 0.4..."
32,0.815725,0.794588,140,12,4,False,entropy,"['case AMOUNT_REQ', 'event concept:name', 'pre...","[0.005849339081926089, 0.47895464397689935, 0...."
28,0.815706,0.794588,100,12,4,False,entropy,"['case AMOUNT_REQ', 'event concept:name', 'pre...","[0.005862358003214306, 0.48059341115315596, 0...."
5,0.815687,0.79407,15,12,4,False,entropy,"['case AMOUNT_REQ', 'event concept:name', 'pre...","[0.006309408148767006, 0.4796073560653351, 0.4..."
33,0.815649,0.79454,150,12,4,False,entropy,"['case AMOUNT_REQ', 'event concept:name', 'pre...","[0.005852228187745282, 0.47984300859455725, 0...."
11,0.815535,0.794067,45,12,4,False,entropy,"['case AMOUNT_REQ', 'event concept:name', 'pre...","[0.006017182390517102, 0.48009597763116724, 0...."
