In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

import random
random.random_state = 0

# # next command ensures that plots appear inside the notebook
# %matplotlib inline
# import matplotlib as mpl
# import matplotlib.pyplot as plt
# import seaborn as sns  # also improves the look of plots
# sns.set()
# plt.rcParams['figure.figsize'] = 10, 5  # default hor./vert. size of plots, in inches
# plt.rcParams['lines.markeredgewidth'] = 1  # to fix issue with seaborn box plots; needed after import seaborn

pd.set_option('display.max_rows', 1000)
pd.set_option("display.max_columns", 1000)

In [2]:
%%time
df_train = pd.read_csv('BPI_Challenge_2012-training.csv')
df_test = pd.read_csv('BPI_Challenge_2012-test.csv')

df_data = pd.concat([df_train, df_test])

df_data.drop(["eventID ","case REG_DATE"],axis=1,inplace=True)

df_data["next event"] = 0
df_data["prev event"] = 0

def next_event(data, lst):
    """function to add the next and previous event of a trace"""
    for i in lst:
        data.loc[data["case concept:name"] == i, "next event"] = data.loc[data["case concept:name"] == i, "event concept:name"].shift(-1)
    
        data.loc[data["case concept:name"] == i, "prev event"] = data.loc[data["case concept:name"] == i, "event concept:name"].shift(1)
    
    return data


next_event(df_data, df_data["case concept:name"].unique().tolist())
# Check a trace
#df_data.loc[df_data["case concept:name"] == df_data["case concept:name"].unique().tolist()[1261]];

df_encoded = df_data.copy()

df_encoded.iloc[-1] = 0

df_encoded.loc[47822, "next event"] = "A_SUBMITTED"

df_encoded.iloc[-1]

df_encoded.replace(0, np.nan, inplace=True)


from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

event_encoder = LabelEncoder()
time_of_day_encoder = OrdinalEncoder()

labels = df_encoded["event concept:name"].unique()
event_encoder.fit(labels)
df_encoded[["event concept:name", "next event", "prev event"]] = df_encoded[["event concept:name", "next event", "prev event"]].apply(event_encoder.fit_transform)

# Time of day implementation, not sure if correct.
df_encoded["time of day"] = df_encoded["event time:timestamp"].str.split(expand=True)[1]
time_of_day_encoder = OrdinalEncoder()
df_encoded["time of day"] = time_of_day_encoder.fit_transform(df_encoded[["time of day"]])

#df_encoded.drop(["event time:timestamp", "case concept:name"], axis=1, inplace=True)

df_encoded.dropna(inplace=True)

Wall time: 28.2 s


In [3]:
# New: Event lifecycle encoding
labels = df_encoded["event lifecycle:transition"].unique()
event_encoder.fit(labels)
df_encoded[["event lifecycle:transition"]] = df_encoded[["event lifecycle:transition"]].apply(event_encoder.fit_transform)


In [4]:
models = {}

## 31% test

0 = sklearn split 0.2

1 = 31-33% incorrect

2 = 26-27.5% incorrect

3 = 20-21.5% incorrect

4 = fixed lol 21% /8

5 = 24.5-25.5% /6.5

In [5]:
# To confirm split is proper.
test = df_encoded[df_encoded["case concept:name"].isin(random.sample(df_encoded["case concept:name"].tolist(), round(len(df_encoded["case concept:name"].unique())/8)))]["case concept:name"].unique()

train = df_encoded[~df_encoded["case concept:name"].isin(test)]["case concept:name"].unique()

test.shape, train.shape, pd.Series(test).isin(train).sum()

((1460,), (11626,), 0)

In [6]:
len(df_encoded[df_encoded["case concept:name"].isin(random.sample(df_encoded["case concept:name"].tolist(), round(len(df_encoded["case concept:name"].unique())/8)))])/len(df_encoded)

0.21377789982760462

In [7]:
def add(lst,dct=models):
    """
    Adds a model's results and data to a dict.
    """
    dct[len(dct)+1] = lst

def random_trace_split(data):

    test = data[data["case concept:name"].isin(random.sample(data["case concept:name"].tolist(), round(len(data["case concept:name"].unique())/8)))]

    train = data[~data["case concept:name"].isin(test["case concept:name"].unique())]

    X_test = test[[
#                    "case AMOUNT_REQ", 
                   "event concept:name", "prev event", "time of day", "event lifecycle:transition"]]
    y_test = test["next event"].values

    X_train = train[[
#                     "case AMOUNT_REQ", 
                    "event concept:name", "prev event", "time of day", "event lifecycle:transition"]]
    y_train = train["next event"].values
    return X_train, X_test, y_train, y_test
    
    
def testRFC():
    """
    For testing a RFC by inputting n_estimators and max_depth, rest = default
    """
    testRFC = RandomForestClassifier(n_estimators=int(input("n_estimators: ")), max_depth=int(input("max_depth: ")), random_state=0)
    testRFC.fit(X_train, y_train)
    y_pred = testRFC.predict(X_test)
    print("MSE: ",mean_squared_error(y_test, y_pred))
    print("Accuracy: ",accuracy_score(y_test, y_pred))
    print("Precision: ",precision_score(y_test, y_pred, average="weighted", zero_division=0))
    print("Recall: ",recall_score(y_test, y_pred, average="weighted", zero_division=0))
    print("F1: ",f1_score(y_test, y_pred, average="weighted", zero_division=0))    

def nRFC(n_estimators=False, max_depth=False, min_samples_leaf=False, criterion=False,bootstrap=False, random_sampling=4):
    """
    Trains a new RFC model and adds its results and data to a dict.
    Input: as argument or as input, if no arguments are specified it automatically prompts an input box.
    """
    if not n_estimators:
        n_estimators=int(input("n_estimators: "))
    if not max_depth:
        max_depth=int(input("max_depth: "))
    if not min_samples_leaf:
        min_samples_leaf=int(input("min_samples_leaf: "))
    if not criterion:
        criterion=str(input("criterion: "))
    
    #bootstrap = bool(input("bootstrap(T/F): "))
    print("n_estimators: ", n_estimators)
    print("max_depth: ", max_depth)
    print("min_samples_leaf: ", min_samples_leaf)
    print("criterion: ", criterion)
    print("bootstrap: ", bootstrap)
    print("Random Sampling; ", random_sampling)
    
    newRFC = RandomForestClassifier(bootstrap=bootstrap,
                                    criterion=criterion,
                                    n_estimators=n_estimators, 
                                    max_depth=max_depth, 
                                    min_samples_leaf=min_samples_leaf,
                                    random_state=0
                                   )
    newRFC.fit(X_train, y_train)
    y_pred = newRFC.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)
    
    features = newRFC.feature_names_in_
    importance = newRFC.feature_importances_
    values = [acc, f1, n_estimators, max_depth, min_samples_leaf, bootstrap, criterion, 
              list(features), list(importance), random_sampling]
    print(values)
    add(values)
    
    
def save_models(data):
    """
    Run to save model data as csv
    """
    df_models_old = pd.read_csv("models.csv")
    df_models = pd.DataFrame.from_dict(models, columns = ["accuracy", "f1", "n_estimators", "max_depth", "min_samples_leaf", "bootstrap", "criterion", "predictors", "predictor importances", "Random Sampling"], orient="index")
    df_models = pd.concat([df_models_old, df_models])
#     df_models["Random Sampling"] = False
    df_models.drop_duplicates(subset=["accuracy", "f1", "max_depth", "n_estimators"], inplace=True)
    df_models.to_csv('models.csv', index=False)
    return df_models



df_models = pd.DataFrame.from_dict(models, columns = ["accuracy", "f1", "n_estimators", "max_depth", "min_samples_leaf", "bootstrap", "criterion", "predictors", "predictor importances", "Random Sampling"], orient="index")
df_models = save_models(df_models)

X_train, X_test, y_train, y_test = random_trace_split(df_encoded)

In [8]:
nRFC(40,10,2,"entropy", True)

n_estimators:  40
max_depth:  10
min_samples_leaf:  2
criterion:  entropy
bootstrap:  True
Random Sampling;  4
[0.8386295928500497, 0.821924604709998, 40, 10, 2, True, 'entropy', ['event concept:name', 'prev event', 'time of day', 'event lifecycle:transition'], [0.43892419526835197, 0.4518347302650027, 0.03134251807221229, 0.077898556394433], 4]


In [10]:
# Run to view and/or save all models
print(len(save_models(df_models)))
df = save_models(df_models).sort_values(by="accuracy", ascending=0)
# df[df["Random Sampling"] == 0]
df[~df["Random Sampling"].isin([1,2,3])]

155


Unnamed: 0,accuracy,f1,n_estimators,max_depth,min_samples_leaf,bootstrap,criterion,predictors,predictor importances,Random Sampling
149,0.839248,0.823701,40,10,2,True,entropy,"['event concept:name', 'prev event', 'time of ...","[0.4362121007723876, 0.45573194926429633, 0.02...",4
148,0.838908,0.821228,40,10,2,True,entropy,"['case AMOUNT_REQ', 'event concept:name', 'pre...","[0.006328651553305071, 0.4426299785954487, 0.4...",4
154,0.83863,0.821925,40,10,2,True,entropy,"['event concept:name', 'prev event', 'time of ...","[0.43892419526835197, 0.4518347302650027, 0.03...",4
151,0.837251,0.822291,84,12,1,True,entropy,"['event concept:name', 'prev event', 'time of ...","[0.423798913541065, 0.45734259106998537, 0.040...",5
153,0.837231,0.823024,84,12,1,True,entropy,"['event concept:name', 'prev event', 'time of ...","[0.4269117673224626, 0.4559603637463897, 0.039...",4
152,0.83618,0.818684,84,10,1,True,entropy,"['event concept:name', 'prev event', 'time of ...","[0.41907638273627756, 0.4708935925347491, 0.03...",5
150,0.835868,0.817807,40,10,2,True,entropy,"['event concept:name', 'prev event', 'time of ...","[0.4373455164870468, 0.4526808781053216, 0.030...",5
126,0.833462,0.812425,40,10,2,True,entropy,"['case AMOUNT_REQ', 'event concept:name', 'pre...","[0.005791232654363697, 0.4587869652322273, 0.5...",4
145,0.833168,0.812256,40,10,2,False,entropy,"['case AMOUNT_REQ', 'event concept:name', 'pre...","[0.004671634827906434, 0.4459892595878514, 0.5...",4
143,0.833132,0.812273,43,10,1,True,entropy,"['case AMOUNT_REQ', 'event concept:name', 'pre...","[0.005701982742502893, 0.4487961320015824, 0.5...",4
