This is the Jupyter notebook that we train the basedline model. The model that we are using is going to be a Decision Tree

# Importing Libraries

In [None]:
# loading the packages
import pandas as pd
import pm4py
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Loading the Data

In [None]:
df = pd.read_csv('baseline_df.csv') 
df.head(10)

In [None]:
# listing the columns of the dataframe
df.columns

# Choosing the features and the prediction target

In [None]:
feature =['position']
target = 'next_activity'

In [None]:
# selecting the features and the target
X = df[feature]
y = df[target]

# Splitting the data for training the model

In [None]:
len(X)

In [None]:
train_test_ratio = 0.2
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=train_test_ratio, random_state=42)
f'The length of the train set: {len(X_train)} The length of the test set: {len(X_test)} with the ration of: {train_test_ratio}'

# Creating and Training the Model

In [None]:
# Create decision tree with max depth of 1
depth = 1
model = DecisionTreeClassifier(max_depth=depth)

In [None]:
# Fit the model on the training data
model.fit(X_train, y_train)

In [None]:
# Display the tree
plt.figure(figsize=(20,10))
plot_tree(model, filled=True, feature_names=feature, class_names=model.classes_)
plt.show()

# Predicting Labels

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)
y_pred

In [None]:
# attaching the prediction to the test set and their actual values
X_test['Actual_Next_Activity'] = y_test
X_test['Predicted_Next_Activity'] = y_pred
X_test.head(30)

# Evaluating Model

In [None]:
# printing the accuracy, precision, recall and f1-score of the model separately without report
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='micro')
recall = recall_score(y_test, y_pred, average='micro')
f1 = f1_score(y_test, y_pred, average='micro')

f'Accuracy: {accuracy} Precision: {precision} Recall: {recall} F1: {f1}'

In [None]:
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

## Evaluating the correctness of the Model

We need to see if the model is performing as intended

In [None]:
# Lets see what are the unique values of the 
f'The unique values of the predicted activities are: {np.unique(y_pred)}'

In [None]:
# lets see what is the most dominant next activity in the main df for each position
df.groupby('position')['next_activity'].agg(['max', 'count']).head(20)

In [None]:
# lets see what is the frequency of next activity in the main df for each position
df.groupby('position')['next_activity'].value_counts().head(20)

Now we see how many positions are these labels dominant in

In [None]:
# Now we see how many positions are these labels dominatn in
df.groupby('position')['next_activity'].max().value_counts()

In [None]:
df['next_activity'].value_counts()