# Baseline predictor
This baseline calculates the average remaining time for per index in the trace. For instance:
- 0 (i.e. first event) -> remaining time 20 days
- 1 (i.e. second event) -> remaining time 19 days
- etc.

In [19]:
# import basic libraries
import pandas as pd

# import machine learning libraries
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [20]:
X_train = pd.read_csv("data/generated/onehot/X_train.csv")
y_train = pd.read_csv("data/generated/onehot/y_train.csv")
X_test = pd.read_csv("data/generated/onehot/X_test.csv")
y_test = pd.read_csv("data/generated/onehot/y_test.csv")

In [21]:
# merge X train and y train
train_df = result = pd.concat([X_train, y_train], axis=1)

# make a dictionary that maps an index to the average remaining time
avg_remaining_time_prediction = train_df.groupby('event_index_in_trace')['remaining_time'].mean().to_dict()
avg_remaining_time_prediction

{0: 21.56490223789831,
 1: 21.564900766368048,
 2: 21.564897975498404,
 3: 21.53650286039639,
 4: 21.535334967293327,
 5: 21.460155108139098,
 6: 20.772059856437355,
 7: 20.742206537343307,
 8: 20.53616120930162,
 9: 20.42029045378137,
 10: 20.302678023086038,
 11: 20.230294037037474,
 12: 20.170058130075216,
 13: 19.287558644274746,
 14: 19.159300112500052,
 15: 17.350095809485005,
 16: 16.54838739869388,
 17: 15.362780415034418,
 18: 13.912131562282445,
 19: 12.382569008588646,
 20: 11.394884358980457,
 21: 10.181579582742136,
 22: 9.450740349748504,
 23: 8.657775090326615,
 24: 7.9906058390217085,
 25: 7.467446925731016,
 26: 7.148316784131735,
 27: 6.889234453321498,
 28: 6.614520577931787,
 29: 6.487127631464566,
 30: 6.265910386359027,
 31: 6.142512936869482,
 32: 5.902304294087206,
 33: 5.76717843810364,
 34: 5.6141192890342175,
 35: 5.531261156823756,
 36: 5.455929878263389,
 37: 5.360479738358569,
 38: 5.286282574879618,
 39: 5.24119531711402,
 40: 5.195516070322444,
 41: 5.18

## Make predictions

In [22]:
X_test['y_pred'] = X_test['event_index_in_trace'].map(avg_remaining_time_prediction)
X_test.head()

Unnamed: 0.1,Unnamed: 0,case:concept:name,org:resource,lifecycle:transition,time:timestamp,case:RequestedAmount,event_index_in_trace,Action_Created,Action_Deleted,Action_Obtained,...,case:LoanGoal_Existing loan takeover,case:LoanGoal_Extra spending limit,case:LoanGoal_Home improvement,case:LoanGoal_Motorcycle,case:LoanGoal_Not speficied,"case:LoanGoal_Other, see explanation",case:LoanGoal_Remaining debt home,case:LoanGoal_Tax payments,case:LoanGoal_Unknown,y_pred
0,1080782,Application_796205430,User_54,complete,2016-11-22 09:22:17.274000+00:00,0.0,0,1,0,0,...,0,0,0,0,0,0,0,0,1,21.564902
1,1080783,Application_796205430,User_54,schedule,2016-11-22 09:22:17.285000+00:00,0.0,1,1,0,0,...,0,0,0,0,0,0,0,0,1,21.564901
2,1080784,Application_796205430,User_54,start,2016-11-22 09:22:17.288000+00:00,0.0,2,0,0,1,...,0,0,0,0,0,0,0,0,1,21.564898
3,1080785,Application_796205430,User_54,complete,2016-11-22 09:22:17.291000+00:00,0.0,3,0,0,0,...,0,0,0,0,0,0,0,0,1,21.536503
4,1080786,Application_796205430,User_54,complete,2016-11-22 09:24:43.370000+00:00,0.0,4,0,0,0,...,0,0,0,0,0,0,0,0,1,21.535335


## Calculate metrics
The MSE on the baseline is around 100. This means that the predictions are off by about 10 days.

The R2 score of approximately 0.3 suggests that approximately 30% of the variance in the target variable is explained by the model. Ideally, we want our predictor to have a higher R2 score closer to 1.

In [23]:
# MSE
mean_squared_error(y_test["remaining_time"], X_test["y_pred"])

100.9851377666187

In [24]:
# R2
r2_score(y_test["remaining_time"], X_test["y_pred"])

0.29140523713312516