In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, f1_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.feature_selection import SelectKBest, f_regression


In [2]:
df_2012 = pd.read_csv('clean_BPI_2012.csv')

In [3]:
df_2012.columns

Index(['Unnamed: 0', 'org:resource', 'lifecycle:transition', 'concept:name',
       'time:timestamp', 'case:REG_DATE', 'case:concept:name',
       'case:AMOUNT_REQ', 'timestamp_difference'],
      dtype='object')

# 2012

In [4]:
# Step 1: Calculate new columns
df_2012['time:timestamp'] = pd.to_datetime(df_2012['time:timestamp'])
df_2012['event_seq'] = df_2012.groupby('case:concept:name').cumcount() + 1
df_2012['time_to_next_event'] = df_2012.groupby('case:concept:name')['time:timestamp'].transform(lambda x: x.diff().shift(-1))
df_2012['time_to_next_event_seconds'] = df_2012['time_to_next_event'].dt.total_seconds().fillna(0)


In [5]:
# Convert 'time:timestamp' to a numerical format
df_2012['timestamp_numeric'] = (df_2012['time:timestamp'] - df_2012['time:timestamp'].min()).dt.total_seconds()

# Define features and target variable
features = ['event_seq', 'case:concept:name', 'timestamp_numeric']
X = df_2012[features]
y = df_2012['time_to_next_event_seconds']

# Preprocessing for numeric and categorical features
numeric_features = ['event_seq', 'timestamp_numeric']
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

categorical_features = ['case:concept:name']
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Feature Selection: SelectKBest to be integrated into the pipeline
feature_selection = SelectKBest(f_regression)

# Pipeline for preprocessing and regression
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', feature_selection),
    ('regressor', LinearRegression())
])

# Hyperparameters to tune
param_grid = {
    'feature_selection__k': [3, 5, 7],  # Assuming you want to test different numbers of features
    'regressor__fit_intercept': [True, False],
}

# Split the data into training + validation, and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split the training + validation set into separate training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2

In [6]:
# Nested CV for hyperparameter tuning
inner_cv = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
outer_scores = cross_val_score(inner_cv, X, y, cv=5)

print(f"Nested CV score (mean squared error): {np.mean(outer_scores)}")

# Train the final model on the entire training and validation set
inner_cv.fit(X_train_val, y_train_val)

# Evaluate the model on the test set
y_test_pred = inner_cv.predict(X_test)
print(f"Test R^2: {r2_score(y_test, y_test_pred)}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred))}")

# Convert y_test and y_pred to categorical data for Accuracy and F1 Score, if applicable
estimator = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
y_test_binned = estimator.fit_transform(y_test.values.reshape(-1, 1)).squeeze()
y_pred_binned = estimator.transform(y_test_pred.reshape(-1, 1)).squeeze()

accuracy = accuracy_score(y_test_binned, y_pred_binned)
f1 = f1_score(y_test_binned, y_pred_binned, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")


Nested CV score (mean squared error): -273983083024.36295
Test R^2: 0.003975776808128217
Test RMSE: 525586.032440032
Accuracy: 0.9789929771243172
F1 Score: 0.9686009605263263




In [7]:
# Predict on the full dataset and save the predictions
df_2012['predicted_time_to_next_event_seconds'] = inner_cv.predict(X)
df_2012 = df_2012.drop('timestamp_numeric', axis=1)

df_2012.head(20)

Unnamed: 0.1,Unnamed: 0,org:resource,lifecycle:transition,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ,timestamp_difference,event_seq,time_to_next_event,time_to_next_event_seconds,predicted_time_to_next_event_seconds
0,0,112,COMPLETE,A_SUBMITTED,2011-10-01 00:38:44.546,01:10:2011 00:38:44.546000,173688,20000,,1,0 days 00:00:00.334000,0.334,90813.118572
1,1,112,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880,01:10:2011 00:38:44.546000,173688,20000,0 days 00:00:00.334000,2,0 days 00:00:53.026000,53.026,90489.889103
2,2,112,COMPLETE,A_PREACCEPTED,2011-10-01 00:39:37.906,01:10:2011 00:38:44.546000,173688,20000,0 days 00:00:53.026000,3,0 days 00:00:00.969000,0.969,90166.215805
3,3,112,SCHEDULE,W_Completeren aanvraag,2011-10-01 00:39:38.875,01:10:2011 00:38:44.546000,173688,20000,0 days 00:00:00.969000,4,12 days 09:25:48.050000,1070748.05,89842.980989
4,26,112,COMPLETE,A_SUBMITTED,2011-10-01 08:08:58.256,01:10:2011 08:08:58.256000,173691,5000,0 days 07:29:19.381000,1,0 days 00:00:03.939000,3.939,90585.57952
5,27,112,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 08:09:02.195,01:10:2011 08:08:58.256000,173691,5000,0 days 00:00:03.939000,2,0 days 00:00:54.453000,54.453,90262.319687
6,28,112,COMPLETE,A_PREACCEPTED,2011-10-01 08:09:56.648,01:10:2011 08:08:58.256000,173691,5000,0 days 00:00:54.453000,3,0 days 00:00:02.930000,2.93,89938.634369
7,29,112,SCHEDULE,W_Completeren aanvraag,2011-10-01 08:09:59.578,01:10:2011 08:08:58.256000,173691,5000,0 days 00:00:02.930000,4,9 days 03:02:37.034000,788557.034,89615.383034
8,65,112,COMPLETE,A_SUBMITTED,2011-10-01 08:10:30.287,01:10:2011 08:10:30.287000,173694,7000,0 days 00:00:30.709000,1,0 days 00:00:00.304000,0.304,90584.804334
9,66,112,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 08:10:30.591,01:10:2011 08:10:30.287000,173694,7000,0 days 00:00:00.304000,2,0 days 00:00:42.435000,42.435,90261.575119


In [8]:
df_2012.to_csv('BPI_2012_LR.csv', index=False)