In [2]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, f1_score

In [4]:
df_2017 = pd.read_csv('/Users/alexraudvee/Desktop/TU_e/year_2/Q3Y2_Process_Mining/process_mining_data/clean_BPI_2017.csv')

  df_2017 = pd.read_csv('/Users/alexraudvee/Desktop/TU_e/year_2/Q3Y2_Process_Mining/process_mining_data/clean_BPI_2017.csv')


In [6]:
df_2017.columns

Index(['Unnamed: 0', 'level_0', 'index', 'Action', 'org:resource',
       'concept:name', 'EventOrigin', 'EventID', 'lifecycle:transition',
       'time:timestamp', 'case:LoanGoal', 'case:ApplicationType',
       'case:concept:name', 'case:RequestedAmount', 'FirstWithdrawalAmount',
       'NumberOfTerms', 'Accepted', 'MonthlyCost', 'Selected', 'CreditScore',
       'OfferedAmount', 'OfferID', 'timestamp_difference'],
      dtype='object')

In [8]:
df_2017 = df_2017.drop(columns=['Unnamed: 0', 'level_0', 'index'])

# 2017

In [9]:
# Step 1: Calculate new columns
df_2017['time:timestamp'] = pd.to_datetime(df_2017['time:timestamp'])
df_2017['event_seq'] = df_2017.groupby('case:concept:name').cumcount() + 1
df_2017['time_to_next_event'] = df_2017.groupby('case:concept:name')['time:timestamp'].transform(lambda x: x.diff().shift(-1))
df_2017['time_to_next_event_seconds'] = df_2017['time_to_next_event'].dt.total_seconds().fillna(0)


In [10]:
# Convert 'time:timestamp' to a numerical format
df_2017['timestamp_numeric'] = (df_2017['time:timestamp'] - df_2017['time:timestamp'].min()).dt.total_seconds()

# Define features and target variable
features = ['event_seq', 'case:concept:name', 'timestamp_numeric']
X = df_2017[features]
y = df_2017['time_to_next_event_seconds']

# Preprocessing for numeric and categorical features
numeric_features = ['event_seq', 'timestamp_numeric']
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

categorical_features = ['case:concept:name']
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Feature Selection: SelectKBest to be integrated into the pipeline
feature_selection = SelectKBest(f_regression)

# Pipeline for preprocessing and regression
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', feature_selection),
    ('regressor', LinearRegression())
])

# Hyperparameters to tune
param_grid = {
    'feature_selection__k': [3, 5, 7],  # Assuming you want to test different numbers of features
    'regressor__fit_intercept': [True, False],
}

# Split the data into training + validation, and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split the training + validation set into separate training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2

In [13]:
# Nested CV for hyperparameter tuning
inner_cv = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
outer_scores = cross_val_score(inner_cv, X, y, cv=3)

print(f"Nested CV score (mean squared error): {np.mean(outer_scores)}")

# Train the final model on the entire training and validation set
inner_cv.fit(X_train_val, y_train_val)

# Evaluate the model on the test set
y_test_pred = inner_cv.predict(X_test)
print(f"Test R^2: {r2_score(y_test, y_test_pred)}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred))}")

# Convert y_test and y_pred to categorical data for Accuracy and F1 Score, if applicable
estimator = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
y_test_binned = estimator.fit_transform(y_test.values.reshape(-1, 1)).squeeze()
y_pred_binned = estimator.transform(y_test_pred.reshape(-1, 1)).squeeze()

accuracy = accuracy_score(y_test_binned, y_pred_binned)
f1 = f1_score(y_test_binned, y_pred_binned, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"F1 Score: {f1}")

Nested CV score (mean squared error): -286087081084.1196
Test R^2: 0.0006287202500351308
Test RMSE: 535786.8470253097
Accuracy: 0.9990966538727323
F1 Score: 0.9986451849098417




In [12]:
# Predict on the full dataset and save the predictions
df_2017['predicted_time_to_next_event_seconds'] = inner_cv.predict(X)
df_2017 = df_2017.drop('timestamp_numeric', axis=1)

df_2017.head(20)

Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,...,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID,timestamp_difference,event_seq,time_to_next_event,time_to_next_event_seconds,predicted_time_to_next_event_seconds
0,Created,User_1,A_Create Application,Application,Application_652823628,complete,2016-01-01 09:51:15.304,Existing loan takeover,New credit,Application_652823628,...,,,,,,,1,0 days 00:00:00.048000,0.048,39573.545485
1,statechange,User_1,A_Submitted,Application,ApplState_1582051990,complete,2016-01-01 09:51:15.352,Existing loan takeover,New credit,Application_652823628,...,,,,,,0 days 00:00:00.048000,2,0 days 00:00:00.422000,0.422,39573.545399
2,Created,User_1,W_Handle leads,Workflow,Workitem_1298499574,schedule,2016-01-01 09:51:15.774,Existing loan takeover,New credit,Application_652823628,...,,,,,,0 days 00:00:00.422000,3,0 days 00:01:20.618000,80.618,39573.54465
3,Deleted,User_1,W_Handle leads,Workflow,Workitem_1673366067,withdraw,2016-01-01 09:52:36.392,Existing loan takeover,New credit,Application_652823628,...,,,,,,0 days 00:01:20.618000,4,0 days 00:00:00.011000,0.011,39573.401445
4,Created,User_1,W_Complete application,Workflow,Workitem_1493664571,schedule,2016-01-01 09:52:36.403,Existing loan takeover,New credit,Application_652823628,...,,,,,,0 days 00:00:00.011000,5,0 days 00:00:00.010000,0.01,39573.401425
5,statechange,User_1,A_Concept,Application,ApplState_642383566,complete,2016-01-01 09:52:36.413,Existing loan takeover,New credit,Application_652823628,...,,,,,,0 days 00:00:00.010000,6,4 days 23:34:08.079000,430448.079,39573.401407
6,Created,User_1,A_Create Application,Application,Application_1691306052,complete,2016-01-01 10:16:11.500,Home improvement,New credit,Application_1691306052,...,,,,,,0 days 00:23:35.087000,1,0 days 00:00:00.049000,0.049,39570.887728
7,statechange,User_1,A_Submitted,Application,ApplState_284636842,complete,2016-01-01 10:16:11.549,Home improvement,New credit,Application_1691306052,...,,,,,,0 days 00:00:00.049000,2,0 days 00:00:00.191000,0.191,39570.887641
8,Created,User_1,W_Handle leads,Workflow,Workitem_831373279,schedule,2016-01-01 10:16:11.740,Home improvement,New credit,Application_1691306052,...,,,,,,0 days 00:00:00.191000,3,0 days 00:01:19.833000,79.833,39570.887302
9,Deleted,User_1,W_Handle leads,Workflow,Workitem_1299098074,withdraw,2016-01-01 10:17:31.573,Home improvement,New credit,Application_1691306052,...,,,,,,0 days 00:01:19.833000,4,0 days 00:00:00.011000,0.011,39570.745491


In [8]:
df_2017.to_csv('BPI_2017_LR.csv', index=False)