In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, f1_score , make_scorer
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, KBinsDiscretizer, PolynomialFeatures


In [2]:
df_2017 = pd.read_csv('BPI_2017_naive.csv')

  df_2017 = pd.read_csv('BPI_2017_naive.csv')


In [3]:
df_2017.columns

Index(['Unnamed: 0', 'Action', 'org:resource', 'concept:name', 'EventOrigin',
       'EventID', 'lifecycle:transition', 'time:timestamp', 'case:LoanGoal',
       'case:ApplicationType', 'case:concept:name', 'case:RequestedAmount',
       'FirstWithdrawalAmount', 'NumberOfTerms', 'Accepted', 'MonthlyCost',
       'Selected', 'CreditScore', 'OfferedAmount', 'OfferID',
       'timestamp_difference', 'event_seq', 'time_to_next_event',
       'time_to_next_event_seconds', 'predicted_time_to_next_event_seconds',
       'elapsed_time_from_start', 'predicted_start_time', 'predicted_step'],
      dtype='object')

In [4]:
print(df_2017.shape)

(1201090, 28)


In [5]:
print('time_to_next_event_seconds' in df_2017.columns)

True


In [6]:
X = df_2017.drop('time_to_next_event_seconds', axis=1)
y = df_2017['time_to_next_event_seconds']
print(X.shape, y.shape)

(1201090, 27) (1201090,)


### finding columns for features

In [7]:
num_columns = df_2017.select_dtypes(include=[np.number]).columns
num_imputer = SimpleImputer(strategy='median')
df_2017[num_columns] = num_imputer.fit_transform(df_2017[num_columns])

# Impute missing values for categorical columns
cat_columns = df_2017.select_dtypes(include=['object', 'category']).columns
cat_imputer = SimpleImputer(strategy='most_frequent', fill_value='missing')
df_2017[cat_columns] = cat_imputer.fit_transform(df_2017[cat_columns])

# Encoding categorical variables
label_encoders = {}
for column in cat_columns:
    le = LabelEncoder()
    df_2017[column] = le.fit_transform(df_2017[column])
    label_encoders[column] = le

# Calculate correlation for numerical features
correlations = df_2017[num_columns].corrwith(df_2017['time_to_next_event_seconds']).sort_values(ascending=False)
print("Correlations with the target:\n", correlations)

# Prepare data for mutual information calculation
# Drop the target variable and ensure no NaN values exist
X = df_2017.drop(columns=['time_to_next_event_seconds'])
y = df_2017['time_to_next_event_seconds']

# Calculate mutual information
mi_scores = mutual_info_regression(X, y, random_state=0)
mi_scores = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)
print("\nMutual Information Scores:\n", mi_scores)


Correlations with the target:
 time_to_next_event_seconds              1.000000
predicted_time_to_next_event_seconds    0.240850
case:RequestedAmount                    0.004244
NumberOfTerms                          -0.000543
MonthlyCost                            -0.000624
OfferedAmount                          -0.000823
FirstWithdrawalAmount                  -0.000990
CreditScore                            -0.001973
event_seq                              -0.004017
Unnamed: 0                             -0.013705
dtype: float64

Mutual Information Scores:
 time_to_next_event                      5.847363
concept:name                            0.796160
predicted_time_to_next_event_seconds    0.657222
lifecycle:transition                    0.656644
Action                                  0.616599
timestamp_difference                    0.423684
elapsed_time_from_start                 0.374872
org:resource                            0.273443
Unnamed: 0                              0.2

### finding R^2 values

In [8]:
features = ['Action', 'timestamp_difference', 'elapsed_time_from_start', 'event_seq', 'lifecycle:transition', 'org:resource'] 
target = 'time_to_next_event_seconds'

X = df_2017[features]
y = df_2017[target]

# Split the data into training, validation, and test sets
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42) # 0.25 * 0.8 = 0.2

# Define the model pipeline
numeric_features = ['timestamp_difference', 'elapsed_time_from_start', 'event_seq']
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ])

model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('regressor', LinearRegression())])

# Nested Cross-Validation for model evaluation
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

outer_scores = []
for train_idx, test_idx in outer_cv.split(X_train_val):
    X_train, X_test = X_train_val.iloc[train_idx], X_train_val.iloc[test_idx]
    y_train, y_test = y_train_val.iloc[train_idx], y_train_val.iloc[test_idx]
    
    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)
    r2 = r2_score(y_test, y_pred)
    
    outer_scores.append(r2)

# Final model training and evaluation on the test set
model_pipeline.fit(X_train_val, y_train_val)
y_test_pred = model_pipeline.predict(X_test)

final_r2 = r2_score(y_test, y_test_pred)
print(f'Cross-validated R^2 scores: {outer_scores}')
print(f'Mean R^2 score: {np.mean(outer_scores)}')
print(f'Final R^2 score on the test set: {final_r2}')


Cross-validated R^2 scores: [0.023319041932128082, 0.02298809262033963, 0.02182064144792739, 0.0229912704934937, 0.022809650943973914]
Mean R^2 score: 0.022785739487572544
Final R^2 score on the test set: 0.0228118398089725


In [9]:
# this one has better R^2 with cross validation
# Create a pipeline with feature transformation and a regressor

pipeline = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False),
    StandardScaler(),
    Ridge()
)

# Define parameter grid
param_grid = {
    'polynomialfeatures__degree': [1, 2, 3],
    'ridge__alpha': [0.1, 1, 10, 100]
}

# Nested CV with parameter tuning
inner_cv = KFold(n_splits=5, shuffle=True, random_state=42)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)

# GridSearchCV for hyperparameter tuning within inner CV
grid_search = GridSearchCV(pipeline, param_grid, cv=inner_cv, scoring='r2')

# Outer CV to evaluate model performance
outer_scores = cross_val_score(grid_search, X_train_val, y_train_val, cv=outer_cv)
final_r2 = outer_scores.mean()

print("Cross-validated scores:", outer_scores)
print("Final Cross-Validated R² Score:", final_r2)

Cross-validated scores: [0.16656406 0.16608336 0.16147583 0.16608973 0.16427455]
Final Cross-Validated R² Score: 0.16489750686523558


In [10]:
# Define a pipeline with preprocessing and model
pipeline = Pipeline([
    ('poly', PolynomialFeatures(include_bias=False)),  # exclude bias to avoid multicollinearity
    ('scaling', StandardScaler()),
    ('ridge', Ridge())
])

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'poly__degree': [1, 2, 3],  # Consider the degrees of polynomial features
    'ridge__alpha': np.logspace(-4, 4, 10)  # Use logspace for a broader range of alpha
}

# Configure the inner cross-validation procedure
cv_inner = KFold(n_splits=5, shuffle=True, random_state=42)

# Configure GridSearchCV for model selection and hyperparameter tuning
grid_search = GridSearchCV(pipeline, param_grid, cv=cv_inner, scoring='r2', n_jobs=-1)

# Fit GridSearchCV on the training+validation set
grid_search.fit(X_train_val, y_train_val)

# Output the best parameters and the corresponding score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation R² Score:", grid_search.best_score_)

# Retrain the model on the entire training+validation set using the best parameters
best_model = grid_search.best_estimator_

# Predict on the test set using the best model
y_pred_test = best_model.predict(X_test)

# Evaluate the final model on the test set using R² score
final_r2_score = r2_score(y_test, y_pred_test)
print("Final R² Score on Test Set:", final_r2_score)


Best Parameters: {'poly__degree': 3, 'ridge__alpha': 0.3593813663804626}
Best Cross-Validation R² Score: 0.16489754509925583
Final R² Score on Test Set: 0.1645139534896637


In [14]:
# Train the model on the entire training + validation set
pipeline.fit(X_train_val, y_train_val)

# Make predictions on the entire dataset to get the 'predicted_time_to_next_event_seconds_LR' column
df_2017['predicted_time_to_next_event_seconds_LR'] = pipeline.predict(X)


df_2017.head(5)

Unnamed: 0.1,Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,...,OfferID,timestamp_difference,event_seq,time_to_next_event,time_to_next_event_seconds,predicted_time_to_next_event_seconds,elapsed_time_from_start,predicted_start_time,predicted_step,predicted_time_to_next_event_seconds_LR
0,0.0,0,0,4,0,233761,1,0,5,1,...,1,1,1.0,101273,0.05,0.04,0,112,1,-98361.86768
1,1.0,4,0,8,0,62626,1,1,5,1,...,1,47,2.0,101647,0.42,0.3,48,113,3,437186.950006
2,2.0,0,0,22,2,551964,3,2,5,1,...,1,421,3.0,170859,80.62,35.17,470,125,7,143331.423486
3,3.0,1,0,22,2,701704,6,3,5,1,...,1,73357,4.0,101236,0.01,35.17,33119,128,7,-5807.634161
4,4.0,0,0,21,2,630431,3,4,5,1,...,1,10,5.0,101235,0.01,0.01,33126,127,6,126610.622856


### previous: run libraries+ path first

In [7]:
# Preprocess and feature engineering
df_2017['time:timestamp'] = pd.to_datetime(df_2017['time:timestamp'])
df_2017['event_seq'] = df_2017.groupby('case:concept:name').cumcount() + 1
df_2017['time_to_next_event'] = df_2017.groupby('case:concept:name')['time:timestamp'].transform(lambda x: x.diff().shift(-1))
df_2017['time_to_next_event_seconds'] = df_2017['time_to_next_event'].dt.total_seconds().fillna(0)
df_2017['timestamp_numeric'] = (df_2017['time:timestamp'] - df_2017['time:timestamp'].min()).dt.total_seconds()

# Define features and target variable
features = ['Action', 'timestamp_numeric', 'event_seq', 'lifecycle:transition', 'org:resource']  # Adjusted feature names as necessary
X = df_2017[features]
y = df_2017['time_to_next_event_seconds']

# Preprocessing for numeric and categorical features
numeric_features = ['timestamp_numeric', 'event_seq']
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

categorical_features = ['Action', 'lifecycle:transition', 'org:resource']
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
    ]
)

In [8]:
# Feature Selection: SelectKBest to be integrated into the pipeline
feature_selection = SelectKBest(f_regression)

# Pipeline for preprocessing and regression
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', feature_selection),
    ('regressor', LinearRegression())
])

# Hyperparameters to tune
param_grid = {
    'feature_selection__k': [3, 5, 7],  # Test different numbers of features
    'regressor__fit_intercept': [True, False],
}

In [11]:
# Split the data
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)  # 0.25 * 0.8 = 0.2

# Nested CV for hyperparameter tuning
inner_cv = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_mean_squared_error')
outer_scores = cross_val_score(inner_cv, X, y, cv=3)

print(f"Nested CV score (mean squared error): {np.mean(outer_scores)}")

inner_cv.fit(X_train_val, y_train_val)

# Evaluate the model on the test set
y_test_pred = inner_cv.predict(X_test)
print(f"Test R^2: {r2_score(y_test, y_test_pred)}")
print(f"Test RMSE: {np.sqrt(mean_squared_error(y_test, y_test_pred))}")

Nested CV score (mean squared error): -284739947032.86066
Test R^2: 0.03395272338960176
Test RMSE: 526778.2146094837


In [12]:
# Predict on the full dataset and save the predictions
df_2017['predicted_time_to_next_event_seconds_LR'] = inner_cv.predict(X)
#df_2017 = df_2017.drop('timestamp_numeric', axis=1)

df_2017.head(20)

Unnamed: 0.1,Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,...,timestamp_difference,event_seq,time_to_next_event,time_to_next_event_seconds,predicted_time_to_next_event_seconds,elapsed_time_from_start,predicted_start_time,predicted_step,timestamp_numeric,predicted_time_to_next_event_seconds_LR
0,0,Created,User_1,A_Create Application,Application,Application_652823628,complete,2016-01-01 09:51:15.304,Existing loan takeover,New credit,...,,1,0 days 00:00:00.048000,0.048,0.04,0 days 00:00:00,4 days 18:38:55.010316100,A_Create Application,0.0,104695.731061
1,1,statechange,User_1,A_Submitted,Application,ApplState_1582051990,complete,2016-01-01 09:51:15.352,Existing loan takeover,New credit,...,0 days 00:00:00.048000,2,0 days 00:00:00.422000,0.422,0.3,0 days 00:00:00.048000,4 days 18:50:11.306445745,A_Submitted,0.048,152350.505033
2,2,Created,User_1,W_Handle leads,Workflow,Workitem_1298499574,schedule,2016-01-01 09:51:15.774,Existing loan takeover,New credit,...,0 days 00:00:00.422000,3,0 days 00:01:20.618000,80.618,35.17,0 days 00:00:00.470000,5 days 03:53:50.679851693,W_Handle leads,0.47,113383.403919
3,3,Deleted,User_1,W_Handle leads,Workflow,Workitem_1673366067,withdraw,2016-01-01 09:52:36.392,Existing loan takeover,New credit,...,0 days 00:01:20.618000,4,0 days 00:00:00.011000,0.011,35.17,0 days 00:01:21.088000,5 days 16:10:11.014999397,W_Handle leads,81.088,2166.101394
4,4,Created,User_1,W_Complete application,Workflow,Workitem_1493664571,schedule,2016-01-01 09:52:36.403,Existing loan takeover,New credit,...,0 days 00:00:00.011000,5,0 days 00:00:00.010000,0.01,0.01,0 days 00:01:21.099000,5 days 08:22:12.629814592,W_Complete application,81.099,113383.403919
5,5,statechange,User_1,A_Concept,Application,ApplState_642383566,complete,2016-01-01 09:52:36.413,Existing loan takeover,New credit,...,0 days 00:00:00.010000,6,4 days 23:34:08.079000,430448.079,179447.74,0 days 00:01:21.109000,5 days 05:51:35.263530324,A_Concept,81.109,152350.505033
6,6,Created,User_1,A_Create Application,Application,Application_1691306052,complete,2016-01-01 10:16:11.500,Home improvement,New credit,...,0 days 00:23:35.087000,1,0 days 00:00:00.049000,0.049,0.04,0 days 00:00:00,4 days 18:38:55.010316100,A_Create Application,1496.196,104695.731061
7,7,statechange,User_1,A_Submitted,Application,ApplState_284636842,complete,2016-01-01 10:16:11.549,Home improvement,New credit,...,0 days 00:00:00.049000,2,0 days 00:00:00.191000,0.191,0.3,0 days 00:00:00.049000,4 days 18:50:11.306445745,A_Submitted,1496.245,152350.505033
8,8,Created,User_1,W_Handle leads,Workflow,Workitem_831373279,schedule,2016-01-01 10:16:11.740,Home improvement,New credit,...,0 days 00:00:00.191000,3,0 days 00:01:19.833000,79.833,35.17,0 days 00:00:00.240000,5 days 03:53:50.679851693,W_Handle leads,1496.436,113383.403919
9,9,Deleted,User_1,W_Handle leads,Workflow,Workitem_1299098074,withdraw,2016-01-01 10:17:31.573,Home improvement,New credit,...,0 days 00:01:19.833000,4,0 days 00:00:00.011000,0.011,35.17,0 days 00:01:20.073000,5 days 16:10:11.014999397,W_Handle leads,1576.269,2166.101394
