In [1]:
!pip3 install xgboost



In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from datetime import datetime, timedelta
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder, KBinsDiscretizer, PolynomialFeatures

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [5]:
df_2017 = pd.read_csv('../process_mining_data/BPI_2017_naive.csv')

FileNotFoundError: [Errno 2] No such file or directory: '../process_mining_data/BPI_2017_naive.csv'

In [3]:
df_2017.columns

Index(['Unnamed: 0', 'Action', 'org:resource', 'concept:name', 'EventOrigin',
       'EventID', 'lifecycle:transition', 'time:timestamp', 'case:LoanGoal',
       'case:ApplicationType', 'case:concept:name', 'case:RequestedAmount',
       'FirstWithdrawalAmount', 'NumberOfTerms', 'Accepted', 'MonthlyCost',
       'Selected', 'CreditScore', 'OfferedAmount', 'OfferID',
       'timestamp_difference', 'event_seq', 'time_to_next_event',
       'time_to_next_event_seconds', 'predicted_time_to_next_event_seconds',
       'elapsed_time_from_start', 'predicted_start_time', 'predicted_step'],
      dtype='object')

### finding columns for features

In [4]:
num_columns = df_2017.select_dtypes(include=[np.number]).columns
num_imputer = SimpleImputer(strategy='median')
df_2017[num_columns] = num_imputer.fit_transform(df_2017[num_columns])

# Impute missing values for categorical columns
cat_columns = df_2017.select_dtypes(include=['object', 'category']).columns
cat_imputer = SimpleImputer(strategy='most_frequent', fill_value='missing')
df_2017[cat_columns] = cat_imputer.fit_transform(df_2017[cat_columns])

# Encoding categorical variables
label_encoders = {}
for column in cat_columns:
    le = LabelEncoder()
    df_2017[column] = le.fit_transform(df_2017[column])
    label_encoders[column] = le

# Calculate correlation for numerical features
correlations = df_2017[num_columns].corrwith(df_2017['time_to_next_event_seconds']).sort_values(ascending=False)
print("Correlations with the target:\n", correlations)

# Prepare df_2017 for mutual information calculation
# Drop the target variable and ensure no NaN values exist
X = df_2017.drop(columns=['time_to_next_event_seconds'])
y = df_2017['time_to_next_event_seconds']

# Calculate mutual information
mi_scores = mutual_info_regression(X, y, random_state=0)
mi_scores = pd.Series(mi_scores, index=X.columns).sort_values(ascending=False)
print("\nMutual Information Scores:\n", mi_scores)


Correlations with the target:
 time_to_next_event_seconds              1.000000
predicted_time_to_next_event_seconds    0.240850
case:RequestedAmount                    0.004244
NumberOfTerms                          -0.000543
MonthlyCost                            -0.000624
OfferedAmount                          -0.000823
FirstWithdrawalAmount                  -0.000990
CreditScore                            -0.001973
event_seq                              -0.004017
Unnamed: 0                             -0.013705
dtype: float64

Mutual Information Scores:
 time_to_next_event                      5.847363
concept:name                            0.796160
predicted_time_to_next_event_seconds    0.657222
lifecycle:transition                    0.656644
Action                                  0.616599
timestamp_difference                    0.423684
elapsed_time_from_start                 0.374872
org:resource                            0.273443
Unnamed: 0                              0.2

### Preprocessing

In [6]:
X = df_2017.drop(['case:concept:name', 'Unnamed: 0','concept:name', 'EventOrigin',
       'EventID', 'case:LoanGoal', 'case:ApplicationType', 'case:concept:name', 'case:RequestedAmount',
       'FirstWithdrawalAmount', 'NumberOfTerms', 'Accepted', 'MonthlyCost',
       'Selected', 'CreditScore', 'OfferedAmount', 'OfferID', 'predicted_time_to_next_event_seconds'], axis=1)  # Drop target and non-feature columns
y = df_2017['time_to_next_event_seconds']  # Target for prediction

# Split data into training, validation, and test sets (60%, 20%, 20%)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)


In [7]:
# Preprocessing for numerical and categorical columns
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ])

# Fit the preprocessor on the training data and transform both training and validation data
X_train_transformed = preprocessor.fit_transform(X_train)
X_val_transformed = preprocessor.transform(X_val)

# Initialize the model
model = XGBRegressor(objective='reg:squarederror')

# Fit the model using the transformed training set and validate using the transformed validation set
model.fit(X_train_transformed, y_train, eval_set=[(X_val_transformed, y_val)], early_stopping_rounds=10, verbose=False)

# Transform the test set and predict
X_test_transformed = preprocessor.transform(X_test)
y_pred_test = model.predict(X_test_transformed)

# Evaluate the model on the test set
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
print(f'Test RMSE: {rmse_test}')

r2_score_value = r2_score(y_test, y_pred_test)
print(f'R² score: {r2_score_value}')
# For predicting and adding new columns, you need to transform the whole dataset
X_transformed = preprocessor.transform(X)
df_2017['predicted_time_to_next_event_seconds_XG'] = model.predict(X_transformed)



Test RMSE: 21108.93365961263
R² score: 0.998448772988457


In [8]:
df_2017.head(20)

Unnamed: 0.1,Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,...,OfferID,timestamp_difference,event_seq,time_to_next_event,time_to_next_event_seconds,predicted_time_to_next_event_seconds,elapsed_time_from_start,predicted_start_time,predicted_step,predicted_time_to_next_event_seconds_XG
0,0.0,0,0,4,0,233761,1,0,5,1,...,1,1,1.0,101273,0.05,0.04,0,112,1,-5.474391
1,1.0,4,0,8,0,62626,1,1,5,1,...,1,47,2.0,101647,0.42,0.3,48,113,3,11.20448
2,2.0,0,0,22,2,551964,3,2,5,1,...,1,421,3.0,170859,80.62,35.17,470,125,7,-5.474391
3,3.0,1,0,22,2,701704,6,3,5,1,...,1,73357,4.0,101236,0.01,35.17,33119,128,7,14.96282
4,4.0,0,0,21,2,630431,3,4,5,1,...,1,10,5.0,101235,0.01,0.01,33126,127,6,-5.474391
5,5.0,4,0,3,0,169679,1,5,5,1,...,1,9,6.0,422649,430448.08,179447.74,33133,126,0,428079.8
6,6.0,0,0,4,0,219156,1,6,7,1,...,1,295515,1.0,101274,0.05,0.04,0,112,1,53.6515
7,7.0,4,0,8,0,130913,1,7,7,1,...,1,48,2.0,101416,0.19,0.3,49,113,3,11.20448
8,8.0,0,0,22,2,1134767,3,8,7,1,...,1,190,3.0,170276,79.83,35.17,240,125,7,-5.474391
9,9.0,1,0,22,2,552218,6,9,7,1,...,1,72734,4.0,101236,0.01,35.17,32485,128,7,14.96282


In [9]:
df_2017['predicted_time_to_next_event_seconds_XG'].head(20)

0    -5.474391e+00
1     1.120448e+01
2    -5.474391e+00
3     1.496282e+01
4    -5.474391e+00
5     4.280798e+05
6     5.365150e+01
7     1.120448e+01
8    -5.474391e+00
9     1.496282e+01
10   -5.474391e+00
11    4.280798e+05
12    5.365150e+01
13    1.120448e+01
14   -5.474391e+00
15    1.496282e+01
16   -5.474391e+00
17    1.059108e+06
18    5.365150e+01
19    1.120448e+01
Name: predicted_time_to_next_event_seconds_XG, dtype: float32

In [10]:
df_2017.to_csv('BPI_2017_naive_XG.csv', index=False)