# Catboost predictor

In [71]:
# import basic libraries
import pandas as pd

# import machine learning libraries
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score, precision_score, f1_score

#import catboost
from catboost import CatBoostRegressor

In [72]:
#import the data
X_train = pd.read_csv("data/generated/new_documents/X_train.csv")
y_train = pd.read_csv("data/generated/new_documents/y_train.csv")
X_test = pd.read_csv("data/generated/new_documents/X_test.csv")
y_test = pd.read_csv("data/generated/new_documents/y_test.csv")

In [73]:
X_test.head(50)

Unnamed: 0.1,Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,...,case:RequestedAmount,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID,event_index_in_trace
0,1080782,Created,User_54,A_Create Application,Application,Application_796205430,complete,2016-11-22 09:22:17.274000+00:00,Unknown,New credit,...,0.0,,,,,,,,,0
1,1080783,Created,User_54,A_Create ApplicationW_Complete application,Workflow,Workitem_1369226671,schedule,2016-11-22 09:22:17.285000+00:00,Unknown,New credit,...,0.0,,,,,,,,,1
2,1080784,Obtained,User_54,A_Create ApplicationW_Complete applicationW_Co...,Workflow,Workitem_1550461371,start,2016-11-22 09:22:17.288000+00:00,Unknown,New credit,...,0.0,,,,,,,,,2
3,1080785,statechange,User_54,A_Create ApplicationW_Complete applicationW_Co...,Application,ApplState_293815231,complete,2016-11-22 09:22:17.291000+00:00,Unknown,New credit,...,0.0,,,,,,,,,3
4,1080786,statechange,User_54,A_Create ApplicationW_Complete applicationW_Co...,Application,ApplState_1815775481,complete,2016-11-22 09:24:43.370000+00:00,Unknown,New credit,...,0.0,,,,,,,,,4
5,1080787,Created,User_54,A_Create ApplicationW_Complete applicationW_Co...,Offer,Offer_399297264,complete,2016-11-22 09:25:50.242000+00:00,Unknown,New credit,...,0.0,3081.61,18.0,False,426.13,False,0.0,7300.0,,5
6,1080788,statechange,User_54,A_Create ApplicationW_Complete applicationW_Co...,Offer,OfferState_1589699485,complete,2016-11-22 09:25:50.921000+00:00,Unknown,New credit,...,0.0,,,,,,,,Offer_399297264,6
7,1080789,statechange,User_54,A_Create ApplicationW_Complete applicationW_Co...,Offer,OfferState_2089218629,complete,2016-11-22 09:26:06.138000+00:00,Unknown,New credit,...,0.0,,,,,,,,Offer_399297264,7
8,1080790,Deleted,User_54,A_Create ApplicationW_Complete applicationW_Co...,Workflow,Workitem_225326741,complete,2016-11-22 09:26:06.151000+00:00,Unknown,New credit,...,0.0,,,,,,,,,8
9,1080791,Created,User_54,A_Create ApplicationW_Complete applicationW_Co...,Workflow,Workitem_216769643,schedule,2016-11-22 09:26:06.160000+00:00,Unknown,New credit,...,0.0,,,,,,,,,9


In [74]:
y_test = y_test['remaining_time']
y_train = y_train['remaining_time']

### Pre-process the data

In [75]:
# Convert timestamp to a pandas datetime object
X_train['timestamp'] = pd.to_datetime(X_train['time:timestamp'],format='ISO8601')

# Extract relevant features
X_train['year'] = X_train['timestamp'].dt.year
X_train['month'] = X_train['timestamp'].dt.month
X_train['day'] = X_train['timestamp'].dt.day
X_train['hour'] = X_train['timestamp'].dt.hour
X_train['minute'] = X_train['timestamp'].dt.minute
X_train['second'] = X_train['timestamp'].dt.second
X_train['microsecond'] = X_train['timestamp'].dt.microsecond 

# Drop the original timestamp column
X_train = X_train.drop(['time:timestamp', 'timestamp'], axis=1)

In [76]:
# Convert timestamp to a pandas datetime object
X_test['timestamp'] = pd.to_datetime(X_test['time:timestamp'],format='ISO8601')

# Extract relevant features
X_test['year'] = X_test['timestamp'].dt.year
X_test['month'] = X_test['timestamp'].dt.month
X_test['day'] = X_test['timestamp'].dt.day
X_test['hour'] = X_test['timestamp'].dt.hour
X_test['minute'] = X_test['timestamp'].dt.minute
X_test['second'] = X_test['timestamp'].dt.second
X_test['microsecond'] = X_test['timestamp'].dt.microsecond 

# Drop the original timestamp column
X_test = X_test.drop(['time:timestamp', 'timestamp'], axis=1)

In [77]:
# Drop the offer columns
X_train = X_train.drop(['FirstWithdrawalAmount', 'NumberOfTerms', 'Accepted','MonthlyCost','Selected','CreditScore','OfferedAmount','OfferID','Unnamed: 0'], axis=1)
# Drop the offer columns
X_test = X_test.drop(['FirstWithdrawalAmount', 'NumberOfTerms', 'Accepted','MonthlyCost','Selected','CreditScore','OfferedAmount','OfferID','Unnamed: 0'], axis=1)

In [78]:
#Drop some other unnecessary columns
#X_train = X_train.drop(['Action', 'EventID','case:ApplicationType','EventOrigin','lifecycle:transition'],axis=1)

#Drop some other unnecessary columns
#X_test = X_test.drop(['Action', 'EventID','case:ApplicationType','EventOrigin','lifecycle:transition'],axis=1)

In [79]:
#dropping the ID column
#X_train = X_train.drop(['case:concept:name'],axis=1)
#X_test = X_test.drop(['case:concept:name'],axis=1)

In [80]:
# Separate categorical and numeric features

categorical_features = ['org:resource','concept:name','case:LoanGoal','case:concept:name','Action', 'EventID','case:ApplicationType','EventOrigin','lifecycle:transition']
numeric_features = X_train.columns.difference(categorical_features)

# Convert categorical features to string
X_train[categorical_features] = X_train[categorical_features].astype(str)
X_test[categorical_features] = X_test[categorical_features].astype(str)

In [81]:
X_test.dtypes

Action                   object
org:resource             object
concept:name             object
EventOrigin              object
EventID                  object
lifecycle:transition     object
case:LoanGoal            object
case:ApplicationType     object
case:concept:name        object
case:RequestedAmount    float64
event_index_in_trace      int64
year                      int32
month                     int32
day                       int32
hour                      int32
minute                    int32
second                    int32
microsecond               int32
dtype: object

### Building the model

In [82]:
# Create a CatBoost regressor
model = CatBoostRegressor(iterations=500, depth=6, learning_rate=0.01, loss_function='RMSE')

In [83]:
# Train the model
model.fit(X_train, y_train, cat_features=categorical_features, eval_set=(X_test, y_test), early_stopping_rounds=10, verbose=10)

0:	learn: 13.4422899	test: 11.9293129	best: 11.9293129 (0)	total: 1.74s	remaining: 14m 28s
10:	learn: 12.7055358	test: 11.5892344	best: 11.5892344 (10)	total: 6.8s	remaining: 5m 2s
20:	learn: 12.0634996	test: 11.2879457	best: 11.2879457 (20)	total: 10.1s	remaining: 3m 51s
30:	learn: 11.4921115	test: 11.0133206	best: 11.0133206 (30)	total: 17.3s	remaining: 4m 21s
40:	learn: 10.9633044	test: 10.8057750	best: 10.8057750 (40)	total: 25.9s	remaining: 4m 49s
50:	learn: 10.4898112	test: 10.6544860	best: 10.6544860 (50)	total: 33.6s	remaining: 4m 55s
60:	learn: 10.0784772	test: 10.5144843	best: 10.5144843 (60)	total: 39.3s	remaining: 4m 42s
70:	learn: 9.7215891	test: 10.3991125	best: 10.3991125 (70)	total: 48.2s	remaining: 4m 51s
80:	learn: 9.3957588	test: 10.2548510	best: 10.2548510 (80)	total: 54.1s	remaining: 4m 39s
90:	learn: 9.1077357	test: 10.0802255	best: 10.0802255 (90)	total: 57.7s	remaining: 4m 19s
100:	learn: 8.8598645	test: 9.9405842	best: 9.9405842 (100)	total: 60s	remaining: 3m 5

<catboost.core.CatBoostRegressor at 0x1ad5949bb50>

In [84]:
# Make predictions on the test set
y_pred = model.predict(X_test)

### Evaluate the model

In [85]:
# MSE
mean_squared_error(y_test, y_pred)

91.87029750260531

In [86]:
# R2
r2_score(y_test, y_pred)

0.3553624512171861

In [88]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score

# Assuming y_test and y_pred are your actual and predicted values, respectively
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)
explained_variance = explained_variance_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")
print(f"Explained Variance Score: {explained_variance}")

Mean Squared Error (MSE): 91.87029750260531
Mean Absolute Error (MAE): 7.642977644476506
Root Mean Squared Error (RMSE): 9.584899451877694
R-squared (R²): 0.3553624512171861
Explained Variance Score: 0.3555169765838544


In [None]:
#Precision
precision = precision_score(y_test, y_pred, average='weighted')

In [None]:
#F1
f1 = f1_score(y_test, y_pred, average='weighted')

### Feature importance

In [None]:
import matplotlib.pyplot as plt

# Assuming 'model' is your trained CatBoostRegressor
feature_importance = model.get_feature_importance()

# Get feature names from the original DataFrame (X_train)
feature_names = X_train.columns

# Create a DataFrame to store feature importance values with their corresponding names
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

# Sort the DataFrame by importance in descending order
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('CatBoost Feature Importance')
plt.show()

In [None]:
X_train.head()