Install required packages

In [None]:
pip install pycox 

Import libraries

In [None]:
import pandas as pd
import torch 
import torchtuples as tt 
from pycox.models import CoxPH
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from pycox.evaluation import EvalSurv

Data Pre-processing

In [None]:
# Read the dataset

data = pd. read_csv('bpic.csv')

In [None]:
# Rename columns
 
data.rename(columns={'Unnamed: 0': 'Case_ID', 'time:timestamp' : 'Complete Timestamp', 'concept:name': 'Activity', 'case:ApplicationType' : 'ApplicationType',  'case:LoanGoal': 'LoanGoal', 'case:RequestedAmount' : 'RequestedAmount', 'org:resource': 'user' }, inplace=True)

In [None]:
# Format for DeepSurv. lable=1 => event happens, label=0=> event doesn't happen. 

data.loc[:, 'label'] = 1
data.drop(columns=['EventID', 'OfferID','Case_ID' ],inplace=True)       # Drop columns that are useless.  

In [None]:
data.head()

In [None]:
# Sort the data by "application_id" and "timestamp" in increasing order

data = data.sort_values(by=["case:concept:name", "Complete Timestamp"])

In [None]:
# Re-arrange dataframe

temp_col = data['case:concept:name']


data.drop(columns=['case:concept:name'], inplace=True)


data.insert(1, 'case:concept:name', temp_col)

Feature Engineering

In [None]:
# Create a new fetaure 'duration'


data['Complete Timestamp']=pd.to_datetime(data['Complete Timestamp'])

grouped_df = data.groupby('case:concept:name')


# Add a new column with the timestamp of the previous row
data['prev_timestamp'] = grouped_df['Complete Timestamp'].shift(1)

# Calculate the duration by subtracting the previous timestamp from the current timestamp
data['duration'] = data['Complete Timestamp'] - data['prev_timestamp']

data['duration']=data['duration'].dt.total_seconds()


# Drop the temporary column
data.drop(columns=['prev_timestamp'], inplace=True)

data['duration']=data['duration'].fillna(0)

In [None]:


cat_feats = ['Action','Activity', 'EventOrigin', 'lifecycle:transition', 'LoanGoal', 'ApplicationType', 'Accepted', 'Selected']           # categorical_features
num_feats = ['RequestedAmount', 'FirstWithdrawalAmount', 'NumberOfTerms', 'MonthlyCost', 'CreditScore', 'OfferedAmount', 'duration']      # numerical_features


In [None]:
#Fill missing values with 0

null_counts = data.isnull().sum()
null_columns = null_counts[null_counts > 0].index

for i in null_columns:
  if i in num_feats:
    data[i].fillna(0, inplace=True)

In [None]:
# Enocde the features to numerical representations 


one_hot_encoder = OneHotEncoder()
one_hot_features = one_hot_encoder.fit_transform(data[cat_feats])

# The above line returns a sparse matrix, so we can convert it to a dense numpy array
one_hot_features = one_hot_features.toarray()

#  integer encode the cat_int_encoding feature remove this
label_encoder = LabelEncoder()
int_feature = label_encoder.fit_transform(data['case:concept:name'])

# concatenate the one hot encoded features and the integer encoded feature, along with the numerical features
final_features = np.concatenate((one_hot_features, int_feature.reshape(-1, 1)), axis=1)
final_features = np.concatenate((one_hot_features, data[num_feats].values), axis=1)

# final_features is now a numpy array that contains all of the encoded features


In [None]:
# Create a pandas DataFrame from the array


df_final = pd.DataFrame()

# Next, let's add the one hot encoded features to the DataFrame
one_hot_columns = []
for i, col in enumerate(cat_feats):
    one_hot_columns += [f"{col}_{c}" for c in one_hot_encoder.categories_[i]]
df_final[one_hot_columns] = pd.DataFrame(one_hot_features)

# Now, let's add the integer encoded feature to the DataFrame
df_final['case:concept:name'] = int_feature

# Finally, let's add the numerical features to the DataFrame
df_final[num_feats] = data[num_feats]

# Now, df_final is a DataFrame with the original column names and the encoded features

# Reorder the columns to make 'case:concept:name' the first column
df_final = df_final[['case:concept:name'] + one_hot_columns + num_feats]

# The DataFrame now has 'case:concept:name' as the first column


In [None]:
# The Time duration, 'duration' is our target variable

duration=df_final['duration'].values
df_final.drop('duration', axis=1, inplace=True)

In [None]:
#This is how the DeepSurv format expects the data to be formatted.

label=data['label']
tar=np.column_stack((duration, label))
df_target= pd.DataFrame(data=tar, columns=['duration', 'event'])

In [None]:
#Split the data into train,test and validation gropus

x= df_final
y=df_target
grouped = x.groupby('case:concept:name')

# Initialize empty lists to store the split datasets
x_train_list = []
y_train_list = []
x_val_list = []
y_val_list = []
x_test_list = []
y_test_list = []

# Iterate over the groups
for user, group in grouped:
  # Calculate the number of samples in each set
  n = len(group)
  tr_size = int(n * 0.60)  # 60% for training
  vl_size = int(n * 0.20)  # 20% for validation
  te_size = int(n * 0.20)  # 20% for test

  # Split the group into training, validation, and test sets
  x_train = group[:tr_size]
  y_train = y[:tr_size]
  x_val = group[tr_size:tr_size+vl_size]
  y_val = y[tr_size:tr_size+vl_size]
  x_test = group[-te_size:]
  y_test = y[-te_size:]

  # Append the split datasets to the list
  x_train_list.append(x_train)
  y_train_list.append(y_train)
  x_val_list.append(x_val)
  y_val_list.append(y_val)
  x_test_list.append(x_test)
  y_test_list.append(y_test)


x_train = pd.concat(x_train_list)
x_val = pd.concat(x_val_list)
x_test = pd.concat(x_test_list)
y_train = pd.concat(y_train_list)
y_val = pd.concat(y_val_list)
y_test = pd.concat(y_test_list)


In [None]:
#Disacrd the case:conept:name column, as it's not a feature, per se.


x_train = x_train.iloc[:, 1:]
x_test = x_test.iloc[:, 1:]
x_val = x_val.iloc[:, 1:]

In [None]:
# DeepSurv requires arrays in 'float32'. 

x_train=x_train.values.astype('float32')
x_val=x_val.values.astype('float32')
x_test=x_test.values.astype('float32')

Format the data as required by DeepSurv.

In [None]:
get_target = lambda df: (df['duration'].values, df['event'].values)
y_train = get_target(y_train)
y_val = get_target(y_val)
durations_test, events_test = get_target(y_test)
y_test = get_target(y_test)

In [None]:
# Convert the duration arrays to float

y_train = tuple(map(lambda x: x.astype('float32'), y_train))
y_val = tuple(map(lambda x: x.astype('float32'), y_val))
y_test = tuple(map(lambda x: x.astype('float32'), y_test))

In [None]:
val = (x_val, y_val)

Configuration & Modelling

In [None]:
# Configure the model, initialize it & fit the data


in_features = x_train.shape[1]
num_nodes = [64, 64]
out_features = 1
batch_norm = True
dropout = 0.2
output_bias = False
batch_size=64
optimizer = tt.optim.AdamWR(decoupled_weight_decay=0.001, cycle_eta_multiplier=0.8,
                            cycle_multiplier=2)

net = tt.practical.MLPVanilla(in_features, num_nodes, out_features, batch_norm,
                              dropout, output_bias=output_bias)
                              
model = CoxPH(net, optimizer) 

epochs = 100
callbacks = [tt.callbacks.EarlyStopping()]
verbose = True

log = model.fit(x_train, y_train, batch_size, epochs, callbacks, verbose,
                val_data=val, val_batch_size=batch_size)

In [None]:
#Predict Surviavl times 


_ = model.compute_baseline_hazards()
surv = model.predict_surv_df(x_test)

In [None]:
# Plot the Survival function against time

surv.iloc[:80, 50000].plot()
plt.ylabel('S(t | x)')
_ = plt.xlabel('Time')

Evaulation

In [None]:
ev = EvalSurv(surv.iloc[:, :30000], durations_test[:30000], events_test[:30000], censor_surv='km')

In [None]:
ev.concordance_td()