# LSTM predictor - Frequency encoded
Implemented version of the LSTM model predictor model with one-hot encoding

In [34]:
# import basic libraries
import pandas as pd
import numpy as np

# import machine learning libraries
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

#import catboost
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.preprocessing.sequence import pad_sequences

In [35]:
#import the data
X_train = pd.read_csv("data/generated/frequency/X_train.csv")
y_train = pd.read_csv("data/generated/frequency/y_train.csv")
X_test = pd.read_csv("data/generated/frequency/X_test.csv")
y_test = pd.read_csv("data/generated/frequency/y_test.csv")
X_test.head(3)

Unnamed: 0.1,Unnamed: 0,EventID,time:timestamp,case:concept:name,case:RequestedAmount,FirstWithdrawalAmount,NumberOfTerms,MonthlyCost,Selected,CreditScore,...,EventOrigin_Workflow,lifecycle:transition_ate_abort,lifecycle:transition_complete,lifecycle:transition_resume,lifecycle:transition_schedule,lifecycle:transition_start,lifecycle:transition_suspend,lifecycle:transition_withdraw,case:ApplicationType_Limit raise,case:ApplicationType_New credit
0,1093289,Application_1000386745,2016-11-25 14:31:09.852000+00:00,Application_1000386745,5000.0,,,,,,...,0,0,1,0,0,0,0,0,0,1
1,1093290,ApplState_508603983,2016-11-25 14:31:11.088000+00:00,Application_1000386745,5000.0,,,,,,...,0,0,1,0,0,0,0,0,0,1
2,1093291,Workitem_1662375286,2016-11-25 14:31:11.309000+00:00,Application_1000386745,5000.0,,,,,,...,1,0,0,0,1,0,0,0,0,1


### Pre-process the data

In [36]:
X_train['remaining_time'] = y_train['remaining_time']
X_test['remaining_time'] = y_test['remaining_time']

In [37]:
X_train.fillna(0, inplace=True)
X_test.fillna(0, inplace=True)

In [38]:
columns_2_drop= ['Unnamed: 0', 'EventID','Selected']

In [39]:
# Drop the unnamed and lifecycle columns column
X_test = X_test.drop(columns_2_drop, axis=1)
X_train = X_train.drop(columns_2_drop, axis=1)

In [40]:
# Label encode case identifiers
label_encoder = LabelEncoder()
X_test['case_id'] = label_encoder.fit_transform(X_test['case:concept:name'])
X_train['case_id'] = label_encoder.fit_transform(X_train['case:concept:name'])

In [41]:
# Sort data by case_id and timestamp
X_test.sort_values(['case_id', 'time:timestamp'], inplace=True)
X_train.sort_values(['case_id', 'time:timestamp'], inplace=True)

In [42]:
# Create sequences for each case, including temporal information
sequences_X = []
sequences_Y = []
case_ids = X_test['case_id'].unique()

for case_id in case_ids:
    case_df = X_test[X_test['case_id'] == case_id]
    
    sequence_X = case_df.drop(['case:concept:name', 'time:timestamp'], axis=1).values
    sequences_X.append(sequence_X)
    # Target variable (Y)
    sequence_Y = case_df['remaining_time'].values
    sequences_Y.append(sequence_Y)

# Pad sequences to have a consistent length
X_test = pad_sequences(sequences_X,maxlen=180, padding='post', dtype='float32')
y_test = pad_sequences(sequences_Y,maxlen=180, padding='post', dtype='float32')

In [43]:
# Create sequences for each case, including temporal information
sequences_X = []
sequences_Y = []
case_ids = X_train['case_id'].unique()

for case_id in case_ids:
    case_df = X_train[X_train['case_id'] == case_id]
    
    sequence_X = case_df.drop(['case:concept:name', 'time:timestamp'], axis=1).values
    sequences_X.append(sequence_X)
    # Target variable (Y)
    sequence_Y = case_df['remaining_time'].values
    sequences_Y.append(sequence_Y)

# Pad sequences to have a consistent length
X_train = pad_sequences(sequences_X, padding='post', dtype='float32')
y_train = pad_sequences(sequences_Y, padding='post', dtype='float32')

In [44]:
# Normalize the features using Min-Max scaling based on the training set
#scaler = MinMaxScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

In [45]:
# Reshape the data for LSTM input (assuming univariate time series)
#X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
#X_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

In [51]:
X_test.shape[1]

180

### Building the model

In [46]:
# Define the LSTM model
model = Sequential()
model.add(LSTM(50,input_shape=(X_train.shape[1], X_train.shape[2])))
#model.add(LSTM(50, activation='relu',input_shape=(X_train.shape[1], X_train.shape[2]),return_sequences=True))
model.add(Dropout(0.1))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mean_squared_error')

In [47]:
# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=16, validation_data=(X_test, y_test), verbose=1)

Epoch 1/10

KeyboardInterrupt: 

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test)

In [None]:
# Denormalize the predictions to the original scale
y_pred = scaler.inverse_transform(y_pred_normalized.reshape(-1, 1)).flatten()

### Evaluate the model

In [None]:
# MSE
mean_squared_error(y_test, y_pred)

In [None]:
# R2
r2_score(y_test, y_pred)

In [None]:
# Evaluate the model
mse = np.mean((y_test - y_pred)**2)
mae = np.mean(np.abs(y_test - y_pred))

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")

### Feature importance

In [None]:
import matplotlib.pyplot as plt

# Assuming 'model' is your trained CatBoostRegressor
feature_importance = model.get_feature_importance()

# Get feature names from the original DataFrame (X_train)
feature_names = X_train.columns

# Create a DataFrame to store feature importance values with their corresponding names
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

# Set the threshold for visibility
threshold = 0.01  # Adjust this threshold as needed

# Filter features based on the threshold
important_features = feature_importance_df[feature_importance_df['Importance'] >= threshold]

# Sort the DataFrame by importance in descending order
important_features = important_features.sort_values(by='Importance', ascending=False)

# Plot feature importance for only the relevant features
plt.figure(figsize=(10, 6))
plt.barh(important_features['Feature'], important_features['Importance'])
plt.xlabel('Feature Importance')
plt.ylabel('Feature')
plt.title('CatBoost Feature Importance (Above Threshold)')
plt.show()
