In this Jupyter notebook we tried to train a simple Radom Forrest model on the 3 datasets BPI Challenge 2012, 2017, and 2018. We mostly value the result of how well Random Forrest performed on the 3 models, but we also want to observe how inclusion of more features and feeding more data to the model could impact the performance of the model.

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from aux_functions import split_data

Setup

In [None]:
proportion = 0.8
rng = 42

# BPI Challenge 2012 Data Set

Importing Data

In [None]:
df_2012 = pd.read_csv('data/preprocessed/BPI_Challenge_2012.csv')
df_2012.head()

Splitting the Data Set

In [None]:
# splitting the data into train and test
df_train_2012, df_test_2012 = split_data(df_2012, ratio=0.8, report=True)

Training the Model

In [None]:
df_2012.columns

In [None]:
# Create a random forest Classifier. By convention, clf means 'Classifier'
clf_2012 = RandomForestClassifier(n_jobs=2, random_state=0)

# specifying the predictor columns and the target column
predictor_columns_2012 = ['position', 'concept:name', 'lifecycle:transition', 'case:AMOUNT_REQ']
target_column_2012 = 'next_concept:name'

In [None]:
# Implementing one-hot encoding on columns
# X_train_2012 = pd.get_dummies(df_train_2012[predictor_columns_2012])
# y_train_2012 = df_train_2012[target_column_2012]
# X_train_2012

In [None]:
# Instead of using the one-hot encoding, we can use the label encoding
le = LabelEncoder()
X_train_2012 = df_train_2012[predictor_columns_2012].apply(le.fit_transform)
y_train_2012 = df_train_2012[target_column_2012]
X_train_2012

In [None]:
# Train the Classifier to take the training features and learn how they relate
# to the training y
clf_2012.fit(X_train_2012, y_train_2012)

Testing the Model

In [None]:
# evaluating the model
# X_test_2012 = pd.get_dummies(df_test_2012[predictor_columns_2012])
# y_test_2012 = df_test_2012[target_column_2012]
# y_pred_2012 = clf_2012.predict(X_test_2012)

In [None]:
# Instead of using the one-hot encoding, we can use the label encoding
X_test_2012 = df_test_2012[predictor_columns_2012].apply(le.fit_transform)
y_test_2012 = df_test_2012[target_column_2012]
y_pred_2012 = clf_2012.predict(X_test_2012)

Result

In [None]:
# result
accuracy = accuracy_score(y_test_2012, y_pred_2012)
# printing f1 score
f1 = f1_score(y_test_2012, y_pred_2012, average='weighted')
# printing precision score
precision = precision_score(y_test_2012, y_pred_2012, average='weighted')
# printing recall score
recall = recall_score(y_test_2012, y_pred_2012, average='weighted')

f1, precision, recall, accuracy
f'f1: {f1}, precision: {precision}, recall: {recall}, accuracy: {accuracy}'

In [None]:
# ploting the confusion matrix with seaborn


conf_mat = confusion_matrix(y_test_2012, y_pred_2012)
fig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=clf_2012.classes_, yticklabels=clf_2012.classes_)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

In [None]:
del df_2012, df_train_2012, df_test_2012, X_train_2012, y_train_2012, X_test_2012, y_test_2012, y_pred_2012, clf_2012, predictor_columns_2012, target_column_2012

# DO NOT RUN THE CODE BELOW

# BPI Challenge 2018

Loading the Data

In [None]:
df_2018 = pd.read_csv('data/preprocessed/BPI_Challenge_2018.csv')
df_2018.head()

In [None]:
le = LabelEncoder()
# encode the concept:name as well
df_2018['concept:name_encoded'] = le.fit_transform(df_2018['concept:name'])
df_2018.head()

In [None]:
# Defining penalty columns
bolean_columns = ['success',
  'case:young farmer',
  'case:selected_random',
  'case:penalty_AJLP',
  'case:penalty_BGKV',
  'case:penalty_AUVP',
  'case:small farmer',
  'case:penalty_BGP',
  'case:penalty_C16',
  'case:penalty_BGK',
  'case:penalty_AVUVP',
  'case:penalty_CC',
  'case:penalty_AVJLP',
  'case:penalty_C9',
  'case:rejected',
  'case:greening',
  'case:penalty_C4',
  'case:penalty_AVGP',
  'case:penalty_ABP',
  'case:penalty_B6',
  'case:penalty_B4',
  'case:penalty_B5',
  'case:penalty_AVBP',
  'case:penalty_B2',
  'case:selected_risk',
  'case:penalty_B3',
  'case:selected_manually',
  'case:penalty_AGP',
  'case:penalty_B16',
  'case:penalty_GP1',
  'case:basic payment',
  'case:penalty_B5F',
  'case:penalty_V5',
  'case:redistribution',
  'case:penalty_JLP6',
  'case:penalty_JLP7',
  'case:penalty_JLP5',
  'case:penalty_JLP2',
  'case:penalty_JLP3',
  'case:penalty_JLP1']

int_columns = ['case:cross_compliance', 'case:area', 'case:payment_actual0', 'case:amount_applied0', 'case:year', 'case:number_parcels']

In [None]:
# specifying the predictor columns and the target column
predictor_columns_2018 = ['position', 'concept:name', 'success'] + int_columns
target_column_2018 = 'next_concept:name'

In [None]:
# saving the column names for one-hot encoding
predictor_plus_onehot_columns_2018 = pd.get_dummies(df_2018[predictor_columns_2018]).columns.to_list() + int_columns
print(predictor_plus_onehot_columns_2018)

# add the one-hot encoding for the predictor and add them to the dataframe and remove the original columns
df_2018 = pd.concat([df_2018, pd.get_dummies(df_2018[predictor_columns_2018])], axis=1)
df_2018.drop(columns=['concept:name'], inplace=True)

In [None]:
# splitting the data into train and test
df_train_2018, df_test_2018 = split_data(df_2018, ratio=0.65, report=True)

In [None]:
# Create a random forest Classifier. By convention, clf means 'Classifier'
clf_2018 = RandomForestClassifier(n_jobs=2, random_state=42)

In [None]:
# specifying the target column
target_column_2018 = 'next_concept:name'

In [None]:
# Implementing one-hot encoding on columns
X_train_2018 = df_train_2018[predictor_plus_onehot_columns_2018]
y_train_2018 = df_train_2018[target_column_2018]
X_train_2018

In [None]:
# Instead of one-hot icodeing lets implement le encoding
# X_train_2018 = df_train_2018[predictor_columns_2018].apply(le.fit_transform)
# y_train_2018 = df_train_2018[target_column_2018]
# X_train_2018

In [None]:
# Train the Classifier to take the training features and learn how they relate
# to the training y

clf_2018.fit(X_train_2018, y_train_2018)

In [None]:
y_test_2018.unique()

In [None]:
# X_train_2018 = pd.get_dummies(df_train_2018[predictor_columns_2018], dtype='uint8', sparse=True)
# y_train_2018 = df_train_2018[target_column_2018]
# X_train_2018

# evaluating the model
X_test_2018 = pd.get_dummies(df_test_2018[predictor_columns_2018], dtype='uint8', sparse=True)
y_test_2018 = df_test_2018[target_column_2018]
y_pred_2018 = clf_2018.predict(X_test_2018)
y_train_2018

X_train_2018 = df_train_2018[predictor_plus_onehot_columns_2018]
y_train_2018 = df_train_2018[target_column_2018]

In [None]:
# Instead of one-hot icodeing lets implement le encoding
# X_test_2018 = df_test_2018[predictor_columns_2018].apply(le.fit_transform)
# y_test_2018 = df_test_2018[target_column_2018]
# y_pred_2018 = clf_2018.predict(X_test_2018)

In [None]:
# result
accuracy = accuracy_score(y_test_2018, y_pred_2018)
# printing f1 score
f1 = f1_score(y_test_2018, y_pred_2018, average='weighted')
# printing precision score
precision = precision_score(y_test_2018, y_pred_2018, average='weighted')
# printing recall score
recall = recall_score(y_test_2018, y_pred_2018, average='weighted')

f1, precision, recall, accuracy
f'f1: {f1}, precision: {precision}, recall: {recall}, accuracy: {accuracy}'

In [None]:
# Checking the coefs and what columns are the most important
importances = clf_2018.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf_2018.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking with the feature name
print("Feature ranking:")
for f in range(X_train_2018.shape[1]):
    print(f"{f+1}. feature {X_train_2018.columns[indices[f]]} ({importances[indices[f]]})")