This Jupyter file is designed to conduct an in-depth analysis of the dataset, incorporating statistical insights and visualization. Throughout the file, we show various visual representations and numerical summaries. Towards the end, we implement some changes on the DataFrame to enhance its compatibility for training both a naive version and a completed version of the model. The final step involves saving the modified dataset in CSV format for further utilization.

## Importing Libraries 

In [None]:
# loading the packages
import pandas as pd
import pm4py
import numpy as np
import matplotlib.pyplot as plt

Importing the data (change the path and adapt it to each system later)

In [None]:
# Loading the data and converting it to a dataframe
# log = pm4py.read_xes("data/extracted/BPI_Challenge_2018.xes")
# df = pm4py.convert_to_dataframe(log)
# saving the dataframe to a csv file
# df.to_csv("data/extracted/BPI_Challenge_2018.csv", index=False)
############## If it is the first time you are running the code, please use the above code to load the data ###########
################# If you have already run the code once, please use the code below to load the data ###################
# Loading the data
df = pd.read_csv("data/extracted/BPI_Challenge_2018.csv")
df.head(10)

## Short Summary of Data (All the datasets)

In [None]:
# Printing the nr rows and columns
print(f'nr Rows: {df.shape[0]}, nr Cols: {df.shape[1]}\n')

# checking the data types
print(df.info())

## Missing Values

In [None]:
# checking the missing values
df.isnull().sum()

In [None]:
# checking the percentage of missing values for each column
missing = df.isnull().sum()
missing = missing[missing > 0]
missing = missing / df.shape[0] * 100
missing

Analysis of unique values for each attribute

In [None]:
# checking the unique values
for col in df.columns:
    print(f'{col}: {df[col].nunique()}')

In [None]:
# checking the unique values for the 'concept:name' column
df['org:resource'].value_counts()

In [None]:
# checking the unique values of the lifecycle:transition
df['lifecycle:transition'].unique()

In [None]:
# Plotting the freq of the lifecycle:transition values
plt.figure(figsize=(10, 5))
df['lifecycle:transition'].value_counts().plot(kind='bar')
plt.title('Freq of lifecycle:transition')   
plt.show()

print(df['lifecycle:transition'].value_counts())

In [None]:
# checking the unique values for the concept:name column
df['concept:name'].value_counts()

In [None]:
# Plotting the freq of the concept:name values
plt.figure(figsize=(20, 10))
df['concept:name'].value_counts().plot(kind='bar')
for i, v in enumerate(df['concept:name'].value_counts()):
    plt.text(i, v + 0.2, str(v), ha='center', va='bottom')
plt.title('Freq of concept:name')  
plt.show()

In [None]:
# checking the unique values for the case:concept:name column
df['case:concept:name'].value_counts()

# Analysis specific to the BPI_Challenge_2018 dataset

In [None]:
# checking the percentage of missing values for each column
missing = df.isnull().sum()
missing = missing[missing > 0]
missing = missing / df.shape[0] * 100
missing

In [None]:
# Finding all the columns that have values either True or False
bool_cols = [col for col in df.columns if np.isin(df[col].dropna().unique(), [True, False]).all()]
bool_cols, len(bool_cols)

In [None]:
# Removing the boolean_columns from the dataframe
df_no_bool = df.drop(columns=bool_cols)
df_no_bool.head(5)

In [None]:
df_no_bool.columns

In [None]:
# finding the rows that have the value '0;n/a' in the 'org:resource' column since isna() does not work for this value
df_no_bool[df_no_bool['org:resource'] == '0;n/a']

In [None]:
# Since the isna() method does not capture 0;n/a as missing values, we need to replace it with np.nan
df_no_bool['org:resource'] = df_no_bool['org:resource'].replace('0;n/a', np.nan)
df_no_bool['org:resource'].isna().sum()/len(df_no_bool) * 100

In [None]:
# Checking the value counts for the subprocess column
df_no_bool['subprocess'].value_counts()

In [None]:
# Plotting the freq of the subprocess values
plt.figure(figsize=(20, 10))
df_no_bool['subprocess'].value_counts().plot(kind='bar')
for i, v in enumerate(df_no_bool['subprocess'].value_counts()):
    plt.text(i, v + 0.2, str(v), ha='center', va='bottom')
plt.title('Freq of subprocess')
plt.show()

## Analysis of the boolean columns

In [None]:
# A dataframe with only the boolean columns
df_bool = df[bool_cols]

# checking for the number of unique values for each column
unique = df_bool.nunique()
unique

As it is shown above all of the boolean columns have full values and none of them contain a NaN value or a missing one

In [None]:
# checking the unique values
groups = df.groupby(['case:concept:name'], as_index=False, sort=False)
dataframesGroupedList = [group.reset_index(drop=True) for _, group in groups]

constantColumnsPerGroup = {}

constantsColumnsTotal = []
inConstantsColumnsTotal = []

for dataframeGrouped in dataframesGroupedList:
    constantColumns = []
    for col in dataframeGrouped.columns:
        if dataframeGrouped[col].nunique() == 1:
            constantColumns.append(col)
            constantsColumnsTotal.append(col)
        else:
            inConstantsColumnsTotal.append(col)
    name = dataframeGrouped['case:concept:name'][0]
    constantColumnsPerGroup[name] = constantColumns

In [None]:
for constantColumnsKey in constantColumnsPerGroup:
    print(constantColumnsKey)
    for constantColumn in constantColumnsPerGroup[constantColumnsKey]:
        print(constantColumnsKey + ' ' + constantColumn)
    print('----------')

In [None]:
# checking the unique values
groups = df.groupby(['case:concept:name'], as_index=False, sort=False)
dataframesGroupedList = [group.reset_index(drop=True) for _, group in groups]

constantColumnsPerGroup = {}

constantsColumnsTotal = []
inConstantsColumnsTotal = []

for dataframeGrouped in dataframesGroupedList:
    constantColumns = []
    for col in dataframeGrouped.columns:
        if dataframeGrouped[col].nunique() == 1:
            constantColumns.append(col)
            constantsColumnsTotal.append(col)
        else:
            inConstantsColumnsTotal.append(col)
    name = dataframeGrouped['case:concept:name'][0]
    constantColumnsPerGroup[name] = constantColumns

for constantColumnsKey in constantColumnsPerGroup:
    print(constantColumnsKey)
    for constantColumn in constantColumnsPerGroup[constantColumnsKey]:
        print(constantColumnsKey + ' ' + constantColumn)
    print('----------')

# Helper methods

def uniqueList(list1):
 
    # initialize a null list
    unique_list = []
 
    # traverse for all elements
    for x in list1:
        # check if exists in unique_list or not
        if x not in unique_list:
            unique_list.append(x)
    
    return unique_list

def intersectionList(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

def exceptList(lst1, lst2):
    lst3 = [x for x in lst1 if x not in lst2]
    return lst3

uniqueConstantsColumnsTotal = uniqueList(constantsColumnsTotal)
uniqueInConstantsColumnsTotal = uniqueList(inConstantsColumnsTotal)

constantAndInConstant = intersectionList(uniqueConstantsColumnsTotal, uniqueInConstantsColumnsTotal)
alwaysConstant = exceptList(uniqueConstantsColumnsTotal, uniqueInConstantsColumnsTotal)
alwaysInConstant = exceptList(uniqueInConstantsColumnsTotal, uniqueConstantsColumnsTotal)

print('Constants and Inconstant')
print(len(constantAndInConstant))
print(constantAndInConstant)
print('---------')

print('Always constant')
print(len(alwaysConstant))
print(alwaysConstant)
print('---------')


print('Always inconstant')
print(len(alwaysInConstant))
print(alwaysInConstant)