In [31]:
import pandas as pd
import pm4py
import networkx as nx
import matplotlib.pyplot as plt
#import seaborn as sns
#from mlxtend.preprocessing import TransactionEncoder
#from mlxtend.frequent_patterns import apriori, association_rules
from pm4py.objects.log.importer.xes import importer
from sklearn.model_selection import train_test_split
import graphviz

# Data Reading And Cleaning

In [32]:
# Read the event log CSV file
csv_file_path = 'event_log.csv'  # Replace with your CSV file path
df = pd.read_csv(csv_file_path)

# Convert 'start_time' column to datetime if it's not already
df['start_time'] = pd.to_datetime(df['start_time'])

# Find events where start_time equals end_time
events_same_start_end = df[df['start_time'] == df['completion_time']]

# Get list of event names where start and end times are the same
event_names_same_start_end = events_same_start_end['event_label'].unique().tolist()

# Display or further process event names where start and end times are the same
print("Event names where start and end times are the same:")
print(event_names_same_start_end)

Event names where start and end times are the same:
['emergency_patient', 'releasing', 'patient_referal', 'time_for_intake', 'patient_left_due_to_long_wait']


In [33]:
# Step 1: Load the original CSV file
df = pd.read_csv('event_log.csv')

In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90430 entries, 0 to 90429
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   case_id          90430 non-null  int64 
 1   task_id          90430 non-null  int64 
 2   event_label      90430 non-null  object
 3   resource         21234 non-null  object
 4   start_time       90430 non-null  object
 5   completion_time  90430 non-null  object
 6   diagnosis        12128 non-null  object
dtypes: int64(2), object(5)
memory usage: 4.8+ MB


In [35]:
# Check for null cells in the entire DataFrame
null_cells = df.isnull()
print(True in null_cells)

False


In [36]:
df.head(10)

Unnamed: 0,case_id,task_id,event_label,resource,start_time,completion_time,diagnosis
0,0,0,emergency_patient,,2018-01-01 00:57:24.551134,2018-01-01 00:57:24.551134,
1,0,2,ER_treatment,ER_PRACTITIONER3,2018-01-01 00:57:24.551134,2018-01-01 02:30:33.427464,
2,0,3,releasing,,2018-01-01 02:30:33.427464,2018-01-01 02:30:33.427464,
3,1,1,emergency_patient,,2018-01-01 03:03:14.397047,2018-01-01 03:03:14.397047,B2
4,1,5,ER_treatment,ER_PRACTITIONER3,2018-01-01 03:03:14.397047,2018-01-01 04:20:35.559032,
5,2,4,emergency_patient,,2018-01-01 06:16:04.316754,2018-01-01 06:16:04.316754,B1
6,3,7,emergency_patient,,2018-01-01 07:06:12.075649,2018-01-01 07:06:12.075649,
7,4,9,emergency_patient,,2018-01-01 07:55:07.334212,2018-01-01 07:55:07.334212,B2
8,5,11,emergency_patient,,2018-01-01 08:16:09.204766,2018-01-01 08:16:09.204766,B1
9,2,8,ER_treatment,ER_PRACTITIONER3,2018-01-01 06:16:04.316754,2018-01-01 08:38:28.243767,


we now see that we have a lot of unneccesary columns so we will remove them and we will organize the events

In [37]:
# # Define the function to modify the dataframe in place
# def transform_event_label_and_resource(df):
#     for index, row in df.iterrows():
#         if row['resource'] != 'None':
#             # Change the event_label to the resource value
#             df.at[index, 'event_label'] = row['resource']
#             # Change the resource to 'None'
#             df.at[index, 'resource'] = 'None'
#     return df

# # Apply the transformation
# df = transform_event_label_and_resource(df)

# # Display the transformed dataframe
# df.head(10)

KeyboardInterrupt: 

In [None]:
# Remove the resource column
df = df.drop(columns=['resource'])

In [None]:
# # Define the function to transform the dataframe
# def split_start_completion(df):
#     new_rows = []
#     for index, row in df.iterrows():
#         # Create the _start row
#         if not row['event_label'] in event_names_same_start_end:
#             start_row = row.copy()
#             start_row['event_label'] = f"{row['event_label']}"
#             new_rows.append(start_row)
#         else:
#             new_rows.append(row)
    
#     # Create the new dataframe without the completion_time column
#     new_df = pd.DataFrame(new_rows).drop(columns=['completion_time'])
    
#     return new_df

# # Apply the transformation
# df = split_start_completion(df)

In [None]:
df = df.drop(columns=['task_id', 'diagnosis', 'completion_time'])

In [None]:
df.head(30)

In [None]:
sorted_df = df.sort_values(['case_id', 'start_time'], ascending=[True, True])

In [None]:
# Save the transformed dataframe to a new CSV file if needed
sorted_df.to_csv('after_cleaning.csv', index=False)

In [None]:
sorted_df.head(20)

In [None]:
# Prepare the dataframe for conversion to XES
sorted_df.rename(columns={'case_id': 'case:concept:name', 'event_label': 'concept:name', 'start_time': 'time:timestamp'}, inplace=True)
sorted_df['time:timestamp'] = pd.to_datetime(sorted_df['time:timestamp'])

# Convert the dataframe to an XES event log
event_log = pm4py.format_dataframe(sorted_df, case_id='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp')
xes_event_log = pm4py.convert_to_event_log(event_log)

# Save the XES event log to a file
pm4py.write_xes(xes_event_log, 'event_log.xes')

## Algorithms

### we will split the data to train and test

In [None]:
# Split the DataFrame into training and testing sets
train, test = train_test_split(sorted_df, test_size=0.4, random_state=42)

# Convert the dataframe to an XES event log
event_log_train = pm4py.format_dataframe(train, case_id='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp')
xes_event_log_train = pm4py.convert_to_event_log(event_log_train)

# Save the XES event log to a file
pm4py.write_xes(xes_event_log_train, 'event_log_train.xes')

# Convert the dataframe to an XES event log
event_log_test = pm4py.format_dataframe(test, case_id='case:concept:name', activity_key='concept:name', timestamp_key='time:timestamp')
xes_event_log_test = pm4py.convert_to_event_log(event_log_train)

# Save the XES event log to a file
pm4py.write_xes(xes_event_log_train, 'event_log_test.xes')

# Save the training and testing sets to separate CSV files
train.to_csv('train_set.csv', index=False)
test.to_csv('test_set.csv', index=False)

getting the Petri Net of the log

applying the alpha algorithm on the xes file

In [None]:
from pm4py.algo.discovery.alpha import algorithm as alpha_miner

# Load the event log
xes_event_log_train = pm4py.read_xes('event_log_train.xes')

# Apply the Alpha Miner algorithm to discover a Petri net
alpha_net, alpha_initial_marking, alpha_final_marking = alpha_miner.apply(xes_event_log_train)

# Save the Petri net to a PNML file
pm4py.write_pnml(alpha_net, alpha_initial_marking, alpha_final_marking, 'alpha_mined_petri_net.pnml')

# Visualize the Petri net
pm4py.view_petri_net(alpha_net, alpha_initial_marking, alpha_final_marking)

### Inductive Algorithm

In [None]:
# Load the event log
xes_event_log_train = pm4py.read_xes('event_log_train.xes')

# Apply the Inductive Miner algorithm to discover a Petri net
net, initial_marking, final_marking = pm4py.discover_petri_net_inductive(xes_event_log_train)

# Save the Petri net to a PNML file
pm4py.write_pnml(net, initial_marking, final_marking, 'inductive_mined_petri_net.pnml')
pm4py.view_petri_net(net, initial_marking, final_marking)

### Heuristic Algorithm

In [None]:
# Load the event log
xes_event_log_train = pm4py.read_xes('event_log_train.xes')

# Apply the Heuristic Miner algorithm to discover a Petri net
net, initial_marking, final_marking = pm4py.discover_petri_net_heuristics(xes_event_log_train)

# Save the Petri net to a PNML file
pm4py.write_pnml(net, initial_marking, final_marking, 'heuristic_mined_petri_net.pnml')
pm4py.view_petri_net(net, initial_marking, final_marking)

### Rechability Graph

In [None]:
#from pm4py.algo.analysis.petri_net.reachability import algorithm as reachability_algorithm

# Load the event log
xes_event_log_train = pm4py.read_xes('event_log_train.xes')

# Apply the Alpha Miner algorithm to discover a Petri net
net, initial_marking, final_marking = pm4py.discover_petri_net_alpha(xes_event_log_train)

# Get the reachability graph
reachability_graph = reachability_algorithm.apply(net, initial_marking)

# Visualize the reachability graph
gviz = reachability_algorithm.view(reachability_graph)
graphviz.Source(gviz)

### Performance Checking

In [None]:
import pm4py
from pm4py.algo.evaluation.replay_fitness import algorithm as replay_fitness
from pm4py.algo.evaluation.precision import algorithm as precision_evaluator
from pm4py.algo.evaluation.generalization import algorithm as generalization_evaluator
from pm4py.algo.evaluation.simplicity import algorithm as simplicity_evaluator

# Load the training and testing event logs
xes_event_log_train = pm4py.read_xes('event_log_train.xes')
xes_event_log_test = pm4py.read_xes('event_log_test.xes')

# Define a function to evaluate performance using the test log
def evaluate_performance(log, net, initial_marking, final_marking):
    # Fitness
    fitness_value = replay_fitness.apply(log, net, initial_marking, final_marking)['averageFitness']
    
    # Precision
    precision_value = precision_evaluator.apply(log, net, initial_marking, final_marking)
    
    # Generalization
    generalization_value = generalization_evaluator.apply(log, net, initial_marking, final_marking)
    
    # Simplicity
    simplicity_value = simplicity_evaluator.apply(net)
    
    return {
        "fitness": fitness_value,
        "precision": precision_value,
        "generalization": generalization_value,
        "simplicity": simplicity_value
    }


In [None]:
from pm4py.algo.discovery.inductive import algorithm as inductive_miner

# Apply the Inductive Miner algorithm to discover a Petri net
inductive_net, inductive_initial_marking, inductive_final_marking = pm4py.discover_petri_net_inductive(xes_event_log_train)

# Save the Petri net to a PNML file
pm4py.write_pnml(inductive_net, inductive_initial_marking, inductive_final_marking, 'inductive_mined_petri_net.pnml')
inductive_performance = evaluate_performance(xes_event_log_test, inductive_net, inductive_initial_marking, inductive_final_marking)

# Print the performance metrics
print("\nInductive Miner Performance on Test Log:")
print(inductive_performance)

In [None]:
from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner

# Heuristic Miner
heuristic_net, heuristic_initial_marking, heuristic_final_marking = heuristics_miner.apply(xes_event_log_train)
heuristic_performance = evaluate_performance(xes_event_log_test, heuristic_net, heuristic_initial_marking, heuristic_final_marking)

# Print the performance metrics
print("\nHeuristic Miner Performance on Test Log:")
print(heuristic_performance)

In [46]:
import pm4py
from pm4py.algo.conformance.tokenreplay import algorithm as token_replay
import pandas as pd

# Step 1: Read the event log
log_2 = pm4py.read_xes('event_log_train.xes')

# Step 2: Discover a Petri net using inductive miner
net, im, fm = pm4py.discover_petri_net_inductive(log_2)

# Step 3: Perform token-based replay conformance checking
replay_result = token_replay.apply(log_2, net, im, fm)

# Convert replay results to a DataFrame manually
records = []
for case in replay_result:
    missing_tokens = case['missing_tokens']
    remaining_tokens = case['remaining_tokens']
    produced_tokens = case['produced_tokens']
    consumed_tokens = case['consumed_tokens']
    fit_traces = case['trace_is_fit']
    
    records.append({
        'missing_tokens': missing_tokens,
        'remaining_tokens': remaining_tokens,
        'produced_tokens': produced_tokens,
        'consumed_tokens': consumed_tokens,
        'fit_traces': fit_traces
    })

df = pd.DataFrame(records)

# Calculate the total missing, extra, produced, and consumed tokens
total_missing_tokens = df['missing_tokens'].sum()
total_extra_tokens = df['remaining_tokens'].sum()
total_produced_tokens = df['produced_tokens'].sum()
total_consumed_tokens = df['consumed_tokens'].sum()

# Calculate the fitness using the specified formula


# Print the summary
print(f"Total Missing Tokens: {total_missing_tokens}")
print(f"Total Remaining Tokens: {total_extra_tokens}")
print(f"Total Produced Tokens: {total_produced_tokens}")
print(f"Total Consumed Tokens: {total_consumed_tokens}")

fitness = 0.5 * (1 - total_missing_tokens / total_produced_tokens) + 0.5 * (1 - total_extra_tokens / total_consumed_tokens)
print(f"Fitness: {fitness:.4f}")

# Export the conformance checking results to a CSV file
csv_file_path = 'token_based_conformance_checking_results.csv'
df.to_csv(csv_file_path, index=False)

# Save the summary results to a text file
with open('conformance_summary.txt', 'w') as file:
    file.write(f"Total Missing Tokens: {total_missing_tokens}\n")
    file.write(f"Total Extra Tokens: {total_extra_tokens}\n")
    file.write(f"Total Produced Tokens: {total_produced_tokens}\n")
    file.write(f"Total Consumed Tokens: {total_consumed_tokens}\n")

parsing log, completed traces ::   0%|          | 0/450 [00:00<?, ?it/s]

replaying log with TBR, completed traces ::   0%|          | 0/195 [00:00<?, ?it/s]

Total Missing Tokens: 0
Total Remaining Tokens: 0
Total Produced Tokens: 13960
Total Consumed Tokens: 13960
Fitness: 1.0000


In [None]:
from pm4py.algo.discovery.alpha import algorithm as alpha_miner

# Apply the Alpha Miner algorithm to discover a Petri net
alpha_net, alpha_initial_marking, alpha_final_marking = alpha_miner.apply(xes_event_log_train)

# Evaluate performance on the test log
alpha_performance = evaluate_performance(xes_event_log_test, alpha_net, alpha_initial_marking, alpha_final_marking)

# Print the performance metrics
print("\nAlpha Miner Performance on Test Log:")
print(alpha_performance)

# Discovery part

we get from here what are the activities we start from (referal, emergency_patients and the er_treatment)

In [None]:
log = importer.apply('event_log.xes')

log_start = pm4py.get_start_activities(log)

# Convert dictionary to DataFrame
df = pd.DataFrame(list(log_start.items()), columns=['Activity', 'Count'])

# Specify the file path for CSV
csv_file_path = 'log_start_activities.csv'

# Export to CSV
df.to_csv(csv_file_path, index=False)

In [None]:
log = importer.apply('event_log.xes')

# Get end activities
log_end = pm4py.get_end_activities(log)
# Convert dictionary to DataFrame
end_df = pd.DataFrame(list(log_end.items()), columns=['Activity', 'Count'])

# Specify the file path for end activities CSV
csv_file_path_end = 'log_end_activities.csv'
# Export end activities to CSV
end_df.to_csv(csv_file_path_end, index=False)

now we will create traces of our activites 

In [None]:
# Mapping event labels to numbers
event_label_to_number = {}
current_number = 1  # Start with number 1

for event_label in sorted_df['concept:name'].unique():
    event_label_to_number[event_label] = current_number
    current_number += 1

# Create a new column 'number' based on event_label
sorted_df['number'] = sorted_df['concept:name'].map(event_label_to_number)

# Group by case_id and concatenate numbers into traces with spaces
traces = sorted_df.groupby('case:concept:name')['number'].apply(lambda x: ' '.join(map(str, x))).reset_index()

# Count occurrences of each trace
trace_counts = traces['number'].value_counts().reset_index()
trace_counts.columns = ['trace', 'count']

# Specify the file path for CSV
csv_file_path_trace_counts = 'trace_counts.csv'
csv_file_path_case_traces = 'case_traces.csv'

# Export trace counts to CSV
trace_counts.to_csv(csv_file_path_trace_counts, index=False)

# Export case_id and trace to CSV
traces.columns = ['case_id', 'trace']
traces.to_csv(csv_file_path_case_traces, index=False)

In [None]:
# Read the trace counts from CSV
trace_counts = pd.read_csv('trace_counts.csv')

def remove_consecutive_duplicates(trace):
    numbers = trace.split()
    result = [numbers[0]]
    for num in numbers[1:]:
        if num != result[-1]:
            result.append(num)
    return ' '.join(result)

# Apply the function to remove consecutive duplicates
trace_counts['trace'] = trace_counts['trace'].apply(remove_consecutive_duplicates)

# Group by the trace and sum the counts
deduplicated_trace_counts = trace_counts.groupby('trace')['count'].sum().reset_index()

# Sort the DataFrame by count in descending order
deduplicated_trace_counts = deduplicated_trace_counts.sort_values(by='count', ascending=False)

# Export the deduplicated and sorted trace counts to CSV
csv_file_path_deduplicated = 'trace_counts.csv'
deduplicated_trace_counts.to_csv(csv_file_path_deduplicated, index=False)

# Select the top 7 most frequent traces
top_traces = deduplicated_trace_counts.head(10)

# Create a bar chart of the top 7 trace counts
plt.figure(figsize=(10, 7))
plt.bar(top_traces['trace'], top_traces['count'], color='skyblue')
plt.xlabel('Trace')
plt.ylabel('Count')
plt.title('Most Frequent Trace Counts')
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability
plt.tight_layout()  # Adjust layout to make room for rotated x-axis labels

# Save the bar chart to a file
bar_chart_path = 'top_trace_counts_bar_chart.png'
plt.savefig(bar_chart_path)

# Display the bar chart
plt.show()


In [None]:
print(event_label_to_number)

In [None]:
import csv

# Your dictionary
data = {
    'emergency_patient': 1, 'ER_PRACTITIONER3': 2, 'releasing': 3, 'B_BED3': 4, 'B_BED25': 5, 'ER_PRACTITIONER4': 6,
    'ER_PRACTITIONER5': 7, 'B_BED26': 8, 'ER_PRACTITIONER6': 9, 'B_BED4': 10, 'patient_referal': 11, 'time_for_intake': 12,
    'INTAKE3': 13, 'OR1': 14, 'A_BED20': 15, 'INTAKE2': 16, 'OR4': 17, 'B_BED11': 18, 'INTAKE1': 19, 'OR2': 20, 'A_BED1': 21,
    'INTAKE4': 22, 'OR3': 23, 'A_BED15': 24, 'B_BED17': 25, 'B_BED5': 26, 'OR5': 27, 'A_BED16': 28, 'B_BED31': 29, 'A_BED14': 30,
    'B_BED18': 31, 'A_BED8': 32, 'B_BED35': 33, 'A_BED18': 34, 'A_BED17': 35, 'B_BED32': 36, 'B_BED27': 37, 'B_BED12': 38,
    'A_BED19': 39, 'A_BED21': 40, 'B_BED8': 41, 'A_BED22': 42, 'A_BED2': 43, 'A_BED30': 44, 'A_BED3': 45, 'A_BED28': 46,
    'B_BED33': 47, 'A_BED23': 48, 'B_BED37': 49, 'A_BED4': 50, 'A_BED9': 51, 'B_BED19': 52, 'B_BED6': 53, 'B_BED30': 54,
    'B_BED28': 55, 'B_BED9': 56, 'B_BED7': 57, 'B_BED29': 58, 'B_BED21': 59, 'B_BED38': 60, 'A_BED29': 61, 'A_BED24': 62,
    'B_BED34': 63, 'B_BED13': 64, 'B_BED40': 65, 'B_BED24': 66, 'A_BED5': 67, 'B_BED14': 68, 'B_BED10': 69, 'A_BED10': 70,
    'A_BED25': 71, 'patient_left_due_to_long_wait': 72, 'B_BED39': 73, 'B_BED23': 74, 'ER_PRACTITIONER7': 75, 'ER_PRACTITIONER1': 76,
    'B_BED22': 77, 'B_BED36': 78, 'B_BED20': 79, 'B_BED1': 80, 'B_BED2': 81, 'B_BED15': 82, 'B_BED16': 83, 'ER_PRACTITIONER8': 84,
    'ER_PRACTITIONER9': 85, 'A_BED11': 86, 'A_BED12': 87, 'A_BED26': 88, 'A_BED6': 89, 'A_BED13': 90, 'A_BED7': 91, 'A_BED27': 92,
    'ER_PRACTITIONER2': 93
}

# Specify the file path for CSV
csv_file_path = 'meaning.csv'

# Writing the dictionary to a CSV file
with open(csv_file_path, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Key', 'Value'])  # Writing header
    for key, value in data.items():
        writer.writerow([key, value])

print(f"Data has been written to {csv_file_path}")


In [None]:
# Count the number of events per case
case_event_counts = sorted_df['case:concept:name'].value_counts()

# Plot the distribution of case lengths
plt.figure(figsize=(10, 6))
sns.histplot(case_event_counts, kde=True, color='purple')
plt.title('Distribution of Events per Case')
plt.xlabel('Number of Events per Case')
plt.ylabel('Frequency')
plt.show()

# Display the case event counts
print(case_event_counts.describe())

In [None]:
# Select a few cases to visualize
sample_cases = sorted_df[sorted_df['case:concept:name'].isin(sorted_df['case:concept:name'].unique()[:5])]

# Plot the event sequences for the selected cases
plt.figure(figsize=(15, 8))
sns.lineplot(x='time:timestamp', y='concept:name', hue='case:concept:name', data=sample_cases, marker='o')
plt.title('Event Sequences for Sample Cases')
plt.xlabel('Timestamp')
plt.ylabel('Event Name')
plt.legend(title='Case ID')
plt.show()

In [None]:
# Prepare the data for sequence mining
cases = sorted_df.groupby('case:concept:name')['concept:name'].apply(list).values

# Encode the data
te = TransactionEncoder()
te_ary = te.fit(cases).transform(cases)
df = pd.DataFrame(te_ary, columns=te.columns_)

# Apply the apriori algorithm to find frequent itemsets
frequent_itemsets = apriori(df, min_support=0.3, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1)

# Display the results
print(frequent_itemsets)
print(rules)

# Conformance Checking

### Reply Method

### Aliganments Method