In [1]:
import pm4py
import io
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.algo.discovery.alpha import algorithm as alpha_miner
from pm4py.visualization.petri_net import visualizer as pn_visualizer
from pm4py.visualization.bpmn import visualizer as bpmn_vis
from pm4py.statistics.traces.generic.log import case_statistics

In [2]:
from src.loader import load_data

In [3]:
log_domestic = load_data('domestic')
log_international = load_data('international')

parsing log, completed traces ::   0%|          | 0/10500 [00:00<?, ?it/s]

parsing log, completed traces ::   0%|          | 0/6449 [00:00<?, ?it/s]

In [4]:
dom_case_duration = pm4py.get_all_case_durations(log_domestic, activity_key='concept:name', case_id_key='case:concept:name',timestamp_key='time:timestamp')
int_case_duration = pm4py.get_all_case_durations(log_international, activity_key='concept:name', case_id_key='case:concept:name',timestamp_key='time:timestamp')

In [5]:
def convert_case_duration_to_hms(ms):
    seconds = ms /1000
    hours = int(ms // 3600)
    minutes = (seconds % 3600) // 60
    seconds = (seconds % 3600) % 60
    return hours, minutes, seconds

med_int_case_duration = np.mean(int_case_duration)
med_domestic_case_duration = np.mean(dom_case_duration)

int_h, int_m, int_s = convert_case_duration_to_hms(int(med_int_case_duration))
domestic_h, domestic_m, domestic_s = convert_case_duration_to_hms(int(med_domestic_case_duration))

print(f"Durchschnitt der Bearbeitungsdauer für internationale Reiseanträge: {int(int_h)} Stunden, {int(int_m)} Minuten, {int(int_s)} Sekunden")
print(f"Durchschnitt der Bearbeitungsdauer für inländische Reiseanträge: {int(domestic_h)} Stunden, {int(domestic_m)} Minuten, {int(domestic_s)} Sekunden")


Durchschnitt der Bearbeitungsdauer für internationale Reiseanträge: 2074 Stunden, 4 Minuten, 29 Sekunden
Durchschnitt der Bearbeitungsdauer für inländische Reiseanträge: 276 Stunden, 16 Minuten, 35 Sekunden


In [20]:
import pm4py
import pandas as pd
import numpy as np

# Funktion zur Berechnung der Aktivitätsdauer
def calculate_activity_durations(log):
    df = pm4py.convert_to_dataframe(log)
    df['duration'] = df.groupby('case:concept:name')['time:timestamp'].diff().dt.total_seconds()
    return df

# Aktivitätsdauer für jeden Log berechnen
df_domestic = calculate_activity_durations(log_domestic)
df_international = calculate_activity_durations(log_international)

# Funktion zur Berechnung von Statistiken pro Aktivität
def activity_duration_statistics(df):
    stats = df.groupby('concept:name')['duration'].agg(['mean', 'median', 'min', 'max', 'std']).reset_index()
    frequency = df['concept:name'].value_counts().reset_index()
    frequency.columns = ['concept:name', 'frequency']
    total_frequency = frequency['frequency'].sum()
    frequency['frequency_percentage'] = (frequency['frequency'] / total_frequency) * 100
    frequency['frequency_percentage'] = frequency['frequency_percentage'].round(1)  # Rundung auf eine Nachkommastelle
    frequency['frequency_percentage'] = frequency['frequency_percentage'].astype(str) + '%'  # Hinzufügen des % Zeichens
    stats = stats.merge(frequency, on='concept:name', how='left')
    return stats

# Statistiken für inländische und internationale Aktivitäten berechnen
domestic_stats = activity_duration_statistics(df_domestic)
international_stats = activity_duration_statistics(df_international)

# Sortierung nach 'median' in absteigender Reihenfolge
domestic_stats_sorted = domestic_stats.sort_values(by='median', ascending=False)
international_stats_sorted = international_stats.sort_values(by='median', ascending=False)

# Umrechnung der Dauer in Stunden
def convert_seconds_to_hours(seconds):
    if np.isnan(seconds):  # NaN-Werte behandeln
        return np.nan
    hours = seconds / 3600
    return hours

# Anwendung der Umrechnung auf die Spalten der Statistiken
for column in ['mean', 'median', 'min', 'max', 'std']:
    domestic_stats_sorted[column] = domestic_stats_sorted[column].apply(lambda x: convert_seconds_to_hours(x))
    international_stats_sorted[column] = international_stats_sorted[column].apply(lambda x: convert_seconds_to_hours(x))

# Berechnung des Verhältnisses von Median zu Standardabweichung und Umrechnung in Prozent
def calculate_median_std_ratio(median, std):
    if std == 0 or np.isnan(std):  # Vermeidung von Division durch Null oder NaN
        return "NaN"
    ratio = (median / std) * 100
    return f"{ratio:.1f}%"  # Nur eine Nachkommastelle und % Zeichen hinzufügen

# Hinzufügen einer neuen Spalte für das Verhältnis in Prozent
domestic_stats_sorted['median_to_std_ratio'] = domestic_stats_sorted.apply(lambda row: calculate_median_std_ratio(row['median'], row['std']), axis=1)
international_stats_sorted['median_to_std_ratio'] = international_stats_sorted.apply(lambda row: calculate_median_std_ratio(row['median'], row['std']), axis=1)

# Filterkriterien definieren
frequency_percentage_threshold = 5  # Beispiel: Mindestens 5% Anteil an der Gesamtfrequenz

# Filter auf die sortierten Daten anwenden
filtered_domestic_stats = domestic_stats_sorted[
    (domestic_stats_sorted['frequency_percentage'].str.rstrip('%').astype(float) > frequency_percentage_threshold)
]

filtered_international_stats = international_stats_sorted[
    (international_stats_sorted['frequency_percentage'].str.rstrip('%').astype(float) > frequency_percentage_threshold)
]

# Ergebnisse als Tabellen anzeigen
print("Filtered Domestic Declarations (based on frequency percentage)")
display(filtered_domestic_stats)

print("Filtered International Declarations (based on frequency percentage)")
display(filtered_international_stats)

Filtered Domestic Declarations (based on frequency percentage)


Unnamed: 0,concept:name,mean,median,min,max,std,frequency,frequency_percentage,median_to_std_ratio
15,Payment Handled,87.095872,77.772083,2.318333,6821.591667,93.108391,10044,17.8%,83.5%
16,Request Payment,75.94954,27.432361,0.0,5631.320278,172.368092,10040,17.8%,15.9%
3,Declaration FINAL_APPROVED by SUPERVISOR,48.509872,21.004444,0.000278,3458.328056,92.860446,10131,18.0%,22.6%
14,Declaration SUBMITTED by EMPLOYEE,127.063711,5.210833,0.008056,8566.134444,447.597124,11531,20.4%,1.2%
0,Declaration APPROVED by ADMINISTRATION,27.73905,0.010278,0.000278,6781.981389,197.660255,8202,14.5%,0.0%


Filtered International Declarations (based on frequency percentage)


Unnamed: 0,concept:name,mean,median,min,max,std,frequency,frequency_percentage,median_to_std_ratio
33,Start trip,1159.283671,678.477222,0.112222,10905.720833,1684.442552,6449,8.9%,40.3%
30,Permit SUBMITTED by EMPLOYEE,485.70954,110.145556,0.013056,8795.336944,1056.57447,6255,8.7%,10.4%
14,Declaration SUBMITTED by EMPLOYEE,282.416621,109.193194,0.011667,7968.318333,598.188363,8099,11.2%,18.3%
15,End trip,181.57821,96.0,0.0,15846.465833,616.657952,6449,8.9%,15.6%
16,Payment Handled,85.443832,77.201111,0.658889,6559.204167,115.87593,6187,8.6%,66.6%
31,Request Payment,76.681039,28.333611,0.0,6386.376944,148.773984,6183,8.6%,19.0%
5,Declaration FINAL_APPROVED by SUPERVISOR,63.666091,25.184167,0.000556,2087.458889,94.65097,6039,8.4%,26.6%
22,Permit FINAL_APPROVED by SUPERVISOR,49.83086,22.691944,0.0,1013.256389,64.864926,5381,7.5%,35.0%
0,Declaration APPROVED by ADMINISTRATION,35.67633,0.047778,0.000278,10129.095278,268.05293,5037,7.0%,0.0%
17,Permit APPROVED by ADMINISTRATION,10.682949,0.001111,0.000278,4414.245556,109.556452,4839,6.7%,0.0%
