In [1]:
import pandas as pd
import numpy as np
import gensim
from tqdm.auto import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN

In [2]:
df = pd.read_csv("Team 1.csv")

#### Переименовываем столбцы

In [3]:
df = df.rename(columns={"case concept:name": "case:concept:name",
                   "event concept:name": "concept:name",
                   "event time:timestamp": "time:timestamp"});

# Word2Vec

#### Группируем по case:concept:name и формаируем столбец событий в процессе(не сортированный)

In [4]:
word_two_vec_df = df.groupby("case:concept:name")["concept:name"].apply(' '.join).reset_index()

#### Формируем массив

In [None]:
word_two_vec_df["concept:name"] = word_two_vec_df["concept:name"].str.split(' ')

In [None]:
word_two_vec_df.head()

Unnamed: 0,case:concept:name,concept:name
0,2000000005_00001,"[SRM:, Created, SRM:, Document, Completed, SRM..."
1,2000000005_00002,"[SRM:, Created, SRM:, Awaiting, Approval, SRM:..."
2,2000000009_00001,"[Vendor, creates, invoice, SRM:, Created, SRM:..."
3,2000000011_00001,"[SRM:, Created, SRM:, Ordered, SRM:, Awaiting,..."
4,2000000019_00001,"[SRM:, Created, SRM:, In, Transfer, to, Execut..."


#### Получем эмбединги

In [None]:
model = gensim.models.Word2Vec(word_two_vec_df["concept:name"], min_count=1, size=100, seed=42)

In [None]:
features = [np.mean([model.wv[word] for word in sent], axis=0).tolist() for sent in word_two_vec_df["concept:name"]] 

# KMeans

#### В цикле подбираем оптимальное количество класстеров

In [None]:
res = pd.DataFrame(columns=['k', 'inertia', 'silhouete'])
max_score_word_two_vec = 0.
word_two_vec_n_of_clusters = 0
for k in tqdm(range(2, 20)):
    km = KMeans(n_clusters=k, random_state=42)
    pred = km.fit_predict(features)
    score = silhouette_score(features, pred, random_state=42)
    if score > max_score_word_two_vec:
        max_score_word_two_vec = score
        word_two_vec_n_of_clusters = k
    res.loc[k] = (k, km.inertia_, score)
res

HBox(children=(IntProgress(value=0, max=348), HTML(value='')))

In [None]:
print(f"max_score: {max_score_word_two_vec} ",  f"number of clusters with max score: {word_two_vec_n_of_clusters} ")

In [None]:
res.to_csv("KMeans.csv", sep = "\t")

# DBSCAN

In [None]:
res = pd.DataFrame(columns=['eps', 'min_samples', 'score'])

max_score = 0
op_eps = 0
op_min_samples = 0
n = 0
for eps in [0.01, 0.05, 0.075, 0.1]:
    for min_samples in [2,3,5]:
        clustering = DBSCAN(eps = eps, min_samples=min_samples).fit(features)
        pred = clustering.labels_
        score = silhouette_score(features, pred, random_state=42)
        res.loc[n] = (eps, min_samples, score)
        n += 1
        if(current_score > max_score):
            max_score = current_score
            op_eps = eps
            op_min_samples = min_samples
res.head(70)

In [None]:
print(f"max_score: {max_score} ",  f"eps: {op_eps}, min_samples: {op_min_samples}")

In [None]:
res.to_csv("DBSCAN.csv", sep = "\t")

# Fitness

In [None]:
import os
from pm4py.objects.log.importer.xes import factory as xes_importer
from pm4py.algo.discovery.alpha import factory as alpha_miner
from pm4py.algo.discovery.inductive import factory as inductive_miner
from pm4py.objects.conversion.log import factory as conversion_factory
from pm4py.evaluation.replay_fitness import factory as replay_factory

#### Получаем нужный формат

In [None]:
df = pd.read_csv("Team 1.csv")

df["event time:timestamp"] = df["event time:timestamp"].apply(lambda x: pd.Timestamp(x))

df = df.rename(columns={"case concept:name": "case:concept:name",
                   "event concept:name": "concept:name",
                   "event time:timestamp": "time:timestamp"});

df = df[["case:concept:name","concept:name","time:timestamp"]]

df = df.sort_values(by=["case:concept:name", "time:timestamp"])

In [None]:
event_log = conversion_factory.apply(df)

#### Альфа Петри

In [None]:
alpha_petri, alpha_initial_marking, alpha_final_marking = alpha_miner.apply(event_log)

In [None]:
fitness_alpha = replay_factory.apply(event_log, alpha_petri, alpha_initial_marking, alpha_final_marking)
print("fitness_alpha=",fitness_alpha)