## Import lib

In [None]:
import pandas as pd
import os
import sys

from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA

from functions import *

import warnings
warnings.filterwarnings("ignore")

import matplotlib.pyplot as plt
%matplotlib inline

## Load data 

In [None]:
PermitLog_Events = pd.read_csv('C:/Users/Documents/PermitLog_Events.csv', sep=';')
PermitLog_Events.shape

In [None]:
PermitLog_Events[['case_name', 'concept:name', 'time:timestamp']].head()

## Get event sequences 

In [None]:
res_df = Prepare(PermitLog_Events, 'concept:name', 'time:timestamp', 'case_name', timeformat="%Y-%m-%dT%H:%M:%S").get_result()

In [None]:
res_df.head()

In [None]:
un_case = res_df["case_name"].unique().shape[0]
print(f"Количество уникальных идентификаторов = {un_case}")

## Get features with help pivot table

In [None]:
p_table = res_df.pivot_table(index="case_name", columns="transact", aggfunc="size", fill_value=0)

In [None]:
p_table.head()

## KMeans section

In [None]:
model_kmeans = KMeans(random_state=17, n_jobs=-1, algorithm='full')

In [None]:
kmeans_class = KMeans_Clusterization(model_kmeans, p_table)

In [None]:
kmeans_class.draw_elbow_method_plot()

In [None]:
kmeans_result = kmeans_class.clustering(num_clusters=5)

In [None]:
df_kmeans_cl = pd.merge(res_df, kmeans_result[["case_name", "clusters"]], on = "case_name")

In [None]:
df_kmeans_cl.columns

### Drawing graph

In [None]:
num_cluster = 4
for_draw = Select_cluster(df_kmeans_cl, num_cluster, cluster_col_name='clusters').select()

In [None]:
fr = Frequency_graph(for_draw, filename=f'graphs/Frequency_kmeans_cluster_{num_cluster}')
fr.draw_freq(count_treshold = 'All', less_or_more = '>')

## DBSCAN section

In [None]:
p_table = res_df.pivot_table(index="case_name", columns="transact", aggfunc="size", fill_value=0)

In [None]:
dbscan_class = DBSCAN_Clusterization(p_table)

In [None]:
dbscan_class.epsilon_optimal_graph()

In [None]:
dbscan_class.epsilon_optimal_graph(ranges=[5500, 7000, 0, 40])

In [None]:
dbscan_res = dbscan_class.clustering(eps_val=4, min_samples=100)

In [None]:
df_dbscan_cl = pd.merge(res_df, dbscan_res[["case_name", "clusters"]], on = "case_name")

### Drawing graph

In [None]:
num_cluster = 2
for_draw = Select_cluster(df_dbscan_cl, num_cluster, cluster_col_name='clusters').select()

In [None]:
fr = Frequency_graph(for_draw, filename=f'graphs/Frequency_dbscan_cluster_{num_cluster}')
fr.draw_freq(count_treshold = 'All')

### Other graph

In [None]:
fr = Frequency_graph(for_draw, filename=f'graphs/Frequency_dbscan_cluster_{num_cluster}_more100')
fr.draw_freq(count_treshold = 50, less_or_more = '>')

In [None]:
pf = Performance_graph(for_draw, filename=f'graphs/Performance_dbscan_cluster_{num_cluster}')
pf.draw_perform(time_treshold = 8, type_value = 'median', less_or_more = '<')  # time_treshold ~ hours