In [1]:
import matplotlib.pyplot as plt

from pathlib import Path

from promg.modules.db_management import DBManagement
from tabulate import tabulate
import yaml

from promg import Configuration, DatabaseConnection, Performance, SemanticHeader, DatasetDescriptions, OcedPg, Query

import numpy as np
import pandas as pd
import seaborn as sns

pd.set_option('display.width', 2000)
%matplotlib inline


In [2]:
case_study = 'bpic14'
load = False
infer_high_level_events = False


In [3]:
conf_path = Path(case_study, 'config.yaml')
config = yaml.safe_load(open(conf_path))

print(f"These are the credentials that I expect to be set for the database.")
print(f"db_name: {config['db_name']}")
print(f"uri: {config['uri']}")
print(f"password: {config['password']}")
print("----------------------")
print(f"If you have other credentials, please change them at: {conf_path}")

These are the credentials that I expect to be set for the database.
db_name: neo4j
uri: bolt://localhost:7687
password: bpic2014
----------------------
If you have other credentials, please change them at: bpic14\config.yaml


In [4]:
config = Configuration.init_conf_with_config_file(conf_path)
db_connection = DatabaseConnection.set_up_connection(config=config)

In [5]:
if load:
    # Import the original modeled data
    %run./ 0_analysis_and_model.ipynb

In [6]:
if infer_high_level_events:
    # Import the original modeled data
    %run./ 1_0_1_infer_high_level_events.ipynb

In [8]:
# get the bag variants on the high_level
q_set_variants = '''
MATCH (ci_sc:CI_SC) <- [:CORR] - (e:HighLevelEvent)
MATCH (ci_sc) - [:RELATED_CI] -> (ci:ConfigurationItem)
WITH ci_sc, ci, e.activity AS activity, count(e) as event_count ORDER BY activity, event_count
WITH ci_sc, ci, activity,
CASE
WHEN event_count = 1 THEN "1"
WHEN event_count = 2 THEN "2"
WHEN event_count <= 10 THEN "3-10"
WHEN event_count <= 20 THEN "11-20"
WHEN event_count <= 100 THEN "21-100"
WHEN event_count <= 1000 THEN "101-1000"
ELSE ">1001" END AS event_count
WITH ci_sc.sysId as sysId, ci.ciType as type, ci.ciSubtype as subtype, collect(distinct activity) as set_variant, collect(distinct activity + " (" + event_count + ")") as multi_set_variant
RETURN  rtrim(reduce(str = "", act in set_variant | str + act + ", "),", ") as set_variant,
        rtrim(reduce(str = "", act in multi_set_variant | str + act + ", "),", ") as multi_set_variant,
        count(distinct sysId) as num_ci_sc
'''

result = pd.DataFrame(db_connection.exec_query(q_set_variants))

In [9]:
table = pd.pivot_table(result, index=['set_variant', 'multi_set_variant'], aggfunc="sum", sort=False)
table['%_set_variant'] = (
    round(
        table.groupby(['set_variant']).num_ci_sc.transform("sum") /
        sum(table['num_ci_sc']) * 100,
        2
    )
)
table.reset_index(inplace=True)
# table.set_index(['set_variant', '%_set_variant'], inplace=True)
table['%_multi_set_variant'] = (
    round(table.num_ci_sc / table.groupby(['set_variant', '%_set_variant']).num_ci_sc.transform("sum") * 100, 2))
set_variants = pd.pivot_table(table, index=['set_variant', '%_set_variant'], aggfunc={'num_ci_sc': "sum"})
set_variants

Unnamed: 0_level_0,Unnamed: 1_level_0,num_ci_sc
set_variant,%_set_variant,Unnamed: 2_level_1
Change,63.61,8212
"Change, Incident",0.25,32
"Change, Incident, Interaction",4.11,531
"Change, Interaction",0.55,71
Incident,3.66,472
"Incident, Interaction",15.79,2038
Interaction,12.04,1554


In [10]:
multi_set_variants = pd.pivot_table(table, index=['set_variant', '%_set_variant', 'multi_set_variant'])

In [11]:
# get the bag variants on the high_level
q_set_variants = '''
MATCH (ci_sc:CI_SC) <- [:CORR] - (e:HighLevelEvent)
MATCH (ci_sc) - [:RELATED_CI] -> (ci:ConfigurationItem)
WITH ci_sc, ci, e.activity AS activity, count(e) as event_count ORDER BY activity DESC
WITH ci_sc.sysId as sysId, ci.ciType as type, ci.ciSubtype as subtype, collect(distinct activity) as set_variant, collect(distinct activity + " (" + event_count + ")") as multi_set_variant
RETURN  type,
        subtype,
        rtrim(reduce(str = "", act in set_variant | str + act + ", "),", ") as set_variant,
        rtrim(reduce(str = "", act in multi_set_variant | str + act + ", "),", ") as multi_set_variant,
        count(distinct sysId) as num_ci_sc
'''

result = pd.DataFrame(db_connection.exec_query(q_set_variants))

In [12]:
table = pd.pivot_table(result, index=['type', 'subtype', 'set_variant', 'multi_set_variant'], aggfunc="sum")

table['%_set_variant'] = (
        round(
            table.groupby(['type', 'subtype', 'set_variant']).num_ci_sc.transform("sum") /
            table.groupby(['type', 'subtype']).num_ci_sc.transform("sum") * 100,
            2
        ).astype(str) + '%'
)
table.reset_index(inplace=True)
# table.set_index(['set_variant', '%_set_variant'], inplace=True)
table['%_multi_set_variant'] = (
    round(table.num_ci_sc / table.groupby(['set_variant', '%_set_variant']).num_ci_sc.transform("sum") * 100, 2))
table.sort_values(by='%_set_variant')
set_variants = pd.pivot_table(table, index=['set_variant', 'type', 'subtype', '%_set_variant'],
                              aggfunc={'num_ci_sc': "sum"})
set_variants

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,num_ci_sc
set_variant,type,subtype,%_set_variant,Unnamed: 4_level_1
Change,Phone,Number,97.56%,120
Change,application,Client Based Application,20.0%,3
Change,application,Desktop Application,76.87%,482
Change,application,Exchange,20.0%,1
Change,application,SAP,14.29%,1
...,...,...,...,...
"Interaction, Incident, Change",storage,Tape Library,8.7%,2
"Interaction, Incident, Change",subapplication,Citrix,20.0%,1
"Interaction, Incident, Change",subapplication,Server Based Application,3.95%,15
"Interaction, Incident, Change",subapplication,Standard Application,33.33%,1


In [13]:
# get the bag variants on the high_level
q_number_of_events = '''
MATCH (ci_sc:CI_SC) <- [:CORR] - (e:HighLevelEvent)
MATCH (ci_sc) - [:RELATED_CI] -> (ci:ConfigurationItem)
WHERE ci.ciType is not null AND ci.ciSubtype is not null AND ci.ciType <> "#N/B"
WITH ci_sc, ci, e.activity AS activity, count(e) as event_count ORDER BY activity
RETURN ci_sc.sysId as sysId, ci.ciType as type, ci.ciSubtype as subtype, activity, event_count
'''

result = pd.DataFrame(db_connection.exec_query(q_number_of_events))
result

Unnamed: 0,sysId,type,subtype,activity,event_count
0,LSR000699_WBS000161,computer,Linux Server,Change,4
1,HMD000002_WBS000195,hardware,MigratieDummy,Change,12
2,SUB000494_WBS000162,subapplication,Web Based Application,Change,8
3,SBA000167_WBS000296,application,Server Based Application,Change,4
4,SWT000091_WBS000207,networkcomponents,Switch,Change,84
...,...,...,...,...,...
15986,SUB000422_WBS000310,subapplication,Standard Application,Interaction,60
15987,WSR001512_WBS000102,computer,Windows Server,Interaction,38
15988,SBA000669_WBS000256,application,Server Based Application,Interaction,2
15989,SBA000543_WBS000139,application,Server Based Application,Interaction,6


In [14]:
# Pivot the DataFrame
pivot_df = result.pivot(
    index=['sysId', 'type', 'subtype'],
    columns='activity',
    values='event_count'
).reset_index()

pivot_df = pivot_df.fillna(0)
pivot_df

activity,sysId,type,subtype,Change,Incident,Interaction
0,ACS000001_WBS000252,applicationcomponent,Application Server,0.0,2.0,2.0
1,ADB000001_WBS000253,database,Applicatie Database,4.0,0.0,0.0
2,ADB000002_WBS000253,database,Applicatie Database,2.0,0.0,0.0
3,ADB000003_WBS000253,database,Applicatie Database,4.0,0.0,0.0
4,ADB000004_WBS000253,database,Applicatie Database,10.0,0.0,0.0
...,...,...,...,...,...,...
12788,ZOS000029_WBS000199,computer,zOS Server,2.0,0.0,0.0
12789,ZOS000030_WBS000199,computer,zOS Server,2.0,0.0,0.0
12790,ZOS000031_WBS000199,computer,zOS Server,2.0,0.0,0.0
12791,ZOS000032_WBS000199,computer,zOS Server,2.0,0.0,0.0


In [17]:
compare_df = pivot_df


def get_activity_order(row):
    activities = ["Change", "Interaction", "Incident"]
    sorted_activities = sorted(zip(activities, row[activities]), key=lambda x: x[1], reverse=True)
    return " >= ".join([f"{activity}" for activity, value in sorted_activities if value > 0])


# Apply the function to each row
compare_df["order"] = compare_df.apply(get_activity_order, axis=1)

#TODO ADD TOTAL PER BAG VARIANT (AND SORT)

# Display the updated DataFrame
# Group by 'app_wbs' and 'type', then aggregate by 'subtype' to find min and max
result = compare_df.groupby(['type', 'subtype', 'order']).agg(
    {'sysId': 'count'})
result['%'] = (round(result.sysId / result.groupby(level=0).sysId.transform("sum") * 100, 2))
result

Unnamed: 0_level_0,Unnamed: 1_level_0,activity,sysId,%
type,subtype,order,Unnamed: 3_level_1,Unnamed: 4_level_1
Phone,Number,Change,120,97.56
Phone,Number,Incident,1,0.81
Phone,Number,Interaction,1,0.81
Phone,Number,Interaction >= Incident,1,0.81
application,Citrix,Interaction,1,0.06
...,...,...,...,...
subapplication,Web Based Application,Incident,2,0.40
subapplication,Web Based Application,Interaction,8,1.60
subapplication,Web Based Application,Interaction >= Change >= Incident,2,0.40
subapplication,Web Based Application,Interaction >= Incident,32,6.41


In [15]:
import seaborn as sns

fig = px.box(data_frame=result, x="type", y="event_count", color="activity")
fig.show()

NameError: name 'px' is not defined

In [None]:
# Group by 'app_wbs' and 'type', then aggregate by 'subtype' to find min and max
result = pivot_df.groupby(['type']).agg(
    {
        'sysId': 'count',
        'Change': 'max',
        'Incident': 'max',
        'Interaction': 'max'
    })
result

In [None]:
table_reset = table.reset_index()
table_reset[table_reset['type'] == 'applicationcomponent']