In [1]:
import matplotlib.pyplot as plt

from pathlib import Path

from promg.modules.db_management import DBManagement
from tabulate import tabulate
import yaml

from promg import Configuration, DatabaseConnection, Performance, SemanticHeader, DatasetDescriptions, OcedPg, Query

import numpy as np
import pandas as pd

pd.set_option('display.width', 2000)


In [2]:
case_study = 'bpic14'
load = False


In [3]:
conf_path = Path(case_study, 'config.yaml')
config = yaml.safe_load(open(conf_path))

print(f"These are the credentials that I expect to be set for the database.")
print(f"db_name: {config['db_name']}")
print(f"uri: {config['uri']}")
print(f"password: {config['password']}")
print("----------------------")
print(f"If you have other credentials, please change them at: {conf_path}")

These are the credentials that I expect to be set for the database.
db_name: neo4j
uri: bolt://localhost:7687
password: bpic2014
----------------------
If you have other credentials, please change them at: bpic14\config.yaml


In [4]:
config = Configuration.init_conf_with_config_file(conf_path)
db_connection = DatabaseConnection.set_up_connection(config=config)

In [5]:
if load:
    # Import the original modeled data
    %run./ 0_analysis_and_model.ipynb

# Check Objects without Any Events

In [6]:
query = '''
    MATCH (i) - [:IS_OF_TYPE] -> (ot:ObjectType)
    WHERE not 'KnowledgeDocument' in labels(i)
    RETURN EXISTS((i) <- [] - (:Event)) as has_events, labels(i)[0] as _label, count(i) as cnt order by _label, has_events
'''

df_result = pd.DataFrame(db_connection.exec_query(query))

table = pd.pivot_table(df_result, index=['_label', 'has_events'], aggfunc="sum")
table['%'] = (round(table.cnt / table.groupby(level=0).cnt.transform("sum") * 100, 2)).astype(str) + '%'
table

Unnamed: 0_level_0,Unnamed: 1_level_0,cnt,%
_label,has_events,Unnamed: 2_level_1,Unnamed: 3_level_1
CI_SC,False,1353,8.81%
CI_SC,True,14003,91.19%
Change,True,17318,100.0%
ConfigurationItem,False,1318,8.71%
ConfigurationItem,True,13816,91.29%
Incident,True,46368,100.0%
Interaction,True,146553,100.0%
Resource,False,2,0.83%
Resource,True,240,99.17%
ServiceComponent,False,5,1.47%


The found numbers for 'Incident', 'Interaction', 'Change' correspond to the objects that have events deleted before the cutoff.

After deleting these objects, we might now have CI_SC that have no other object referring to them.

In [13]:
query = '''MATCH (ci_sc:CI_SC)
RETURN EXISTS((ci_sc) <- [] - (:Event)) as has_events, EXISTS ((ci_sc) -- (:Incident|Change|Interaction)) as is_refered_to, count(ci_sc) as cnt'''

pd.DataFrame(db_connection.exec_query(query))

Unnamed: 0,has_events,is_refered_to,cnt
0,True,True,14003
1,False,False,323
2,False,True,1030


In [16]:
query = '''
MATCH (ci_sc:CI_SC) - [r] - (n:Interaction|Change|Incident)
WHERE NOT EXISTS((ci_sc) <- [] - (:Event))
RETURN type(r) as relation_type, labels(n) as label, count(distinct ci_sc) as cnt'''

pd.DataFrame(db_connection.exec_query(query))

Unnamed: 0,relation_type,label,cnt
0,CAUSED_BY_CI_SC,[Incident],1030


## FINDING
We can also delete the 323 CI_SC that are now not being referred to anymore.


## Explore some examples

In [9]:
query = '''
    MATCH (ci_sc:CI_SC)
    WHERE NOT EXISTS ((ci_sc) -- (:Incident|Change|Interaction))
    RETURN ci_sc.sysId as id
'''

df_result = pd.DataFrame(db_connection.exec_query(query))
df_result

Unnamed: 0,id
0,OVR000012_WBS000256
1,SBA000755_WBS000313
2,SBA000754_WBS000313
3,SBA000756_WBS000312
4,HMD000040_WBS000196
...,...
318,SBA000788_WBS000285
319,LAP000520_WBS000091
320,PRN000044_WBS000096
321,SSW000280_WBS000102


In [13]:
query = '''
    MATCH (i:CI_SC)
    OPTIONAL MATCH (i) <- [] - (e:Event)
    RETURN labels(e)[0] as eventSources, i.sysId as id, EXISTS ((:Incident) - [:CAUSED_BY_CI_SC] -> (i)) as caused_by_incident, EXISTS ((:Incident) - [:AFFECTED_CI_SC] -> (i)) as affected_incident
'''

result = pd.DataFrame(db_connection.exec_query(query))


In [14]:
table = pd.pivot_table(result, index=['eventSources'],
                       aggfunc={"id": "count", "affected_incident": "sum", "caused_by_incident": "sum"}, dropna=False)
table

Unnamed: 0_level_0,affected_incident,caused_by_incident,id
eventSources,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ChangeEvent,18539,21959,109267
IncidentActivityEvent,458676,451386,458676
IncidentEvent,137336,135230,137336
InteractionEvent,288144,286252,293106
,3,1032,1353


# Let's Explore some CI_SC that have no events and no affected nor caused_by incident

In [15]:
query = '''
    MATCH (i:CI_SC)
    WHERE NOT (EXISTS((i) <- [] - (:Event)) OR EXISTS((i) <- [] - (:Incident)))
    RETURN i.sysId
'''

pd.DataFrame(db_connection.exec_query(query))


Unnamed: 0,i.sysId
0,OVR000012_WBS000256
1,SBA000755_WBS000313
2,SBA000754_WBS000313
3,SBA000756_WBS000312
4,HMD000040_WBS000196
...,...
314,LAP000744_WBS000091
315,SBA000788_WBS000285
316,LAP000520_WBS000091
317,PRN000044_WBS000096


After exploring, it comes clear that these CI_SC are extracted from the Detail_Change.csv

In [16]:
query = '''
    MATCH (i:CI_SC) - [:EXTRACTED_FROM] - (r:Record) <- [:CONTAINS] - (l:Log)
    WHERE NOT (EXISTS((i) <- [] - (:Event)) OR EXISTS((i) <- [] - (:Incident)))
    RETURN l.name, count(i)
'''

pd.DataFrame(db_connection.exec_query(query))


Unnamed: 0,l.name,count(i)
0,Detail_Change.csv,331
1,BPIC14Interaction.csv,12


In Changes, the mentioned CI_SC are only the CI_SC they have affected. Let's explore the timestamps of these records.

In [21]:
query = '''
    MATCH (i:CI_SC) - [:EXTRACTED_FROM] - (r:Record) <- [:CONTAINS] - (l:Log {name:'Detail_Change.csv'})
    WHERE NOT (EXISTS((i) <- [] - (:Event)) OR EXISTS((i) <- [] - (:Incident)))
    RETURN

        count(r) as count,
        reduce(result = "", item in split(r.changeType, " ")[0..2] | result + " " + item) as changeType,
        r.changeRecordOpenTime is NOT NULL as record_open,
        r.changeRecordCloseTime IS NOT NULL as record_close,
        r.plannedStart IS NOT NULL as planned_start,
        r.plannedEnd IS NOT NULL as planned_end,
        r.actualStart IS NOT NULL as actual_start,
        r.actualEnd IS NOT NULL as actual_end ORDER BY count DESC
'''

pd.DataFrame(db_connection.exec_query(query))


Unnamed: 0,count,changeType,record_open,record_close,planned_start,planned_end,actual_start,actual_end
0,106,Standard Activity,True,True,True,True,False,False
1,92,Standard Change,True,True,True,True,True,True
2,68,Standard Activity,True,True,True,True,True,True
3,33,Standard Change,True,True,True,True,False,False
4,21,Release Type,True,True,True,True,True,True
5,8,Release Type,True,True,True,True,False,False
6,3,Master Change,True,True,True,True,False,False


In [76]:
query = '''
    MATCH (r:Record) <- [:CONTAINS] - (l:Log {name:'Detail_Change.csv'})
    WHERE r.changeRecordOpenTime IS NOT NULL
    RETURN r.actualStart >= r.changeRecordOpenTime as change_after_record, count(DISTINCT r.changeId), avg(duration.inSeconds(r.actualStart, r.changeRecordOpenTime).minutes) as avg_duration
'''

pd.DataFrame(db_connection.exec_query(query))




Unnamed: 0,change_after_record,count(DISTINCT r.changeId),avg_duration
0,True,15984,-21034.136737
1,,1310,
2,False,708,8547.720102


In [77]:
query = '''
    MATCH (r:Record) <- [:CONTAINS] - (l:Log {name:'Detail_Change.csv'})
    WHERE r.changeRecordOpenTime IS NOT NULL AND r.actualStart < r.changeRecordOpenTime
    RETURN DISTINCT r.changeId, r.actualStart as actual, r.changeRecordOpenTime as record_open, duration.inSeconds(r.actualStart, r.changeRecordOpenTime).minutes as diff ORDER BY diff DESC limit 100
'''

pd.DataFrame(db_connection.exec_query(query))


Unnamed: 0,r.changeId,actual,record_open,diff
0,C00011089,2013-01-13T00:00:00.000000000+01:00,2014-01-13T09:42:00.000000000+01:00,526182
1,C00008371,2012-12-05T08:00:00.000000000+01:00,2013-12-05T08:40:00.000000000+01:00,525640
2,C00010471,2013-01-07T07:00:00.000000000+01:00,2014-01-06T13:30:00.000000000+01:00,524550
3,C00010470,2013-01-07T07:45:00.000000000+01:00,2014-01-06T13:29:00.000000000+01:00,524504
4,C00006938,2012-11-27T08:00:00.000000000+01:00,2013-11-21T09:20:00.000000000+01:00,517040
...,...,...,...,...
95,C00008073,2013-12-02T00:00:00.000000000+01:00,2013-12-03T10:57:00.000000000+01:00,2097
96,C00015876,2014-02-26T08:00:00.000000000+01:00,2014-02-27T18:11:00.000000000+01:00,2051
97,C00006948,2013-11-19T09:00:00.000000000+01:00,2013-11-20T18:30:00.000000000+01:00,2010
98,C00011955,2014-01-20T00:00:00.000000000+01:00,2014-01-21T08:56:00.000000000+01:00,1976
