In [2]:
import matplotlib.pyplot as plt

from pathlib import Path

from promg.modules.db_management import DBManagement
from tabulate import tabulate
import yaml

from promg import Configuration, DatabaseConnection, Performance, SemanticHeader, DatasetDescriptions, OcedPg, Query

import numpy as np
import pandas as pd

pd.set_option('display.width', 2000)
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go


In [3]:
case_study = 'bpic14'
load = False


In [4]:
conf_path = Path(case_study, 'config.yaml')
config = yaml.safe_load(open(conf_path))

print(f"These are the credentials that I expect to be set for the database.")
print(f"db_name: {config['db_name']}")
print(f"uri: {config['uri']}")
print(f"password: {config['password']}")
print("----------------------")
print(f"If you have other credentials, please change them at: {conf_path}")

These are the credentials that I expect to be set for the database.
db_name: neo4j
uri: bolt://localhost:7687
password: bpic2014
----------------------
If you have other credentials, please change them at: bpic14\config.yaml


In [5]:
config = Configuration.init_conf_with_config_file(conf_path)
db_connection = DatabaseConnection.set_up_connection(config=config)

In [6]:
if load:
    # Import the original modeled data
    %run./ 0_analysis_and_model.ipynb

# Check Objects without Any Events

In [13]:
query = '''
    MATCH (i)
    WHERE not 'Event'  in labels(i) and not 'Log' in labels(i) and not 'Record' in labels(i) and not 'KnowledgeDocument' in labels(i)
    RETURN EXISTS((i) <- [] - (:Event)) as has_events, labels(i)[0] as _label, count(i) as cnt order by _label, has_events
'''

df_result = pd.DataFrame(db_connection.exec_query(query))

table = pd.pivot_table(df_result, index=['_label', 'has_events'], aggfunc="sum")
table['%'] = (round(table.cnt / table.groupby(level=0).cnt.transform("sum") * 100, 2)).astype(str) + '%'
table

Unnamed: 0_level_0,Unnamed: 1_level_0,cnt,%
_label,has_events,Unnamed: 2_level_1,Unnamed: 3_level_1
CI_SC,False,2394,15.59%
CI_SC,True,12962,84.41%
Change,False,1307,7.26%
Change,True,16693,92.74%
ConfigurationItem,False,2354,15.55%
ConfigurationItem,True,12780,84.45%
Incident,False,441,0.94%
Incident,True,46616,99.06%
Interaction,True,147004,100.0%
Resource,True,242,100.0%


## Explore some examples

In [14]:
query = '''
    MATCH (i)
    WHERE not 'Event'  in labels(i) and not 'Log' in labels(i) and not 'Record' in labels(i) and not 'KnowledgeDocument' in labels(i) AND NOT EXISTS((i) <- [] - (:Event))
    RETURN labels(i)[0] as _label, i.sysId ORDER BY i.sysId
'''

df_result = pd.DataFrame(db_connection.exec_query(query))
df_result

Unnamed: 0,_label,i.sysId
0,ConfigurationItem,ADB000060
1,CI_SC,ADB000060_WBS000253
2,ConfigurationItem,ADB000069
3,CI_SC,ADB000069_WBS000253
4,ConfigurationItem,ADB000070
...,...,...
6494,ConfigurationItem,XSR000047
6495,CI_SC,XSR000047_WBS000120
6496,ConfigurationItem,XSR000100
6497,CI_SC,XSR000100_WBS000120


In [29]:
query = '''
    MATCH (i:CI_SC)
    OPTIONAL MATCH (i) <- [] - (e:Event)
    RETURN labels(e)[0] as eventSources, i.sysId as id, EXISTS ((:Incident) - [:CAUSED_BY_CI_SC] -> (i)) as caused_by_incident, EXISTS ((:Incident) - [:AFFECTED_CI_SC] -> (i)) as affected_incident
'''

result = pd.DataFrame(db_connection.exec_query(query))


In [30]:
table = pd.pivot_table(result, index=['eventSources'],
                       aggfunc={"id": "count", "affected_incident": "sum", "caused_by_incident": "sum"}, dropna=False)
table

Unnamed: 0_level_0,affected_incident,caused_by_incident,id
eventSources,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ChangeEvent,9373,11129,53739
IncidentEvent,138038,135921,138038
InteractionEvent,289014,287112,294008
,0,1030,2394


# Let's Explore some CI_SC that have no events and no affected nor caused_by incident

In [31]:
query = '''
    MATCH (i:CI_SC)
    WHERE NOT (EXISTS((i) <- [] - (:Event)) OR EXISTS((i) <- [] - (:Incident)))
    RETURN i.sysId
'''

pd.DataFrame(db_connection.exec_query(query))


Unnamed: 0,i.sysId
0,HMD000040_WBS000196
1,HMD000034_WBS000194
2,CBD000704_WBS000146
3,MQM000020_WBS000197
4,MQM000021_WBS000197
...,...
1359,SSW000120_WBS000197
1360,SSW000124_WBS000197
1361,SSW000128_WBS000197
1362,SSW000130_WBS000197


After exploring, it comes clear that these CI_SC are extracted from the Detail_Change.csv

In [36]:
query = '''
    MATCH (i:CI_SC) - [:EXTRACTED_FROM] - (r:Record) <- [:CONTAINS] - (l:Log)
    WHERE NOT (EXISTS((i) <- [] - (:Event)) OR EXISTS((i) <- [] - (:Incident)))
    RETURN l.name, count(i)
'''

pd.DataFrame(db_connection.exec_query(query))


Unnamed: 0,l.name,count(i)
0,Detail_Change.csv,1386


In Changes, the mentioned CI_SC are only the CI_SC they have affected. Let's explore the timestamps of these records.

In [52]:
query = '''
    MATCH (i:CI_SC) - [:EXTRACTED_FROM] - (r:Record) <- [:CONTAINS] - (l:Log)
    WHERE NOT (EXISTS((i) <- [] - (:Event)) OR EXISTS((i) <- [] - (:Incident)))
    RETURN

        count(r) as count,
        reduce(result = "", item in split(r.changeType, " ")[0..2] | result + " " + item) as changeType,
        r.changeRecordOpenTime is NOT NULL as record_open,
        r.changeRecordCloseTime IS NOT NULL as record_close,
        r.plannedStart IS NOT NULL as planned_start,
        r.plannedEnd IS NOT NULL as planned_end,
        r.actualStart IS NOT NULL as actual_start,
        r.actualEnd IS NOT NULL as actual_end ORDER BY count DESC
'''

pd.DataFrame(db_connection.exec_query(query))


Unnamed: 0,count,changeType,record_open,record_close,planned_start,planned_end,actual_start,actual_end
0,1053,Release Type,True,True,True,True,False,False
1,167,Standard Change,True,True,True,True,False,False
2,161,Standard Activity,True,True,True,True,False,False
3,3,Master Change,True,True,True,True,False,False
4,2,Standard Change,True,True,True,False,False,False


In [76]:
query = '''
    MATCH (r:Record) <- [:CONTAINS] - (l:Log {name:'Detail_Change.csv'})
    WHERE r.changeRecordOpenTime IS NOT NULL
    RETURN r.actualStart >= r.changeRecordOpenTime as change_after_record, count(DISTINCT r.changeId), avg(duration.inSeconds(r.actualStart, r.changeRecordOpenTime).minutes) as avg_duration
'''

pd.DataFrame(db_connection.exec_query(query))




Unnamed: 0,change_after_record,count(DISTINCT r.changeId),avg_duration
0,True,15984,-21034.136737
1,,1310,
2,False,708,8547.720102


In [77]:
query = '''
    MATCH (r:Record) <- [:CONTAINS] - (l:Log {name:'Detail_Change.csv'})
    WHERE r.changeRecordOpenTime IS NOT NULL AND r.actualStart < r.changeRecordOpenTime
    RETURN DISTINCT r.changeId, r.actualStart as actual, r.changeRecordOpenTime as record_open, duration.inSeconds(r.actualStart, r.changeRecordOpenTime).minutes as diff ORDER BY diff DESC limit 100
'''

pd.DataFrame(db_connection.exec_query(query))


Unnamed: 0,r.changeId,actual,record_open,diff
0,C00011089,2013-01-13T00:00:00.000000000+01:00,2014-01-13T09:42:00.000000000+01:00,526182
1,C00008371,2012-12-05T08:00:00.000000000+01:00,2013-12-05T08:40:00.000000000+01:00,525640
2,C00010471,2013-01-07T07:00:00.000000000+01:00,2014-01-06T13:30:00.000000000+01:00,524550
3,C00010470,2013-01-07T07:45:00.000000000+01:00,2014-01-06T13:29:00.000000000+01:00,524504
4,C00006938,2012-11-27T08:00:00.000000000+01:00,2013-11-21T09:20:00.000000000+01:00,517040
...,...,...,...,...
95,C00008073,2013-12-02T00:00:00.000000000+01:00,2013-12-03T10:57:00.000000000+01:00,2097
96,C00015876,2014-02-26T08:00:00.000000000+01:00,2014-02-27T18:11:00.000000000+01:00,2051
97,C00006948,2013-11-19T09:00:00.000000000+01:00,2013-11-20T18:30:00.000000000+01:00,2010
98,C00011955,2014-01-20T00:00:00.000000000+01:00,2014-01-21T08:56:00.000000000+01:00,1976
