In [79]:
import matplotlib.pyplot as plt

from pathlib import Path

from promg.modules.db_management import DBManagement
from tabulate import tabulate
import yaml

from promg import Configuration, DatabaseConnection, Performance, SemanticHeader, DatasetDescriptions, OcedPg, Query


import pandas as pd
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go


In [80]:
case_study = 'bpic14'


In [81]:
conf_path = Path(case_study, 'config.yaml')
config = yaml.safe_load(open(conf_path))

print(f"These are the credentials that I expect to be set for the database.")
print(f"db_name: {config['db_name']}")
print(f"uri: {config['uri']}")
print(f"password: {config['password']}")
print("----------------------")
print(f"If you have other credentials, please change them at: {conf_path}")

These are the credentials that I expect to be set for the database.
db_name: neo4j
uri: bolt://localhost:7687
password: bpic2014
----------------------
If you have other credentials, please change them at: bpic14/config.yaml


In [82]:
config = Configuration.init_conf_with_config_file(conf_path)
db_connection = DatabaseConnection.set_up_connection(config=config)

Basic node type counts

In [83]:
query_1 = """
MATCH (n)
RETURN labels(n)[0] AS node_type, count(*) AS count
"""

result_1 = db_connection.exec_query(query_1)
summary_df_1 = pd.DataFrame(result_1)

fig_1 = px.bar(summary_df_1, x='node_type', y='count', title='Node Counts per Type')
fig_1.show()


Incidents per CI_SC (Affected)

In [84]:
query_2 = """
MATCH (ci:CI_SC)<-[:AFFECTED_CI_SC]-(i:Incident)
RETURN ci.sysId AS ci_sc, count(i) AS num_incidents
"""

df_incidents = pd.DataFrame(db_connection.exec_query(query_2))
print(df_incidents.head())

fig_2 = px.scatter(df_incidents, x='ci_sc', y='num_incidents',
                 title='Number of Incidents per CI_SC')
fig_2.show()


                 ci_sc  num_incidents
0  SUB000508_WBS000162            309
1  WBA000124_WBS000088            615
2  SBA000263_WBS000072           2176
3  WBA000058_WBS000073           1614
4  SBA000462_WBS000073           1698


Incidents per CI_SC (caused_by)

In [85]:
query_3 = """
MATCH (ci:CI_SC)<-[:CAUSED_BY_CI_SC]-(i:Incident)
RETURN ci.sysId AS ci_sc, count(i) AS num_incidents
"""

df_incidents_2 = pd.DataFrame(db_connection.exec_query(query_3))
print(df_incidents.head())

fig_3 = px.scatter(df_incidents_2, x='ci_sc', y='num_incidents',
                 title='Number of Incidents per CI_SC')
fig_3.show()

                 ci_sc  num_incidents
0  SUB000508_WBS000162            309
1  WBA000124_WBS000088            615
2  SBA000263_WBS000072           2176
3  WBA000058_WBS000073           1614
4  SBA000462_WBS000073           1698


Interactions per CI_SC (Affected)

In [86]:
query_4 = """
MATCH (ci:CI_SC)<-[:AFFECTED_CI_SC]-(i:Interaction)
RETURN ci.sysId AS ci_sc, count(i) AS num_interactions
"""

df_interactions = pd.DataFrame(db_connection.exec_query(query_4))
print(df_interactions.head())

fig_4 = px.scatter(df_interactions, x='ci_sc', y='num_interactions',
                 title='Number of Interactions per CI_SC')
fig_4.show()


                 ci_sc  num_interactions
0  HMD000002_WBS000195                 2
1  SBA000167_WBS000296                28
2  SBA000599_WBS000140                 1
3  ASW000010_WBS000284                23
4  STA000026_WBS000284                 2


Incidents vs Interactions (both affected)

In [87]:
df_combined = pd.merge(df_incidents, df_interactions, on='ci_sc', how='outer').fillna(0)

fig_5 = px.scatter(df_combined, x='num_interactions', y='num_incidents',
                 hover_data=['ci_sc'],
                 title='Incidents vs Interactions per affected CI_SC')
fig_5.show()


Changes on Affected vs Caused_by CI_SC

- affected_ci: the CI_SC affected by the interaction
- causing_ci: the CI_SC that caused the incident
- ch1_count: number of changes affecting the affected CI_SC
- ch2_count: number of changes affecting the causing CI_SC

Points along the diagonal show CIs that both cause and are affected by a similar number of changes. These might be tightly linked components.

In [88]:
query_5 = """
MATCH (c1:CI_SC)<-[:AFFECTED_CI_SC]-(int:Interaction)
      -[:RELATED_INCIDENT]->(inc:Incident)
      -[:CAUSED_BY_CI_SC]->(c2:CI_SC)
WHERE c1 <> c2
WITH c1, c2
OPTIONAL MATCH (c1)<-[:AFFECTED_CI_SC]-(ch1:Change)
WITH c1, c2, count(ch1) AS ch1_count
OPTIONAL MATCH (c2)<-[:AFFECTED_CI_SC]-(ch2:Change)
RETURN c1.sysId AS affected_ci,
       c2.sysId AS causing_ci,
       ch1_count,
       count(ch2) AS ch2_count
"""

df_q_5 = pd.DataFrame(db_connection.exec_query(query_5))
print(df_q_5.head())

fig_6 = px.scatter(
    df_q_5,
    x='ch1_count',
    y='ch2_count',
    hover_data=['affected_ci', 'causing_ci'],
    title='Changes on Affected vs Causing CI_SC',
    labels={'ch1_count': 'Changes on Affected CI_SC', 'ch2_count': 'Changes on Causing CI_SC'}
)
fig_6.show()



           affected_ci           causing_ci  ch1_count  ch2_count
0  SBA000759_WBS000296  SBA000167_WBS000296        168          4
1  SBA000028_WBS000030  SBA000599_WBS000140         11          6
2  WBA000148_WBS000172  SPF000017_WBS000136          2         17
3  SPF000018_WBS000136  SPF000017_WBS000136         14         17
4  ASW000010_WBS000284  STA000026_WBS000284        120          2


Correlation between Incidents, Interactions & Changes per CI_SC (Affected relations)

In [89]:
query_6 = """
MATCH (ci:CI_SC)
OPTIONAL MATCH (ci)<-[:AFFECTED_CI_SC]-(inc:Incident)
WITH ci, count(inc) AS num_incidents
OPTIONAL MATCH (ci)<-[:AFFECTED_CI_SC]-(int:Interaction)
WITH ci, num_incidents, count(int) AS num_interactions
OPTIONAL MATCH (ci)<-[:AFFECTED_CI_SC]-(ch:Change)
WITH ci, num_incidents, num_interactions, count(ch) AS num_changes
WHERE num_interactions > 0 OR num_changes > 0
RETURN ci.sysId AS ci_sc, num_incidents, num_interactions, num_changes
"""
df_q_6 = pd.DataFrame(db_connection.exec_query(query_6))
print(df_q_6.head())

fig_7 = px.scatter_matrix(
    df_q_6,
    dimensions=['num_incidents', 'num_interactions', 'num_changes'],
    hover_data=['ci_sc'],
    title='Correlation Between Incidents, Interactions & Changes per CI_SC Based on Affected relations (Filtered (>0))',
)

fig_7.update_traces(diagonal_visible=False)
fig_7.update_layout(height=700, width=900)
fig_7.show()

                 ci_sc  num_incidents  num_interactions  num_changes
0  HMD000002_WBS000195              2                 2           13
1  DTA000266_WBS000135              0                 0            2
2  LSR000699_WBS000161              0                 0            2
3  WSR000561_WBS000102              0                 0            1
4  ESC000024_WBS000232              0                 0            1


In [90]:
fig_3d = px.scatter_3d(
    df_q_6,
    x='num_interactions', y='num_incidents', z='num_changes',
    hover_data=['ci_sc'],
    title='3D Correlation Between Incidents, Interactions & Changes per CI_SC'
)
fig_3d.update_layout(height=700, width=700)
fig_3d.show()


### Teporal analysis for CI_SC

Durations

In [106]:
queries = {
    "CI_SC": """
        MATCH (ci:CI_SC)<-[:CORR]-(e:Event)
        WITH ci.sysId AS ci_sc, e.timestamp AS event_timestamp
        WITH ci_sc, min(event_timestamp) AS earliest_ts, max(event_timestamp) AS latest_ts
        RETURN ci_sc, earliest_ts, latest_ts,
            (duration.inSeconds(earliest_ts, latest_ts).seconds)/3600 AS duration_hours
        ORDER BY ci_sc
    """,
    "Change": """
        MATCH (ch:Change)<-[:CORR]-(e:Event)
        WITH ch.changeId AS change_id, e.timestamp AS event_timestamp
        WITH change_id, min(event_timestamp) AS earliest_ts, max(event_timestamp) AS latest_ts
        RETURN change_id, earliest_ts, latest_ts,
            (duration.inSeconds(earliest_ts, latest_ts).seconds)/3600 AS duration_hours
        ORDER BY change_id
    """,
    "Incident": """
        MATCH (inc:Incident)<-[:CORR]-(e:Event)
        WITH inc.incidentId AS incident_id, e.timestamp AS event_timestamp
        WITH incident_id, min(event_timestamp) AS earliest_ts, max(event_timestamp) AS latest_ts
        RETURN incident_id, earliest_ts, latest_ts,
            (duration.inSeconds(earliest_ts, latest_ts).seconds)/3600 AS duration_hours
        ORDER BY incident_id
    """,
    "Interaction": """
        MATCH (int:Interaction)<-[:CORR]-(e:Event)
        WITH int.interactionId AS interaction_id, e.timestamp AS event_timestamp
        WITH interaction_id, min(event_timestamp) AS earliest_ts, max(event_timestamp) AS latest_ts
        RETURN interaction_id, earliest_ts, latest_ts,
            (duration.inSeconds(earliest_ts, latest_ts).seconds)/3600 AS duration_hours
        ORDER BY interaction_id

    """
}

all_dfs = []
for obj_type, cypher_query in queries.items():
    df_dur = pd.DataFrame(db_connection.exec_query(cypher_query))
    if not df_dur.empty:
        df_dur["object_type"] = obj_type  
        all_dfs.append(df_dur)

df_dur = pd.concat(all_dfs, ignore_index=True)
df_dur["duration_hours"] = pd.to_numeric(df["duration_hours"], errors="coerce")

print(df_dur.head())
print(df_dur["object_type"].value_counts())


                 ci_sc                          earliest_ts  \
0       #N/B_WBS000284  2013-10-02T09:54:00.000000000+01:00   
1  ACS000001_WBS000252  2013-10-21T13:44:00.000000000+01:00   
2  ADB000001_WBS000253  2013-11-04T10:03:00.000000000+01:00   
3  ADB000002_WBS000253  2013-10-22T15:26:00.000000000+01:00   
4  ADB000003_WBS000253  2013-12-06T15:10:00.000000000+01:00   

                             latest_ts  duration_hours object_type change_id  \
0  2014-03-21T10:47:00.000000000+01:00            4080       CI_SC       NaN   
1  2013-10-23T16:47:00.000000000+01:00              51       CI_SC       NaN   
2  2013-11-12T09:14:00.000000000+01:00             191       CI_SC       NaN   
3  2013-10-22T15:26:00.000000000+01:00               0       CI_SC       NaN   
4  2014-01-14T14:00:00.000000000+01:00             934       CI_SC       NaN   

  incident_id interaction_id  
0         NaN            NaN  
1         NaN            NaN  
2         NaN            NaN  
3         NaN   

In [123]:
fig_8 = sp.make_subplots(
    rows=2, cols=2,
    subplot_titles=["CI_SC", "Change", "Incident", "Interaction"],
    shared_xaxes=False
)

colors = {
    "CI_SC": "red",
    "Change": "orange",
    "Incident": "blue",
    "Interaction": "green"
}

subplot_map = {
    "CI_SC": (1,1),
    "Change": (1,2),
    "Incident": (2,1),
    "Interaction": (2,2)
}

for obj_type, (row, col) in subplot_map.items():
    sub_df = df_dur[df_dur["object_type"] == obj_type]
    if not sub_df.empty:
        fig_8.add_trace(
            go.Histogram(
                x=sub_df["duration_hours"],
                name=obj_type,
                marker_color=colors[obj_type],
                nbinsx=70,
                hovertemplate=(
                    f"<b>{obj_type}</b><br>" +
                    "Duration range: %{x} hours<br>" +
                    "# of objects: %{y}<extra></extra>"
                )
            ),
            row=row, col=col
        )

fig_8.update_xaxes(title_text="Duration (hours)", row=2, col=1)
fig_8.update_xaxes(title_text="Duration (hours)", row=2, col=2)
fig_8.update_xaxes(title_text="Duration (hours)", row=1, col=1)
fig_8.update_xaxes(title_text="Duration (hours)", row=1, col=2)
fig_8.update_yaxes(title_text="# of CI_SC", row=1, col=1)
fig_8.update_yaxes(title_text="# of Changes", row=1, col=2)
fig_8.update_yaxes(title_text="# of Incidents", row=2, col=1)
fig_8.update_yaxes(title_text="# of Interactions", row=2, col=2)


fig_8.update_layout(
    height=900,
    width=1500,
    title_text="Distribution of Object Durations (hours) per Object Type",
    showlegend=False,
    bargap=0.1
)

fig_8.show()


Number of unique incidents/interactions before, during, after changes CI_SC+ duration of changes

In [125]:
query_7= """

// collect  events per CI_SC 
MATCH (ci:CI_SC)<-[:CORR]-(e:Event)
WITH ci.sysId AS ci_sc, e
ORDER BY e.timestamp

WITH ci_sc, collect(e) AS events

// find all Start → End ChangeEvent pairs
UNWIND events AS ev
WITH ci_sc, events, ev
WHERE ev:ChangeEvent AND ev.activity = "Start"
WITH ci_sc, events, ev AS start_event

// Find nearest End after Start
WITH ci_sc, events, start_event, 
     [e IN events WHERE e:ChangeEvent AND e.activity = "End" AND e.timestamp > start_event.timestamp] AS possible_ends
WITH ci_sc, start_event, head(possible_ends) AS end_event, events
WHERE end_event IS NOT NULL

// counts for each window
WITH ci_sc, start_event, end_event, events,
     [e IN events WHERE (e:IncidentEvent OR e:IncidentActivityEvent) AND e.timestamp < start_event.timestamp] AS inc_before,
     [e IN events WHERE (e:IncidentEvent OR e:IncidentActivityEvent) AND e.timestamp >= start_event.timestamp AND e.timestamp <= end_event.timestamp] AS inc_between,
     [e IN events WHERE (e:IncidentEvent OR e:IncidentActivityEvent) AND e.timestamp > end_event.timestamp] AS inc_after,
     [e IN events WHERE e:InteractionEvent AND e.timestamp < start_event.timestamp] AS int_before,
     [e IN events WHERE e:InteractionEvent AND e.timestamp >= start_event.timestamp AND e.timestamp <= end_event.timestamp] AS int_between,
     [e IN events WHERE e:InteractionEvent AND e.timestamp > end_event.timestamp] AS int_after

RETURN
  ci_sc,
  start_event.timestamp AS start_time,
  end_event.timestamp AS end_time,
  size(apoc.coll.toSet([e IN inc_before | e.incidentId])) AS n_incidents_before,
  size(apoc.coll.toSet([e IN inc_between | e.incidentId])) AS n_incidents_between,
  size(apoc.coll.toSet([e IN inc_after | e.incidentId])) AS n_incidents_after,
  size(apoc.coll.toSet([e IN int_before | e.interactionId])) AS n_interactions_before,
  size(apoc.coll.toSet([e IN int_between | e.interactionId])) AS n_interactions_between,
  size(apoc.coll.toSet([e IN int_after | e.interactionId])) AS n_interactions_after,
  (duration.inSeconds(datetime(start_event.timestamp), datetime(end_event.timestamp)).seconds)/3600 AS duration_hours
ORDER BY ci_sc, start_time


"""

df_q_7 = pd.DataFrame(db_connection.exec_query(query_7))
print(df_q_7.head())

                 ci_sc                           start_time  \
0  ADB000001_WBS000253  2013-11-04T10:03:00.000000000+01:00   
1  ADB000003_WBS000253  2013-12-06T15:10:00.000000000+01:00   
2  ADB000003_WBS000253  2014-01-14T13:00:00.000000000+01:00   
3  ADB000004_WBS000253  2013-11-29T12:54:00.000000000+01:00   
4  ADB000004_WBS000253  2014-01-21T08:58:00.000000000+01:00   

                              end_time  n_incidents_before  \
0  2013-11-12T09:14:00.000000000+01:00                   0   
1  2014-01-14T14:00:00.000000000+01:00                   0   
2  2014-01-14T14:00:00.000000000+01:00                   0   
3  2014-01-21T09:42:00.000000000+01:00                   0   
4  2014-01-21T09:42:00.000000000+01:00                   0   

   n_incidents_between  n_incidents_after  n_interactions_before  \
0                    0                  0                      0   
1                    0                  0                      0   
2                    0                  0   

In [126]:
# how many ci_sc have several records
counts = df_q_7['ci_sc'].value_counts()
multiple_rows = counts[counts > 1]
print(multiple_rows)


ci_sc
NET000425_WBS000118    729
NET000217_WBS000118    496
NET000426_WBS000254    385
DBR00114_WBS000224     296
DBR00113_WBS000224     217
                      ... 
WSR000963_WBS000102      2
WSR000553_WBS000102      2
MQM000346_WBS000197      2
MQM000152_WBS000197      2
LSR001054_WBS000330      2
Name: count, Length: 3707, dtype: int64


In [127]:
def to_pandas_ts(series):
    """Convert a Series of neo4j.time.DateTime or ISO strings to pandas UTC timestamps."""
    def _coerce(x):
        if hasattr(x, "to_native"):
            return x.to_native()
        return str(x) 
    return pd.to_datetime(series.map(_coerce), utc=True, errors="coerce")

df_q_7["start_time"] = to_pandas_ts(df_q_7["start_time"])
df_q_7["end_time"]   = to_pandas_ts(df_q_7["end_time"])

In [128]:
df_q_7

Unnamed: 0,ci_sc,start_time,end_time,n_incidents_before,n_incidents_between,n_incidents_after,n_interactions_before,n_interactions_between,n_interactions_after,duration_hours
0,ADB000001_WBS000253,2013-11-04 09:03:00+00:00,2013-11-12 08:14:00+00:00,0,0,0,0,0,0,191
1,ADB000003_WBS000253,2013-12-06 14:10:00+00:00,2014-01-14 13:00:00+00:00,0,0,0,0,0,0,934
2,ADB000003_WBS000253,2014-01-14 12:00:00+00:00,2014-01-14 13:00:00+00:00,0,0,0,0,0,0,1
3,ADB000004_WBS000253,2013-11-29 11:54:00+00:00,2014-01-21 08:42:00+00:00,0,0,0,0,0,0,1268
4,ADB000004_WBS000253,2014-01-21 07:58:00+00:00,2014-01-21 08:42:00+00:00,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
24488,ZOS000029_WBS000199,2014-03-01 18:00:00+00:00,2014-03-02 05:49:00+00:00,0,0,0,0,0,0,11
24489,ZOS000030_WBS000199,2014-03-01 18:00:00+00:00,2014-03-02 05:49:00+00:00,0,0,0,0,0,0,11
24490,ZOS000031_WBS000199,2014-03-01 18:00:00+00:00,2014-03-02 05:49:00+00:00,0,0,0,0,0,0,11
24491,ZOS000032_WBS000199,2014-03-01 18:00:00+00:00,2014-03-02 05:49:00+00:00,0,0,0,0,0,0,11


add visualization with average

In [129]:
fig_9 = sp.make_subplots(
    rows=2, cols=3,
    subplot_titles=(
        "Incidents vs Duration Before Change", "Incidents vs Duration During Change", "Incidents vs Duration After Change",
        "Interactions vs Duration Before Change", "Interactions vs Duration During Change", "Interactions vs Duration After Change"
    ),
    horizontal_spacing=0.08, vertical_spacing=0.18
)

def add_scatter(row, col, x, y, name):
    fig_9.add_trace(
        go.Scatter(
            x=x, y=y, mode="markers",
            name=name,
            text=df_q_7["ci_sc"],
            hovertemplate="CI_SC=%{text}<br>Duration=%{x:.2f} h<br>Count=%{y}<extra></extra>"
        ),
        row=row, col=col
    )

add_scatter(1, 1, df_q_7["duration_hours"], df_q_7["n_incidents_before"],  "Incidents (before)")
add_scatter(1, 2, df_q_7["duration_hours"], df_q_7["n_incidents_between"], "Incidents (during)")
add_scatter(1, 3, df_q_7["duration_hours"], df_q_7["n_incidents_after"],   "Incidents (after)")

add_scatter(2, 1, df_q_7["duration_hours"], df_q_7["n_interactions_before"],  "Interactions (before)")
add_scatter(2, 2, df_q_7["duration_hours"], df_q_7["n_interactions_between"], "Interactions (during)")
add_scatter(2, 3, df_q_7["duration_hours"], df_q_7["n_interactions_after"],   "Interactions (after)")

for c in (1, 2, 3):
    fig_9.update_xaxes(title_text="Change Duration (hours)", row=1, col=c)
    fig_9.update_xaxes(title_text="Change Duration (hours)", row=2, col=c)

fig_9.update_yaxes(title_text="# Incidents", row=1, col=1)
fig_9.update_yaxes(title_text="# Interactions", row=2, col=1)

fig_9.update_layout(
    height=750, width=1200,
    title="Distribution of Incidents and Interactions in different stages of Changes_9",
    showlegend=False
)

fig_9.show()
