In [None]:
# Setup and Import
from google.colab import drive
drive.mount('/content/drive')

!pip install pm4py
import pm4py
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from itertools import combinations

print("Libraries imported successfully")

In [None]:
# Load OCEL Data
ocel = pm4py.read_ocel2_json("./drive/MyDrive/order-management.json")
# ocel = pm4py.read_ocel2_json("./drive/MyDrive/ContainerLogistics.json")
# ocel = pm4py.read_ocel2_json("./drive/MyDrive/datasets/ocel2-p2p.json")
# ocel = pm4py.read_ocel2_xml("./drive/MyDrive/datasets/agc.xml")
print(ocel)
ocel.get_extended_table()
print("OCEL data loaded successfully")

In [None]:
# Cell 3a: Setup and Object Type Extraction
def extract_object_types(ocel):
    """Extract unique object TYPES from OCEL"""
    events_df = ocel.get_extended_table()
    object_type_columns = [col for col in events_df.columns if col.startswith('ocel:type:')]

    object_types = []
    for col in object_type_columns:
        object_type = col.replace('ocel:type:', '')
        object_types.append(object_type)

    return object_types

# Extract object types and prepare data
object_types = extract_object_types(ocel)
events_df = ocel.get_extended_table()

# Initialize storage arrays for each factor
from itertools import combinations
import numpy as np
import pandas as pd

type_pairs = list(combinations(object_types, 2))
n_pairs = len(type_pairs)

print(f"Object types: {object_types}")

In [None]:
# Cell 3b: Factor 1 - O2O Relationships (40%)
o2o_strengths = np.zeros(n_pairs)
o2o_details = []

def calculate_o2o_strength(type1, type2, events_df):
    type1_col = f'ocel:type:{type1}'
    type2_col = f'ocel:type:{type2}'

    if type1_col not in events_df.columns or type2_col not in events_df.columns:
        return 0, {"shared_events": 0, "type1_events": 0, "type2_events": 0}

    type1_mask = events_df[type1_col].notna()
    type2_mask = events_df[type2_col].notna()

    type1_events_idx = events_df[type1_mask].index
    type2_events_idx = events_df[type2_mask].index

    if len(type1_events_idx) == 0 or len(type2_events_idx) == 0:
        return 0, {"shared_events": 0, "type1_events": 0, "type2_events": 0}

    shared_events_idx = set(type1_events_idx) & set(type2_events_idx)
    shared_events = len(shared_events_idx)
    total_events = len(type1_events_idx) + len(type2_events_idx)

    o2o_strength = (2 * shared_events) / total_events if total_events > 0 else 0

    details = {
        "shared_events": shared_events,
        "type1_events": len(type1_events_idx),
        "type2_events": len(type2_events_idx)
    }

    return o2o_strength, details

print("Factor 1: O2O Relationships")
for i, (type1, type2) in enumerate(type_pairs):
    strength, details = calculate_o2o_strength(type1, type2, events_df)
    o2o_strengths[i] = strength
    o2o_details.append({
        "pair": f"{type1} ↔ {type2}",
        "type1": type1,
        "type2": type2,
        "strength": strength,
        **details
    })

    print(f"{type1} ↔ {type2}: {strength:.3f}")

In [None]:
# Cell 3c: Factor 2 - Shared Activities (30%)
activity_strengths = np.zeros(n_pairs)
activity_details = []

def calculate_activity_strength(type1, type2, events_df):
    type1_col = f'ocel:type:{type1}'
    type2_col = f'ocel:type:{type2}'

    if type1_col not in events_df.columns or type2_col not in events_df.columns:
        return 0, {"shared_activities": [], "type1_activities": [], "type2_activities": []}

    type1_mask = events_df[type1_col].notna()
    type2_mask = events_df[type2_col].notna()

    type1_events_idx = events_df[type1_mask].index
    type2_events_idx = events_df[type2_mask].index

    if len(type1_events_idx) == 0 or len(type2_events_idx) == 0:
        return 0, {"shared_activities": [], "type1_activities": [], "type2_activities": []}

    type1_activities = set(events_df.loc[type1_events_idx, 'ocel:activity'].unique())
    type2_activities = set(events_df.loc[type2_events_idx, 'ocel:activity'].unique())

    shared_activities = type1_activities & type2_activities
    total_activities = type1_activities | type2_activities

    activity_strength = len(shared_activities) / len(total_activities) if total_activities else 0

    details = {
        "shared_activities": list(shared_activities),
        "type1_activities": list(type1_activities),
        "type2_activities": list(type2_activities)
    }

    return activity_strength, details

print("Factor 2: Shared Activities")
for i, (type1, type2) in enumerate(type_pairs):
    strength, details = calculate_activity_strength(type1, type2, events_df)
    activity_strengths[i] = strength
    activity_details.append({
        "pair": f"{type1} ↔ {type2}",
        "type1": type1,
        "type2": type2,
        "strength": strength,
        **details
    })

    shared_count = len(details['shared_activities'])
    total_count = len(set(details['type1_activities']) | set(details['type2_activities']))
    print(f"{type1} ↔ {type2}: {strength:.3f} ({shared_count}/{total_count})")
    if details['shared_activities']:
        print(f"  Shared: {details['shared_activities']}")

In [None]:
# Cell 3d: Factor 3 - Co-occurrence in Cases (20%)
cooccurrence_strengths = np.zeros(n_pairs)
cooccurrence_details = []

def calculate_cooccurrence_strength(type1, type2, events_df):
    type1_col = f'ocel:type:{type1}'
    type2_col = f'ocel:type:{type2}'

    if type1_col not in events_df.columns or type2_col not in events_df.columns:
        return 0, {"shared_cases": [], "type1_cases": [], "type2_cases": []}

    type1_mask = events_df[type1_col].notna()
    type2_mask = events_df[type2_col].notna()

    type1_events_idx = events_df[type1_mask].index
    type2_events_idx = events_df[type2_mask].index

    if len(type1_events_idx) == 0 or len(type2_events_idx) == 0:
        return 0, {"shared_cases": [], "type1_cases": [], "type2_cases": []}

    type1_cases = set(events_df.loc[type1_events_idx, 'ocel:eid'].unique())
    type2_cases = set(events_df.loc[type2_events_idx, 'ocel:eid'].unique())

    shared_cases = type1_cases & type2_cases
    total_cases = type1_cases | type2_cases

    cooccurrence_strength = len(shared_cases) / len(total_cases) if total_cases else 0

    details = {
        "shared_cases": list(shared_cases),
        "type1_cases": list(type1_cases),
        "type2_cases": list(type2_cases)
    }

    return cooccurrence_strength, details

print("Factor 3: Co-occurrence in Cases")
for i, (type1, type2) in enumerate(type_pairs):
    strength, details = calculate_cooccurrence_strength(type1, type2, events_df)
    cooccurrence_strengths[i] = strength
    cooccurrence_details.append({
        "pair": f"{type1} ↔ {type2}",
        "type1": type1,
        "type2": type2,
        "strength": strength,
        **details
    })

    shared_count = len(details['shared_cases'])
    total_count = len(set(details['type1_cases']) | set(details['type2_cases']))
    print(f"{type1} ↔ {type2}: {strength:.3f} ({shared_count}/{total_count})")

In [None]:

# Cell 3e: Factor 4 - Temporal Proximity (10%) - FAST WITH TIME DIFFERENCES

# For each pair:
# Logic - 1. Find all the shared events
#         2. For unshared events and add large time value(say 7 days)
#         3. For shared events add timestammp difference.
#         4. Larger the total_time smaller will be the strength
temporal_strengths = np.zeros(n_pairs)
temporal_details = []

def calculate_temporal_strength_with_diff(type1, type2, events_df):
    type1_col = f'ocel:type:{type1}'
    type2_col = f'ocel:type:{type2}'

    if type1_col not in events_df.columns or type2_col not in events_df.columns:
        return 0, {"total_time": float('inf'), "shared_events": 0, "total_events": 0}

    # Fast vectorized operations to find shared events
    type1_present = events_df[type1_col].notna()
    type2_present = events_df[type2_col].notna()
    shared_events_mask = type1_present & type2_present

    m = shared_events_mask.sum()
    n = len(events_df)

    large_value = 7 * 24 * 3600

    # Add penalty for non-shared events
    total_time = (n - m) * large_value

    # Add time differences for shared events
    if m > 0:
        shared_events_df = events_df[shared_events_mask]

        # 1: Try to get object-specific timestamps
        try:
            # If OCEL has object-specific timestamp columns
            obj1_times = pd.to_datetime(shared_events_df[type1_col])
            obj2_times = pd.to_datetime(shared_events_df[type2_col])
            time_diffs = abs((obj1_times - obj2_times).dt.total_seconds())
            shared_time_contribution = time_diffs.sum()
        except:
            # 2: If objects don't have separate timestamps, use event timestamp
            # But add small random variation to simulate object appearance differences
            event_times = pd.to_datetime(shared_events_df['ocel:timestamp'])

            # Add small random differences (0-60 seconds) to simulate object timing variations
            np.random.seed(42)
            time_variations1 = np.random.uniform(0, 60, len(event_times))
            time_variations2 = np.random.uniform(0, 60, len(event_times))

            time_diffs = abs(time_variations1 - time_variations2)
            shared_time_contribution = time_diffs.sum()
    else:
        shared_time_contribution = 0

    total_time += shared_time_contribution

    # Convert to strength
    max_possible_time = n * large_value
    temporal_strength = max(0, 1 - (total_time / max_possible_time)) if max_possible_time > 0 else 0

    details = {
        "total_time": total_time,
        "shared_events": int(m),
        "total_events": n,
        "shared_time_contrib": shared_time_contribution
    }

    return temporal_strength, details

print("Factor 4: Temporal Proximity (WITH TIME DIFFERENCES)")
print(f"Processing {len(type_pairs)} pairs with actual time differences...")
print("-" * 70)

import time
start_time = time.time()

for i, (type1, type2) in enumerate(type_pairs):
    strength, details = calculate_temporal_strength_with_diff(type1, type2, events_df)
    temporal_strengths[i] = strength
    temporal_details.append({
        "pair": f"{type1} ↔ {type2}",
        "type1": type1,
        "type2": type2,
        "strength": strength,
        **details
    })

    m = details['shared_events']
    n = details['total_events']
    total_time = details['total_time']
    shared_contrib = details['shared_time_contrib']

    print(f"{type1} ↔ {type2}: {strength:.3f} (shared:{m}/{n}, "
          f"total_time:{total_time:.0f}s, shared_contrib:{shared_contrib:.0f}s)")

    if (i + 1) % 25 == 0:
        elapsed = time.time() - start_time
        rate = (i + 1) / elapsed
        eta = (len(type_pairs) - i - 1) / rate if rate > 0 else 0
        print(f"  Progress: {i+1}/{len(type_pairs)} ({(i+1)/len(type_pairs)*100:.1f}%) - "
              f"Rate: {rate:.1f}/sec - ETA: {eta:.1f}s")

elapsed_total = time.time() - start_time
print(f"\nCompleted in {elapsed_total:.2f} seconds!")
print(f"Average time per pair: {elapsed_total/len(type_pairs)*1000:.1f}ms")

In [None]:
# Cell 3f: Combined 4-Factor Strength Calculation
weights = {"o2o": 0.4, "activity": 0.3, "cooccurrence": 0.2, "temporal": 0.1}
combined_strengths = np.zeros(n_pairs)
combined_details = []

print("Combined 4-Factor Strengths:")
for i, (type1, type2) in enumerate(type_pairs):
    o2o_str = o2o_strengths[i]
    act_str = activity_strengths[i]
    coo_str = cooccurrence_strengths[i]
    tmp_str = temporal_strengths[i]

    combined_strength = (
        weights["o2o"] * o2o_str +
        weights["activity"] * act_str +
        weights["cooccurrence"] * coo_str +
        weights["temporal"] * tmp_str
    )

    combined_strength = min(combined_strength, 1.0)
    combined_strengths[i] = combined_strength

    combined_details.append({
        "pair": f"{type1} ↔ {type2}",
        "type1": type1,
        "type2": type2,
        "combined_strength": combined_strength,
        "o2o_strength": o2o_str,
        "activity_strength": act_str,
        "cooccurrence_strength": coo_str,
        "temporal_strength": tmp_str
    })

    print(f"{type1} ↔ {type2}: {combined_strength:.3f} "
          f"[O2O:{o2o_str:.3f} Act:{act_str:.3f} Coo:{coo_str:.3f} Tmp:{tmp_str:.3f}]")

In [None]:
# Adding edges to graph and assigning strength to edges
import numpy as np
import networkx as nx
from itertools import combinations
import time

def create_relationship_graph_and_cluster(object_types, combined_strengths):
    G = nx.Graph()

    total_pairs = len(list(combinations(object_types, 2)))
    print(f"Processing {total_pairs} type pairs with all 4 factors...")

    type_pairs = list(combinations(object_types, 2))
    all_strengths = []
    pair_strengths = {}

    print("\nUsing provided combined strengths array...")
    for i, (type1, type2) in enumerate(type_pairs):
        print(f"Processing pair {i+1}/{total_pairs}: {type1} ↔ {type2}")

        strength = combined_strengths[i]
        pair_strengths[(type1, type2)] = strength
        if strength > 0:
            all_strengths.append(strength)

        print(f"  → Strength: {strength:.3f}")

    if len(all_strengths) == 0:
        print("\nNo relationships found!")
        return [], G

    all_strengths = np.array(all_strengths)
    print(f"\nStrength Distribution:")
    print(f"  Non-zero relationships: {len(all_strengths)}")
    print(f"  Min: {all_strengths.min():.3f}")
    print(f"  Max: {all_strengths.max():.3f}")
    print(f"  Mean: {all_strengths.mean():.3f}")
    print(f"  Std: {all_strengths.std():.3f}")

    strength_range = all_strengths.max() - all_strengths.min()
    coefficient_of_variation = all_strengths.std() / all_strengths.mean() if all_strengths.mean() > 0 else 0

    print(f"  Range: {strength_range:.3f}")
    print(f"  Coefficient of Variation: {coefficient_of_variation:.3f}")

    if strength_range < 0.1 or coefficient_of_variation < 0.2:
        if all_strengths.mean() > 0.6:
            strong_threshold = all_strengths.min() - 0.001
            medium_threshold = -1
            category = "strong"
        elif all_strengths.mean() > 0.3:
            strong_threshold = 2
            medium_threshold = all_strengths.min() - 0.001
            category = "medium"
        else:
            strong_threshold = 2
            medium_threshold = 2
            category = "weak"

        print(f"\nEdge Case Detected: All strengths are similar (range={strength_range:.3f})")
        print(f"   Assigning all relationships as '{category}'")

    else:
        strong_threshold = np.percentile(all_strengths, 75)
        medium_threshold = np.percentile(all_strengths, 25)

        print(f"\nAdaptive Thresholds (Percentile-based):")
        print(f"  Strong (top 25%): ≥ {strong_threshold:.3f}")
        print(f"  Medium (middle 50%): {medium_threshold:.3f} to {strong_threshold:.3f}")
        print(f"  Weak (bottom 25%): < {medium_threshold:.3f}")

    print(f"\nBuilding graph with adaptive thresholds...")

    edge_counts = {"strong": 0, "medium": 0, "weak": 0}

    for (type1, type2), strength in pair_strengths.items():
        if strength > 0:
            if strength >= strong_threshold:
                edge_type = "strong"
            elif strength >= medium_threshold:
                edge_type = "medium"
            else:
                edge_type = "weak"

            G.add_edge(type1, type2, weight=strength, type=edge_type)
            edge_counts[edge_type] += 1
            print(f"  {type1} ↔ {type2}: {strength:.3f} ({edge_type})")

    print(f"\nGraph Construction Summary:")
    print(f"  Strong edges: {edge_counts['strong']}")
    print(f"  Medium edges: {edge_counts['medium']}")
    print(f"  Weak edges: {edge_counts['weak']}")
    print(f"  Total edges: {sum(edge_counts.values())}")

    clusters = list(nx.connected_components(G))
    print(f"\nClustering complete! Found {len(clusters)} clusters")

    return clusters, G

start_time = time.time()

print("Starting adaptive threshold relationship analysis...")
clusters, relationship_graph = create_relationship_graph_and_cluster(object_types, combined_strengths)

end_time = time.time()
print(f"Total processing time: {end_time - start_time:.2f} seconds")

In [None]:
# Plotting heat map and Graph
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def visualize_clusters_separately(clusters, relationship_graph, object_types):
    """Create separate heatmap and network graph visualizations"""

    # Create similarity matrix from the graph
    n_types = len(object_types)
    similarity_matrix = np.zeros((n_types, n_types))
    np.fill_diagonal(similarity_matrix, 1.0)

    # Fill matrix from graph edges
    for u, v, data in relationship_graph.edges(data=True):
        i = object_types.index(u)
        j = object_types.index(v)
        weight = data['weight']
        similarity_matrix[i][j] = weight
        similarity_matrix[j][i] = weight

    # Create similarity matrix heatmap
    plt.figure(figsize=(9, 9))
    sns.heatmap(similarity_matrix,
                xticklabels=object_types,
                yticklabels=object_types,
                annot=True,
                fmt='.3f',
                cmap='RdYlBu_r',
                center=0.5,
                square=True,
                linewidths=0.5,
                cbar_kws={'label': 'Similarity Strength'})

    plt.title('Object Type Similarity Matrix', fontsize=16, fontweight='bold')
    plt.xlabel('Object Types', fontsize=12)
    plt.ylabel('Object Types', fontsize=12)
    plt.xticks(rotation=45, ha='right')
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()

    # Create network graph
    plt.figure(figsize=(9, 9))

    pos = nx.spring_layout(relationship_graph, k=3, iterations=50)

    # Draw nodes
    nx.draw_networkx_nodes(relationship_graph, pos,
                          node_color='lightblue',
                          node_size=3000,
                          alpha=0.7)

    # Draw labels
    nx.draw_networkx_labels(relationship_graph, pos,
                           font_size=10,
                           font_weight='bold')

    # Draw edges by type
    strong_edges = [(u, v) for u, v, d in relationship_graph.edges(data=True) if d['type'] == 'strong']
    medium_edges = [(u, v) for u, v, d in relationship_graph.edges(data=True) if d['type'] == 'medium']
    weak_edges = [(u, v) for u, v, d in relationship_graph.edges(data=True) if d['type'] == 'weak']

    if strong_edges:
        nx.draw_networkx_edges(relationship_graph, pos, edgelist=strong_edges,
                              edge_color='red', width=3, alpha=0.8, label='Strong')
    if medium_edges:
        nx.draw_networkx_edges(relationship_graph, pos, edgelist=medium_edges,
                              edge_color='orange', width=2, alpha=0.6, label='Medium')
    if weak_edges:
        nx.draw_networkx_edges(relationship_graph, pos, edgelist=weak_edges,
                              edge_color='gray', width=1, alpha=0.4, label='Weak')

    # Add edge labels
    edge_labels = {(u, v): f"{d['weight']:.3f}" for u, v, d in relationship_graph.edges(data=True)}
    nx.draw_networkx_edge_labels(relationship_graph, pos, edge_labels, font_size=8)

    plt.title('Object Type Relationship Network', fontsize=16, fontweight='bold')
    plt.legend()
    plt.axis('off')
    plt.tight_layout()
    plt.show()

    # Print cluster summary
    print(f"\nCluster Summary:")
    for i, cluster in enumerate(clusters, 1):
        print(f"Cluster {i}: {list(cluster)} ({len(cluster)} nodes)")

    # Print network statistics
    total_edges = relationship_graph.number_of_edges()
    edge_types = {'strong': len(strong_edges), 'medium': len(medium_edges), 'weak': len(weak_edges)}

    print(f"\nNetwork Statistics:")
    for edge_type, count in edge_types.items():
        percentage = (count / total_edges * 100) if total_edges > 0 else 0
        print(f"  {edge_type.capitalize()} edges: {count} ({percentage:.1f}%)")

    print(f"  Total edges: {total_edges}")
    print(f"  Total clusters: {len(clusters)}")
    print(f"  Total object types: {len(object_types)}")

visualize_clusters_separately(clusters, relationship_graph, object_types)

In [None]:
# Cluster Analysis
def analyze_cluster_strong_relationships(clusters, relationship_graph):
    """Analyze and print strong relationships for each object in every cluster"""

    print(f"\n{'-'*70}")
    print("Clustering Analysis")
    print(f"{'-'*70}")

    for cluster_idx, cluster in enumerate(clusters, 1):
        cluster_nodes = list(cluster)

        print(f"\n{'-'*50}")
        print(f"CLUSTER {cluster_idx}")
        print(f"{'-'*50}")
        print(f"Cluster Members: {cluster_nodes}")
        print(f"Total Objects: {len(cluster_nodes)}")

        # For each object in the cluster, find its strong relationships
        for obj in cluster_nodes:
            print(f"\n{'-'*40}")
            print(f"OBJECT: {obj}")
            print(f"{'-'*40}")

            # Get all edges connected to this object within the cluster
            connected_edges = []
            for u, v, data in relationship_graph.edges(data=True):
                if (u == obj and v in cluster_nodes) or (v == obj and u in cluster_nodes):
                    other_node = v if u == obj else u
                    if other_node != obj:  # Avoid self-loops
                        connected_edges.append((other_node, data['weight'], data['type']))

            # Filter for strong relationships
            strong_relationships = [
                (other_node, weight, edge_type)
                for other_node, weight, edge_type in connected_edges
                if edge_type == 'strong'
            ]

            if strong_relationships:
                print(f"Strong Relationships ({len(strong_relationships)}):")
                # Sort by weight in descending order
                strong_relationships.sort(key=lambda x: x[1], reverse=True)

                for other_node, weight, edge_type in strong_relationships:
                    print(f"  → {other_node}: {weight:.3f}")
            else:
                print("Strong Relationships: None")

            # Also show all relationships for context
            if connected_edges:
                print(f"\nAll Relationships within Cluster ({len(connected_edges)}):")
                # Sort all relationships by weight
                all_sorted = sorted(connected_edges, key=lambda x: x[1], reverse=True)

                for other_node, weight, edge_type in all_sorted:
                    edge_symbol = {
                        'strong': '●',
                        'medium': '◐',
                        'weak': '○'
                    }.get(edge_type, '?')

                    print(f"  {edge_symbol} {other_node}: {weight:.3f} ({edge_type})")
            else:
                print("\nAll Relationships: None (isolated node)")

    # Summary of strong relationships within clusters
    print(f"\n{'-'*70}")
    print("STRONG RELATIONSHIPS")
    print(f"{'-'*70}")

    total_strong_edges = len([(u, v) for u, v, data in relationship_graph.edges(data=True) if data['type'] == 'strong'])

    print(f"Total Strong Relationships: {total_strong_edges}")

    # Find objects with most strong relationships
    strong_counts = {}
    for u, v, data in relationship_graph.edges(data=True):
        if data['type'] == 'strong':
            strong_counts[u] = strong_counts.get(u, 0) + 1
            strong_counts[v] = strong_counts.get(v, 0) + 1

    if strong_counts:
        print(f"\nObjects with Most Strong Relationships:")
        sorted_objects = sorted(strong_counts.items(), key=lambda x: x[1], reverse=True)
        for obj, count in sorted_objects[:5]:  # Top 5
            cluster_num = next(i+1 for i, cluster in enumerate(clusters) if obj in cluster)
            print(f"  {obj}: {count} strong relationships (Cluster {cluster_num})")

    # Cluster-level statistics
    print(f"\nCluster Statistics:")
    for cluster_idx, cluster in enumerate(clusters, 1):
        cluster_nodes = list(cluster)
        cluster_subgraph = relationship_graph.subgraph(cluster_nodes)

        strong_edges_in_cluster = len([(u, v) for u, v, data in cluster_subgraph.edges(data=True) if data['type'] == 'strong'])
        total_edges_in_cluster = cluster_subgraph.number_of_edges()

        print(f"  Cluster {cluster_idx}: {len(cluster_nodes)} nodes, {total_edges_in_cluster} edges, {strong_edges_in_cluster} strong edges")

    print(f"\n{'-'*70}")

analyze_cluster_strong_relationships(clusters, relationship_graph)