In [1]:
import pickle
import networkx as nx
from collections import defaultdict
import time
import numpy as np
import dsbcorr
import random
from scipy.spatial import distance
from sklearn.cluster import AgglomerativeClustering
from itertools import groupby
import pandas as pd
from copy import deepcopy
from sklearn.preprocessing import normalize
from topcorr import tmfg

In [2]:
def extract_service_type(node_label):
    return '-'.join(node_label.split('-')[:2])

def weighted_random_walk(graph, start_node, walk_length, weight='weight'):
    """
    Perform a single weighted random walk starting from a given node.
    
    Parameters:
    - graph: NetworkX graph
    - start_node: Node to start the random walk from
    - walk_length: Length of the random walk
    - weight: Edge attribute to use as weight (default is 'weight')
    """
    path = [start_node]
    for _ in range(walk_length - 1):
        current_node = path[-1]
        neighbors = list(graph.neighbors(current_node))
        if not neighbors:  # If no neighbors, end the walk
            break
        
        # Get weights of edges to neighbors
        weights = np.array([graph[current_node][neighbor].get(weight, 1) for neighbor in neighbors])
        probabilities = weights / weights.sum()  # Normalize to get probabilities
        
        # Choose the next node based on the probabilities
        next_node = random.choices(neighbors, weights=probabilities, k=1)[0]
        path.append(next_node)
    
    return path

def characterize_nodes_weighted(graph, walk_length=10, num_walks=100, weight='weight'):
    """
    Characterize all nodes in the graph using weighted random walks.
    
    Parameters:
    - graph: NetworkX graph
    - walk_length: Length of each random walk
    - num_walks: Number of random walks per node
    - weight: Edge attribute to use as weight
    
    Returns:
    - node_characteristics: Dictionary of visit frequencies for each node
    """
    node_characteristics = {}
    for node in graph.nodes:
        # Collect weighted random walk paths starting from this node
        all_walks = [weighted_random_walk(graph, node, walk_length, weight) for _ in range(num_walks)]
        
        # Compute visit frequencies for each node
        visit_counts = {}
        for walk in all_walks:
            for visited_node in walk:
                visit_counts[visited_node] = visit_counts.get(visited_node, 0) + 1
        
        # Normalize frequencies
        total_visits = sum(visit_counts.values())
        visit_frequencies = {n: count / total_visits for n, count in visit_counts.items()}
        
        # Store the characteristics
        node_characteristics[node] = visit_frequencies
    return node_characteristics

def get_normal_label(labels):
    unique, counts = np.unique(labels, return_counts=True)
    max_count_index = np.argmax(counts)
    return unique[max_count_index]

def get_normal_mat(labels, r_dict, normal_label, fault_label):
    first_index = np.where(labels == fault_label)[0][0]
    normal_indexes = np.where((labels == normal_label) & (np.arange(len(labels)) < first_index))[0]
    avg_network = np.zeros((np.shape(r_dict[0])))
    for idx in normal_indexes:
        avg_network = avg_network + r_dict[idx]
    return avg_network/len(normal_indexes)

In [8]:
from tqdm import trange

hts_res_dict = {}
for experiment in trange(0,4):
    with open(f"../data/fault_data/hts_fault_{experiment}.pkl", "rb") as f:
        s_list, X = pickle.load(f)
    ind = 0
    mapping = {}
    for service in s_list:
        x = service.split('-')
        x = x[:-2]
        x = '-'.join(x)
        mapping[ind] = x
        ind+=1
        
    # Start FC-FDL
    win = 360
    step = 1
    Xc = deepcopy(X)
    Xdiff = np.diff(Xc[:1080,:],axis=0)
    Xdiff = normalize(Xdiff,axis=0)
    r_dict = dsbcorr.rolling_window(Xdiff[:,:], win, "tapered", step, 0.3, "tmfg", "pearsons")
    
    r_pos = {}
    for corr in r_dict:
        r_pos[corr]=deepcopy(r_dict[corr])
        r_pos[corr] += 1
    
    spectra = dsbcorr.all_spectra(r_pos, True)
    all_es = spectra.T
    all_es = all_es[:,:-1]
    
    start_time = time.time()
    landmarks = random.sample(range(0, all_es.shape[0], 1), int(len(all_es) / 4))
    dist = distance.cdist(all_es[landmarks, :], all_es, 'euclidean')
    coords = dsbcorr.LMDS(dist, landmarks, 2)
    
    n_clusters = 2
    clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage="average").fit(coords)
    labels=clustering.labels_
    
    # Find the order of first instance cluster appearance
    order_of_clusters = [key for key, _ in groupby(labels)]
    
    # Initialize an empty list to store unique clusters in the order of first appearance
    unique_clusters = []
    
    # Set to keep track of seen clusters
    seen = set()
    
    # Iterate through the labels
    for label in labels:
        if label not in seen:
            unique_clusters.append(label)
            seen.add(label)
    
    network_centroids = {}
    for i in range(0,2):
        avg_network = np.zeros((np.shape(r_dict[0])))
        for n in np.where(labels == unique_clusters[i])[0]:
            avg_network = avg_network+r_dict[n]
        avg_network = avg_network/len(np.where(labels == i))
        network_centroids[i] = deepcopy(avg_network)
        
    ms_slist = s_list
    service_type_mapping = {i: extract_service_type(label) for i, label in enumerate(ms_slist)}
    
    # normal_label = get_normal_label(labels)
    # normal_mat = get_normal_mat(labels, r_dict, 0, 1)
    normal_mat = network_centroids[0]
    # normal_mat = remove_indices(normal_mat, non_ms, non_ms)
    normal_mat = normal_mat/np.max(normal_mat)
    normal_mat[np.where((normal_mat < 0.1) & (normal_mat > -0.1))] = 0
    # normal_mat = nx.to_numpy_array(tmfg(normal_mat))
    
    
    abnormal_mat = network_centroids[1]
    # abnormal_mat = remove_indices(abnormal_mat, non_ms, non_ms)
    abnormal_mat = abnormal_mat/np.max(abnormal_mat)
    abnormal_mat[np.where((abnormal_mat < 0.1) & (abnormal_mat > -0.1))] = 0
    # abnormal_mat = nx.to_numpy_array(tmfg(abnormal_mat))
    
    sub_G = abnormal_mat - normal_mat
    matrix = np.abs(sub_G)
    # matrix = pmfg(np.abs(sub_G))  # Assuming pmfg function processes the matrix appropriately
    G = nx.from_numpy_array(sub_G)  # Create a graph from the adjacency matrix
    
    node_characteristics = characterize_nodes_weighted(G, walk_length=10, num_walks=100, weight='weight')
    total_f = {n:0 for n in range(0,47)}
    for i, characteristics in node_characteristics.items():
        for node in characteristics:
            total_f[node]+=characteristics[node]
    combined_frequencies = defaultdict(float)
    
    i = 0
    mapping = {}
    for service in s_list:
        x = service.split('-')
        x = x[:-2]
        x = '-'.join(x)
        mapping[i] = x
        i+=1
        
    for service_id, frequency in total_f.items():
        service_type = mapping.get(service_id, 'unknown')
        combined_frequencies[service_type] += frequency
    
    # Convert the combined frequencies dictionary to a pandas dataframe
    combined_frequencies_df = pd.DataFrame(
        list(combined_frequencies.items()), columns=["Service Type", "Combined Frequency"]
    )
    end_time = time.time()
    ####
    
    t_elapsed = end_time - start_time
    sorted_pr = combined_frequencies_df.sort_values(by="Combined Frequency", ascending=False)
    top_5 = list(sorted_pr.head(5)["Service Type"])
    hts_res_dict[experiment] = (top_5, t_elapsed)

100%|██████████| 4/4 [00:54<00:00, 13.69s/it]


In [7]:
fault_label = 1
normal_label = 0
first_index = np.where(labels == fault_label)[0][0]
normal_indexes = np.where((labels == normal_label) & (np.arange(len(labels)) < first_index))[0]
avg_network = np.zeros((np.shape(r_dict[0])))
for idx in normal_indexes:
    avg_network = avg_network + r_dict[idx]

In [None]:
import matplotlib.pyplot as plt
plt.plot(labels)

In [4]:
cp_res_dict = {}
for experiment in range(0,4):
    with open(f"../data/fault_data/cp_fault_{experiment}.pkl", "rb") as f:
        s_list, X = pickle.load(f)
    ind = 0
    mapping = {}
    for service in s_list:
        x = service.split('-')
        x = x[:-2]
        x = '-'.join(x)
        mapping[ind] = x
        ind+=1
    # Start CausalRCA
    win = 360
    step = 1
    Xc = deepcopy(X)
    Xdiff = np.diff(Xc[:1080,:],axis=0)
    Xdiff = normalize(Xdiff,axis=0)
    r_dict = dsbcorr.rolling_window(Xdiff[:,:], win, "tapered", step, 0.3, "tmfg", "pearsons")
    
    r_pos = {}
    for corr in r_dict:
        r_pos[corr]=deepcopy(r_dict[corr])
        r_pos[corr] += 1
    
    spectra = dsbcorr.all_spectra(r_pos, True)
    all_es = spectra.T
    all_es = all_es[:,:-1]
    
    start_time = time.time()
    landmarks = random.sample(range(0, all_es.shape[0], 1), int(len(all_es) / 4))
    dist = distance.cdist(all_es[landmarks, :], all_es, 'euclidean')
    coords = dsbcorr.LMDS(dist, landmarks, 2)
    
    n_clusters = 2
    clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage="average").fit(coords)
    labels=clustering.labels_
    
    # Find the order of first instance cluster appearance
    order_of_clusters = [key for key, _ in groupby(labels)]
    
    # Initialize an empty list to store unique clusters in the order of first appearance
    unique_clusters = []
    
    # Set to keep track of seen clusters
    seen = set()
    
    # Iterate through the labels
    for label in labels:
        if label not in seen:
            unique_clusters.append(label)
            seen.add(label)
    
    network_centroids = {}
    for i in range(0,2):
        avg_network = np.zeros((np.shape(r_dict[0])))
        for n in np.where(labels == unique_clusters[i])[0]:
            avg_network = avg_network+r_dict[n]
        avg_network = avg_network/len(np.where(labels == i))
        network_centroids[i] = deepcopy(avg_network)
        
    ms_slist = s_list
    service_type_mapping = {i: extract_service_type(label) for i, label in enumerate(ms_slist)}
    
    # normal_label = get_normal_label(labels)
    # normal_mat = get_normal_mat(labels, r_dict, normal_label, 1)
    normal_mat = network_centroids[0]
    # normal_mat = remove_indices(normal_mat, non_ms, non_ms)
    # normal_mat = normal_mat/np.max(normal_mat)
    normal_mat = normal_mat/len(network_centroids[0])
    normal_mat[np.where((normal_mat < 0.1) & (normal_mat > -0.1))] = 0
    normal_mat = nx.to_numpy_array(tmfg(normal_mat))
    
    
    abnormal_mat = network_centroids[1]
    # abnormal_mat = remove_indices(abnormal_mat, non_ms, non_ms)
    # abnormal_mat = abnormal_mat/np.max(abnormal_mat)
    abnormal_mat = normal_mat/len(network_centroids[1])
    abnormal_mat[np.where((abnormal_mat < 0.1) & (abnormal_mat > -0.1))] = 0
    abnormal_mat = nx.to_numpy_array(tmfg(abnormal_mat))
    
    sub_G = abnormal_mat - normal_mat
    matrix = np.abs(sub_G)
    # matrix = pmfg(np.abs(sub_G))  # Assuming pmfg function processes the matrix appropriately
    G = nx.from_numpy_array(sub_G)  # Create a graph from the adjacency matrix
    
    node_characteristics = characterize_nodes_weighted(G, walk_length=10, num_walks=100, weight='weight')
    total_f = {n:0 for n in range(0,47)}
    for i, characteristics in node_characteristics.items():
        for node in characteristics:
            total_f[node]+=characteristics[node]
    combined_frequencies = defaultdict(float)
    
    i = 0
    mapping = {}
    for service in s_list:
        x = service.split('-')
        x = x[:-2]
        x = '-'.join(x)
        mapping[i] = x
        i+=1
        
    for service_id, frequency in total_f.items():
        service_type = mapping.get(service_id, 'unknown')
        combined_frequencies[service_type] += frequency
    
    # Convert the combined frequencies dictionary to a pandas dataframe
    combined_frequencies_df = pd.DataFrame(
        list(combined_frequencies.items()), columns=["Service Type", "Combined Frequency"]
    )
    end_time = time.time()
    ####
    
    t_elapsed = end_time - start_time
    sorted_pr = combined_frequencies_df.sort_values(by="Combined Frequency", ascending=False)
    top_5 = list(sorted_pr.head(5)["Service Type"])
    cp_res_dict[experiment] = (top_5, t_elapsed)

In [27]:
print(hts_res_dict)

{0: (['user-memcached', 'text-service', 'compose-post-service', 'user-service', 'post-storage-service'], 14.031043291091919), 1: (['user-service', 'media-service', 'post-storage-service', 'user-memcached', 'text-service'], 13.797024488449097), 2: (['compose-post-service', 'user-timeline-service', 'media-service', 'text-service', 'post-storage-memcached'], 13.856050491333008), 3: (['home-timeline-service', 'compose-post-service', 'url-shorten-service', 'user-memcached', 'user-mention-service'], 13.882007360458374)}
