In [1]:
#import torch
#import torch.nn.functional as F
#from torch_geometric.nn import GATConv, SAGEConv
#from torch_geometric.data import Data
import networkx as nx
import pygraphviz as pgv
import matplotlib.pyplot as plt
import re
from bs4 import BeautifulSoup
import os
from hashlib import md5
#from sklearn.model_selection import train_test_split
from networkx.algorithms.isomorphism import GraphMatcher
from datetime import datetime
import pickle
import shutil
# Getting the current date and time
dt = datetime.now()

import matplotlib.pyplot as plt
import re

def strip_html_tags(text):
    return ''.join(BeautifulSoup(text, "html.parser").stripped_strings)

# Function to convert clock cycle labels (e.g., "@21+41k" to 62)
def convert_clock_cycle(label):
    cleaned_label = strip_html_tags(label)
    match = re.match(r'@(\d+)(\+(\d+)k)?', cleaned_label)
    if match:
        base = int(match.group(1))
        increment = int(match.group(3)) * 1 if match.group(3) else 0
        return base + increment
    return 0

# Function to parse .dot file and prepare the graph
def parse_dot_file(dot_path):
    A = pgv.AGraph(file=dot_path)
    G = nx.DiGraph(A)
    
    for u, v, data in G.edges(data=True):
        if 'label' in data:
            data['label'] = convert_clock_cycle(data['label'])
    
    return G

def sanitize_filename(filename):
    # Replace invalid characters with underscores
    return re.sub(r'[<>:"/\\|?*]', '_', filename)

# Function to hash a knowledge graph to identify unique patterns
def hash_graph(G):
    graph_str = nx.weisfeiler_lehman_graph_hash(G)
    return md5(graph_str.encode('utf-8')).hexdigest()

def plot_graph(G, title, highlight_edges=None):
    pos = nx.spring_layout(G, seed=42)
    plt.figure(figsize=(10, 8))
    nx.draw(G, pos, with_labels=True, node_size=500, node_color='lightblue', edge_color='gray', font_size=12, font_weight='bold')
    
    # Adding edge labels
    edge_labels = nx.get_edge_attributes(G, 'label')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels, font_color='red', label_pos=0.3, bbox=dict(facecolor='white', edgecolor='none', alpha=0.7))
    
    # Highlight the edges/nodes in red if they are in the highlight list
    if highlight_edges:
        nx.draw_networkx_edges(G, pos, edgelist=highlight_edges, edge_color='red', width=2.5)

    plt.title(title)
    
    # Sanitize title to create a valid filename
    safe_title = sanitize_filename(title)
    numeric_plot_filename = f'{safe_title}.png'
    
    plt.savefig(numeric_plot_filename)
    plt.close()
# Function to create a knowledge graph representation
def create_knowledge_graph(G):
    knowledge_graph = nx.Graph()

    # Add nodes with features as attributes
    for node in G.nodes():
        knowledge_graph.add_node(node, features=G.nodes[node])

    # Add edges with attributes
    for u, v, data in G.edges(data=True):
        knowledge_graph.add_edge(u, v, **data)

    return knowledge_graph

# Function to classify a pattern based on the rules provided
def classify_pattern(G):
    first_five_nodes = set([f"M{i}" for i in range(4)])  # Assuming nodes are named like M0, M1, M2, M3,
    long_jump_found = False
    long_jump_node = None
    pattern_type = "Unknown"

    for u, v, data in G.edges(data=True):
        try:
            u_index = int(re.findall(r'\d+', u)[0])
            v_index = int(re.findall(r'\d+', v)[0])
        except (IndexError, ValueError):
            continue

        if abs(u_index - v_index) >= 2:
            long_jump_found = True
            long_jump_node = v
            break

    if long_jump_found:
        for node in nx.dfs_postorder_nodes(G, source=long_jump_node):
            if G.has_edge(node, node):
                pattern_type = "Spinner"
                break
        else:
            if long_jump_node in first_five_nodes:
                connects_back = False
                for node in nx.dfs_postorder_nodes(G, source=long_jump_node):
                    for u, v in G.edges(data=False):
                        if u == node and v in first_five_nodes:
                            connects_back = True
                            break
                    if connects_back:
                        break
                if not connects_back:
                    pattern_type = "Tunnel and Wormhole"
                else:
                    pattern_type = "Unknown"
            else:
                pattern_type = "Wormhole"
    else:
        for u, v, data in G.edges(data=True):
            if u in first_five_nodes and v in first_five_nodes:
                pattern_type = "Tunnel"
                break

    return pattern_type

# Base folder path
#base_folder = "/mnt/labdrive/zliu12/capri6_rootcause_data/lasersweep/lasersweep_asconsbox_bitslice/"
base_folder = "/media/dillibabu/PortableSSD/Exp/capri6_rootcause_data/lasersweep/lasersweep_verifypin5/Pinverify5/"
training_dotfiles_folder_path = os.path.join(base_folder, 'training_dotfiles') 
testing_dotfiles_folder_path = os.path.join(base_folder, 'testing_dotfiles') 
# Function to strip HTML tags from labels (if present)
def strip_html_tags(text):
    return ''.join(BeautifulSoup(text, "html.parser").stripped_strings)



from collections import defaultdict
core0=[]
core1=[]
core2=[]
core3=[]
core4=[]
core5=[]



def gather_and_classify_training_dot_files(training_dotfiles_folder_path, exclude_folders=['coremajority']):
    dot_files = defaultdict(list)  # To store files by categories
    categories_count = defaultdict(int)  # To count files in each category
    training_dotfiles= []

    for dot_file in os.listdir(training_dotfiles_folder_path):
        if dot_file.endswith('uarchi_abstract.dot') and not any(exclude in dot_file for exclude in exclude_folders) and 'core3' not in dot_file:
            training_dotfiles.append(os.path.join(training_dotfiles_folder_path, dot_file))
#                         print('dot_file_training:',dot_file)

            category = None
            if 'core0' in dot_file:
                category = 'core0'
                core0.append(dot_file)
            elif 'core1' in dot_file:
                category = 'core1'
                core1.append(dot_file)
            elif 'core2' in dot_file:
                category = 'core2'
                core2.append(dot_file)
            elif 'core3' in dot_file:
                category = 'core3'
                core3.append(dot_file)
            elif 'core4' in dot_file:
                category = 'core4'
                core4.append(dot_file)
            elif 'core5' in dot_file:
                category = 'core5'
                core5.append(dot_file)

            # Add more elif cases for other cores if needed

            if category:
                dot_files[category].append(os.path.join(training_dotfiles_folder_path, dot_file))
                categories_count[category] += 1
                    
                    

    # Print the total number of .dot files in each category
    for category, files in dot_files.items():
        print(f'Category: {category}, Number of .dot files: {categories_count[category]}')
#     print('core0',core0)
#     print('core1',core1)
#     print('core2',core2)
#     print('core4',core4)
#     print('core5',core5)

    return training_dotfiles,dot_files, categories_count


def gather_and_classify_testing_dot_files(testing_dotfiles_folder_path, exclude_folders=['coremajority']):
    dot_files = defaultdict(list)  # To store files by categories
    categories_count = defaultdict(int)  # To count files in each category
    testing_dotfiles=[]
    
    for dot_file in os.listdir(testing_dotfiles_folder_path):
        if dot_file.endswith('uarchi_abstract.dot') and not any(exclude in dot_file for exclude in exclude_folders) and 'core3' in dot_file:
            testing_dotfiles.append(os.path.join(testing_dotfiles_folder_path, dot_file))
#                         print('dot_file_training:',dot_file)

            category = None
            if 'core3' in dot_file:
                category = 'core3'
                core3.append(dot_file)
            if category:
                dot_files[category].append(os.path.join(testing_dotfiles_folder_path, dot_file))
                categories_count[category] += 1
                    
    # Print the total number of .dot files in each category
    for category, files in dot_files.items():
        print(f'Category: {category}, Number of .dot files: {categories_count[category]}')
#     print('core3',core3)
    

    return testing_dotfiles,dot_files, categories_count


training_dot_file_paths,  category_type, categories_count = gather_and_classify_training_dot_files(training_dotfiles_folder_path)

testing_dot_file_paths, category_type, categories_count = gather_and_classify_testing_dot_files(testing_dotfiles_folder_path)
#-----------------------

# Check if tb_13cc_dot_file_paths is empty
if not testing_dot_file_paths:
    print("No .dot files found in the testing directory. Please check the directory path.")
else:
    # Load the list from the file
    #with open('train_dot_file_paths.pkl', 'rb') as file:
        #train_dot_file_paths = pickle.load(file)
    # Split tb-13cc data into training and testing sets
    tb_cc_test_files = testing_dot_file_paths

    # Parse and prepare training knowledge graphs and collect patterns
    train_patterns = {}
    pattern_count_train = {}
    pattern_graphs_train = {}
    pattern_classification_train = {}
    for path in training_dot_file_paths:
        train_G = parse_dot_file(path)
        train_kg = create_knowledge_graph(train_G)
        pattern_hash = hash_graph(train_kg)
        train_patterns[pattern_hash] = train_G
        pattern_count_train[pattern_hash] = pattern_count_train.get(pattern_hash, 0) + 1
        if pattern_hash not in pattern_graphs_train:
            pattern_graphs_train[pattern_hash] = []
        pattern_graphs_train[pattern_hash].append(train_G)
        pattern_type = classify_pattern(train_G)
        pattern_classification_train[pattern_hash] = pattern_type

    # Parse and prepare testing knowledge graphs and collect patterns
    test_patterns = {}
    pattern_count_test = {}
    pattern_graphs_test = {}
    pattern_classification_test = {}
    seen_in_training = 0
    not_seen_in_training = 0
    unknown_pattern_dot_files = []
    known_pattern_dot_files = []
    new_pattern_dot_files = []

    for path in tb_cc_test_files:
        test_G = parse_dot_file(path)
        test_kg = create_knowledge_graph(test_G)
        pattern_hash = hash_graph(test_kg)
        test_patterns[pattern_hash] = test_G
        pattern_count_test[pattern_hash] = pattern_count_test.get(pattern_hash, 0) + 1
        if pattern_hash not in pattern_graphs_test:
            pattern_graphs_test[pattern_hash] = []
        pattern_graphs_test[pattern_hash].append(test_G)
        pattern_type = classify_pattern(test_G)
        pattern_classification_test[pattern_hash] = pattern_type
        
        if pattern_hash in train_patterns:
            seen_in_training += 1
            plot_graph(test_G, title=f"Test Pattern Seen in Training: {pattern_hash}", highlight_edges=list(test_G.edges()))
        else:
            not_seen_in_training += 1
            plot_graph(test_G, title=f"Test Pattern Not Seen in Training: {pattern_hash}", highlight_edges=list(test_G.edges()))
            new_pattern_dot_files.append(path)
    
    unknown_patterns = []
    known_patterns = []
    no_unknown_pattern = 0
    no_known_pattern = 0

    for pattern_hash, pattern_type in pattern_classification_test.items():
        #print('pattern_hash', pattern_hash)
        if pattern_hash not in pattern_classification_train:
            unknown_patterns.append(pattern_hash)
            no_unknown_pattern += 1
        else:
            known_patterns.append(pattern_hash)
            no_known_pattern += 1
        


    total_train_samples = sum(pattern_count_train.values())
    total_test_samples = sum(pattern_count_test.values())
    
    print("\n====== Training ======\n")
    print(f"Total number of campaigns in the training dataset: {total_train_samples}")
    # Plot all different patterns found in the training set
    print("\nPlotting all different patterns in the Training set:", len(pattern_classification_train.items()))
    for pattern_hash, G in train_patterns.items():
        plot_graph(G, title=f"Training Pattern: {pattern_hash} ({pattern_classification_train[pattern_hash]})")

    # Output the classification results
    print("\nWeird Machine classifications in Training:", len(pattern_classification_train.items()))
    for pattern_hash, pattern_type in pattern_classification_train.items():
        print(f"Pattern {pattern_hash}: {pattern_type} ({pattern_count_train[pattern_hash]} samples)")
    print("\n====== Testing ======\n")
    print(f"Total number of campaigns in the testing dataset: {total_test_samples}")   
    
    
    # Plot all different patterns found in the testing set
    print("\nPlotting all different patterns in the Testing set:", len(pattern_classification_test.items()))
    for pattern_hash, G in test_patterns.items():
        plot_graph(G, title=f"Testing Pattern: {pattern_hash} ({pattern_classification_test[pattern_hash]})")

    
    print("\nWeird machine classifications in Testing:", len(pattern_classification_test.items()))
    for pattern_hash, pattern_type in pattern_classification_test.items():
        print(f"Pattern {pattern_hash}: {pattern_type} ({pattern_count_test[pattern_hash]} samples)")

    print("=========== Result =========\n")
    print(f"Number of weird machine patterns in the testing dataset already seen in the training dataset: {no_known_pattern}")
    print(f"Number of weird machine patterns in the testing dataset not seen in the training dataset: {no_unknown_pattern}")
    print(f"New weird machine pattern(s) in the testing dataset: {unknown_patterns}")
    print(f"New weird machine pattern(s) in the testing dataset dot file(s): {new_pattern_dot_files}")
    


Category: core2, Number of .dot files: 429
Category: core5, Number of .dot files: 442
Category: core0, Number of .dot files: 433
Category: core1, Number of .dot files: 433
Category: core4, Number of .dot files: 434
Category: core3, Number of .dot files: 435


Total number of campaigns in the training dataset: 2171

Plotting all different patterns in the Training set: 74

Weird Machine classifications in Training: 74
Pattern 2dd3fd3b2c7df350241568aaf9797189: Wormhole (881 samples)
Pattern 184a37a7bc6649027d7bc630f251903f: Wormhole (243 samples)
Pattern 3e8ac51ed1241beac4ad7b88af311791: Tunnel (331 samples)
Pattern a6f2e5ee1911c56a1599d39c521ac1bb: Wormhole (186 samples)
Pattern e4a9c4ac4b84be7668569728c8f105ad: Wormhole (127 samples)
Pattern 24a21209ad3ad01e7e8177ff4b0ec23f: Wormhole (1 samples)
Pattern 0bc05b01d66b72cc33afc1b94ec6b455: Tunnel (182 samples)
Pattern 8e0df7746bea792a82e3ef7495e240d7: Tunnel (7 samples)
Pattern 6622bc2e2e6203d8bce93101bcfa9c1d: Spinner (15 samples)
Pattern