In [1]:
import os
import networkx as nx

def get_graph(dot_file):
    G = nx.nx_agraph.read_dot(dot_file)
    
    nodes_to_delete = [node for node, attr in G.nodes(data=True) if 'shape' in attr]

    node_labels = {}

    for node, attr in G.nodes(data=True):
        label = attr.get('label', '') 
        if label.startswith('"') and label.endswith('"'):
            label = label[1:-1]
        node_labels[node] = label

    for node in nodes_to_delete:
        # connect in and out nodes with parent and child, delete channel nodes
        for in_src, _ in G.in_edges(node):
            for _, out_dst in G.out_edges(node):
                G.add_edge(in_src, out_dst)

        G.remove_node(node)
    
    if len(G.nodes()) == 0:
        return [], None, None, None
    
    longest_path = nx.dag_longest_path(G)
    
    # if longest path empty, return empty list
    if len(longest_path) == 0:
        return [], None, None, None
    
    # source node is first node
    source_node = longest_path[0]
    
    # target node is last node of longest path
    target_node = longest_path[-1]
    
    return G, source_node, target_node, node_labels

def find_all_paths_between_nodes(G, source, target):
    all_paths = list(nx.all_simple_paths(G, source, target))
    return all_paths

def find_repeated_patterns(lst):
    patterns = set()
    repeated_patterns = set()
    
    for pattern_length in range(2, len(lst)+1):
        for i in range(len(lst) - pattern_length + 1):
            pattern = tuple(lst[i:i+pattern_length])
            if len(set(pattern)) == pattern_length:
                if pattern in patterns:
                    repeated_patterns.add(pattern)
                else:
                    patterns.add(pattern)
    
    return repeated_patterns

def process_dot_file(dot_file):
    # skipping this file, hardcoded
    if os.path.basename(dot_file) == 'raredisease_dag.dot':
        print("Skipping processing for:", os.path.basename(dot_file))
        return
    
    G, source_node, target_node, node_labels = get_graph(dot_file)
    print("Processing DOT file:", os.path.basename(dot_file))
    if G:
        paths = find_all_paths_between_nodes(G, source_node, target_node)
        repeated_patterns = set()
        pattern_found = False
        for path in paths:
            labels = [node_labels[node] for node in path]
            path_repeated_patterns = find_repeated_patterns(labels)
            repeated_patterns.update(path_repeated_patterns)
            if repeated_patterns and not pattern_found:
                pattern_found = True
                with open("./results/repetition_pattern.txt", "w") as output_file:
                    output_file.write("File: {}\n".format(os.path.basename(dot_file)))
                    for pattern in repeated_patterns:
                        output_file.write("Repeated Pattern: {}\n".format(pattern))
                        output_file.write("Path: {}\n".format(labels))
                print(pattern)
                print(labels)
                print("Repeated patterns written to repetition.txt")
                break



directory = "./dags/"
for filename in os.listdir(directory):
    if filename.endswith(".dot"):
        dot_file = os.path.join(directory, filename)
        process_dot_file(dot_file)


Processing DOT file: airrflow_dag.dot
Processing DOT file: ampliseq_dag.dot
Processing DOT file: atacseq_dag.dot
Processing DOT file: bacass_dag.dot
Processing DOT file: bamtofastq_dag.dot
Processing DOT file: cageseq_dag.dot
Processing DOT file: callingcards_dag.dot
Processing DOT file: chipseq_dag.dot
Processing DOT file: circdna_dag.dot
Processing DOT file: circrna_dag.dot
Processing DOT file: clipseq_dag.dot
Processing DOT file: coproid_dag.dot
Processing DOT file: createpanelrefs_dag.dot
Processing DOT file: createtaxdb_dag.dot
Processing DOT file: crisprseq_dag.dot
Processing DOT file: cutandrun_dag.dot
Processing DOT file: ddamsproteomics_dag.dot
Processing DOT file: demultiplex_dag.dot
Processing DOT file: denovohybrid_dag.dot
Processing DOT file: detaxizer_dag.dot
Processing DOT file: diaproteomics_dag.dot
Processing DOT file: differentialabundance_dag.dot
Processing DOT file: dualrnaseq_dag.dot
Processing DOT file: eager_dag.dot
Processing DOT file: epitopeprediction_dag.dot
