In [13]:
%run parsing.ipynb
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from typing import List

In [11]:
# https://networkx.github.io/documentation/latest/tutorial.html#edges
class WeightGraph:
    def __init__(self):
        self.g = nx.Graph()
        
    def add(self, a, b, delta):
        self.g.add_node(a)
        self.g.add_node(b)
        new_value = self.get(a, b) + delta
        self.g.add_edge(a, b, weight=new_value)
        
    def get(self, a, b):
        if b in self.g.adj[a]:
            return self.g.adj[a][b]["weight"]
        return 0
    
    def print_statistics(self):
        # https://networkx.github.io/documentation/latest/tutorial.html#analyzing-graphs
        node_count = len(self.g.nodes)
        edge_count = len(self.g.edges)
        cc = list(nx.connected_components(self.g))
        print("WeightGraph statistics: "
              + str(node_count) + " nodes, "
              + str(edge_count) + " edges, "
              + str(len(cc)) + " connected component(s), with sizes: ["
              + ", ".join([str(len(c)) for c in cc])
              + "]")
    
    def show_weight_histogram(self):
        # https://matplotlib.org/3.3.1/api/_as_gen/matplotlib.pyplot.hist.html
        # import pdb; pdb.set_trace()  # debugger
        edge_weights = [self.g[e[0]][e[1]]["weight"] for e in self.g.edges]
        plt.hist(edge_weights, "auto", facecolor='b', alpha=0.75)
        plt.axvline(np.array(edge_weights).mean(), color='k', linestyle='dashed', linewidth=1)
        plt.xlabel('Coupling Strength')
        plt.ylabel('Amount')
        plt.title('Histogram of edge weights in coupling graph')
        plt.grid(True)
        plt.show()
        
        # import pdb; pdb.set_trace()
        node_weights = [sum([self.g[n][n2]["weight"] for n2 in self.g.adj[n]]) for n in self.g.nodes]
        plt.hist(node_weights, "auto", facecolor='g', alpha=0.75)
        plt.axvline(np.array(node_weights).mean(), color='k', linestyle='dashed', linewidth=1)
        # plt.xscale("log")
        # plt.yscale("log")
        plt.xlabel('Coupling Strength')
        plt.ylabel('Amount')
        plt.title('Histogram of edge weights in coupling graph 2')
        plt.grid(True)
        plt.show()

In [12]:
class RepoFile:
    def __init__(self, repo, file_obj):
        self.repo = repo
        self.file_obj = file_obj
        self.content = None
        self.tree = None
        
    def get_path(self):
        return self.file_obj.path
    
    def get_content(self):
        if self.content is None:
            self.content = self.repo.get_file_object_content(self.file_obj)
        return self.content
    
    def get_tree(self):
        if self.tree is None:
            self.tree = java_parser.parse(self.get_content())
        return self.tree
    
    def node_text(self, node):
        return self.content[node.start_byte:node.end_byte].decode("utf-8")
    
    def walk_tree(self, node_handler):
        """ node_handler gets the current logic-path and node for each ast node"""
        self.walk_tree_cursor(self.get_tree().walk(), self.get_path(), node_handler)
    
    def walk_tree_cursor(self, cursor, prefix, node_handler):
        if not cursor.node.is_named:
            return
        node_handler(prefix, cursor.node)
        # cursor.current_field_name() is the role that this node has in its parent
        tree_node_name = None
        if cursor.node.type == "class_declaration":
            idfield = cursor.node.child_by_field_name("name")
            tree_node_name = self.node_text(idfield)
        elif cursor.node.type == "field_declaration":
            idfield = cursor.node.child_by_field_name("declarator").child_by_field_name("name")
            tree_node_name = self.node_text(idfield)
        elif cursor.node.type == "method_declaration":
            idfield = cursor.node.child_by_field_name("name")
            tree_node_name = self.node_text(idfield)

        if tree_node_name is not None:
            prefix = prefix + "/" + tree_node_name
            # found_nodes.register(prefix)

        if cursor.goto_first_child():
            self.walk_tree_cursor(cursor, prefix, node_handler)
            while cursor.goto_next_sibling():
                self.walk_tree_cursor(cursor, prefix, node_handler)
            cursor.goto_parent()


In [5]:
def calculate_structural_connections(repo) -> WeightGraph:
    coupling_graph = WeightGraph()
    
    package_query_1 = JA_LANGUAGE.query("(package_declaration (identifier) @decl)")
    package_query_2 = JA_LANGUAGE.query("(package_declaration (scoped_identifier) @decl)")
    import_query = JA_LANGUAGE.query("(import_declaration (scoped_identifier) @decl)")
    class_query = JA_LANGUAGE.query("(class_declaration name: (identifier) @decl)")
    
    def _get_all_files() -> List[RepoFile]:
        return [RepoFile(repo, o) for o in repo.get_file_objects()]
    
    def _get_package(file) -> List[str]:
        packages = package_query_1.captures(file.get_tree().root_node) + package_query_2.captures(file.get_tree().root_node)
        assert len(packages) <= 1
        if len(packages) == 1:
            return file.node_text(packages[0][0]).split(".")
        else:
            return []
    
    def _get_imports(file) -> List[str]:
        imports = import_query.captures(file.get_tree().root_node)
        result = []
        for import_statement in imports:
            import_string = file.node_text(import_statement[0])
            if not import_string.startswith("java"):
                result.append(import_string)
        return result
    
    def _get_main_class_name(file) -> List[str]:
        classes = class_query.captures(file.get_tree().root_node)
        if len(classes) >= 1:
            return file.node_text(classes[0][0])
        else:
            return None
    
    def _mark_connected(a, b):
        coupling_graph.add(a, b, 1)
    
    #######
    
    full_class_name_to_id = {}
    
    files = _get_all_files()
    for file in files:
        class_name = _get_main_class_name(file)
        if class_name is not None:
            full_class_name = ".".join(_get_package(file) + [class_name])
            full_class_name_to_id[full_class_name] = file.get_path()
    
    for file in files:
        imports = _get_imports(file)
        for i in imports:
            if i in full_class_name_to_id:
                # print("import RESOLVED: " + i)
                _mark_connected(file.get_path(), full_class_name_to_id[i])
            else:
                pass # print("cannot resolve import: " + i)
    
    coupling_graph.print_statistics()
    coupling_graph.show_weight_histogram()
    return coupling_graph