In [1]:
%run parsing.ipynb
%run util.ipynb
%run LocalRepo.ipynb
%run structural_parsing.ipynb
import networkx as nx
from networkx.readwrite import json_graph
import json
import matplotlib.pyplot as plt
import numpy as np
import math
import time
from typing import List
from stop_words import get_stop_words
from nltk.corpus import stopwords
import re
from nltk import WordNetLemmatizer, FreqDist
import string
from random import shuffle
from gensim.corpora.dictionary import Dictionary
from gensim import similarities
from gensim.models import LdaModel, LdaMulticore
from multiprocessing import Pool, TimeoutError, Process, Manager, Lock
from functools import partial
import pdb
from biterm.cbtm import oBTM 
from sklearn.feature_extraction.text import CountVectorizer
from biterm.utility import vec_to_biterms, topic_summuary # helper functions
from scipy.spatial import distance

In [2]:
METRICS_SAVE_PATH = "../metrics/"
EXPORT_SAVE_PATH = "../export/"

# https://networkx.github.io/documentation/latest/tutorial.html#edges
class WeightGraph:
    def __init__(self, view_name):
        self.g = nx.Graph()
        self.view_name = view_name
        
    def add(self, a, b, delta):
        #self.g.add_node(a)
        #self.g.add_node(b)
        if a == b:
            return
        new_value = self.get(a, b) + delta
        self.g.add_edge(a, b, weight=new_value)
        
    def get(self, a, b):
        if a in self.g and b in self.g.adj[a]:
            return self.g.adj[a][b]["weight"]
        return 0
    
    def add_support(self, node, delta):
        if not node in self.g.nodes:
            self.g.add_node(node)
        self.g.nodes[node]["support"] = self.get_support(node) + delta
        
    def get_support(self, node):
        return self.g.nodes.get(node, {}).get("support", 0)
    
    def add_and_support(self, a, b, delta):
        self.add(a, b, delta)
        self.add_support(a, delta)
        self.add_support(b, delta)
    
    def cutoff_edges(self, minimum_weight):
        fedges = [(a, b) for a, b, info in self.g.edges.data() if info["weight"] < minimum_weight]
        self.g.remove_edges_from(fedges)
        
    def cleanup(self):
        # self.g.remove_nodes_from(list(nx.isolates(self.g)))
        for component in list(nx.connected_components(self.g)):
            if len(component) < 5:
                for node in component:
                    self.g.remove_node(node)
    
    def propagate_down(self, layers = 1, weight_factor = 0.2):
        """copy the connections of each node (scaled by weight_factor) to its children"""
        children_dict = self._get_children_dict()
        child_having_nodes = list(children_dict.keys())
        child_having_nodes.sort(key=lambda path: -path.count('/'))
        for iteration in range(layers):
            changes_to_apply = []
            for node in log_progress(child_having_nodes, desc="Propagating down coupling information, iteration " + str(iteration + 1) + "/" + str(layers)):
                connections_and_weights = [(conn, self.get(node, conn) * weight_factor) for conn in self.g[node] if not conn.startswith(node + "/")]
                for child in children_dict[node]:
                    for conn, val in connections_and_weights:
                        for conn_child in children_dict.get(conn, []):
                            changes_to_apply.append((child, conn_child, val))
            for a, b, delta in log_progress(changes_to_apply, desc="Applying changes, iteration " + str(iteration + 1) + "/" + str(layers)):
                self.add(a, b, delta)
                
    def dilate(self, iterations = 1, weight_factor = 0.2):
        all_nodes = list(self.g.nodes)
        for iteration in range(iterations):
            changes_to_apply = []
            for node in log_progress(all_nodes, desc="Dilating coupling information, iteration " + str(iteration + 1) + "/" + str(iterations)):
                connections_and_weights = [(conn, self.get(node, conn) * weight_factor) for conn in self.g[node] if not conn.startswith(node + "/")]
                for (c1, w1), (c2, w2) in all_pairs(connections_and_weights):
                    changes_to_apply.append((c1, c2, min(w1, w2)))
            for a, b, delta in log_progress(changes_to_apply, desc="Applying changes, iteration " + str(iteration + 1) + "/" + str(iterations)):
                self.add(a, b, delta)
    
    def _get_children_dict(self):
        result = {}
        all_nodes = list(self.g.nodes)
        for node in all_nodes:
            result[node] = set()
        for node in all_nodes:
            if "/" in node:
                parent = "/".join(node.split("/")[0:-1])
                if parent in result:
                    result[parent].add(node)
        for node in all_nodes:
            if len(result[node]) == 0:
                del result[node]
        return result
        
        
    def save(self, repo_name):
        os.makedirs(METRICS_SAVE_PATH + repo_name, exist_ok=True)
        nx.write_gpickle(self.g, WeightGraph.pickle_path(repo_name, self.view_name))
        
    def get_max_weight(self):
        return max([self.g[e[0]][e[1]]["weight"] for e in self.g.edges])
        
    @staticmethod
    def load(repo_name, name):
        wg = WeightGraph(name)
        wg.g = nx.read_gpickle(WeightGraph.pickle_path(repo_name, name))
        return wg
        
    @staticmethod
    def pickle_path(repo_name, name):
        # see https://networkx.github.io/documentation/stable/reference/readwrite/gpickle.html
        return METRICS_SAVE_PATH + repo_name + "/" + name + ".gpickle"
    
    def json_save(self, repo_name):
        data = json_graph.node_link_data(self.g)
        with open(METRICS_SAVE_PATH + repo_name + "/" + self.view_name + ".json", 'w') as outfile:
            json.dump(data, outfile)
            
    def html_save(self, repo_name):
        data = json.dumps(json_graph.node_link_data(self.g))
        content = '<html><body><script type="text/javascript">const graph = ' + data + ';</script><script src="/files/metrics/html_app.js?_xsrf=2%7Ce163cb61%7Cb9245804a283415ecb4c641f0cf1f882%7C1601372106"></script></body></html>'
        with open(METRICS_SAVE_PATH + repo_name + "/" + self.view_name + ".html", 'w') as outfile:
            outfile.write(content)
    
    def plaintext_save(self, repo_name):
        node_list = list(self.g.nodes)
        node2index = dict(zip(node_list, range(len(node_list))))
        content = ";".join(node_list) + "\n" + ";".join([str(node2index[a]) + "," + str(node2index[b]) + "," + str(d["weight"]) for a, b, d in self.g.edges(data=True)])
        os.makedirs(EXPORT_SAVE_PATH + repo_name, exist_ok=True)
        with open(EXPORT_SAVE_PATH + repo_name + "/" + self.view_name + ".graph.txt", "w") as f:
            f.write(content)
    
    
    def print_most_linked_nodes(self, amount = 10):
        print("Most linked nodes:")
        debug_list = sorted(list(self.g.edges.data()), key = lambda e: -e[2]["weight"])
        print([str(info["weight"]) + ": " + a + " <> " + b for a, b, info in debug_list[0:amount]])
    
    def print_statistics(self):
        # https://networkx.github.io/documentation/latest/tutorial.html#analyzing-graphs
        node_count = len(self.g.nodes)
        edge_count = len(self.g.edges)
        cc = sorted(list(nx.connected_components(self.g)), key= lambda e: -len(e))
        print("WeightGraph statistics: "
              + str(node_count) + " nodes, "
              + str(edge_count) + " edges, "
              + str(len(cc)) + " connected component(s), with sizes: ["
              + ", ".join([str(len(c)) for c in cc[0:20]])
              + "]")
        edge_weights = [self.g[e[0]][e[1]]["weight"] for e in self.g.edges]
        edge_weights.sort()
        node_supports = [self.get_support(n) for n in self.g.nodes]
        node_supports.sort()
        print("Edge weights:", edge_weights[0:5], "...", edge_weights[-5:], ", mean:", np.array(edge_weights).mean())
        print("Node support values:", node_supports[0:5], "...", node_supports[-5:], ", mean:", np.array(node_supports).mean())
        
    
    def show_weight_histogram(self):
        edge_weights = [self.g[e[0]][e[1]]["weight"] for e in self.g.edges]
        show_histogram(edge_weights, 'Histogram of edge weights in coupling graph', 'Coupling Strength', 'Amount', 'b')
        
        node_weights = [sum([self.g[n][n2]["weight"] for n2 in self.g.adj[n]]) for n in self.g.nodes]
        show_histogram(node_weights, 'Histogram of node weights', 'Coupling Strength', 'Amount', 'g')
        
        node_supports = [self.get_support(n) for n in self.g.nodes]
        show_histogram(node_supports, 'Histogram of node support values', 'Support', 'Amount', 'g')
        
    def visualize(self, use_spring = False, with_labels = True):
        # https://networkx.github.io/documentation/latest/reference/generated/networkx.drawing.nx_pylab.draw_networkx.html
        for e in self.g.edges:
            self.g[e[0]][e[1]]["distance"] = 1. - self.g[e[0]][e[1]]["weight"] + 0.000001  # the value must not be exactly zero
        
        edge_weights = [self.g[e[0]][e[1]]["weight"] for e in self.g.edges]
        max_weight = max(edge_weights)
        mean_weight = np.array(edge_weights).mean()
        target_max_weight = min(max_weight, mean_weight * 2)
        
        plt.figure(figsize=(8, 8))
        VIZ_POW = 1
        max_w_fact = (1. / target_max_weight) ** VIZ_POW
        
        layout = nx.drawing.layout.kamada_kawai_layout(self.g, weight="distance") if use_spring else None
        
        # nx.draw_kamada_kawai(self.g, alpha=0.2, node_size=100)
        # nx.draw(self.g, alpha=0.2, node_size=100)
        edge_colors = [(0., 0., 0., min(1., (self.g[a][b]["weight"] ** VIZ_POW) * max_w_fact)) for a, b in self.g.edges]
        nx.draw(self.g, pos=layout, node_size=50, edge_color=edge_colors, node_color=[(0.121, 0.469, 0.703, 0.2)], with_labels=with_labels)
        
        plt.show()
        

In [3]:
MAX_COMMIT_FILES = 50
from timeit import default_timer as timer

# needs to be separate so that multiprocessing lib can find it
def get_commit_diff(commit_hash, repo):
    # repo_tree = repo.get_tree()
    
    def walk_tree_cursor(cursor, prefix, content_bytes, node_handler):
        if not cursor.node.is_named:
            return
        def node_text(node):
            return decode(content_bytes[node.start_byte:node.end_byte])
            
        # cursor.current_field_name() is the role that this node has in its parent
        tree_node_names = []  # TODO keep in sync with structural and linguistic view as well as RepoFile class
        if cursor.node.type == "class_declaration" or cursor.node.type == "interface_declaration" or cursor.node.type == "enum_declaration":
            tree_node_names.append(node_text(cursor.node.child_by_field_name("name")))
        elif cursor.node.type == "field_declaration":
            declarators = [child for child in cursor.node.children if child.type == "variable_declarator"]
            tree_node_names += [node_text(d.child_by_field_name("name")) for d in declarators]
        elif cursor.node.type == "method_declaration":
            tree_node_names.append(node_text(cursor.node.child_by_field_name("name")))
        elif cursor.node.type == "constructor_declaration":
            tree_node_names.append("constructor")

        for tree_node_name in tree_node_names:
            node_handler(prefix + "/" + tree_node_name, cursor.node)
        if len(tree_node_names) > 0:
            prefix = prefix + "/" + tree_node_names[0]

        if cursor.goto_first_child():
            walk_tree_cursor(cursor, prefix, content_bytes, node_handler)
            while cursor.goto_next_sibling():
                walk_tree_cursor(cursor, prefix, content_bytes, node_handler)
            cursor.goto_parent()
    
    def walk_tree(tree, content_bytes, base_path) -> RepoTree:
        """ node_handler gets the current logic-path and node for each ast node"""
        try:
            found_nodes = RepoTree(None, "")
            def handle(logic_path, ts_node):
                found_nodes.register(logic_path, ts_node)
            walk_tree_cursor(tree.walk(), base_path, content_bytes, handle)
            return found_nodes
        except Exception as e:
            print("Failed to parse file:", base_path, "Error:", e)
            pdb.set_trace()
            return None
    
    error_query = JA_LANGUAGE.query("(ERROR) @err")
    def _has_error(tree) -> List[str]:
        errors = error_query.captures(tree.root_node)
        return len(errors) > 1
    
    def blob_diff(diff) -> List[str]:
        # pdb.set_trace()
        if diff.a_blob is None:
            return [diff.b_path] # newly created
        elif diff.b_blob is None:
            return [diff.a_path] # deleted
        path = diff.a_path
        # if not repo_tree.has_node(path):
        #     return []  # ignore changed files that are not part of the interesting project structure
        if not path.endswith("." + repo.type_extension()):
            return [path]
        a_content = diff.a_blob.data_stream.read()
        if should_skip_file(a_content):
            return []
        b_content = diff.b_blob.data_stream.read()
        if should_skip_file(b_content):
            return []
        a_tree = java_parser.parse(a_content)
        b_tree = java_parser.parse(b_content)
        if _has_error(a_tree) or _has_error(b_tree):
            return [path] # I guess just the file changed, no more details available
        a_repo_tree = walk_tree(a_tree, a_content, path)
        if a_repo_tree is None:
            return [path]
        b_repo_tree = walk_tree(b_tree, b_content, path)
        if b_repo_tree is None:
            return [path]
        return a_repo_tree.calculate_diff_to(b_repo_tree, a_content, b_content)
    
    c1 = repo.get_commit(commit_hash)
    if len(c1.parents) == 1:
        c2 = c1.parents[0]
        # t4 = timer()
        diff = c1.diff(c2)
        # t5 = timer()
        if len(diff) > MAX_COMMIT_FILES or len(diff) <= 1:  # this is duplicated here for performance
            return None
        diffs = [result for d in diff for result in blob_diff(d)]  #  if repo_tree.has_node(result)
        # t6 = timer()
        # print("Diff: " + str(len(diff)) + " / " + str(len(diffs)) + " changes")
        
        # print("Time taken (ms):", round((t5-t4)*1000), "(getting git diff)", round((t6-t5)*1000), "(parsing sub-file diffs)", round((t6-t4)*1000), "(total)")
    elif len(c1.parents) == 2:
        return None  # TODO how to do sub-file diffs for merge commits?
        #c2 = c1.parents[0]
        #diff_1 = c1.diff(c2)
        #c3 = c1.parents[1]
        #diff_2 = c1.diff(c3)

        #diffs_1 = [ d.a_path for d in diff_1 ]
        #diffs_2 = [ d.a_path for d in diff_2 ]
        #diffs = list(set(diffs_1).intersection(set(diffs_2)))
    else:
        return None
    if len(diffs) > MAX_COMMIT_FILES or len(diffs) <= 1:
        return None
    return diffs

In [4]:
class MetricsGeneration:
    # ascii art: http://patorjk.com/software/taag/#p=display&f=Soft&t=STRUCTURAL%0A.%0ALINGUISTIC%0A.%0AEVOLUTIONARY%0A.%0ADYNAMIC
    def __init__(self, repo):
        self.repo = repo
        
    def calculate_evolutionary_connections(self) -> WeightGraph:
        """
,------.,--.   ,--.,-----. ,--.   ,--. ,--.,--------.,--. ,-----. ,--.  ,--.  ,---.  ,------.,--.   ,--. 
|  .---' \  `.'  /'  .-.  '|  |   |  | |  |'--.  .--'|  |'  .-.  '|  ,'.|  | /  O  \ |  .--. '\  `.'  /  
|  `--,   \     / |  | |  ||  |   |  | |  |   |  |   |  ||  | |  ||  |' '  ||  .-.  ||  '--'.' '.    /   
|  `---.   \   /  '  '-'  '|  '--.'  '-'  '   |  |   |  |'  '-'  '|  | `   ||  | |  ||  |\  \    |  |    
`------'    `-'    `-----' `-----' `-----'    `--'   `--' `-----' `--'  `--'`--' `--'`--' '--'   `--'    
        """
        # MAX_COMMIT_FILES = 50  # Ignore too large commits. (constant moved)
        
        coupling_graph = WeightGraph("evolutionary")
        
        def processDiffs(diffs):
            score = 2 / len(diffs)
            diffs = [d for d in diffs if self.repo.get_tree().has_node(d)]
            for f1, f2 in all_pairs(diffs):
                coupling_graph.add(f1, f2, score)
            for node in diffs:
                coupling_graph.add_support(node, 1)
        
        print("Discovering commits...")
        all_commits = list(self.repo.get_all_commits())
        # shuffle(all_commits)
        print("Done!")
        r.get_tree()
        print("Commits to analyze: " + str(len(all_commits)))
        
        map_parallel(
            all_commits,
            partial(get_commit_diff, repo=self.repo),
            processDiffs,
            "Analyzing commits",
        )
        
        
        coupling_graph.cutoff_edges(0.005)
        return coupling_graph
    
    def post_evolutionary(self, coupling_graph: WeightGraph):
        pass
    
    
    def calculate_structural_connections(self) -> WeightGraph:
        """
 ,---. ,--------.,------. ,--. ,--. ,-----.,--------.,--. ,--.,------.   ,---.  ,--.                     
'   .-''--.  .--'|  .--. '|  | |  |'  .--./'--.  .--'|  | |  ||  .--. ' /  O  \ |  |                     
`.  `-.   |  |   |  '--'.'|  | |  ||  |       |  |   |  | |  ||  '--'.'|  .-.  ||  |                     
.-'    |  |  |   |  |\  \ '  '-'  ''  '--'\   |  |   '  '-'  '|  |\  \ |  | |  ||  '--.                  
`-----'   `--'   `--' '--' `-----'  `-----'   `--'    `-----' `--' '--'`--' `--'`-----'   
        """

        
        coupling_graph = WeightGraph("structural")

        context = StructuralContext(self.repo)
        context.couple_files_by_import(coupling_graph)
        context.couple_by_ineritance(coupling_graph)
        context.couple_members_by_content(coupling_graph)
        flush_unresolvable_vars()

        return coupling_graph
    
    def post_structural(self, coupling_graph: WeightGraph):
        coupling_graph.propagate_down(2, 0.5)
        # coupling_graph.dilate(1, 0.2)
    
    
    def calculate_linguistic_connections(self) -> WeightGraph:
        """
,--.   ,--.,--.  ,--. ,----.   ,--. ,--.,--. ,---. ,--------.,--. ,-----.                                
|  |   |  ||  ,'.|  |'  .-./   |  | |  ||  |'   .-''--.  .--'|  |'  .--./                                
|  |   |  ||  |' '  ||  | .---.|  | |  ||  |`.  `-.   |  |   |  ||  |                                    
|  '--.|  ||  | `   |'  '--'  |'  '-'  '|  |.-'    |  |  |   |  |'  '--'\                                
`-----'`--'`--'  `--' `------'  `-----' `--'`-----'   `--'   `--' `-----'              
        """
        
        #
        # TODO: https://pypi.org/project/biterm/
        #
        
        # constants
        MIN_WORD_LENGTH = 2
        MAX_WORD_LENGTH = 50
        MIN_WORD_USAGES = 2  # any word used less often will be ignored
        MAX_DF = 0.95  # any terms that appear in a bigger proportion of the documents than this will be ignored (corpus-specific stop-words)
        MAX_FEATURES = 1500  # the size of the LDA thesaurus - amount of words to consider for topic learning
        TOPIC_COUNT = 40 # 40  # 100 according to paper
        BTM_ITERATIONS = 100  # 100 according to docs?
        # LDA_PASSES = 200  # how often to go through the corpus
        LDA_RANDOM_SEED = 42
        DOCUMENT_SIMILARITY_EXP = 8 # higher = lower equality values, lower = equality values are all closer to 1
        DOCUMENT_SIMILARITY_CUTOFF = 0.1  # in range [0 .. 1]: everything below this is dropped
        
        
        # keywords from python, TS and Java
        custom_stop_words = ["abstract", "and", "any", "as", "assert", "async", "await", "boolean", "break", "byte", "case", "catch", "char", "class", "const", "constructor", "continue", "debugger", "declare", "def", "default", "del", "delete", "do", "double", "elif", "else", "enum", "except", "export", "extends", "false", "False", "final", "finally", "float", "for", "from", "function", "get", "global", "goto", "if", "implements", "import", "in", "instanceof", "int", "interface", "is", "lambda", "let", "long", "module", "new", "None", "nonlocal", "not", "null", "number", "of", "or", "package", "pass", "private", "protected", "public", "raise", "require", "return", "set", "short", "static", "strictfp", "string", "super", "switch", "symbol", "synchronized", "this", "throw", "throws", "transient", "true", "True", "try", "type", "typeof", "var", "void", "volatile", "while", "with", "yield"]
        stop_words = set(list(get_stop_words('en')) + list(stopwords.words('english')) + custom_stop_words)
        splitter = r"(?:[\W_]+|(?<![A-Z])(?=[A-Z])|(?<!^)(?=[A-Z][a-z]))"
        lemma = WordNetLemmatizer()
        printable_characters = set(string.printable)
        
        def _normalize_word(word):
            return lemma.lemmatize(lemma.lemmatize(word.lower(), pos = "n"), pos = "v")
        
        def _get_text(content_string):
            # https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python
            # https://agailloty.rbind.io/en/project/nlp_clean-text/
            content_string = ''.join(c for c in content_string if c in printable_characters)
            words = re.split(splitter, content_string)
            words = [word for word in words if not word in stop_words]
            words = [_normalize_word(word) for word in words]
            words = [word for word in words if len(word) >= MIN_WORD_LENGTH and len(word) <= MAX_WORD_LENGTH]
            words = [word for word in words if not word in stop_words]
            return words
        
        def array_similarity(a, b):
            """given two arrays of numbers, how equal are they?"""
            return math.pow(1. - distance.cosine(a, b), DOCUMENT_SIMILARITY_EXP)  # TODO check for alternative distance metrics?
        
        
        
        # see https://docs.python.org/2/library/collections.html#collections.Counter
        freq_dist = FreqDist()
        
        files = self.repo.get_all_interesting_files()
        
        
        node_words = []  # List of (RepoTree-Node,wordList) - tuples
        for file in log_progress(files, desc="Extracting language corpus"):
            node = file.get_repo_tree_node()  # TODO unify with structural view code
            if node is None:
                continue  # TODO why / when does this happen?
            
            # TODO keep in sync with evolutionary and structural view as well as RepoFile class
            classes = node.get_descendants_of_type("class") + node.get_descendants_of_type("interface") + node.get_descendants_of_type("enum")
            for class_node in classes:
                fields = class_node.get_children_of_type("field")
                methods = class_node.get_children_of_type("method") + class_node.get_children_of_type("constructor")
                # print("Class " + class_node.name + ": " + str(len(methods)) + " methods and " + str(len(fields)) + " fields")
                
                for member in fields + methods:
                    text = member.get_comment_and_own_text(file)
                    # words = list(_get_text(class_node.get_path() + " " + text))
                    words = list(_get_text(class_node.name + " " + text))
                    for word in words:
                        freq_dist[word] += 1
                    # TODO: handle the empty list?!?
                    node_words.append((member, words))
                    # print(" ".join(words))
            
        # random.seed(LDA_RANDOM_SEED)
        # random.shuffle(node_words)
        
        for word in freq_dist:
            if freq_dist[word] < MIN_WORD_USAGES:
                del freq_dist[word]
                
        print("Amount of documents: " + str(len(node_words)))
        print("Total Amount of words: " + str(sum([len(b) for a, b in node_words])))
        print("Vocab size: " + str(len(freq_dist)))
        
        # https://pypi.org/project/biterm/
        print("Vectorizing words...")
        texts = [" ".join(words) for (node, words) in node_words]
        vec = CountVectorizer(max_df=MAX_DF, max_features=MAX_FEATURES, stop_words=None)
        X = vec.fit_transform(texts).toarray()
        vocab = np.array(vec.get_feature_names())
        
        print("Instantiating BTM...")
        btm = oBTM(num_topics=TOPIC_COUNT, V=vocab)
        biterms = vec_to_biterms(X)
        print("Total Amount of biterms: " + str(sum([len(x) for x in biterms])))
        
        print("Training model...")
        btm.fit(biterms, iterations=BTM_ITERATIONS)
        topics = btm.transform(biterms)
        
        print("Generating topic output...")
        words_to_show_per_topic = 10
        topic_summuary(btm.phi_wz.T, X, vocab, words_to_show_per_topic)
        
        
        
        print("Generating coupling graph...")
        # debug_list = [] # (sim, f1, f2)
        coupling_graph = WeightGraph("linguistic")
        for n1 in log_progress(range(len(node_words)), desc="Generating coupling graph"):
            for n2 in range(len(node_words)):
                if n1 >= n2:
                    continue
                t1 = topics[n1]
                t2 = topics[n2]
                if np.isnan(t1[0]) or np.isnan(t2[0]):
                    continue  # TODO filter out those earlier (happen when a btm-document is empty)
                similarity = array_similarity(t1, t2)
                coupling_graph.add(node_words[n1][0].get_path(), node_words[n2][0].get_path(), similarity)
                # debug_list.append((similarity, f1, f2))
        
        for node, words in log_progress(node_words, desc="Generating coupling graph step 2"):
            coupling_graph.add_support(node.get_path(), len(words))
                
        print("Trimming graph...")
        coupling_graph.cutoff_edges(DOCUMENT_SIMILARITY_CUTOFF)
                
        
        # print("Most similar files:")
        # debug_list = sorted(debug_list, key = lambda x: -x[0])
        # print([str(sim) + ": " + files[f1].get_path() + " <> " + files[f2].get_path() for sim, f1, f2 in debug_list[0:10]])
        
        # print("Most dissimilar files:")
        # debug_list = sorted(debug_list, key = lambda x: x[0])
        # print([str(sim) + ": " + files[f1].get_path() + " <> " + files[f2].get_path() for sim, f1, f2 in debug_list[0:10]])
        
        return coupling_graph
    
    def post_linguistic(self, coupling_graph: WeightGraph):
        pass
            
    
    # -------------------------------------------------------------------------------------------

In [5]:
class MetricManager:
    
    @staticmethod
    def clear(repo, name):
        if MetricManager._data_present(repo.name, name):
            os.remove(WeightGraph.pickle_path(repo.name, name))
    
    @staticmethod
    def get(repo, name) -> WeightGraph:
        if MetricManager._data_present(repo.name, name):
            print("Using precalculated " + name + " values")
            graph = WeightGraph.load(repo.name, name)
            getattr(MetricsGeneration(repo), "post_" + name)(graph)
            return graph
        print("No precalculated " + name + " values found, starting calculations...")
        graph = getattr(MetricsGeneration(repo), "calculate_" + name + "_connections")()
        graph.cleanup()
        print("Calculated " + name + " values, saving them now...")
        graph.save(repo.name)
        getattr(MetricsGeneration(repo), "post_" + name)(graph)
        return graph
    
    @staticmethod
    def _data_present(repo_name, name):
        return os.path.isfile(WeightGraph.pickle_path(repo_name, name))