In [13]:
%run parsing.ipynb
%run util.ipynb
import networkx as nx
from networkx.readwrite import json_graph
import json
import matplotlib.pyplot as plt
import numpy as np
import math
import time
from typing import List
from stop_words import get_stop_words
from nltk.corpus import stopwords
import re
from nltk import WordNetLemmatizer, FreqDist
import string
from gensim.corpora.dictionary import Dictionary
from gensim import similarities
from gensim.models import LdaModel, LdaMulticore
from multiprocessing import Pool, TimeoutError, Process, Manager, Lock
from functools import partial
import pdb
from biterm.btm import oBTM 
from sklearn.feature_extraction.text import CountVectorizer
from biterm.utility import vec_to_biterms, topic_summuary # helper functions
from scipy.spatial import distance

In [11]:
METRICS_SAVE_PATH = "../metrics/"
EXPORT_SAVE_PATH = "../export/"

# https://networkx.github.io/documentation/latest/tutorial.html#edges
class WeightGraph:
    def __init__(self, view_name):
        self.g = nx.Graph()
        self.view_name = view_name
        
    def add(self, a, b, delta):
        self.g.add_node(a)
        self.g.add_node(b)
        new_value = self.get(a, b) + delta
        self.g.add_edge(a, b, weight=new_value)
        
    def add_sym(self, a, b, delta):
        self.add(a, b, delta)
        
    def get(self, a, b):
        if a in self.g and b in self.g.adj[a]:
            return self.g.adj[a][b]["weight"]
        return 0
    
    def cutoff_edges(self, minimum_weight):
        fedges = [(a, b) for a, b, info in self.g.edges.data() if info["weight"] < minimum_weight]
        self.g.remove_edges_from(fedges)
        
    def cleanup(self):
        # self.g.remove_nodes_from(list(nx.isolates(self.g)))
        for component in list(nx.connected_components(self.g)):
            if len(component) < 5:
                for node in component:
                    self.g.remove_node(node)
        
    def save(self, repo_name):
        os.makedirs(METRICS_SAVE_PATH + repo_name, exist_ok=True)
        nx.write_gpickle(self.g, WeightGraph.pickle_path(repo_name, self.view_name))
        
    def get_max_weight(self):
        return max([self.g[e[0]][e[1]]["weight"] for e in self.g.edges])
        
    @staticmethod
    def load(repo_name, name):
        wg = WeightGraph(name)
        wg.g = nx.read_gpickle(WeightGraph.pickle_path(repo_name, name))
        return wg
        
    @staticmethod
    def pickle_path(repo_name, name):
        # see https://networkx.github.io/documentation/stable/reference/readwrite/gpickle.html
        return METRICS_SAVE_PATH + repo_name + "/" + name + ".gpickle"
    
    def json_save(self, repo_name):
        data = json_graph.node_link_data(self.g)
        with open(METRICS_SAVE_PATH + repo_name + "/" + self.view_name + ".json", 'w') as outfile:
            json.dump(data, outfile)
            
    def html_save(self, repo_name):
        data = json.dumps(json_graph.node_link_data(self.g))
        content = '<html><body><script type="text/javascript">const graph = ' + data + ';</script><script src="/files/metrics/html_app.js?_xsrf=2%7Ce163cb61%7Cb9245804a283415ecb4c641f0cf1f882%7C1601372106"></script></body></html>'
        with open(METRICS_SAVE_PATH + repo_name + "/" + self.view_name + ".html", 'w') as outfile:
            outfile.write(content)
    
    def plaintext_save(self, repo_name):
        node_list = list(self.g.nodes)
        node2index = dict(zip(node_list, range(len(node_list))))
        content = ";".join(node_list) + "\n" + ";".join([str(node2index[a]) + "," + str(node2index[b]) + "," + str(d["weight"]) for a, b, d in self.g.edges(data=True)])
        os.makedirs(EXPORT_SAVE_PATH + repo_name, exist_ok=True)
        with open(EXPORT_SAVE_PATH + repo_name + "/" + self.view_name + ".graph.txt", "w") as f:
            f.write(content)
    
    
    def print_statistics(self):
        # https://networkx.github.io/documentation/latest/tutorial.html#analyzing-graphs
        node_count = len(self.g.nodes)
        edge_count = len(self.g.edges)
        cc = list(nx.connected_components(self.g))
        print("WeightGraph statistics: "
              + str(node_count) + " nodes, "
              + str(edge_count) + " edges, "
              + str(len(cc)) + " connected component(s), with sizes: ["
              + ", ".join([str(len(c)) for c in cc])
              + "]")
        edge_weights = [self.g[e[0]][e[1]]["weight"] for e in self.g.edges]
        edge_weights.sort()
        print("Edge weights:", edge_weights[0:5], "...", edge_weights[-5:], ", mean:", np.array(edge_weights).mean())
    
    def show_weight_histogram(self):
        # https://matplotlib.org/3.3.1/api/_as_gen/matplotlib.pyplot.hist.html
        # import pdb; pdb.set_trace()  # debugger
        edge_weights = [self.g[e[0]][e[1]]["weight"] for e in self.g.edges]
        plt.hist(edge_weights, "auto", facecolor='b', alpha=0.75)
        plt.axvline(np.array(edge_weights).mean(), color='k', linestyle='dashed', linewidth=1)
        plt.xlabel('Coupling Strength')
        plt.ylabel('Amount')
        plt.title('Histogram of edge weights in coupling graph')
        plt.grid(True)
        plt.show()
        
        # import pdb; pdb.set_trace()
        node_weights = [sum([self.g[n][n2]["weight"] for n2 in self.g.adj[n]]) for n in self.g.nodes]
        plt.hist(node_weights, "auto", facecolor='g', alpha=0.75)
        plt.axvline(np.array(node_weights).mean(), color='k', linestyle='dashed', linewidth=1)
        # plt.xscale("log")
        # plt.yscale("log")
        plt.xlabel('Coupling Strength')
        plt.ylabel('Amount')
        plt.title('Histogram of node weights')
        plt.grid(True)
        plt.show()
        
    def visualize(self, use_spring = False, with_labels = True):
        # https://networkx.github.io/documentation/latest/reference/generated/networkx.drawing.nx_pylab.draw_networkx.html
        for e in self.g.edges:
            self.g[e[0]][e[1]]["distance"] = 1. - self.g[e[0]][e[1]]["weight"] + 0.000001  # the value must not be exactly zero
        
        edge_weights = [self.g[e[0]][e[1]]["weight"] for e in self.g.edges]
        max_weight = max(edge_weights)
        mean_weight = np.array(edge_weights).mean()
        target_max_weight = min(max_weight, mean_weight * 2)
        
        plt.figure(figsize=(8, 8))
        VIZ_POW = 1
        max_w_fact = (1. / target_max_weight) ** VIZ_POW
        
        layout = nx.drawing.layout.kamada_kawai_layout(self.g, weight="distance") if use_spring else None
        
        # nx.draw_kamada_kawai(self.g, alpha=0.2, node_size=100)
        # nx.draw(self.g, alpha=0.2, node_size=100)
        edge_colors = [(0., 0., 0., min(1., (self.g[a][b]["weight"] ** VIZ_POW) * max_w_fact)) for a, b in self.g.edges]
        nx.draw(self.g, pos=layout, node_size=50, edge_color=edge_colors, node_color=[(0.121, 0.469, 0.703, 0.2)], with_labels=with_labels)
        
        plt.show()
        

In [None]:
MAX_COMMIT_FILES = 50
# needs to be separate so that multiprocessing lib can find it
def get_commit_diff(commit_hash, repo):
    
    def walk_tree_cursor(cursor, prefix, content_bytes, node_handler):
        if not cursor.node.is_named:
            return
        # cursor.current_field_name() is the role that this node has in its parent
        tree_node_name = None  # TODO keep in sync with structural and linguistic view as well as RepoFile class
        if cursor.node.type == "class_declaration" or cursor.node.type == "interface_declaration" or cursor.node.type == "enum_declaration":
            idfield = cursor.node.child_by_field_name("name")
            tree_node_name = content_bytes[idfield.start_byte:idfield.end_byte].decode("utf-8")
        elif cursor.node.type == "field_declaration":
            idfield = cursor.node.child_by_field_name("declarator").child_by_field_name("name")
            tree_node_name = content_bytes[idfield.start_byte:idfield.end_byte].decode("utf-8")
        elif cursor.node.type == "method_declaration":
            idfield = cursor.node.child_by_field_name("name")
            tree_node_name = content_bytes[idfield.start_byte:idfield.end_byte].decode("utf-8")
        elif cursor.node.type == "constructor_declaration":
            tree_node_name = "constructor"

        if tree_node_name is not None:
            prefix = prefix + "/" + tree_node_name
            # found_nodes.register(prefix)
            node_handler(prefix, cursor.node)

        if cursor.goto_first_child():
            walk_tree_cursor(cursor, prefix, content_bytes, node_handler)
            while cursor.goto_next_sibling():
                walk_tree_cursor(cursor, prefix, content_bytes, node_handler)
            cursor.goto_parent()
    
    def walk_tree(tree, content_bytes, base_path) -> RepoTree:
        """ node_handler gets the current logic-path and node for each ast node"""
        found_nodes = RepoTree(None, "")
        def handle(logic_path, ts_node):
            found_nodes.register(logic_path, ts_node)
        walk_tree_cursor(tree.walk(), base_path, content_bytes, handle)
        # print("Found " + str(found_nodes.node_count()) + " classes, methods and fields!")
        return found_nodes
    
    error_query = JA_LANGUAGE.query("(ERROR) @err")
    def _has_error(tree) -> List[str]:
        errors = error_query.captures(tree.root_node)
        return len(errors) > 1
    
    def blob_diff(diff) -> List[str]:
        # pdb.set_trace()
        if diff.a_blob is None:
            return [diff.b_path] # newly created
        elif diff.b_blob is None:
            return [diff.a_path] # deleted
        path = diff.a_path
        a_content = diff.a_blob.data_stream.read()
        b_content = diff.b_blob.data_stream.read()
        a_tree = java_parser.parse(a_content)
        b_tree = java_parser.parse(b_content)
        if _has_error(a_tree) or _has_error(b_tree):
            return [path] # I guess just the file changed, no more details available
        a_repo_tree = walk_tree(a_tree, a_content, path)
        b_repo_tree = walk_tree(b_tree, b_content, path)
        return a_repo_tree.calculate_diff_to(b_repo_tree, a_content, b_content)
    
    c1 = repo.get_commit(commit_hash)
    if len(c1.parents) == 1:
        c2 = c1.parents[0]
        diff = c1.diff(c2)
        if len(diff) > MAX_COMMIT_FILES or len(diff) <= 1:  # this is duplicated here for performance
            return None
        diffs = [result for d in diff for result in blob_diff(d)]
        # print("Diff: " + str(len(diff)) + " / " + str(len(diffs)) + " changes")
    elif len(c1.parents) == 2:
        return None  # TODO how to do sub-file diffs for merge commits?
        #c2 = c1.parents[0]
        #diff_1 = c1.diff(c2)
        #c3 = c1.parents[1]
        #diff_2 = c1.diff(c3)

        #diffs_1 = [ d.a_path for d in diff_1 ]
        #diffs_2 = [ d.a_path for d in diff_2 ]
        #diffs = list(set(diffs_1).intersection(set(diffs_2)))
    else:
        return None
    if len(diffs) > MAX_COMMIT_FILES or len(diffs) <= 1:
        return None
    return diffs

In [5]:
class MetricsGeneration:
    # ascii art: http://patorjk.com/software/taag/#p=display&f=Soft&t=STRUCTURAL%0A.%0ALINGUISTIC%0A.%0AEVOLUTIONARY%0A.%0ADYNAMIC
    def __init__(self, repo):
        self.repo = repo
        
    def calculate_evolutionary_connections(self) -> WeightGraph:
        """
,------.,--.   ,--.,-----. ,--.   ,--. ,--.,--------.,--. ,-----. ,--.  ,--.  ,---.  ,------.,--.   ,--. 
|  .---' \  `.'  /'  .-.  '|  |   |  | |  |'--.  .--'|  |'  .-.  '|  ,'.|  | /  O  \ |  .--. '\  `.'  /  
|  `--,   \     / |  | |  ||  |   |  | |  |   |  |   |  ||  | |  ||  |' '  ||  .-.  ||  '--'.' '.    /   
|  `---.   \   /  '  '-'  '|  '--.'  '-'  '   |  |   |  |'  '-'  '|  | `   ||  | |  ||  |\  \    |  |    
`------'    `-'    `-----' `-----' `-----'    `--'   `--' `-----' `--'  `--'`--' `--'`--' '--'   `--'    
        """
        # MAX_COMMIT_FILES = 50  # Ignore too large commits. (constant moved)
        PARALLEL_THREADS = 45  # more seems to make it worse again?
        PARALLEL_BATCH_SIZE = 2  # the size of packets delivered to worker processes
        
        coupling_graph = WeightGraph("evolutionary")
        # graph_lock = Lock()
        
        def processDiffs(diffs):
            score = 2 / len(diffs)
            # with graph_lock:
            for f1, f2 in all_pairs(diffs):
                coupling_graph.add_sym(f1, f2, score)
        
        all_commits = list(self.repo.get_all_commits())
        print("Commits to analyze: " + str(len(all_commits)))

        with Pool(processes=PARALLEL_THREADS) as pool:
            bar = log_progress(total=len(all_commits), desc="Analyzing commits", smoothing=0.1)
            diffs = pool.imap_unordered(partial(get_commit_diff, repo=self.repo), all_commits, PARALLEL_BATCH_SIZE)
            # single-threaded alternative for debugging:
            #diffs = []
            #for i, t in enumerate(all_commits):
            #    diffs.append(partial(get_commit_diff, repo=self.repo)(t))
            #    print(str(i / float(len(all_commits)) * 100.) + " % done.")

            for i, elem in enumerate(diffs):
                if elem is not None:
                    processDiffs(elem)
                    # print(str(i / float(len(all_commits)) * 100.) + " % done.")
                bar.update()
        
            bar.close()
        coupling_graph.cutoff_edges(0.005)
        return coupling_graph
    
    
    def calculate_structural_connections(self) -> WeightGraph:
        """
 ,---. ,--------.,------. ,--. ,--. ,-----.,--------.,--. ,--.,------.   ,---.  ,--.                     
'   .-''--.  .--'|  .--. '|  | |  |'  .--./'--.  .--'|  | |  ||  .--. ' /  O  \ |  |                     
`.  `-.   |  |   |  '--'.'|  | |  ||  |       |  |   |  | |  ||  '--'.'|  .-.  ||  |                     
.-'    |  |  |   |  |\  \ '  '-'  ''  '--'\   |  |   '  '-'  '|  |\  \ |  | |  ||  '--.                  
`-----'   `--'   `--' '--' `-----'  `-----'   `--'    `-----' `--' '--'`--' `--'`-----'   
        """
        STRNGTH_FILE_IMPORT = 1
        STRENGTH_MEMBER_CLASS = 1
        STRENGTH_METHOD_RETURN_CLASS = 1
        STRENGTH_METHOD_PARAM_CLASS = 1
        
        
        builtin_types = set(['void', 'String', 'byte', 'short', 'int', 'long', 'float', 'double', 'boolean', 'char', 'Byte', 'Short', 'Integer', 'Long', 'Float', 'Double', 'Boolean', 'Character'])
        stl_types = set(['ArrayList', 'List', 'LinkedList', 'Map', 'HashMap', 'Object'])
        ignored_types = builtin_types.union(stl_types)
        
        coupling_graph = WeightGraph("structural")

        error_query = JA_LANGUAGE.query("(ERROR) @err")
        package_query_1 = JA_LANGUAGE.query("(package_declaration (identifier) @decl)")
        package_query_2 = JA_LANGUAGE.query("(package_declaration (scoped_identifier) @decl)")
        import_query = JA_LANGUAGE.query("(import_declaration (scoped_identifier) @decl)")
        class_query = JA_LANGUAGE.query("(class_declaration name: (identifier) @decl)")


        def _has_error(file) -> List[str]:
            errors = error_query.captures(file.get_tree().root_node)
            return len(errors) > 1


        def _get_package(file) -> List[str]:
            packages = package_query_1.captures(file.get_tree().root_node) + package_query_2.captures(file.get_tree().root_node)
            # assert len(packages) <= 1
            if len(packages) > 1:
                import pdb; pdb.set_trace()
            if len(packages) == 1:
                return file.node_text(packages[0][0]).split(".")
            else:
                return []

        def _get_imports(file) -> List[str]:
            imports = import_query.captures(file.get_tree().root_node)
            result = []
            for import_statement in imports:
                import_string = file.node_text(import_statement[0])
                if not import_string.startswith("java"):
                    result.append(import_string)
            return result

        def _get_main_class_name(file) -> List[str]:
            classes = class_query.captures(file.get_tree().root_node)
            if len(classes) >= 1:
                return file.node_text(classes[0][0])
            else:
                return None

        #######

        full_class_name_to_id = {}

        files = self._get_all_files()
        for file in log_progress(files, desc="Building Import Graph", smoothing=0.1):
            if _has_error(file):
                continue
            class_name = _get_main_class_name(file)
            if class_name is not None:
                full_class_name = ".".join(_get_package(file) + [class_name])
                full_class_name_to_id[full_class_name] = file.get_path()
                class_node = file.get_repo_tree_node().find_node(class_name)
                if class_node is not None:
                    full_class_name_to_id[full_class_name] = class_node.get_path()
                
                
        def _resolve_type(type_name, imports, file_tree_node: RepoTree) -> str:
            """find the node name of the node representing this type, if any, or None"""
            # in imports
            matching_imports = [i for i in imports if i.endswith("." + type_name) or i == type_name]
            if len(matching_imports) > 1:
                raise Exception("Ambiguous import!")
            elif len(matching_imports) == 1:
                import_path = matching_imports[0]
                if import_path in full_class_name_to_id:
                    return full_class_name_to_id[import_path]
            # in same file
            classes = node.get_descendants_of_type("class") + node.get_descendants_of_type("interface")
            matching_classes = [c for c in classes if c.name == type_name]
            if len(matching_classes) > 1:
                pdb.set_trace()
                raise Exception("Ambiguous class name!")
            elif len(matching_classes) == 1:
                return matching_classes[0].get_path()
            # in same package
            type_file_name = type_name + ".java"
            if file_tree_node.parent.has_child(type_file_name):
                class_file = file_tree_node.parent.find_node(type_file_name)
                class_node = class_file.find_node(type_name)
                if class_node is None:
                    return class_file.get_path()
                else:
                    return class_node.get_path()
            return None
        
        def _couple_type(type_text, coupling_path, coupling_strength):
            data_types = re.split("[^\w]+", type_text)
            data_types = [dt.strip() for dt in data_types]
            data_types = [dt for dt in data_types if len(dt) > 0 and not dt[0].isdigit()]
            data_types = [dt for dt in data_types if dt not in ignored_types]
            for data_type in data_types:
                resolved_type = _resolve_type(data_type, imports, node)
                if resolved_type is not None:
                    print("  Coupling " + coupling_path + " to " + resolved_type)
                    coupling_graph.add(coupling_path, resolved_type, coupling_strength)
                else:
                    print("  Cannot resolve type: " + data_type)
        

        for file in log_progress(files, desc="Extraction connections", smoothing=0.1):
            imports = _get_imports(file)
            for i in imports:
                if i in full_class_name_to_id:
                    pass# print("import RESOLVED: " + i)
                    coupling_graph.add(file.get_path(), full_class_name_to_id[i], STRNGTH_FILE_IMPORT)
                else:
                    pass # print("cannot resolve import: " + i)
        
            node = file.get_repo_tree_node()  # TODO unify with linguistic view code
            if node is None:
                continue  # TODO why / when does this happen?
                
            # TODO keep in sync with evolutionary and linguistic view as well as RepoFile class
            classes = node.get_descendants_of_type("class") + node.get_descendants_of_type("interface") + node.get_descendants_of_type("enum")
            for class_node in classes:
                fields = class_node.get_children_of_type("field")
                methods = class_node.get_children_of_type("method") + class_node.get_children_of_type("constructor")
                print("Class " + class_node.name + ": " + str(len(methods)) + " methods and " + str(len(fields)) + " fields")
                
                for field in fields:
                    type_node = field.ts_node.child_by_field_name("type")
                    _couple_type(file.node_text(type_node), field.get_path(), STRENGTH_MEMBER_CLASS)
                            
                for method in methods:
                    type_node = method.ts_node.child_by_field_name("type")
                    _couple_type(file.node_text(type_node), method.get_path(), STRENGTH_METHOD_RETURN_CLASS)
                    parameters_node = method.ts_node.child_by_field_name("parameters")
                    for parameter in [p for p in parameters_node.children if p.type == 'formal_parameter']:
                        type_node = parameter.child_by_field_name("type")
                        _couple_type(file.node_text(type_node), method.get_path(), STRENGTH_METHOD_PARAM_CLASS)
                        

        return coupling_graph
    
    
    def calculate_linguistic_connections(self) -> WeightGraph:
        """
,--.   ,--.,--.  ,--. ,----.   ,--. ,--.,--. ,---. ,--------.,--. ,-----.                                
|  |   |  ||  ,'.|  |'  .-./   |  | |  ||  |'   .-''--.  .--'|  |'  .--./                                
|  |   |  ||  |' '  ||  | .---.|  | |  ||  |`.  `-.   |  |   |  ||  |                                    
|  '--.|  ||  | `   |'  '--'  |'  '-'  '|  |.-'    |  |  |   |  |'  '--'\                                
`-----'`--'`--'  `--' `------'  `-----' `--'`-----'   `--'   `--' `-----'              
        """
        
        #
        # TODO: https://pypi.org/project/biterm/
        #
        
        # constants
        MIN_WORD_LENGTH = 3
        MAX_WORD_LENGTH = 50
        # MIN_WORD_USAGES = 2  # any word used less often will be ignored
        MAX_DF = 0.95  # any terms that appear in a bigger proportion of the documents than this will be ignored (corpus-specific stop-words)
        MAX_FEATURES = 1500  # the size of the LDA thesaurus - amount of words to consider for topic learning
        TOPIC_COUNT = 20 # 40  # 100 according to paper
        BTM_ITERATIONS = 30  # 100 according to docs?
        # LDA_PASSES = 200  # how often to go through the corpus
        # LDA_RANDOM_SEED = 42
        DOCUMENT_SIMILARITY_EXP = 6 # higher = lower equality values, lower = equality values are all closer to 1
        DOCUMENT_SIMILARITY_CUTOFF = 0.1  # in range [0 .. 1]: everything below this is dropped
        
        
        # keywords from python, TS and Java
        custom_stop_words = ["abstract", "and", "any", "as", "assert", "async", "await", "boolean", "break", "byte", "case", "catch", "char", "class", "const", "constructor", "continue", "debugger", "declare", "def", "default", "del", "delete", "do", "double", "elif", "else", "enum", "except", "export", "extends", "false", "False", "final", "finally", "float", "for", "from", "function", "get", "global", "goto", "if", "implements", "import", "in", "instanceof", "int", "interface", "is", "lambda", "let", "long", "module", "new", "None", "nonlocal", "not", "null", "number", "of", "or", "package", "pass", "private", "protected", "public", "raise", "require", "return", "set", "short", "static", "strictfp", "string", "super", "switch", "symbol", "synchronized", "this", "throw", "throws", "transient", "true", "True", "try", "type", "typeof", "var", "void", "volatile", "while", "with", "yield"]
        stop_words = set(list(get_stop_words('en')) + list(stopwords.words('english')) + custom_stop_words)
        splitter = r"(?:[\W_]+|(?<![A-Z])(?=[A-Z])|(?<!^)(?=[A-Z][a-z]))"
        lemma = WordNetLemmatizer()
        printable_characters = set(string.printable)
        
        def _normalize_word(word):
            return lemma.lemmatize(lemma.lemmatize(word.lower(), pos = "n"), pos = "v")
        
        def _get_text(content_string):
            # https://stackoverflow.com/questions/5486337/how-to-remove-stop-words-using-nltk-or-python
            # https://agailloty.rbind.io/en/project/nlp_clean-text/
            content_string = ''.join(c for c in content_string if c in printable_characters)
            words = re.split(splitter, content_string)
            words = [word for word in words if not word in stop_words]
            words = [_normalize_word(word) for word in words]
            words = [word for word in words if len(word) >= MIN_WORD_LENGTH and len(word) <= MAX_WORD_LENGTH]
            words = [word for word in words if not word in stop_words]
            return words
        
        def array_similarity(a, b):
            """given two arrays of numbers, how equal are they?"""
            return math.pow(1. - distance.cosine(a, b), DOCUMENT_SIMILARITY_EXP)  # TODO check for alternative distance metrics?
        
        
        print("Extracting words...")
        
        # see https://docs.python.org/2/library/collections.html#collections.Counter
        freq_dist = FreqDist()
        
        files = self._get_all_files()
        
        
        node_words = []  # List of (RepoTree-Node,wordList) - tuples
        for file in log_progress(files, desc="Extraction language corpus", smoothing=0.1):
            node = file.get_repo_tree_node()  # TODO unify with structural view code
            if node is None:
                continue  # TODO why / when does this happen?
            
            # TODO keep in sync with evolutionary and structural view as well as RepoFile class
            classes = node.get_descendants_of_type("class") + node.get_descendants_of_type("interface") + node.get_descendants_of_type("enum")
            for class_node in classes:
                fields = class_node.get_children_of_type("field")
                methods = class_node.get_children_of_type("method") + class_node.get_children_of_type("constructor")
                # print("Class " + class_node.name + ": " + str(len(methods)) + " methods and " + str(len(fields)) + " fields")
                
                for member in fields + methods:
                    text = member.get_comment_and_own_text(file)
                    words = list(_get_text(class_node.get_path() + " " + text))
                    # TODO: handle the empty list?!?
                    node_words.append((member, words))
                    # print(" ".join(words))
        
        # https://pypi.org/project/biterm/
        print("Vectorizing words...")
        texts = [" ".join(words) for (node, words) in node_words]
        vec = CountVectorizer(max_df=MAX_DF, max_features=MAX_FEATURES)
        X = vec.fit_transform(texts).toarray()
        vocab = np.array(vec.get_feature_names())
        
        print("Instantiating BTM...")
        btm = oBTM(num_topics=TOPIC_COUNT, V=vocab)
        biterms = vec_to_biterms(X)
        
        print("Training model...")
        topics = btm.fit_transform(biterms, iterations=BTM_ITERATIONS)
        
        print("Generating topic output...")
        words_to_show_per_topic = 10
        topic_summuary(btm.phi_wz.T, X, vocab, words_to_show_per_topic)
        
        
        
        print("Generating coupling graph...")
        
        # debug_list = [] # (sim, f1, f2)
        coupling_graph = WeightGraph("linguistic")
        for n1 in range(len(node_words)):
            for n2 in range(len(node_words)):
                if n1 >= n2:
                    continue
                t1 = topics[n1]
                t2 = topics[n2]
                if np.isnan(t1[0]) or np.isnan(t2[0]):
                    continue  # TODO filter out those earlier
                similarity = array_similarity(t1, t2)
                coupling_graph.add_sym(node_words[n1][0].get_path(), node_words[n2][0].get_path(), similarity)
                # debug_list.append((similarity, f1, f2))
                
        print("Trimming graph...")
        
        coupling_graph.cutoff_edges(DOCUMENT_SIMILARITY_CUTOFF)
                
        
        # print("Most similar files:")
        # debug_list = sorted(debug_list, key = lambda x: -x[0])
        # print([str(sim) + ": " + files[f1].get_path() + " <> " + files[f2].get_path() for sim, f1, f2 in debug_list[0:10]])
        
        # print("Most dissimilar files:")
        # debug_list = sorted(debug_list, key = lambda x: x[0])
        # print([str(sim) + ": " + files[f1].get_path() + " <> " + files[f2].get_path() for sim, f1, f2 in debug_list[0:10]])
        
        return coupling_graph
            
    
    # -------------------------------------------------------------------------------------------
    
    def _get_all_files(self) -> List[RepoFile]:
        return [RepoFile(self.repo, o) for o in self.repo.get_file_objects()]

In [None]:
class MetricManager:
    
    @staticmethod
    def clear(repo, name):
        if MetricManager._data_present(repo.name, name):
            os.remove(WeightGraph.pickle_path(repo.name, name))
    
    @staticmethod
    def get(repo, name) -> WeightGraph:
        if MetricManager._data_present(repo.name, name):
            print("Using precalculated " + name + " values")
            return WeightGraph.load(repo.name, name)
        print("No precalculated " + name + " values found, starting calculations...")
        graph = getattr(MetricsGeneration(repo), "calculate_" + name + "_connections")()
        graph.cleanup()
        print("Calculated " + name + " values, saving them now...")
        graph.save(repo.name)
        return graph
    
    @staticmethod
    def _data_present(repo_name, name):
        return os.path.isfile(WeightGraph.pickle_path(repo_name, name))