In [1]:
%run util.ipynb
%run parsing.ipynb
%run Graph.ipynb
%run LocalRepo.ipynb
%run structural_parsing.ipynb
%run linguistic_parsing.ipynb
import networkx as nx
from networkx.readwrite import json_graph
import json
import matplotlib.pyplot as plt
import numpy as np
import math
import time
from typing import List
from stop_words import get_stop_words
from nltk.corpus import stopwords
import re
import regex  # the cooler "re"
from nltk import WordNetLemmatizer, FreqDist
import string
from random import shuffle
from gensim.corpora.dictionary import Dictionary
from gensim import similarities
from gensim.models import LdaModel, LdaMulticore
from multiprocessing import Pool, TimeoutError, Process, Manager, Lock
from functools import partial
import pdb

In [3]:
MAX_COMMIT_FILES = 50
from timeit import default_timer as timer

# needs to be separate so that multiprocessing lib can find it
def get_commit_diff(commit_hash, repo):
    # repo_tree = repo.get_tree()
    
    def walk_tree_cursor(cursor, prefix, content_bytes, node_handler):
        if not cursor.node.is_named:
            return
        def node_text(node):
            return decode(content_bytes[node.start_byte:node.end_byte])
            
        # cursor.current_field_name() is the role that this node has in its parent
        tree_node_names = []  # TODO keep in sync with structural and linguistic view as well as RepoFile class
        if cursor.node.type == "class_declaration" or cursor.node.type == "interface_declaration" or cursor.node.type == "enum_declaration":
            tree_node_names.append(node_text(cursor.node.child_by_field_name("name")))
        elif cursor.node.type == "field_declaration":
            declarators = [child for child in cursor.node.children if child.type == "variable_declarator"]
            tree_node_names += [node_text(d.child_by_field_name("name")) for d in declarators]
        elif cursor.node.type == "method_declaration":
            tree_node_names.append(node_text(cursor.node.child_by_field_name("name")))
        elif cursor.node.type == "constructor_declaration":
            tree_node_names.append("constructor")

        for tree_node_name in tree_node_names:
            node_handler(prefix + "/" + tree_node_name, cursor.node)
        if len(tree_node_names) > 0:
            prefix = prefix + "/" + tree_node_names[0]

        if cursor.goto_first_child():
            walk_tree_cursor(cursor, prefix, content_bytes, node_handler)
            while cursor.goto_next_sibling():
                walk_tree_cursor(cursor, prefix, content_bytes, node_handler)
            cursor.goto_parent()
    
    def walk_tree(tree, content_bytes, base_path) -> RepoTree:
        """ node_handler gets the current logic-path and node for each ast node"""
        try:
            found_nodes = RepoTree(None, "")
            def handle(logic_path, ts_node):
                found_nodes.register(logic_path, ts_node)
            walk_tree_cursor(tree.walk(), base_path, content_bytes, handle)
            return found_nodes
        except Exception as e:
            print("Failed to parse file:", base_path, "Error:", e)
            pdb.set_trace()
            return None
    
    error_query = JA_LANGUAGE.query("(ERROR) @err")
    def _has_error(tree) -> List[str]:
        errors = error_query.captures(tree.root_node)
        return len(errors) > 1
    
    def blob_diff(diff) -> List[str]:
        # pdb.set_trace()
        if diff.a_blob is None:
            return [diff.b_path] # newly created
        elif diff.b_blob is None:
            return [diff.a_path] # deleted
        path = diff.a_path
        # if not repo_tree.has_node(path):
        #     return []  # ignore changed files that are not part of the interesting project structure
        if not path.endswith("." + repo.type_extension()):
            return [path]
        a_content = diff.a_blob.data_stream.read()
        if should_skip_file(a_content):
            return []
        b_content = diff.b_blob.data_stream.read()
        if should_skip_file(b_content):
            return []
        a_tree = java_parser.parse(a_content)
        b_tree = java_parser.parse(b_content)
        if _has_error(a_tree) or _has_error(b_tree):
            return [path] # I guess just the file changed, no more details available
        a_repo_tree = walk_tree(a_tree, a_content, path)
        if a_repo_tree is None:
            return [path]
        b_repo_tree = walk_tree(b_tree, b_content, path)
        if b_repo_tree is None:
            return [path]
        return a_repo_tree.calculate_diff_to(b_repo_tree, a_content, b_content)
    
    c1 = repo.get_commit(commit_hash)
    if len(c1.parents) == 1:
        c2 = c1.parents[0]
        # t4 = timer()
        diff = c1.diff(c2)
        # t5 = timer()
        if len(diff) > MAX_COMMIT_FILES or len(diff) <= 1:  # this is duplicated here for performance
            return None
        diffs = [result for d in diff for result in blob_diff(d)]  #  if repo_tree.has_node(result)
        # t6 = timer()
        # print("Diff: " + str(len(diff)) + " / " + str(len(diffs)) + " changes")
        
        # print("Time taken (ms):", round((t5-t4)*1000), "(getting git diff)", round((t6-t5)*1000), "(parsing sub-file diffs)", round((t6-t4)*1000), "(total)")
    elif len(c1.parents) == 2:
        return None  # TODO how to do sub-file diffs for merge commits?
        #c2 = c1.parents[0]
        #diff_1 = c1.diff(c2)
        #c3 = c1.parents[1]
        #diff_2 = c1.diff(c3)

        #diffs_1 = [ d.a_path for d in diff_1 ]
        #diffs_2 = [ d.a_path for d in diff_2 ]
        #diffs = list(set(diffs_1).intersection(set(diffs_2)))
    else:
        return None
    if len(diffs) > MAX_COMMIT_FILES or len(diffs) <= 1:
        return None
    return diffs

In [4]:
class MetricsGeneration:
    # ascii art: http://patorjk.com/software/taag/#p=display&f=Soft&t=STRUCTURAL%0A.%0ALINGUISTIC%0A.%0AEVOLUTIONARY%0A.%0ADYNAMIC
    def __init__(self, repo):
        self.repo = repo
        
    def calculate_evolutionary_connections(self) -> CouplingGraph:
        """
,------.,--.   ,--.,-----. ,--.   ,--. ,--.,--------.,--. ,-----. ,--.  ,--.  ,---.  ,------.,--.   ,--. 
|  .---' \  `.'  /'  .-.  '|  |   |  | |  |'--.  .--'|  |'  .-.  '|  ,'.|  | /  O  \ |  .--. '\  `.'  /  
|  `--,   \     / |  | |  ||  |   |  | |  |   |  |   |  ||  | |  ||  |' '  ||  .-.  ||  '--'.' '.    /   
|  `---.   \   /  '  '-'  '|  '--.'  '-'  '   |  |   |  |'  '-'  '|  | `   ||  | |  ||  |\  \    |  |    
`------'    `-'    `-----' `-----' `-----'    `--'   `--' `-----' `--'  `--'`--' `--'`--' '--'   `--'    
        """
        # MAX_COMMIT_FILES = 50  # Ignore too large commits. (constant moved)
        
        coupling_graph = ExplicitCouplingGraph("evolutionary")
        
        def processDiffs(diffs):
            score = 2 / len(diffs)
            diffs = [d for d in diffs if self.repo.get_tree().has_node(d)]
            for f1, f2 in all_pairs(diffs):
                coupling_graph.add(f1, f2, score)
            for node in diffs:
                coupling_graph.add_support(node, 1)
        
        print("Discovering commits...")
        all_commits = list(self.repo.get_all_commits())
        # shuffle(all_commits)
        print("Done!")
        r.get_tree()
        print("Commits to analyze: " + str(len(all_commits)))
        
        map_parallel(
            all_commits,
            partial(get_commit_diff, repo=self.repo),
            processDiffs,
            "Analyzing commits",
            force_non_parallel=False
        )
        
        
        coupling_graph.cutoff_edges(0.005)
        coupling_graph.cleanup(3)
        return coupling_graph
    
    def post_evolutionary(self, coupling_graph: CouplingGraph):
        pass
    
    
    def calculate_structural_connections(self) -> CouplingGraph:
        """
 ,---. ,--------.,------. ,--. ,--. ,-----.,--------.,--. ,--.,------.   ,---.  ,--.                     
'   .-''--.  .--'|  .--. '|  | |  |'  .--./'--.  .--'|  | |  ||  .--. ' /  O  \ |  |                     
`.  `-.   |  |   |  '--'.'|  | |  ||  |       |  |   |  | |  ||  '--'.'|  .-.  ||  |                     
.-'    |  |  |   |  |\  \ '  '-'  ''  '--'\   |  |   '  '-'  '|  |\  \ |  | |  ||  '--.                  
`-----'   `--'   `--' '--' `-----'  `-----'   `--'    `-----' `--' '--'`--' `--'`-----'   
        """

        
        coupling_graph = ExplicitCouplingGraph("structural")

        context = StructuralContext(self.repo)
        context.couple_files_by_import(coupling_graph)
        context.couple_by_ineritance(coupling_graph)
        context.couple_members_by_content(coupling_graph)
        coupling_graph.cleanup(3)
        flush_unresolvable_vars()

        return coupling_graph
    
    def post_structural(self, coupling_graph: CouplingGraph):
        coupling_graph.dilate(1, 0.2)
        coupling_graph.propagate_down(2, 0.5)
        pass
    
    
    def calculate_linguistic_connections(self) -> CouplingGraph:
        """
,--.   ,--.,--.  ,--. ,----.   ,--. ,--.,--. ,---. ,--------.,--. ,-----.                                
|  |   |  ||  ,'.|  |'  .-./   |  | |  ||  |'   .-''--.  .--'|  |'  .--./                                
|  |   |  ||  |' '  ||  | .---.|  | |  ||  |`.  `-.   |  |   |  ||  |                                    
|  '--.|  ||  | `   |'  '--'  |'  '-'  '|  |.-'    |  |  |   |  |'  '--'\                                
`-----'`--'`--'  `--' `------'  `-----' `--'`-----'   `--'   `--' `-----'              
        """
        

        coupling_graph = SimilarityCouplingGraph("linguistic")
        
        node_words = extract_topic_model_documents(self.repo.get_all_interesting_files())
        topics = train_topic_model(node_words)
        couple_by_topic_similarity(node_words, topics, coupling_graph)
        
        return coupling_graph
    
    def post_linguistic(self, coupling_graph: CouplingGraph):
        pass
            
    
    # -------------------------------------------------------------------------------------------

In [5]:
class MetricManager:
    graph_cache = {}
    
    @staticmethod
    def cache_key(repo, name):
        return repo.name + "-" + name
    
    @staticmethod
    def clear(repo, name):
        MetricManager.graph_cache.pop(MetricManager.cache_key(repo, name), None)
        if MetricManager._data_present(repo.name, name):
            os.remove(CouplingGraph.pickle_path(repo.name, name))
    
    @staticmethod
    def get(repo, name) -> CouplingGraph:
        if name == "module_distance":
            return ModuleDistanceCouplingGraph()
        if MetricManager.cache_key(repo, name) in MetricManager.graph_cache:
            return MetricManager.graph_cache[MetricManager.cache_key(repo, name)]
        if MetricManager._data_present(repo.name, name):
            print("Using precalculated " + name + " values")
            graph = CouplingGraph.load(repo.name, name)
            getattr(MetricsGeneration(repo), "post_" + name)(graph)
            MetricManager.graph_cache[MetricManager.cache_key(repo, name)] = graph
            return graph
        print("No precalculated " + name + " values found, starting calculations...")
        graph = getattr(MetricsGeneration(repo), "calculate_" + name + "_connections")()
        print("Calculated " + name + " values, saving them now...")
        graph.save(repo.name)
        getattr(MetricsGeneration(repo), "post_" + name)(graph)
        MetricManager.graph_cache[MetricManager.cache_key(repo, name)] = graph
        return graph
    
    @staticmethod
    def _data_present(repo_name, name):
        return os.path.isfile(CouplingGraph.pickle_path(repo_name, name))