In [None]:
import pdb
%matplotlib inline
%run LocalRepo.ipynb
%run repos.ipynb
%run parsing.ipynb
%run metrics.ipynb

In [None]:
class AnalysisGraph:
    def __init__(self, g):
        self.g = g
        self.children = {}
        for n in self.g.g.nodes:
            while n is not None:
                p = self.get_parent(n)
                self.children.setdefault(p, set()).add(n)
                n = p
        self.total_relative_coupling_cache = {}
        self.median_maximum_support_cache = None
    
    def propagate_down(self, layers = 1, weight_factor = 0.2):
        self.g.propagate_down(layers, weight_factor)
        
    def get_node_set(self):
        return set(self.g.g.nodes)
        
    def get_children(self, node):
        if node in self.children:
            return self.children[node]
        else:
            return []
        
    def get_parent(self, node):
        if len(node) <= 1:
            return None
        return "/".join(node.split("/")[:-1])
    
    def get_directly_coupled(self, node):
        if node in self.g.g:
            return [n for n in self.g.g[node]]
        else:
            return []
    
    def get_siblings(self, node):
        """Return all the other children of this.parent (excluding this)"""
        parent = self.get_parent(node)
        if parent is None:
            return []
        return [c for c in self.get_children(parent) if c != node]
    
    def get_coupling_candidates(self, node, add_predecessors = False):
        """Return all the nodes with which the given one could have a non-zero relative coupling.
         * := The direct coupling nodes of given+descendants, and all their predecessors"""
        this_and_descendants = self.get_self_and_descendants(node)
        
        direct_coupling_candidates = []
        for n in this_and_descendants:
            for n2 in self.get_directly_coupled(n):
                direct_coupling_candidates.append(n2)
        
        result = set()
        result.add("")  # root node, added to stop the predecessor iteration
        for other in direct_coupling_candidates:
            while result.add(other): # while not present yet
                if not add_predecessors:
                    break
                other = self.get_parent(other)
        result.remove("")  # root node - removed, since not very interesting
        return result
    
        
    def get_self_and_descendants(self, node):
        """return the given node and all its descendants as a list"""
        result = []
        self.get_self_and_descendants_rec(node, result)
        return result
    
    def get_self_and_descendants_rec(self, node, result):
        """add the given node and all its descendants into the provided list"""
        result.append(node)
        for child in self.get_children(node):
            self.get_self_and_descendants_rec(child, result)
            
    
    def get_direct_coupling(self, a, b):
        """direct coupling between a and b"""
        return self.g.get(a, b)
    
    def get_direct_multi_coupling(self, a, others):
        """sum of direct coupling between a and all b"""
        return sum([self.get_direct_coupling(a, b) for b in others])
    
    def get_relative_coupling(self, a, b):
        """sum of direct coupling between a+descendants and b+descendants"""
        others = self.get_self_and_descendants(b)
        return self.get_relative_multi_direct_coupling(a, others)
    
    def get_relative_multi_coupling(self, a, others):
        """sum of direct coupling between a+descendants and others+descendants"""
        direct_others = []
        for other in others:
            self.get_self_and_descendants_rec(other, direct_others)
        return self.get_relative_multi_direct_coupling(a, direct_others)
    
    def get_relative_multi_direct_coupling(self, a, others):
        """sum of direct coupling between a+descendants and others"""
        if len(others) == 0:
            return 0
        result = self.get_direct_multi_coupling(a, others)
        for child in self.get_children(a):
            result += self.get_relative_multi_direct_coupling(child, others)
        return result
    
    def get_total_relative_coupling(self, a):
        """the sum of direct couplings that a has, cached"""
        if a in self.total_relative_coupling_cache:
            return self.total_relative_coupling_cache[a]
        
        a_candidates = self.get_coupling_candidates(a, add_predecessors = False)
        total_coupling = self.get_relative_multi_direct_coupling(a, a_candidates)
        self.total_relative_coupling_cache[a] = total_coupling
        return total_coupling
    
    def get_normalized_coupling(self, a, b):
        """relative coupling between a and b, normalized by the sum of couplings that a has, in range [0, 1]"""
        if a not in self.g.g or b not in self.g.g:
            return 0
        target_coupling = self.get_relative_coupling(a, b)
        if target_coupling == 0:
            return 0
        total_coupling = self.get_total_relative_coupling(a)
        return target_coupling / total_coupling
    
    
    def get_normalized_support(self, node):
        """
        on a scale of [0, 1], how much support do we have for coupling values with that node?
        This should depend on how much data (including children) we have for this node, relative to how much data is normal in this graph.
        It should also be outlier-stable, so that having median-much data maybe results in a support score of 0.5?
        """
        abs_supp = self.get_absolute_support(node)
        median, maximum = self.get_absolute_support_median_and_max()
        if abs_supp <= median:
            return 0.5 * abs_supp / median
        else:
            return 0.5 + (0.5 * (abs_supp - median) / (maximum - median))
    
    def get_absolute_support(self, node):
        result = self.get_absolute_self_support(node)
        for child in self.get_children(node):
            result += self.get_absolute_support(child)
        return result
        
    def get_absolute_self_support(self, node):
        return self.g.get_support(node)
    
    def get_absolute_support_median_and_max(self):
        if self.median_maximum_support_cache is None:
            supports = list([self.get_absolute_support(node) for node in self.g.g.nodes])
            mean = np.mean(supports) # TODO mean seems to fit better?
            maximum = max(supports)
            self.median_maximum_support_cache = (mean, maximum)
            #for i in range(10):
            #    print("Amount of", i, "in supports:", supports.count(i * 1.0))
            #show_histogram(supports, "support values")
        return self.median_maximum_support_cache
        

In [None]:
# public so other code can use it
def path_module_distance(a, b):
    if a == b:
        return 0
    steps_a = a.split("/")
    steps_b = b.split("/")
    min_len = min(len(steps_a), len(steps_b))
    for i in range(min_len):
        if steps_a[i] == steps_b[i]:
            continue
        # unequal: calc distance
        return len(steps_a) + len(steps_b) - (i * 2)
    return len(steps_a) + len(steps_b) - (min_len * 2)


class ModuleDistanceAnalysisGraph:
    
    def get_node_set(self):
        return None
    
    def get_normalized_support(self, node):
        return 1
    
    def propagate_down(self, layers = 1, weight_factor = 0.2):
        pass
    
    def get_normalized_coupling(self, a, b):
        dist = path_module_distance(a, b)
        base = 1.1  # needs to be bigger than one. Lower values = stronger coupling across bigger distances. Higher values = faster decay of coupling across module distance
        return math.pow(base, -dist)

In [None]:
def get_analysis_graph(r, metric_name):
    if metric_name == "module_distance":
        return ModuleDistanceAnalysisGraph()
    else:
        return AnalysisGraph(MetricManager.get(r, metric_name))

In [None]:
def pattern_match(coupling_values, pattern, support_values):
    """how good does this node-pair fit to the given pattern? Range: [0, 1]"""
    error_sum = 0; values = 0; support = 1
    for i, coupling_val in enumerate(coupling_values):
        if pattern[i] is not None:
            error = abs(pattern[i] - coupling_val)
            error_sum += error * error
            values += 1
            support = min(support, support_values[i])
    match_score = 1. - (error_sum / values)
    return match_score, support


# as seen in:
# https://thelaziestprogrammer.com/python/a-multiprocessing-pool-pickle
# https://thelaziestprogrammer.com/python/multiprocessing-pool-a-global-solution
class StaticStuff:
    analysis_graphs = None
    target_patterns = None

MIN_PATTERN_MATCH = 0  # how close the coupling values need to match the pattern to be a result
MIN_SUPPORT = 0.4  # how much relative support a result needs to not be discarded
def analyze_pair(pair): #, analysis_graphs, target_patterns):
    # pdb.set_trace()
    _a, _b = pair
    if _a.startswith(_b) or _b.startswith(_a):  # ignore nodes that are in a parent-child relation
        return None
    # for each view: how much support do we have for this node pair (minimum of both node support values)
    support_values = [min(supp_a, supp_b) for supp_a, supp_b in zip(*[
        [g.get_normalized_support(node) for g in StaticStuff.analysis_graphs] for node in [_a, _b]
    ])]
    result = [[] for p in StaticStuff.target_patterns]
    for a, b in [(_a, _b), (_b, _a)]:
        normalized_coupling_values = tuple([g.get_normalized_coupling(a, b) for g in StaticStuff.analysis_graphs])
        for i, pattern in enumerate(StaticStuff.target_patterns):
            pattern_match_score_data = tuple(abs(p - v) for p, v in zip(pattern, normalized_coupling_values) if p is not None)
            match_score, support = pattern_match(normalized_coupling_values, pattern, support_values)
            if match_score >= MIN_PATTERN_MATCH and support >= MIN_SUPPORT:
                result[i].append((pattern_match_score_data, (a, b, normalized_coupling_values, support)))
    return result

In [1]:
class BestResultsSet:
    def __init__(self, dimension_count, result_keep_size):
        self.dimension_count = dimension_count
        self.result_keep_size = result_keep_size
        self.data = []  # pair of ([coordinates per dimension], user-data)
        self.total_amount = 0
    
    def add_all(self, new_data):
        self.data += new_data
        self.total_amount += len(new_data)
        self.trim_maybe()
        
    def get_best(self, dim_weights):
        def sort_key(datum):
            return sum(datum[0][i] * weight for i, weight in enumerate(dim_weights))
        self.data.sort(key=sort_key)
        return self.data[:self.result_keep_size]
    
    def trim_maybe(self):
        if len(self.data) > self.result_keep_size * 100 * self.dimension_count:
            self.trim()
    
    def trim(self):
        result_keep_tolerance = 2  # higher = keep more, but better chance at not accidentally removing important stuff
        sampling_accuracy = 20  # higher = more runtime, but more acurately detecting required important data
        # previous_size = len(self.data)
        important_data = set()
        prev_result_keep_size = self.result_keep_size
        self.result_keep_size *= result_keep_tolerance
        for dim_weight in generate_one_distributions(self.dimension_count, sampling_accuracy):
            important_data.update(self.get_best(dim_weight))
        self.result_keep_size = prev_result_keep_size
        self.data = list(important_data)
        # print("Trimming reduced from", previous_size, "to", len(self.data), "elements")
    
    def export_to_csv(self, name):
        """only works for 2 dimensions!"""
        with open(name + ".csv.txt", "w") as f:
            f.write("x,y\n1,1\n" + "\n".join(str(d[1][2][0])+","+str(d[1][2][1]) for d in self.data))

In [None]:
SHOW_RESULTS_SIZE = 50
def analyze_disagreements(repo, views, target_patterns, node_filter_func = None, node_pair_filter_func = None):
    """
    when views are [struct, evo, ling], the pattern [0, 1, None, "comment"] searches for nodes that are
    strongly coupled evolutionary, loosely coupled structurally, and the language does not matter
    """
    if len(views) <= 1:
        return
    if not all([len(p) >= len(views) for p in target_patterns]):
        print("Patterns need at least one element per graph!")
        return
    
    
    
    analysis_graphs = list([get_analysis_graph(repo, g) for g in views])
    #for g in analysis_graphs:
    #    g.propagate_down(2, 0.2)
    analysis_graph_nodes = [g.get_node_set() for g in analysis_graphs]
    all_nodes = list(set.intersection(*[nodes for nodes in analysis_graph_nodes if nodes is not None]))
    all_nodes = [n for n in all_nodes if repo.get_tree().has_node(n)]
    print("Total node count:", len(all_nodes))
    print("Methods:", sum(repo.get_tree().find_node(path).get_type() == "method" for path in all_nodes))
    print("constructors:", sum(repo.get_tree().find_node(path).get_type() == "constructor" for path in all_nodes))
    print("fields:", sum(repo.get_tree().find_node(path).get_type() == "field" for path in all_nodes))
    print("classes:", sum(repo.get_tree().find_node(path).get_type() == "class" for path in all_nodes))
    print("interfaces:", sum(repo.get_tree().find_node(path).get_type() == "interface" for path in all_nodes))
    print("enums:", sum(repo.get_tree().find_node(path).get_type() == "enum" for path in all_nodes))
    print("without type:", sum(repo.get_tree().find_node(path).get_type() is None for path in all_nodes))
    if node_filter_func is not None:
        all_nodes = [node for node in all_nodes if node_filter_func(node)]
    print("all filtered nodes:", len(all_nodes))
    
    all_node_pairs = list(all_pairs(all_nodes))
    if node_pair_filter_func is not None:
        prev_len = len(all_node_pairs)
        all_node_pairs = [pair for pair in all_node_pairs if node_pair_filter_func(pair[0], pair[1])]
        print("Amount of node pairs to check:", len(all_node_pairs), "of", prev_len, "(" + str(len(all_node_pairs)/prev_len*100) + "%)")
        
        
    print("Going parallel...")
    pattern_results = [
        BestResultsSet(sum(type(x)==int for x in p), SHOW_RESULTS_SIZE)
        for p in target_patterns]
    def handle_results(pattern_results_part):
        for i, part in enumerate(pattern_results_part):
            pattern_results[i].add_all(part)
    StaticStuff.analysis_graphs = analysis_graphs
    StaticStuff.target_patterns = target_patterns
    map_parallel(
        all_node_pairs,
        analyze_pair, # partial(analyze_pair, analysis_graphs=analysis_graphs, target_patterns=target_patterns),
        handle_results,
        "Analyzing edges",
        force_non_parallel=True
    )
    
    print("Results:")
    for i, (pattern, results) in enumerate(zip(target_patterns, pattern_results)):
        print("\nPattern " + str(i) + " (" + str(pattern) + "):")
            
        def nice_path(path):
            ending = "." + repo.type_extension()
            if ending in path:
                return path[path.index(ending) + len(ending) + 1:]
            return path
        
        def get_raw_i(i):
            def getter(d):
                return d[1][2][i]
            return getter
        def get_i(i):
            def getter(d):
                return d[0][i]
            return getter
        name_and_raw_getters = [(name, get_raw_i(i)) for i, name in enumerate(views) if pattern[i] is not None]
        sort_val_getters = [get_i(i) for i in range(len([p for p in pattern if p is not None]))]
        dimensions = [(name, get, get_raw) for (name, get_raw), get in zip(name_and_raw_getters, sort_val_getters)]
        def make_show_data(dim):
            def show_data(multi_sorted_results):
                print(results.total_amount, "raw results,", len(multi_sorted_results), "final results")

                display_data = multi_sorted_results[:SHOW_RESULTS_SIZE]
                #for d in display_data:
                #    print(d)
                display_data = [
                    ["{:1.4f}".format(raw_getter(datum)) for name, getter, raw_getter in dim] +
                    ["{:1.4f}".format(datum[1][3])] +
                    ['<a target="_blank" href="' + repo.url_for(path) + '" title="' + path + '">' + nice_path(path) + '</a>' for path in datum[1][0:2]]
                    for datum in display_data]
                header = [name for name, *_ in dim] + ["support", "method 1", "method 2"]
                show_html_table([header] + display_data, len(dim) + 3)
            return show_data
        #results.export_to_csv("full")
        results.trim()
        #results.export_to_csv("trimmed")
        interactive_multi_sort(results.data, dimensions, make_show_data(dimensions)) 
        
    
    
    # pdb.set_trace()
    # TODO trim node set of nodes to those they have in common?

In [None]:
if False:
    r = LocalRepo("ErikBrendel/LudumDare")
    analyze_disagreements([MetricManager.get(r, view) for view in ["structural", "evolutionary"]], [[0, 1], [None, 1], [1, None]])