In [None]:
import pdb
%matplotlib inline
%run LocalRepo.ipynb
%run repos.ipynb
%run parsing.ipynb
%run metrics.ipynb

In [None]:
class AnalysisGraph:
    def __init__(self, g):
        self.g = g
        self.children = {}
        for n in self.g.g.nodes:
            while n is not None:
                p = self.get_parent(n)
                self.children.setdefault(p, set()).add(n)
                n = p
        self.total_relative_coupling_cache = {}
        self.median_maximum_support_cache = None
        
    def get_children(self, node):
        if node in self.children:
            return self.children[node]
        else:
            return []
        
    def get_parent(self, node):
        if len(node) <= 1:
            return None
        return "/".join(node.split("/")[:-1])
    
    def get_directly_coupled(self, node):
        if node in self.g.g:
            return [n for n in self.g.g[node]]
        else:
            return []
    
    def get_siblings(self, node):
        """Return all the other children of this.parent (excluding this)"""
        parent = self.get_parent(node)
        if parent is None:
            return []
        return [c for c in self.get_children(parent) if c != node]
    
    def get_coupling_candidates(self, node, add_predecessors = False):
        """Return all the nodes with which the given one could have a non-zero relative coupling.
         * := The direct coupling nodes of given+descendants, and all their predecessors"""
        this_and_descendants = self.get_self_and_descendants(node)
        
        direct_coupling_candidates = []
        for n in this_and_descendants:
            for n2 in self.get_directly_coupled(n):
                direct_coupling_candidates.append(n2)
        
        result = set()
        result.add("")  # root node, added to stop the predecessor iteration
        for other in direct_coupling_candidates:
            while result.add(other): # while not present yet
                if not add_predecessors:
                    break
                other = self.get_parent(other)
        result.remove("")  # root node - removed, since not very interesting
        return result
    
        
    def get_self_and_descendants(self, node):
        """return the given node and all its descendants as a list"""
        result = []
        self.get_self_and_descendants_rec(node, result)
        return result
    
    def get_self_and_descendants_rec(self, node, result):
        """add the given node and all its descendants into the provided list"""
        result.append(node)
        for child in self.get_children(node):
            self.get_self_and_descendants_rec(child, result)
            
    
    def get_direct_coupling(self, a, b):
        """direct coupling between a and b"""
        return self.g.get(a, b)
    
    def get_direct_multi_coupling(self, a, others):
        """sum of direct coupling between a and all b"""
        return sum([self.get_direct_coupling(a, b) for b in others])
    
    def get_relative_coupling(self, a, b):
        """sum of direct coupling between a+descendants and b+descendants"""
        others = self.get_self_and_descendants(b)
        return self.get_relative_multi_direct_coupling(a, others)
    
    def get_relative_multi_coupling(self, a, others):
        """sum of direct coupling between a+descendants and others+descendants"""
        direct_others = []
        for other in others:
            self.get_self_and_descendants_rec(other, direct_others)
        return self.get_relative_multi_direct_coupling(a, direct_others)
    
    def get_relative_multi_direct_coupling(self, a, others):
        """sum of direct coupling between a+descendants and others"""
        if len(others) == 0:
            return 0
        result = self.get_direct_multi_coupling(a, others)
        for child in self.get_children(a):
            result += self.get_relative_multi_direct_coupling(child, others)
        return result
    
    def get_total_relative_coupling(self, a):
        """the sum of direct couplings that a has, cached"""
        if a in self.total_relative_coupling_cache:
            return self.total_relative_coupling_cache[a]
        
        a_candidates = self.get_coupling_candidates(a, add_predecessors = False)
        total_coupling = self.get_relative_multi_direct_coupling(a, a_candidates)
        self.total_relative_coupling_cache[a] = total_coupling
        return total_coupling
    
    def get_normalized_coupling(self, a, b):
        """relative coupling between a and b, normalized by the sum of couplings that a has, in range [0, 1]"""
        if a not in self.g.g or b not in self.g.g:
            return 0
        target_coupling = self.get_relative_coupling(a, b)
        if target_coupling == 0:
            return 0
        total_coupling = self.get_total_relative_coupling(a)
        return target_coupling / total_coupling
    
    
    def get_normalized_support(self, node):
        """
        on a scale of [0, 1], how much support do we have for coupling values with that node?
        This should depend on how much data (including children) we have for this node, relative to how much data is normal in this graph.
        It should also be outlier-stable, so that having median-much data maybe results in a support score of 0.5?
        """
        abs_supp = self.get_absolute_support(node)
        median, maximum = self.get_absolute_support_median_and_max()
        if abs_supp <= median:
            return 0.5 * abs_supp / median
        else:
            return 0.5 + (0.5 * (abs_supp - median) / (maximum - median))
    
    def get_absolute_support(self, node):
        result = self.get_absolute_self_support(node)
        for child in self.get_children(node):
            result += self.get_absolute_support(child)
        return result
        
    def get_absolute_self_support(self, node):
        return self.g.get_support(node)
    
    def get_absolute_support_median_and_max(self):
        if self.median_maximum_support_cache is None:
            supports = list([self.get_absolute_support(node) for node in self.g.g.nodes])
            median = np.median(supports)
            maximum = max(supports)
            self.median_maximum_support_cache = (median, maximum)
        return self.median_maximum_support_cache
        

In [None]:
def analyze_disagreements(repo, coupling_graphs, target_patterns):
    """
    when views are [struct, evo, ling], the pattern [0, 1, None, "comment"] searches for nodes that are
    strongly coupled evolutionary, loosely coupled structurally, and the language does not matter
    """
    if len(coupling_graphs) <= 1:
        return
    if not all([len(p) >= len(coupling_graphs) for p in target_patterns]):
        print("Patterns need at least one element per graph!")
        return
    
    MIN_PATTERN_MATCH = 0.66  # how close the coupling values need to match the pattern to be a result
    MIN_SUPPORT = 0.1  # how much relative support a result needs to not be discarded
    
    def pattern_match(coupling_values, pattern, support_values):
        """how good does this node-pair fit to the given pattern? Range: [0, 1]"""
        error_sum = 0; values = 0; support = 1
        for i, coupling_val in enumerate(coupling_values):
            if pattern[i] is not None:
                error = abs(pattern[i] - coupling_val)
                error_sum += error * error
                values += 1
                support = min(support, support_values[i])
        match_score = 1. - (error_sum / values)
        return match_score, support
    
    analysis_graphs = list([AnalysisGraph(g) for g in coupling_graphs])
    all_nodes = list(set.intersection(*[set(g.g.nodes) for g in coupling_graphs]))
    all_nodes = [n for n in all_nodes if repo.get_tree().has_node(n)]
    print("Total node count:", len(all_nodes))
    # print("Intersection node count:", len(list(set.intersection(*[set(g.g.nodes) for g in coupling_graphs]))))
    
    pattern_results = [[] for p in target_patterns]
    for _a, _b in log_progress(list(all_pairs(all_nodes)), desc="Analyzing edges", smoothing=0.1):
        if _a.startswith(_b) or _b.startswith(_a):  # ignore nodes that are in a parent-child relation
            continue
        # for each view: how much support do we have for this node pair (minimum of both node support values)
        support_values = [min(supp_a, supp_b) for supp_a, supp_b in zip(*[
            [g.get_normalized_support(node) for g in analysis_graphs] for node in [_a, _b]
        ])]
        for a, b in [(_a, _b), (_b, _a)]:
            normalized_coupling_values = list([g.get_normalized_coupling(a, b) for g in analysis_graphs])
            for i, pattern in enumerate(target_patterns):
                match_score, support = pattern_match(normalized_coupling_values, pattern, support_values)
                if match_score >= MIN_PATTERN_MATCH and support >= MIN_SUPPORT:
                    pattern_results[i].append((a, b, match_score * support))
    
    print("Results:")
    for i, (pattern, results) in enumerate(zip(target_patterns, pattern_results)):
        print("\nPattern " + str(i) + " (" + str(pattern) + "):")
        sorted_results = sorted(results, key = lambda e: -e[2])
        print("  Amount of disagreements:", len(sorted_results), "which is", str(round(100*len(sorted_results)/(len(all_nodes)*len(all_nodes)), 2))+"%", "of all edges")


        values = [v for a, b, v in sorted_results]
        plt.hist(values, "auto", facecolor='g', alpha=0.75)
        plt.axvline(np.array(values).mean(), color='k', linestyle='dashed', linewidth=1)
        # plt.xscale("log")
        # plt.yscale("log")
        plt.xlabel('Pattern match')
        plt.ylabel('Amount of results')
        plt.title('Histogram of pattern matching strengths')
        plt.grid(True)
        plt.show()

        strong_matches = [(a, b, v) for a, b, v in sorted_results if v <= 1]
        print("  Strong nontrivial disagreements:", len(strong_matches))
        for a, b, v in  strong_matches[:10]:
            print("  ", a, "<>", b, " - ", v)
    
    
    # pdb.set_trace()
    # TODO trim node set of nodes to those they have in common?

In [None]:
if False:
    r = LocalRepo("ErikBrendel/LudumDare")
    analyze_disagreements([MetricManager.get(r, view) for view in ["structural", "evolutionary"]], [[0, 1], [None, 1], [1, None]])