In [None]:
import pdb
%matplotlib inline
%run LocalRepo.ipynb
%run repos.ipynb
%run parsing.ipynb
%run metrics.ipynb

In [None]:
class AnalysisGraph:
    def __init__(self, g):
        self.g = g
        self.children = {}
        for n in self.g.g.nodes:
            while n is not None:
                p = self.get_parent(n)
                self.children.setdefault(p, set()).add(n)
                n = p
        self.total_relative_coupling_cache = {}
        
    def get_children(self, node):
        if node in self.children:
            return self.children[node]
        else:
            return []
        
    def get_parent(self, node):
        if len(node) <= 1:
            return None
        return "/".join(node.split("/")[:-1])
    
    def get_directly_coupled(self, node):
        if node in self.g.g:
            return [n for n in self.g.g[node]]
        else:
            return []
    
    def get_siblings(self, node):
        """Return all the other children of this.parent (excluding this)"""
        parent = self.get_parent(node)
        if parent is None:
            return []
        return [c for c in self.get_children(parent) if c != node]
    
    def get_coupling_candidates(self, node, add_predecessors = False):
        """Return all the nodes with which the given one could have a non-zero relative coupling.
         * := The direct coupling nodes of given+descendants, and all their predecessors"""
        this_and_descendants = self.get_self_and_descendants(node)
        
        direct_coupling_candidates = []
        for n in this_and_descendants:
            for n2 in self.get_directly_coupled(n):
                direct_coupling_candidates.append(n2)
        
        result = set()
        result.add("")  # root node, added to stop the predecessor iteration
        for other in direct_coupling_candidates:
            while result.add(other): # while not present yet
                if not add_predecessors:
                    break
                other = self.get_parent(other)
        result.remove("")  # root node - removed, since not very interesting
        return result
    
        
    def get_self_and_descendants(self, node):
        """return the given node and all its descendants as a list"""
        result = []
        self.get_self_and_descendants_rec(node, result)
        return result
    
    def get_self_and_descendants_rec(self, node, result):
        """add the given node and all its descendants into the provided list"""
        result.append(node)
        for child in self.get_children(node):
            self.get_self_and_descendants_rec(child, result)
            
    
    def get_direct_coupling(self, a, b):
        """direct coupling between a and b"""
        return self.g.get(a, b)
    
    def get_direct_multi_coupling(self, a, others):
        """sum of direct coupling between a and all b"""
        return sum([self.get_direct_coupling(a, b) for b in others])
    
    def get_relative_coupling(self, a, b):
        """sum of direct coupling between a+descendants and b+descendants"""
        others = self.get_self_and_descendants(b)
        return self.get_relative_multi_direct_coupling(a, others)
    
    def get_relative_multi_coupling(self, a, others):
        """sum of direct coupling between a+descendants and others+descendants"""
        direct_others = []
        for other in others:
            self.get_self_and_descendants_rec(other, direct_others)
        return self.get_relative_multi_direct_coupling(a, direct_others)
    
    def get_relative_multi_direct_coupling(self, a, others):
        """sum of direct coupling between a+descendants and others"""
        if len(others) == 0:
            return 0
        result = self.get_direct_multi_coupling(a, others)
        for child in self.get_children(a):
            result += self.get_relative_multi_direct_coupling(child, others)
        return result
    
    def get_total_relative_coupling(self, a):
        """the sum of direct couplings that a has, cached"""
        if a in self.total_relative_coupling_cache:
            return self.total_relative_coupling_cache[a]
        
        a_candidates = self.get_coupling_candidates(a, add_predecessors = False)
        total_coupling = self.get_relative_multi_direct_coupling(a, a_candidates)
        self.total_relative_coupling_cache[a] = total_coupling
        return total_coupling
    
    def get_normalized_coupling(self, a, b):
        """relative coupling between a and b, normalized by the sum of couplings that a has, in range [0, 1]"""
        if a not in self.g.g or b not in self.g.g:
            return 0
        target_coupling = self.get_relative_coupling(a, b)
        if target_coupling == 0:
            return 0
        total_coupling = self.get_total_relative_coupling(a)
        return target_coupling / total_coupling
        

In [None]:
def analyze_disagreements(coupling_graphs):
    if len(coupling_graphs) <= 1:
        return
    MIN_DISAGREEMENT = 0.000001  # anything smaller than that will be discarded
    
    analysis_graphs = list([AnalysisGraph(g) for g in coupling_graphs])
    all_nodes = list(set.union(*[set(g.g.nodes) for g in coupling_graphs]))
    print("Total node count:", len(all_nodes))
    all_differences = []
    for _a, _b in log_progress(list(all_pairs(all_nodes)), desc="Analyzing edges", smoothing=0.1):
        if _a.startswith(_b) or _b.startswith(_a):  # ignore nodes that are in a parent-child relation
            continue
        for a, b in [(_a, _b), (_b, _a)]:
            normalized_coupling_values = list([g.get_normalized_coupling(a, b) for g in analysis_graphs])
            disagreement = max(normalized_coupling_values) - min(normalized_coupling_values)  # TODO this is the most basic metric - find something better?
            if disagreement >= MIN_DISAGREEMENT:
                all_differences.append((a, b, disagreement))
    all_differences = sorted(all_differences, key = lambda e: -e[2])
    print("Amount of disagreements:", len(all_differences), "which is", str(round(100*len(all_differences)/(len(all_nodes)*len(all_nodes)), 2))+"%", "of all edges")
    
    
    values = [v for a, b, v in all_differences]
    plt.hist(values, "auto", facecolor='g', alpha=0.75)
    plt.axvline(np.array(values).mean(), color='k', linestyle='dashed', linewidth=1)
    # plt.xscale("log")
    # plt.yscale("log")
    plt.xlabel('Disagreement')
    plt.ylabel('Amount')
    plt.title('Histogram of disagreement strengths')
    plt.grid(True)
    plt.show()
    
    strong_disagreements = [(a, b, v) for a, b, v in all_differences if v < 1 and v > 0.5]
    print("Strong nontrivial disagreements:", len(strong_disagreements))
    for a, b, v in  strong_disagreements[:10]:
        print(a, "<>", b, " - ", v)
    
    
    # pdb.set_trace()
    # TODO trim node set of nodes to those they have in common?