## two step anonymization

construct new degree sequence that is k-anonymous and such that L1(dhat - d) is minimized (minimum number of edge additions). 
construct graph based on that degree sequence

use the recursive method in pseudoNodeAnon for clustering
alternatively use the greedy method -> implemented somewhere (PBCN approach?)

constructGraph is also implemented somewhere

In [2]:
import numpy as np
import networkx as nx

import random   ## for neighborhoodRandom

In [3]:
## we use the DD199 graph for testing
import pandas as pd
DD199 = nx.read_edgelist("Data/DD199/DD199.edges", nodetype=int)

BAG = nx.barabasi_albert_graph(20,2)

In [4]:
def Greedy():
    '''greedy partitioning of vertex set based on degree'''
    ## group with first k highest deg nodes, & assigns maxdeg
    ## check whether to merge next node in or make new group

In [5]:
def Start_unspec(degSeq,start,cost, k, x):
    ''' rightmost partition is given by [Start(x), x], next is given by [Start(Start(x)-1), Start(x)-1]
    where x is an index position
    '''
    if start[x] == None:
        if x < 2*k:
            start[x] =  1    ## bc we have constructed the indexing s.t. 0 should not be reached
        else:
            start[x] = Pos_Split_unspec(degSeq,start,cost, k, x)
    return start[x]

def Pos_Split_unspec(degSeq,start,cost, k, x):
    '''argmin over i in [max(k,x-2k+1), x-k]    of max(Cost(1,i-1), delta(i,x))
    assume that argmin returns maximal point at which a function is minimised (i.e. largest index)
    NOTE: condiditon to enter is x>=2k'''
    myiterator = list(range(x-k, max(k,x- 2*k +1)-1, -1))  ## we go in reverse order, so that we get maximal pt where func is min.
    ## want i, not the actual argmin so:
    return myiterator[np.argmin([max(Cost_unspecified(degSeq,start,cost, k, 1, i-1), delta_unspecified(degSeq, i, x)) for i in myiterator])]

def delta_unspecified(degSeq, x, y):
    '''return dx - dy
    where x and y are positions in the deg sequence, counting from the right
    degSeq = [(node,maxdeg),...,(node,mindeg)]'''
    return degSeq[x][1]*(y-x) - sum([i[1] for i in degSeq[x:y+1]])

def Cost_unspecified(degSeq,start,cost, k, _, x):
    '''delta(1,x) if x<2k, Cost_Split if x>=2k'''
    if cost[x] == None:
        if x < 2*k:
            cost[x] = delta_unspecified(degSeq, 1, x)
        else:
            cost[x] = Cost_Split_unspec(degSeq,start,cost, k, x)
    return cost[x]

def Cost_Split_unspec(degSeq,start,cost, k, x):
    '''minimum over i in [max(k, x-2k+1), x-k]      of (max(Cost(1,i-1), delta(i,x)))
    NOTE: condiditon for entering is k >= 2k'''
    myiterator = list(range(max(k, x-2*k+1), x-k+1)) ## want to include x-k so we have to use x-k+1
    return min([max(Cost_unspecified(degSeq,start,cost, k, 1, i-1), delta_unspecified(degSeq, i, x)) for i in myiterator])


In [6]:
def degreeAnon(d, k:int):
    '''take a degree sequence, and return a degree sequence such that the anon cost is minimized
    d = [(nodeID, deg)] such that d[1]>= ... >= d[n]
    Returns a dictionary of nodeID: targetDegree'''
    ## Da(d[1:i]) = deg anon cost of d[1:i] and I(d[i:j]) = deg anon cost when all i, i+1, ..., j are in the same anon group
    ## I(d[i,j]) = sum l = i to j (d(i)-d(l))
    ## for i <= 2k Da(d[1,i]) = I
    ## for i >= 2k Da(d[1:i]) = min { min over k<=t<= i-k {Da}}

    if len(d) < 2*k:
        dhat = {d[i][0]:d[-1][1] for i in d.keys()}
        return dhat

    i = len(d)
    
    d = [0] + d ## add element for 1 indexing
    DA = [None for node in d] ## costs
    start = [None for node in d]

    degSeqGrouped = []
    while i>1:
        starti = Start_unspec(d,start,DA,k,i)
        degSeqGrouped.append(d[starti:i+1])    ## need to add 1, b/c we want to include i  ## can add [::-1] to reverse this if we want
        i = starti - 1
    d_hat = {partition[j][0]: partition[0][1] for partition in degSeqGrouped for j in range(len(partition))}
    ## nodeID : maxDeg in partition
    d = d[1:] ## remove element for 1 indexing
    
    return d_hat
    

In [None]:
## define the recursive procedure for determining the optimal node partition
## implemented similar as in pseudoNode Anon
## d is sorted largest deg to smallest deg, so d[1] >= d[2] etc
def get_I(d, i, j):
    ## j a larger index than i
    ## sum([d[i]-l for l in d[i:j+1]])    ## i to j inclusive
    return d[i]*(j-i) - sum(d[i:j+1])

def get_DA(DA, d,_,i,k):
    ## DA(d[1,i]) can be written as DA(i)
    ## DA(i) = min of max over (k, i-2k+1 <=t<=i-k) of (DA(t) + I(t+1, i))
    if DA[i] == None:
        if i < 2*k:
            DA[i] = get_I(I,d,1,i)
        else: ## i >= 2k
            myiterator = list(range(max(k, i - 2*k +1), i-k+1)) ## want to include i-k so we have to use i-k+1
            DA[i] = min([max(get_DA(DA,d,1,t,k) + get_I(d,t+1,i)) for t in myiterator])
    return DA[i]

In [61]:
def constructGraph(d):
    '''Constructs a graph with nodes having degree sequence d = {node:degree}
    Returns boolean realizability and the constructed Graph i.e. bool, nx.Graph/None'''

    if sum([deg for deg in d.values()]) %2 ==1:  ## if sum of degrees is odd
        return False, None

    '''
    degSeq = list(d.values())
    n = len(d)
    ## finish check for realizability ## not in pseudoCode, but might be faster
    degSeq = [0] + degSeq ## make 1 indexed
    
    for l in range(1,n): ## 1<=l<=n-1
        minSeq = [min(l,di) for di in degSeq]
        if sum(degSeq[:l+1]) > l*(l-1) + sum(minSeq[l+1:n+1]):
            return False
    degSeq = degSeq[1:] ## make 0 indexed
    '''
    G = nx.Graph()
    G.add_nodes_from(d.keys())  ## get the node set
    
    while True:
        if any([deg<0 for deg in d.values()]):
            return False, None
        if all([deg==0 for deg in d.values()]):
            return True, G
        v = random.choice([n for n,deg in d.items() if deg > 0])    ## random node with d(v) >0
        Vdv = sorted([(n,deg) for n,deg in d.items() if n!=v and deg>0], key= lambda x:x[1], reverse=True)[:d[v]]
        d[v] = 0
        for (w,dw) in Vdv:
            G.add_edge(v,w)
            d[w] -= 1

In [7]:
def priority(d, Ggt:nx.Graph):
    '''
    Allows the construction of deg anon graphs with similar high edge intersection directly 
    without using greedy_swap.
    Similar to constructGraph, but makes two passes over sorted deg seq.
    First pass, considers only nodes such that (v,vprime) in E.
    If not enough, does second pass such that (v,vprime) not in E'''
    G = nx.Graph()
    G.add_nodes_from(d.keys())  ## get the node set

    if sum([deg for deg in d.values()]) %2 ==1:  ## if sum of degrees is odd
        return False, None
    
    while True:
        if any([deg<0 for deg in d.values()]):
            return False, None
        if all([deg==0 for deg in d.values()]):
            return True, G
        v = random.choice([n for n,deg in d.items() if deg > 0])    ## random node with d(v) >0
        vprime = sorted([(n,d[n]) for n in Ggt.neighbors(v) if d[n]>0], key = lambda x:x[1], reverse = True) ## vprime s.t. (v,vprime) in E
        if len(vprime) < d[v]:
            vprime += sorted([(n,deg) for n,deg in d.items() if deg>0 and (n not in Ggt.neighbors(v)) and n!=v], key= lambda x:x[1], reverse=True)
        Vdv = vprime[:d[v]]
        d[v] = 0
        for (w,dw) in Vdv:
            G.add_edge(v,w)
            d[w] -= 1

In [14]:
def probing(G:nx.Graph, k = 10):
    '''Anonymises G'''
    d = sorted(dict(G.degree()).items(), key = lambda item : item [1], reverse= True)
    ## d = [(node, degree), ... ,] in decreasing degree order
    dhat = degreeAnon(d,k) ## returns dict node:targetDeg
    n = G.number_of_nodes()
    ##realizable, Ghat = constructGraph(dhat)
    realizable, Ghat = priority(dhat, G)
    while not realizable:
        ## examine in order of increasing deg, do uniform noise increase of degree of one node each iteration
        noise = random.choice(range(1, 5)) ## paper does not specify how much to increase
        d[-1] = (d[-1][0], min(d[-1][1] + noise, n-1)) ## must replace the tuple in its entirety
        ## repartition and get target deg sequence
        d = sorted(d, key = lambda item : item [1], reverse= True)
        dhat = degreeAnon(d,k)
        ##realizable, Ghat = constructGraph(dhat)
        realizable, Ghat = priority(dhat, G)
    return Ghat

In [111]:
##BAG = nx.barabasi_albert_graph(10000,100)
## seems to work fine, about 0.2 s for n=1k h=10
## 10s for n=10k h=10, 11s for n=10k h=100