### Helper functions

In [1]:
import util
import math
from scipy import stats
from scipy.special import comb
import numpy as np

def edgedp_compute_global_sens(G, Gstat, query_type, k):
    num_nodes = Gstat.num_nodes
    if query_type == "triangle":
        return num_nodes-2
    elif query_type == "kstar":
        return 2 * comb(num_nodes-2, k-1)
    elif query_type == "kclique":
        return comb(num_nodes-2,k-2)
    elif query_type == "ktriangle":
        return comb(num_nodes-2,k) + 2*(num_nodes-2)*comb(num_nodes-3,k-1)
    else:
        print(query_type, "is unspecified")
        return -1.0


In [2]:
def edgedp_compute_local_sens(G, Gstat, query_type, k):
    ls = 0.0 
    num_nodes = Gstat.num_nodes
    if query_type == "triangle":
        ls = Gstat.max_num_common 
        return ls
    elif query_type == "kstar":
        bucket = [-1] * num_nodes # bucket: the common neighbor sizes 
        for i in range(num_nodes):
            for j in range(i+1, num_nodes):
                xij = int(G.has_edge(i,j))
                di = max(G.degree[i],G.degree[j]) - xij
                dj = min(G.degree[i],G.degree[j]) - xij 
                bucket[di] = max(bucket[di],dj)
                
        uppers = []
        for i in reversed(range(num_nodes)):
            if bucket[i] <0:
                continue
            if (len(uppers)==0) or uppers[-1][1] < bucket[i]:
                uppers.append([i, bucket[i]])         
        for p in uppers:
            ls = max(ls, comb(p[0],k-1)+comb(p[1],k-1))

        return ls
    
    elif query_type == "kclique":
        for i in range(num_nodes):
            for j in range(i+1, num_nodes):
                ls  = max(ls, util.count_clique(G,Gstat.get_common_neighbors(min(i,j),max(i,j)),k-2))
        return ls
    
    elif query_type == "ktriangle":
        for i in range(num_nodes):
            for j in range(i+1, num_nodes):
                Aij = Gstat.get_common_neighbors(min(i,j),max(i,j)) # common neighbors of i and j
                xij = int(G.has_edge(i,j)) # 1 if edge ij exists, 0 otherwise
                
                lsij = comb(len(Aij),k) # ktriangle sharing edge ij
                for l in Aij: # l is connected to node i, j
                    ail = Gstat.get_num_common_neighbors(min(i,l),max(i,l)) # ktriangle sharing edge il
                    ajl = Gstat.get_num_common_neighbors(min(l,j),max(l,j)) # ktriangle sharing edge ij
                    lsij = lsij + comb(ail-xij, k-1) + comb(ajl-xij,k-1)
                ls = max(ls,lsij)
        return ls
    
    else:
        print(query_type, "is unspecified")
        return ls



### Helpers for computing the ladder function per graph query

In [3]:
# the following ladder functions uses local sensitivity at distance t

In [3]:
def lsd_triangle(G, Gstat, query_type, k):
    num_nodes = Gstat.num_nodes
    bucket = [-1] * num_nodes # bucket: the common neighbor sizes 
    for i in range(num_nodes):
        for j in range(i+1, num_nodes):
            # aij: the number of common neighbors of i and j
            # aij = Gstat.get_num_common_neighbors(i,j)
            aij = len(Gstat.common_neighbors['{},{}'.format(i,j)])
            
            #bij: the number of nodes connected to exactly one of i and j
            bij = len(Gstat.connection_list[i]) + len(Gstat.connection_list[j]) - 2*aij - 2*int(G.has_edge(i,j))
            # bij = G.degree[i] + G.degree[j] - 2*aij - 2*int(G.has_edge(i,j))
            bucket[aij] = max(bucket[aij], bij)  
    
    uppers = []
    for i in reversed(range(num_nodes)):
        if bucket[i] <0:
            continue
        if (len(uppers)==0) or (i*2+bucket[i] > uppers[-1][0]*2 + uppers[-1][1]):
            uppers.append([i, bucket[i]])
    
    gs = edgedp_compute_global_sens(G,Gstat,query_type,k)
    
    LSD = []
    t = 0
    
    while 1:
        lsd = 0
        for p in uppers:
            lsd = max(lsd, p[0]+ (t+min(t,p[1])) /2)
        t +=1 
        if lsd < gs:
            LSD.append(lsd)
        else: # converged
            LSD.append(gs)
            return LSD      

In [9]:
def lsd_kstar(G,Gstat,query_type,k):
    num_nodes = Gstat.num_nodes
    bucket = [-1] * num_nodes #bucket: the common neighbor sizes 
    for i in range(num_nodes):
        for j in range(i+1, num_nodes):
            xij = int(G.has_edge(i,j))
            di = max(G.degree[i],G.degree[j]) - xij
            dj = min(G.degree[i],G.degree[j]) - xij 
            bucket[di] = max(bucket[di],dj)
    
    uppers = []
    for i in reversed(range(num_nodes)):
        if bucket[i] <0:
            continue
        if (len(uppers)==0) or uppers[-1][1] < bucket[i]:
            uppers.append([i, bucket[i]])   
    
    gs = edgedp_compute_global_sens(G,Gstat,query_type,k)
    
    LSD = []
    
    while 1:
        lsd = 0
        for p in uppers:
            lsd = max(lsd, comb(p[0],k-1)+comb(p[1],k-1))
            
            if p[0] < num_nodes-2:
                p[0] = p[0]+1
            elif p[1] < num_nodes-2:
                p[1] = p[1]+1
            
        if lsd < gs:
            LSD.append(lsd)
        else: # converged
            LSD.append(gs)
            return LSD    
          

In [10]:
def lsd_kclique(G,Gstat,query_type,k):
    gs = edgedp_compute_global_sens(G,Gstat,query_type,k)
    ls = edgedp_compute_local_sens(G,Gstat,query_type,k)
    
    LSD = []
    lsd = ls 
    t = 0
    
    while 1: # loop until converge to gs
        if lsd < gs:
            LSD.append(lsd)
        else:
            LSD.append(gs)
            return LSD   
        lsd = lsd + comb(Gstat.max_num_common + t, k - 3)
        t +=1 
        

In [11]:
def lsd_ktriangle(G,Gstat,query_type,k):
    gs = edgedp_compute_global_sens(G,Gstat,query_type,k)
    ls = edgedp_compute_local_sens(G,Gstat,query_type,k)
    
    LSD = []
    lsd = ls 
    t = 0
    max_common_neighbors = Gstat.max_num_common
    
    while 1: # loop until converge to gs
        if lsd < gs:
            LSD.append(lsd)
        else:
            LSD.append(gs)
            return LSD   
        
        lsd = lsd + 3* comb(max_common_neighbors+t, k-1) + (max_common_neighbors+t) * comb(max_common_neighbors+t,k-2)
        t +=1 
    
    return -1


In [12]:
def edgedp_ladder_function(G, Gstat, query_type, k):
    lsd = []    
    if query_type == "triangle":
        lsd = lsd_triangle(G,Gstat,query_type,k)
    elif query_type == "kstar":
        lsd = lsd_kstar(G,Gstat,query_type,k)
        return lsd
    elif query_type == "kclique":
        lsd = lsd_kclique(G,Gstat,query_type,k)
        return lsd
    elif query_type == "ktriangle":
        lsd = lsd_ktriangle(G,Gstat,query_type,k)
        return lsd
    else:
        print(query_type, "is unspecified")
    return lsd

### Ladder Mechanism

In [13]:
def edgedp_ladder_mechanism_noise_sample(G, Gstat, query_type, k, epsilon, ladders, true_count):
    #M: length of the ladder function
    M = len(ladders)
    
    ranges = [0.0]
    weights = [1.0] #the center's weight
    
    #rungs 1 to M
    dst = 0.0 
    for t in range(M):
        weights.append(2*ladders[t]*np.exp(epsilon/2.0*(-t))) 
        ranges.append(dst)
        dst = dst + ladders[t]
        
    #rung M+1
    weights.append(2*ladders[-1]* np.exp(epsilon/2.0*(-M-1))/(1-np.exp(-epsilon/2.0)))
        
    ####the only part that involves randomness, may store the earlier results for evaluation over multiple runs 

    noisy_count = true_count

    t = int(util.sample_prob_list(weights))

    if t == 0:
        return noisy_count

    elif t <= M: # add/subtract noise of a uniformly distributed random integer in range[t]
        flag = -1.0 # add or subtract? 
        if (np.random.uniform() > 0.5):
            flag = 1.0
        low = ranges[t-1]
        delta = np.ceil(np.random.uniform() * (ranges[t] - ranges[t-1]))
        noisy_count = flag * delta + true_count

    else: # sample noise from geometric distribution
        p = 1.0 - np.exp(-epsilon/2.0)
        ext = np.random.geometric(p)
        low = dst + ext * ladders[-1]
        high = low + ladders[-1]
        flag = -1.0 # add or subtract? 
        if (np.random.uniform()>0.5):
            flag = 1.0
        noisy_count = flag * np.random.randint(low, high+1) + true_count
    
    return noisy_count

# end-to-end: ladder function paper: algorithm 1 
def edgedp_ladder_mechanism(G, Gstat, query_type, k, epsilon):
    true_count = count(G,Gstat,query_type,k)
    
    #ladders: ladder function evaluated on G
    ladders = edgedp_ladder_function(G, Gstat, query_type, k)  
    
    return edgedp_ladder_mechanism_noise_sample(G, Gstat, query_type, k, epsilon, ladders, true_count)

### Smooth Sensitivity Mechanism

In [14]:
from scipy.special import comb

def edgedp_smooth_sensitivity(lsd, beta):
    ss = 0.0
    for i in range(len(lsd)):
        ss = max(ss, lsd[i]* np.exp(beta* (-1.0)* i))
    return ss 

def edgedp_smooth_sensitivity_mechanism_non_ladder(G, Gstat, query_type, k, epsilon):
    delta = 0.01 # Used only for ktriangle queries. Set according to Vishesh et al's evaluation parameters.
    
    true_count = util.count(G,Gstat,query_type,k)
    
    if(query_type == "kstar"):
        ss = edgedp_smooth_sensitivity_kstar(G, Gstat, k, epsilon)
    elif(query_type == "ktriangle"):
        ss = edgedp_smooth_sensitivity_ktriangle(G, Gstat, k, epsilon)
    else:
        
        #ladders: ladder function evaluated on G or LSD
        ladders = edgeDP_LadderFunction(G,Gstat,query_type,k)  
        
        #M: length of the ladder function
        M = len(ladders)
    
        ss = edgedp_smooth_sensitivity(ladders, epsilon/6.0)
    
    if(query_type == "ktriangle"):
        noisy_count = edgedp_smooth_sensitivity_ktriangle_noise_sample(true_count, ss, Gstat.max_num_common, k , epsilon, delta)
    else:
        noisy_count = true_count + 6.0/epsilon * ss * np.random.standard_cauchy(1)  
    
    return noisy_count

# overloaded method with ladders and true_count as input
def edgedp_smooth_sensitivity_mechanism(G, Gstat, query_type, k, epsilon, ladders, true_count): 
    #M: length of the ladder function
    M = len(ladders)
    
    ss = edgedp_smooth_sensitivity(ladders, epsilon/6.0)
    
    noisy_count = true_count + 6.0/epsilon * ss * np.random.standard_cauchy(1)  
    return noisy_count

def edgedp_smooth_sensitivity_kstar(G, Gstat, k, epsilon):
        n = Gstat.num_nodes
        beta = epsilon / 6
        smoothSens = 0

        degreeList = [0] * Gstat.num_nodes

        # Sort nodes by degrees
        degreeListStruct = [(i, G.degree[i]) for i in range(n)]
        degreeListStruct = sorted(degreeListStruct, key = lambda x: x[1], reverse = True)

        ij_pairList = [(1,2)]
        u1Nodes = []
        u2Nodes = []

        highestDegree = degreeListStruct[0][1]
        secondHighestDegree = highestDegree
        secondHighestStartIndex = 0
        thirdHighestStartIndex = 0

        u1 = -1
        u2 = -1
        v1 = -1
        v2 = -1

        for i in range(len(degreeListStruct)):
            recordTuple = degreeListStruct[i]
            if(recordTuple[1] < highestDegree):
                secondHighestStartIndex = i
                break
            u1Nodes.append(recordTuple[0])

        secondHighestDegree = degreeListStruct[secondHighestStartIndex][1]
        for j in range(secondHighestStartIndex, len(degreeListStruct)):
            recordTuple = degreeListStruct[j]
            if(recordTuple[1] < secondHighestDegree):
                thirdHighestStartIndex = j
                break
            u2Nodes.append(recordTuple[0])

        # Get V1
        for i in range(secondHighestStartIndex,len(degreeListStruct)):
            recordTuple = degreeListStruct[i]
            currNode = recordTuple[0]
            for u1Node in u1Nodes:
                if(u1Node in G.neighbors(currNode)):
                    u1 = u1Node
                    v1 = currNode
                    break
        # Get V2
        for j in range(thirdHighestStartIndex, len(degreeListStruct)):
            recordTuple = degreeListStruct[j]
            currNode = recordTuple[0]
            for u2Node in u2Nodes:
                if(u2Node in G.neighbors(currNode)):
                    u2 = u2Node
                    v2 = currNode
                    break

        ij_pairList.append((u1, v1))
        ij_pairList.append((u2, v2))

        for t in range(2 * n - 2):
            smoothSens_t = 0
            for (i, j) in ij_pairList:
                nodeI = i
                nodeJ = j

                d_i = degreeList[i]
                d_j = degreeList[j]

                xij = 0
                if(nodeI in G.neighbors(nodeJ)):
                    xij = 1

                dprime_i = d_i - xij
                dprime_j = d_j - xij

                b_i = n - 2 - dprime_i
                b_j = n - 2 - dprime_j

                if(t <= b_i):
                    currSens = comb(dprime_i + t, k - 1) + comb(dprime_j, k - 1)
                elif (b_i < t < b_i + b_j):
                    currSens = comb(n - 2, k - 1) + comb(dprime_i + t - b_j, k - 1)
                elif (t >= b_i + b_j):
                    currSens = 2 * comb(n - 2, k - 1)
                else:
                    print("Weird condition detected in ss_kstar()")
                    currSens = 0

                smoothSens_t = max(smoothSens_t, currSens)

            smoothSens = max(math.exp(-1 * t * beta) * smoothSens_t, smoothSens)

        return smoothSens

def edgedp_smooth_sensitivity_ktriangle(G, Gstat, k, epsilon):
       # Note that this algorithm satsifies (epsilon,delta) differential privacy as specified in Vishesh et. al.
        ls_max = 0
        for i in range(Gstat.num_nodes + 1):
            for j in range(i + 1, Gstat.num_nodes + 1):
                common_neighbors = Gstat.common_neighbors['{},{}'.format(i, j)]
                a_ij = len(common_neighbors)
                
                if((i, j) in G.edges()):
                    x_ij = 1
                else:
                    x_ij = 0

                ls = comb(a_ij, k)
                for l in common_neighbors:
                    a_il = len(Gstat.common_neighbors['{}, {}'.format(i,l)])
                    a_lj = len(Gstat.common_neighbors['{}, {}'.format(l,j)])
                    ls += comb(a_il - x_ij, k - 1) + comb(a_lj - x_ij, k - 1)

                ls_max = max(ls_max, ls)

        return ls_max

def edgedp_smooth_sensitivity_ktriangle_noise_sample(true_count, local_sensitivity, amax, k, epsilon, delta):
    # Note that this algorithm satsifies (epsilon,delta) differential privacy as specified in Vishesh et. al.
    
    epsilonp = epsilon /3
    deltap = delta/3
    
    amax_noise = amax + stats.laplace.rvs(scale = 1/epsilonp) + (math.log(1/deltap) / epsilonp)
       
    B = (3 * comb(int(amax_noise), k-1)) + (amax_noise * comb(int(amax_noise), k-2))
    
    ls_noise = local_sensitivity + stats.laplace.rvs(scale = B / epsilonp) + ((B / epsilonp) * math.log(1/deltap))

    return true_count + stats.laplace.rvs(scale = ls_noise / epsilonp)






### Laplace Mechansim

In [15]:
def edgedp_laplace_mechanism(G, Gstat, query_type, k, epsilon, true_count):
        
    gs = edgedp_compute_global_sens(G, Gstat, query_type, k)
    
    scale =1.0* gs/epsilon
    noisy = true_count + np.random.laplace(0.0, scale, 1)
    
    return noisy

### Recursive Mechanism

In [16]:
import pandas as pd
import numpy as np
import scipy.optimize
from tqdm import tqdm
import networkx as nx


def H_linprog(edges, n, i):
    # Computing Hi(P,M) = min q(M'(P'))

    array = []
    for j in range(n):
        if (j+i) < n+1:
            new_edges = edges.iloc[j:(j+i+1)]
            edgelist = nx.from_pandas_edgelist(new_edges, 'f', 't', None)
            temp = sum(nx.triangles(edgelist).values()) / 3
            array.append(temp)

    return np.min(array)

def G_linprog(edges, n, i):
    # computing Global Empirical Sensitivity Gi(P,M) = min GS(P', M')

    real_edges = nx.from_pandas_edgelist(edges, 'f', 't', None)
    real_count = sum(nx.triangles(real_edges).values()) / 3

    array = []
    for j in range(n):
        if (j+i) < n+1:
            new_edges = edges.iloc[j:(j+i+1)]
            test = nx.from_pandas_edgelist(new_edges, 'f', 't', None)
            temp = sum(nx.triangles(test).values()) / 3
            array.append(real_count - temp)
    
    return real_count - np.min(array)


def recursive(n, edges, query, eps1, eps2, theta, beta, mu, logging=False):
    '''
    @n: n is the total number of nodes. We assume their indices are
    0, 1, ..., n-1
    @edges: edges is a dataframe with columns ['f', 't'].
    It is unlabeled, and edge is from small index to high index.
    @query: 'triangle', '2-star' and so on
    @epsilon: epsilon
    @theta: theta
    @beta: beta
    @mu: mu
    '''
    #This is the general implementation of the algorithm
    # The efficiency of this algorithm can be further
    # improved by section 5.3

    # 1. Get the output as a dataframe
    if query == 'triangle':
        test = nx.from_pandas_edgelist(edges, 'f', 't', None)
        triangles = sum(nx.triangles(test).values()) / 3
    else:
        pass

    # 2. Compute the H and G sequence
    H, G = [], []
    if logging:
        ranges = tqdm(range(n + 1))
    else:
        ranges = range(n + 1)
    for i in ranges:
        Hi = H_linprog(edges, n, i)
        Gi = G_linprog(edges, n, i)
        H.append(Hi)
        G.append(Gi)

    # 3. Compute delta
    exp_series = np.exp(np.arange(n + 1) * beta) * theta

    K = exp_series[exp_series >= G]
    delta = np.min(K)

    if logging:
        print(f'delta is {delta}')

    # # 4. Compute delta_hat
    Y1 = np.random.laplace(scale=beta / eps1)
    delta_hat = np.exp(mu + Y1) * delta

    # # 5. Compute X
    X = np.min(H + (n - np.arange(n + 1)) * delta_hat)
    if logging:
        print(f'X is {X}')

    # # 6. Compute X_hat
    Y2 = np.random.laplace(scale=delta_hat / eps2)
    X_hat = X + Y2

    rel_errors = (X_hat - triangles) / triangles
    if logging:
        print("rel_errors:", rel_errors)

    return X_hat

## Evaluation

In [13]:
import sys
import timeit
from util import *

data_dir = "data/" # REPLACE WITH YOUR DATASET DIRECTORY
data_key = 2
data_file = data_dir + constants.DATASETS[data_key]


queryTypeList = ["triangle", "kstar", "kclique", "ktriangle"]
kList = [1, 3, 4, 2]

algos = [
            "edgedp_laplace", 
            "edgedp_smooth", 
            "edgedp_ladder",
#              "edgedp_recursive" # this is slow, run only on small datases
]


for dataName in dataNames:
    print("data: ", dataName)
    datafile = dataDir+dataName #"facebook_combined.txt"
    translated = datafile+"-map.txt"
    if not os.path.isfile(translated):
        #convert all nodes id to 0 to nodeNum
        translate(datafile, translated)
    else:
        print("file exists")

    G=nx.read_edgelist(translated, nodetype=int)
    G.remove_edges_from(nx.selfloop_edges(G))

    nodesNum = len(G.nodes()) #assume this is given
    maxDeg = nodesNum -1  #assume this is given

    Gstat = GraphStat(G)

    all_errors = []
    all_stds = []
    for queryKey in range(len(queryTypeList)):
        query_type = queryTypeList[queryKey]
        k = kList[queryKey]

        start_time = timeit.default_timer() 
        true_count = count(G,Gstat,query_type,k)
        baseline_time = timeit.default_timer() - start_time

        print("computing ladder")
        start_time = timeit.default_timer(); 
        ladders = edgedp_ladder_function(G,Gstat,query_type,k)
        ladder_compute_time = timeit.default_timer() - start_time
        print("ladder compute time: ", ladder_compute_time)

        query_errors = []
        query_stds = []
        
        for algoKey in range(len(algos)):
            algo = algos[algoKey]
            print(algo)
            
            algo_errors = []
            algo_stds = []
            for epsilon in epsList:
                errors = []
#                 time = []
                
                for i in range(repeats):
                    noisy= 0.0
#                     start_time = timeit.default_timer(); 
                    if algo == "edgedp_ladder":
                        noisy = edgedp_ladder_mechanism_noise_sample(G, Gstat, query_type, k, epsilon, ladders, true_count)
                    elif algo == "edgedp_laplace":
                        noisy = edgedp_laplace_mechanism(G, Gstat, query_type, k, epsilon, true_count)
                    else:
                        if (query_type =="kstar"):
                            noisy = edgedp_smooth_sensitivity_mechanism_non_ladder(G, Gstat, query_type, k, epsilon)
                        else:
                            noisy = edgedp_smooth_sensitivity_mechanism(G, Gstat, query_type, k, epsilon, ladders,
                                                                        true_count)
                            
######### to get time measurements, uncomment the following lines and also the 2 lines containing 'time' above ###################

#                     itr_time_end = timeit.default_timer()
#                     itr_time = itr_time_end - start_time

#                     if(algo == "edgedp_ladder"):
#                         total_runtime = baseline_time + ladder_compute_time + itr_time
#                     elif(algo == "edgedp_laplace"):
#                         total_runtime = baseline_time + itr_time
#                     elif(algo == "edgedp_smooth"):
#                         if(query_type == "kstar"):
#                             total_runtime = itr_time
#                         else:
#                             total_runtime = baseline_time + ladder_compute_time + itr_time
#                     time.append(total_runtime)

                    relative_error = abs(noisy-true_count)/true_count
                    errors.append( relative_error )
                    
                algo_errors.append(np.mean(errors))
                algo_stds.append(np.std(errors))
            
            query_errors.append(algo_errors)
            query_stds.append(algo_stds)

        all_errors.append(query_errors)
        all_stds.append(query_stds)


In [None]:
########## plots #########
# plot errors and standard deviation

params = {'edgedp_laplace':['x','red'], 'edgedp_smooth': ['o','green'],'edgedp_ladder':['>','orange'],}

query_error_key = 0 # triangle 0 , kstar 1, kclique 2, ktriangle 3 to plot the error for this specific query
errors_to_plot = all_errors[query_error_key]
stds_to_plot = all_stds[query_error_key] 

fig = plt.figure()
for i in range(3):
    algo = algos[i]
    err = errors_to_plot[i]
    st_dev = stds_to_plot[i]
    plt.errorbar(x=epsilon_list, y=err, yerr=st_dev,label=algo,c=params[algo][1],marker=params[algo][0])
plt.legend(bbox_to_anchor=(1.1, 1))
plt.xlabel('Epsilon')
plt.ylabel('Error')
plt.ylim([0,2.5])

 