In [1]:
import numpy as np
import pandas as pd
from collections import Counter
from collections import defaultdict
from typing import List

In [2]:
from collections import namedtuple

QUERY_TYPES_LIST = ['MAX','MIN','MEAN','COUNT','COUNT_PER_ITEM']
query_type_named_tuple = namedtuple('QueryType',QUERY_TYPES_LIST)

QUERY_TYPES = query_type_named_tuple(*range(len(QUERY_TYPES_LIST)))

condition_type_named_tuple = namedtuple('ConditionType',[
    'LESS_THAN','GREATER_THAN',
    'LESS_THAN_EQUAL','GREATER_THAN_EQUAL',
    'EQUAL','NOT_EQUAL'
])

CONDITION_TYPES = condition_type_named_tuple(*range(6))

operand_type_named_tuple = namedtuple('OperandType',['COLUMN','SCALER_NUMBER'])

OPERAND_TYPES = operand_type_named_tuple(*range(2))

In [3]:
class Operand:
    def __init__(self,operand_type,value):
        self.operand_type = operand_type
        self.value = value
    
    def evaluate(self, dataset):
        if self.operand_type == OPERAND_TYPES.COLUMN:
            return dataset.loc[self.value]
        else:
            return self.value
    
class ComposedOperand(Operand):
    def __init__(self,left_operand: Operand,operation,right_operand: Operand):
        self.left_operand = left_operand
        self.right_operand = right_operand
        self.operation = operation
    
    def evaluate():
        # evaluate composed operand
        return self.operation(left_operand.evaluate(),right_operand.evaluate())
        
class Condition:
    def __init__(self,left_operand: Operand,condition_type,right_operand: Operand):
        self.left_operand = left_operand
        self.right_operand = right_operand
        self.condition_type = condition_type
    
    def apply(self, dataset):
        # TODO: filter dataset based on condition
        return dataset

class Query:
    def __init__(self,query_type,query_target,conditions=[]):
        self.query_type = query_type
        self.conditions = conditions
        self.query_target = query_target
        
    def __repr__(self):
        return f"""
========
query over {self.query_target}
query type: {self.query_type}
number of conditions: {len(self.conditions)}
========
        """
        
    def apply(self,dataset):
        # TODO: apply conditions
        for condition in self.conditions:
            dataset = dataset.apply(condition)
        
        if self.query_type == QUERY_TYPES.MEAN:
            return np.mean(dataset[self.query_target])
        elif self.query_type == QUERY_TYPES.COUNT:
            return len(dataset[self.query_target])
    
        


In [4]:
def simple_budget_allocate(queries: List[Query],total_budget):
    return [total_budget/len(queries)]*len(queries)

def genetic_alogirthm_budget_allocate(queries: List[Query],dataset,total_budget):
    pass # use GA hueristic to find the best budget allocation

In [5]:
def apply_laplacian(queries,dataset, budget_allocated):
    results,original_results = [],[]
    for query,budget in zip(queries,budget_allocated):
        # calculate L1 sensitivity
        l1_sense = None
        if query.query_type == QUERY_TYPES.COUNT:
            l1_sense = 1
        elif query.query_type == QUERY_TYPES.MEAN:
            l1_sense = (max(dataset[query.query_target]) - min(dataset[query.query_target]))/len(dataset[query.query_target])
        # TODO: handle other query types as well
        
        # handle multiple output queries
        noise = np.random.laplace(0,l1_sense/budget)
        result = query.apply(dataset)
        results.append(result + noise)
        original_results.append(result)
    return results,original_results
            
        

In [6]:
dataset = pd.read_csv('anonymized_device_flow.csv') # decide database

In [7]:
queries = [] # start deciding queries
total_budget = 10 # total privacy budget

In [8]:
product_column_count_query = Query(QUERY_TYPES.COUNT,'product_name') # query 1
device_column_count_query = Query(QUERY_TYPES.COUNT,'device_id') # query 1

# accumulate queries being considered
queries.append(product_column_count_query) 
queries.append(device_column_count_query)

In [11]:
budget_allocated = simple_budget_allocate(queries,total_budget) # allocate budget

In [12]:
output,original_output = apply_laplacian(queries,dataset, budget_allocated) # apply DP!

In [13]:
# check DP output
for out,orig_out,query in zip(output,original_output,queries):
    print(f'for query: \n{query}\noriginal output was: {orig_out}, after applying DP: {out}')

for query: 

query over product_name
query type: 3
number of conditions: 0
        
original output was: 2442531, after applying DP: 2442531.483621194
for query: 

query over device_id
query type: 3
number of conditions: 0
        
original output was: 2442531, after applying DP: 2442530.7599204564
