In [1]:
%load_ext autoreload
%autoreload 2
import syft as sy
import numpy as np
import pandas as pd
from syft.core.adp.entity_list import EntityList
from time import time
import pyarrow.parquet as pq
from syft.util import size_mb

Loaded constant2epsilon cache of size: (300000,)


In [2]:
t0 = time()
df = pq.read_table("/home/ruchi/1B_rows_dataset_sample.parquet")
tf = time() - t0

print(f"Time taken to read file with {df.shape[0]/1e6} million rows : {tf} seconds")

Time taken to read file with 1000.0 million rows : 13.745653867721558 seconds


In [3]:
scale = 1000  # This is put here to reduce the size of the cache :)
t0 = time()
impressions = df['impressions'].to_numpy()//scale
data_subjects = EntityList.from_series(df['user_id'])
tf = time() - t0

print(f"Time taken to create inputs for Syft Tensor: {tf} seconds")

Time taken to create inputs for Syft Tensor: 23.07019305229187 seconds


In [4]:
t0 = time()
tweets_data = sy.Tensor(impressions).private(min_val=700_000/scale, max_val=20e6/scale, entities = data_subjects,ndept=True)  # RUn this for 1 billion rows
# tweets_data = sy.Tensor(impressions).private(min_val=500/scale, max_val=25000/scale, entities = data_subjects,ndept=True)  # Run this for 1 million rows
tf = time() - t0

print(f"Time make Private Syft Tensor: {tf} seconds")

Time make Private Syft Tensor: 2.1737875938415527 seconds


In [5]:
domain_node = sy.login(email="info@openmined.org",password="changethis",port=8081)


Anyone can login as an admin to your node right now because your password is still the default PySyft username and password!!!

Connecting to localhost... done! 	 Logging into cicada... done!


In [6]:
domain_node.privacy_budget

9999960.571065024

In [7]:
from syft.core.node.common.node_service.user_manager.user_messages import (
    UpdateUserMessage,
)

# Upgrade admins budget
content = {"user_id": 1, "budget": 9_999_999}
domain_node._perform_grid_request(grid_msg=UpdateUserMessage, content=content)

domain_node.privacy_budget

9999999.0

In [8]:
%%time
name = f"Tweets data- {time()}"

domain_node.load_dataset(
    assets={name: tweets_data},
    name=name,
    description=" Tweets- 100M rows",
    use_blob_storage=True
)

Loading dataset... uploading...🚀                        

Uploading `Tweets data- 1648110880.9412913`: 100%|[32m████████████████████[0m| 8/8 [00:08<00:00,  1.12s/it][0m


Dataset is uploaded successfully !!! 🎉

Run `<your client variable>.datasets` to see your new dataset loaded into your machine!
CPU times: user 40.1 s, sys: 10.3 s, total: 50.4 s
Wall time: 1min 7s


In [9]:
domain_node.datasets

Idx,Name,Description,Assets,Id
[0],Tweets data- 1648108483.7777543,Tweets- 100M rows,"[""Tweets data- 1648108483.7777543""] ->",3f42dc06-5153-4cff-a220-1b166853d7a5
[1],Tweets data- 1648108515.882528,Tweets- 100M rows,"[""Tweets data- 1648108515.882528""] ->",288f2653-1808-4451-8360-55c2e09e412e
[2],Tweets data- 1648110880.9412913,Tweets- 100M rows,"[""Tweets data- 1648110880.9412913""] ->",aafc6b48-de35-4196-b22f-c7f58286a925


In [10]:
data = domain_node.datasets[-1][name]

In [11]:
%%time
sum_result = data.sum()
sum_result.block

CPU times: user 2.93 s, sys: 194 ms, total: 3.13 s
Wall time: 1min 44s


<TensorPointer -> cicada:ca31dc5609844a39b62f70d4f7b0d606>

In [12]:
sum_result.exists

True

In [15]:
published_result = sum_result.publish(sigma=1e6)


Please wait we're computing your query ...Completed. 🎉

In [17]:
published_result.exists

False

In [None]:
published_result.get(delete_obj=False)

In [None]:
a = np.random.random(10).astype(np.int64)

In [None]:
a.dtype

In [None]:
a.dtype.name

In [None]:
np.dtype("int64")

In [None]:
# domain_node.requests

In [None]:
# domain_node.requests[-1].accept()

In [None]:
# domain_node.requests

In [None]:
# result_of_our_hard_labour = published_result.get()

# WE GOT AN OVERFLOW ERROR
- doesn't occur with 1M rows
- publish had completed, tho it took 602 seconds

In [None]:
# result_of_our_hard_labour

In [None]:
# impressions.sum()

In [None]:
def percentage_error(true_value, noisy_value):
    return (true_value - noisy_value)/true_value * 100

In [None]:
# percentage_error(true_value=impressions.sum(), noisy_value=result_of_our_hard_labour)

In [None]:
# (1644527104 - 11941995258)/11941995258 * 100

In [None]:
# 11941995258 /1e6

In [None]:
def calculate_bounds_for_mechanism(
     min_val_array, max_val_array
):
    """Calculates the squared L2 norm values needed to create a Mechanism, and calculate
    privacy budget + spend. If you calculate the privacy budget spend with the worst
    case bound, you can show this number to the DS. If you calculate it with the
    regular value (the value computed below when public_only = False, you cannot show
    the privacy budget to the DS because this violates privacy."""
    l2_norm_min = np.sqrt(np.sum(np.square(min_val_array)))
    l2_norm_max = np.sqrt(np.sum(np.square(max_val_array)))
    return l2_norm_min, l2_norm_max

In [None]:
def _get_batch_rdp_constants(
    sigma, scale, min_val, max_val,  L=1
) -> np.ndarray:
    min_val = min_val/scale
    max_val = max_val/scale
    # print(min_val, max_val)
    l2_norm_min, l2_norm_max = calculate_bounds_for_mechanism(min_val, max_val)
    
    # use the indices to get a "batch" of the full ledger. this is the only part
    # of the ledger we care about (the entries corresponding to specific entities)
    squared_Ls = L**2
    squared_sigma = sigma**2
    squared_L2_norms_min = l2_norm_min**2
    squared_L2_norms_max = l2_norm_max**2
    
    constant_min = (
        squared_Ls * squared_L2_norms_min / (2 * squared_sigma)
    )
    constant_max = (
        squared_Ls * squared_L2_norms_max / (2 * squared_sigma)
    )
    
#     constant = np.bincount(batch_entity_ids, weights=constant).take(
#         entity_ids_query
#     )
    # # update our serialized format with the calculated constants
    # self._rdp_constants = np.concatenate([self._rdp_constants, constant])
    # self._entity_ids_query = np.concatenate(
    #     [self._entity_ids_query, entity_ids_query]
    # )
    return constant_min, constant_max

In [None]:
#_get_batch_rdp_constants(sigma=100, scale=1000, min_val=700_000, max_val=20e6)