In [1]:
import syft as sy
import numpy as np
import pandas as pd
from syft.core.adp.entity_list import EntityList
from syft.core.adp.data_subject_ledger import DataSubjectLedger
from syft.core.adp.entity import Entity
from syft.core.adp.ledger_store import DictLedgerStore
from time import time
import pyarrow.parquet as pq
from syft.util import size_mb

Loaded constant2epsilon cache of size: (300000,)


In [3]:
%%time
df = pq.read_table("/home/ruchi/1B_rows_dataset_sample.parquet")

CPU times: user 1min 40s, sys: 2min 5s, total: 3min 45s
Wall time: 17.5 s


In [5]:
scale = 1000  # This is put here to reduce the size of the cache :)
t0 = time()
impressions = df['impressions'].to_numpy()//scale
data_subjects = EntityList.from_series(df['user_id'])
tf = time() - t0

print(f"Time taken to create inputs for Syft Tensor: {tf} seconds")

Time taken to create inputs for Syft Tensor: 25.046489477157593 seconds


In [6]:
t0 = time()
tweets_data = sy.Tensor(impressions).private(min_val=700_000/scale, max_val=20e6/scale, entities = data_subjects,ndept=True)  # RUn this for 1 billion rows
# tweets_data = sy.Tensor(impressions).private(min_val=500/scale, max_val=25000/scale, entities = data_subjects,ndept=True)  # Run this for 1 million rows
tf = time() - t0

print(f"Time make Private Syft Tensor: {tf} seconds")

Time make Private Syft Tensor: 2.268967628479004 seconds


In [7]:
domain_node = sy.login(email="info@openmined.org",password="changethis",port=8081)


Anyone can login as an admin to your node right now because your password is still the default PySyft username and password!!!

Connecting to localhost... done! 	 Logging into festive_song... done!


In [8]:
domain_node.privacy_budget

8908149.351512348

In [None]:
# from syft.core.node.common.node_service.user_manager.user_messages import (
#     UpdateUserMessage,
# )

# # Upgrade admins budget
# content = {"user_id": 1, "budget": 9_999_999}
# domain_node._perform_grid_request(grid_msg=UpdateUserMessage, content=content)

# domain_node.privacy_budget

In [9]:
%%time
name = f"Tweets data- {time()}"

domain_node.load_dataset(
    assets={name: tweets_data},
    name=name,
    description=" Tweets- 100M rows",
    use_blob_storage=True
)

Loading dataset... uploading...🚀                                                                                                                                             

Uploading `Tweets data- 1648146145.4022515`: 100%|[32m████████████████████[0m| 7/7 [00:08<00:00,  1.23s/it][0m

Dataset is uploaded successfully !!! 🎉

Run `<your client variable>.datasets` to see your new dataset loaded into your machine!
CPU times: user 40.6 s, sys: 9.85 s, total: 50.4 s
Wall time: 57.1 s





In [10]:
domain_node.datasets

Idx,Name,Description,Assets,Id
[0],Tweets data- 1648114820.056691,Tweets- 100M rows,"[""Tweets data- 1648114820.056691""] ->",cf40e48d-72f8-4536-8f9d-9383ca23acf7
[1],Tweets data- 1648123216.9175086,Tweets- 100M rows,"[""Tweets data- 1648123216.9175086""] ->",29d587af-4b62-47c9-a27d-71a026ec2265
[2],Tweets data- 1648124216.139227,Tweets- 100M rows,"[""Tweets data- 1648124216.139227""] ->",fd68f34a-818d-4611-9194-1b8d788f96cf
[3],Tweets data- 1648124958.1514506,Tweets- 100M rows,"[""Tweets data- 1648124958.1514506""] ->",7d03bc3a-901e-43a2-b25e-6c3c7d243aff
[4],Tweets data- 1648131125.3170142,Tweets- 100M rows,"[""Tweets data- 1648131125.3170142""] ->",077a5f8c-264d-410f-b143-495a4c7857a3
[5],Tweets data- 1648137952.8112204,Tweets- 100M rows,"[""Tweets data- 1648137952.8112204""] ->",26362575-fc9c-4f06-861f-69db3956c783
[6],Tweets data- 1648146145.4022515,Tweets- 100M rows,"[""Tweets data- 1648146145.4022515""] ->",56bf8625-964c-42b3-81ac-56a30d2d8cc6


In [11]:
data = domain_node.datasets[-1][name]

In [None]:
data

In [12]:
sum_result = data.sum()

In [12]:
%%time
sum_result.block

CPU times: user 2.61 s, sys: 97.1 ms, total: 2.71 s
Wall time: 1min 41s


<TensorPointer -> festive_song:9ae82df9da3149dbaea882a23fed5908>

In [13]:
sum_result.exists

True

In [14]:
published_result = sum_result.publish(sigma=1e6)

CPU times: user 6 µs, sys: 5 µs, total: 11 µs
Wall time: 22.2 µs


<FloatPointer -> festive_song:36f2545de7354bfe9f1ea4c820a00c72>

In [None]:
%time
published_result.block

In [15]:
published_result.exists

True

In [16]:
res = published_result.get(delete_obj=False)
print(res)

11128364194619.71


In [17]:
domain_node.privacy_budget

8835359.374946505