In [6]:
import syft as sy
import os
from syft import ActionObject
from collections import defaultdict

Start this using

In [7]:
# `docker compose --profile blob-storage --file docker-compose.multinode.yml --file docker-compose.dev.yml up`

In [8]:
# node = sy.orchestra.launch(name="test-domain-1", port="auto", dev_mode=True, reset=True, node_type="domain")

In [9]:
# client = node.login(email="info@openmined.org", password="changethis")

```
hagrid launch domain to docker:8080 --dev --verbose
```

In [10]:
client = sy.login(url="http://localhost:8080", email="info@openmined.org", password="changethis")

Logged into <modest_smola: High side Domain> as <info@openmined.org>


# Mount storage container with Helm azure container

In [11]:
# HELM_STORAGE_ACCOUNT_KEY = ""

In [12]:
client.api.services.blob_storage.mount_azure(
    account_name='helmprojectstorage',
    container_name='helm',
    account_key=os.environ["HELM_STORAGE_ACCOUNT_KEY"],
#     account_key=HELM_STORAGE_ACCOUNT_KEY,
    bucket_name='helmazurebucket',
)

In [13]:
blob_files = client.api.services.blob_storage.get_files_from_bucket(bucket_name='helmazurebucket')

In [14]:
blob_files

# Start workers

In [15]:
client.worker.start_workers(n=3)

In [16]:
client.worker.list()

In [18]:
client.users

# Create Dataset

In [19]:
train_file = sy.ActionObject.from_path("short_input.jsonl").send(client).syft_action_data
scenario_file = scenario_obj = sy.ActionObject.from_path(path="scenario_data.jsonl").send(client).syft_action_data

In [20]:
# train_file = [f for f in blob_files if "train-00" in f.file_name][0]
# scenario_file = [f for f in blob_files if "scenario_data" in f.file_name][0]

In [21]:
helm_dataset = sy.Dataset(
    name="Helm Dataset",
    asset_list=[
        sy.Asset(
            name="helm train data",
            data=ActionObject.from_obj([train_file]),
            mock=sy.ActionObject.empty()
        ),
        sy.Asset(
            name="helm test data",
            data=ActionObject.from_obj([scenario_file]),
            mock=sy.ActionObject.empty()
        )
    ]
)

In [22]:
client.upload_dataset(helm_dataset)

Would you like to proceed? [y/n]: y


  0%|                                                                                            | 0/2 [00:00<?, ?it/s]

Uploading: helm train data


 50%|██████████████████████████████████████████                                          | 1/2 [00:00<00:00,  3.65it/s]

Uploading: helm test data


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  3.64it/s]


In [23]:
helm_ds = client.datasets["Helm Dataset"]
helm_train_files = helm_ds.assets["helm train data"]
helm_test_files = helm_ds.assets["helm test data"]

# Syft functions

In [24]:
@sy.syft_function()
def compute_document_data_overlap(scenario_file, input_files, n):
    print("starting overlap computation")

    from nltk import ngrams
    from collections import defaultdict
    from string import punctuation
    import re, json
    import time

    r = re.compile(r"[\s{}]+".format(re.escape(punctuation)))
    
    def create_ngram_index(light_scenarios, n_values, stats_key_counts):
        ngram_index = {n:{}  for n in n_values}
        for i, scenario in enumerate(light_scenarios):
            if i%20 == 0:
                print(f"n_gram indexing progress: {(i/len(light_scenarios))*100:.2f}%")
            for n in n_values:
                stats_key = scenario['scenario_key'] + '_' + str(n)
                stats_key_counts[stats_key] = len(scenario['instances'])
                for instance in scenario['instances']:
                    id = instance['id']                    
                    input_tokens = r.split(instance['input'].lower())
                    for input_ngram in ngrams(input_tokens, n):
                        if input_ngram not in ngram_index[n]:
                            ngram_index[n][input_ngram] = set()
                        ngram_index[n][input_ngram].add(stats_key + '+' + id + '+' + 'input')

                    # compute reference ngrams
                    for reference in instance['references']:
                        reference_unigrams = r.split(reference.lower())
                        for reference_ngram in ngrams(reference_unigrams, n):
                            if reference_ngram not in ngram_index[n]:
                                ngram_index[n][reference_ngram] = set()
                            ngram_index[n][reference_ngram].add(stats_key + '+' + id + '+' + 'references')
        return ngram_index
    
    # SETUP
    print("preparing scenarios and creating indexes")
    start = time.time()
    light_scenarios = []
    for i, (bytes_read, light_scenario_json) in enumerate(scenario_file.iter_lines(progress=True)):
        if i % 20 == 0:
            print(f"scenario creation progress: {(bytes_read/scenario_file.file_size)*100:.2f}%")

        light_scenario_dict: dict = json.loads(light_scenario_json)

        light_scenario_key_dict: dict = light_scenario_dict["scenario_key"]
        scenario_spec = str(light_scenario_key_dict["scenario_spec"])

        light_scenario_key = scenario_spec + '_' + light_scenario_key_dict["split"]
        light_instances = [
            {
                'input': instance_dict['input'], 
                'references': instance_dict['references'], 
                'id': instance_dict["id"]
            }
            for instance_dict in light_scenario_dict["instances"]
        ]
        light_scenarios.append({'scenario_key': light_scenario_key, 'instances': light_instances})
    print(f"Finished creating scenarios ({time.time()-start}s)")
    
    print("Creating indexes")
    
    start = time.time()
    stats_key_counts = defaultdict(int)
    ngram_index = create_ngram_index(
        light_scenarios=light_scenarios, n_values=[n], stats_key_counts=stats_key_counts
    )
    print(f"Finished creating indexes ({time.time()-start}s)")
        
    
    r = re.compile(r"[\s{}]+".format(re.escape(punctuation)))
    stats_key_to_input_ids = defaultdict(set)
    stats_key_to_reference_ids = defaultdict(set)
    print("computing overlap")
    start = time.time()
    
    for input_file in input_files:
        for i, (bytes_read, line) in enumerate(input_file.iter_lines(progress=True)):
            if i%1000 == 0:
                print(f"computing overlap progress: {(bytes_read / input_file.file_size) * 100:.2f}%")            
            document = json.loads(line)["text"]
            document_tokens = r.split(document.lower())
            for n in ngram_index.keys():
                for document_ngram in ngrams(document_tokens, n):
                    if document_ngram in ngram_index[n]:
                        for entry_overlap_key in ngram_index[n][document_ngram]:
                            stats_key, id, part = entry_overlap_key.split("+")
                            if part == "input":
                                stats_key_to_input_ids[stats_key].add(id)
                            elif part == "references":
                                stats_key_to_reference_ids[stats_key].add(id)
    print(f"Finished computing overlap ({time.time()-start}s)")
    print("done")
    
    return stats_key_to_input_ids, stats_key_to_reference_ids, stats_key_counts

In [25]:
client.code.submit(compute_document_data_overlap)

In [26]:
@sy.syft_function_single_use(input_files=helm_train_files, scenario_files=helm_test_files)
def main_function(domain, input_files, scenario_files):
    N = [5, 9, 13]
    jobs = []
    for n in N[:1]:
        for scenario_file in scenario_files:
            batch_job = domain.launch_job(
                compute_document_data_overlap,
                scenario_file=scenario_file,
                input_files=input_files,
                n=n
            )
            jobs.append(batch_job)

    return None


In [27]:
client.code.request_code_execution(main_function)

In [28]:
client.requests[-1].approve()

Would you like to proceed? [y/n]: y
Request approved for domain modest_smola


In [29]:
job = client.code.main_function(input_files=helm_train_files,
                                scenario_files=helm_test_files,
                                blocking=False)

# Inspect Jobs and get results

In [30]:
job

```python
class Job:
    id: UID = d17d13b4cb8a4b34bd4b9061bca620ed
    status: completed
    has_parent: False
    result: ActionDataEmpty UID: b0476b6139d543e0adf00128a6bd424f <None>
    logs:

0 
JOB COMPLETED
    
```

In [31]:
# job.subjobs

In [32]:
job.subjobs

In [33]:
# job.wait().get()

In [34]:
job.subjobs

In [35]:
job.subjobs[0].logs()

starting overlap computation
preparing scenarios and creating indexes
scenario creation progress: 0.90%
Finished creating scenarios (0.4057505130767822s)
Creating indexes
n_gram indexing progress: 0.00%
Finished creating indexes (0.061101436614990234s)
computing overlap
computing overlap progress: 3.83%
Finished computing overlap (0.05202674865722656s)
done



In [36]:
results = [j.wait().get() for j in job.subjobs]

In [37]:
#stats_key_to_input_ids, stats_key_to_reference_ids, stats_key_counts
results

In [38]:
results[0]

(defaultdict(set,
             {"{'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'philosophy'}}_test_5": {'id328'},
              "{'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'philosophy'}}_valid_5": {'id12'}}),
 defaultdict(set, {}),
 defaultdict(int,
             {"{'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'philosophy'}}_train_5": 5,
              "{'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'philosophy'}}_valid_5": 34,
              "{'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'philosophy'}}_test_5": 311,
              "{'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'anatomy'}}_train_5": 5,
              "{'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'anatomy'}}_valid_5": 1

# Aggregate

In [41]:
stats_key_to_input_ids, stats_key_to_reference_ids, stats_key_counts = zip(*results)

total_input_ids = defaultdict(set)
total_reference_ids = defaultdict(set)
total_stats_key_counts = defaultdict(int)

for d in stats_key_counts:
    for key, val in d.items():
        total_stats_key_counts[key] += val


for d in stats_key_to_input_ids:
    for key in d:
        new_set = set()
        if key in total_input_ids:
            new_set = total_input_ids[key]
        new_set = new_set.union(d[key])
        total_input_ids[key] = new_set

for d in stats_key_to_reference_ids:
    for key in d:
        new_set = set()
        if key in total_reference_ids:
            new_set = total_reference_ids[key]
        new_set = total_reference_ids[key].union(d[key])
        total_reference_ids[key] = new_set

all_data_overlap_stats = []
for stats_key, count in total_stats_key_counts.items():
    data_overlap_stats = {
        'data_overlap_stats_key': None,
        'num_instances': count,
        'instance_ids_with_overlapping_input': sorted(total_input_ids[stats_key]),
        'instance_ids_with_overlapping_reference': sorted(total_reference_ids[stats_key]),
    }
    subject, split, n_str = stats_key.rsplit('_', 2)
    data_overlap_stats['data_overlap_stats_key'] = {
        'light_scenario_key': {'scenario_spec': subject, 'split': split},
        'overlap_protocol_spec': {'n': int(n_str)}
    }
    all_data_overlap_stats.append(data_overlap_stats)


In [42]:
stats_key

"{'class_name': 'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', 'args': {'subject': 'anatomy'}}_test_5"

In [43]:
from pprint import pprint
pprint(all_data_overlap_stats)

[{'data_overlap_stats_key': {'light_scenario_key': {'scenario_spec': "{'class_name': "
                                                                     "'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', "
                                                                     "'args': "
                                                                     "{'subject': "
                                                                     "'philosophy'}}",
                                                    'split': 'train'},
                             'overlap_protocol_spec': {'n': 5}},
  'instance_ids_with_overlapping_input': [],
  'instance_ids_with_overlapping_reference': [],
  'num_instances': 5},
 {'data_overlap_stats_key': {'light_scenario_key': {'scenario_spec': "{'class_name': "
                                                                     "'helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', "
                                                                    