In [1]:
# stdlib
from collections import defaultdict
import glob
import json
import os
import re
from string import punctuation

# third party
from nltk import ngrams
import tqdm

In [2]:
# # stdlib
# import subprocess

# helm_process = subprocess.run(
#     [
#         "python",
#         "/home/teo/helm/scripts/data_overlap/compute_data_overlap_metrics.py",
#         "--scenario-data",
#         "/home/teo/helm/scripts/data_overlap/scenario_data.jsonl",
#         "--input-data",
#         "short_input.jsonl",
#         "--output-stats",
#         "/home/teo/helm/scripts/data_overlap/output_stats.jsonl",
#         "--input-format",
#         "the_pile",
#     ]
# )

In [3]:
PART_INPUT: str = "input"
PART_REF: str = "references"

r = re.compile(rf"[\s{re.escape(punctuation)}]+")


def create_ngram_index(light_scenarios, n_values, stats_key_counts):
    ngram_index = {n: {} for n in n_values}
    for scenario in tqdm.tqdm(light_scenarios):
        # print(f"Building ngram indexes for {scenario['scenario_key']}")
        for n in n_values:
            stats_key = scenario["scenario_key"] + "_" + str(n)
            stats_key_counts[stats_key] = len(scenario["instances"])
            for instance in scenario["instances"]:
                id = instance["id"]
                assert id

                input_tokens = r.split(instance["input"].lower())
                for input_ngram in ngrams(input_tokens, n):
                    if input_ngram not in ngram_index[n]:
                        ngram_index[n][input_ngram] = set()
                    ngram_index[n][input_ngram].add(
                        stats_key + "+" + id + "+" + PART_INPUT
                    )

                # compute reference ngrams
                for reference in instance["references"]:
                    reference_unigrams = r.split(reference.lower())
                    for reference_ngram in ngrams(reference_unigrams, n):
                        if reference_ngram not in ngram_index[n]:
                            ngram_index[n][reference_ngram] = set()
                        ngram_index[n][reference_ngram].add(
                            stats_key + "+" + id + "+" + PART_REF
                        )
    return ngram_index

In [4]:
def compute_document_data_overlap(document, ngram_index):
    stats_key_to_input_ids = defaultdict(set)
    stats_key_to_reference_ids = defaultdict(set)
    document_tokens = r.split(document.lower())
    for n in ngram_index.keys():
        for document_ngram in ngrams(document_tokens, n):
            if document_ngram in ngram_index[n]:
                for entry_overlap_key in ngram_index[n][document_ngram]:
                    stats_key, id, part = entry_overlap_key.split("+")
                    if part == PART_INPUT:
                        stats_key_to_input_ids[stats_key].add(id)
                    elif part == PART_REF:
                        stats_key_to_reference_ids[stats_key].add(id)
    return stats_key_to_input_ids, stats_key_to_reference_ids

In [5]:
scenario_data_path = "/Users/koen/Downloads/filtered_scenario_data_new.jsonl"

In [8]:
# stdlib
import sys

In [10]:
all_lines = open(scenario_data_path).read()

In [16]:
sys.getsizeof(all_lines) / 1000000

167.00667

In [18]:
%%time
light_scenarios = []
light_scenario_jsons = open(scenario_data_path).readlines()
for light_scenario_json in light_scenario_jsons:
    light_scenario_dict: dict = json.loads(light_scenario_json)

    light_scenario_key_dict: dict = light_scenario_dict["scenario_key"]
    # if the light_scenarios are exported from helm, they will have a scenario_spec field
    # subject_spec = light_scenario_key_dict["scenario_spec"]['args']['subject']
    scenario_spec = str(light_scenario_key_dict["scenario_spec"])
    light_scenario_key = scenario_spec + "_" + light_scenario_key_dict["split"]
    light_instances = [
        {
            "input": instance_dict[PART_INPUT],
            "references": instance_dict[PART_REF],
            "id": instance_dict["id"],
        }
        for instance_dict in light_scenario_dict["instances"]
    ]
    light_scenarios.append(
        {"scenario_key": light_scenario_key, "instances": light_instances}
    )

CPU times: user 663 ms, sys: 149 ms, total: 812 ms
Wall time: 812 ms


In [19]:
%%time
input_data_path = "short_input.jsonl"
# scenario_data_path = "/home/teo/helm/scripts/data_overlap/scenario_data.jsonl"
# scenario_data_path = "/home/teo/helm/scripts/data_overlap/scenario_data.jsonl"
output_path = "output2.jsonl"
normalization = "default"
N = [5, 9, 13]


print(f"Loading scenario data from {scenario_data_path}")


stats_key_counts = defaultdict(int)
ngram_index = create_ngram_index(
    light_scenarios=light_scenarios, n_values=N, stats_key_counts=stats_key_counts
)

The input data will be loaded from ['short_input.jsonl']
Loading scenario data from /Users/koen/Downloads/filtered_scenario_data_new.jsonl


100%|█████████████████████████████████████████████████| 241/241 [27:11<00:00,  6.77s/it]

CPU times: user 4min 48s, sys: 12min 52s, total: 17min 41s
Wall time: 27min 11s





In [None]:
# SETUP
if os.path.isdir(input_data_path):
    input_file_paths = []
    for file_path in glob.iglob(os.path.join(input_data_path, "**/*"), recursive=True):
        if os.path.isfile(file_path):
            input_file_paths.append(file_path)
else:
    input_file_paths = [input_data_path]
print(f"The input data will be loaded from {input_file_paths}")

In [16]:
stats_key_to_input_ids = []
stats_key_to_reference_ids = []

# BATCH PROCESSING
for input_file_index in tqdm.tqdm(
    range(len(input_file_paths)),
    desc="Computing overlap stats for input files",
    disable=None,
):
    input_file_path: str = input_file_paths[input_file_index]
    with open(input_file_path) as f:
        for line in f:
            document = json.loads(line)["text"]
            doc_input_ids, doc_ref_ids = compute_document_data_overlap(
                document=document,
                ngram_index=ngram_index,
            )
            stats_key_to_input_ids.append(doc_input_ids)
            stats_key_to_reference_ids.append(doc_ref_ids)

# AGGREGATION
total_input_ids = defaultdict(set)
total_reference_ids = defaultdict(set)

for d in stats_key_to_input_ids:
    for key in d:
        new_set = set()
        if key in total_input_ids:
            new_set = total_input_ids[key]
        new_set = new_set.union(d[key])
        total_input_ids[key] = new_set

for d in stats_key_to_reference_ids:
    for key in d:
        new_set = set()
        if key in total_reference_ids:
            new_set = total_reference_ids[key]
        new_set = total_reference_ids[key].union(d[key])
        total_reference_ids[key] = new_set

all_data_overlap_stats = []
for stats_key, count in stats_key_counts.items():
    data_overlap_stats = {
        "data_overlap_stats_key": None,
        "num_instances": count,
        "instance_ids_with_overlapping_input": sorted(total_input_ids[stats_key]),
        "instance_ids_with_overlapping_reference": sorted(
            total_reference_ids[stats_key]
        ),
    }
    # print(stats_key)
    subject, split, n_str = stats_key.rsplit("_", 2)
    data_overlap_stats["data_overlap_stats_key"] = {
        "light_scenario_key": {"scenario_spec": subject, "split": split},
        "overlap_protocol_spec": {"n": int(n_str)},
    }
    all_data_overlap_stats.append(data_overlap_stats)

with open(output_path, "w") as f:
    f.writelines(
        f"{json.dumps(data_overlap_stats)}\n"
        for data_overlap_stats in all_data_overlap_stats
    )
print(f"Written {len(all_data_overlap_stats)} results to {output_path}")

Written 723 results to output2.jsonl


In [6]:
# syft absolute

