In [1]:
%pip install datasets

Note: you may need to restart the kernel to use updated packages.


In [2]:
from datasets import load_dataset, DatasetDict, Dataset
from typing import Any

In [3]:
DATASET_NAME: str = "ai4privacy/open-pii-masking-500k-ai4privacy"

# Load only the splits we need (train and validation)
# The dataset will be cached locally after the first download
raw_dataset: DatasetDict = load_dataset(
    DATASET_NAME,
    split=None,  # Load all available splits, then we select what we need
)

# Display basic information about the downloaded dataset
print(f"Dataset: {DATASET_NAME}")
print(f"Available splits: {list(raw_dataset.keys())}")
print()

for split_name, split_data in raw_dataset.items():
    print(f"  {split_name}: {len(split_data):,} examples")

Dataset: ai4privacy/open-pii-masking-500k-ai4privacy
Available splits: ['train', 'validation']

  train: 464,150 examples
  validation: 116,077 examples


In [4]:
# Display the features (schema) of the dataset
print("Dataset Features (Schema):")
print(raw_dataset["train"].features)
print()

Dataset Features (Schema):
{'source_text': Value('string'), 'masked_text': Value('string'), 'privacy_mask': List({'label': Value('string'), 'start': Value('int64'), 'end': Value('int64'), 'value': Value('string'), 'label_index': Value('int64')}), 'split': Value('string'), 'uid': Value('int64'), 'language': Value('string'), 'region': Value('string'), 'script': Value('string'), 'mbert_tokens': List(Value('string')), 'mbert_token_classes': List(Value('string'))}



In [5]:
# Display a single example to understand the data format
print("Sample record from training set:")
sample_record: dict[str, Any] = raw_dataset["train"][0]
for key, value in sample_record.items():
    # Truncate long values for readability
    display_value = str(value)
    if len(display_value) > 200:
        display_value = display_value[:200] + "..."
    print(f"  {key}: {display_value}")

Sample record from training set:
  source_text: 20:10:26 Venanzius Höttermann Revés యొక్క వివాహం July/95 నాడు జరిగింది, Tadaలో Faizabad Road వద్ద.
  masked_text: [TIME_1] [GIVENNAME_1] [SURNAME_1] యొక్క వివాహం [DATE_1] నాడు జరిగింది, [CITY_1]లో [STREET_1] వద్ద.
  privacy_mask: [{'label': 'TIME', 'start': 0, 'end': 8, 'value': '20:10:26', 'label_index': 1}, {'label': 'GIVENNAME', 'start': 9, 'end': 18, 'value': 'Venanzius', 'label_index': 1}, {'label': 'SURNAME', 'start': 19...
  split: train
  uid: 5387382
  language: te
  region: IN
  script: Telu
  mbert_tokens: ['20', ':', '10', ':', '26', 'Ve', '##nan', '##ziu', '##s', 'H', '##ötter', '##mann', 'Rev', '##és', 'యొక్క', 'వి', '##వ', '##ా', '##హం', 'July', '/', '95', 'న', '##ాడు', 'జరిగింది', ',', 'Tada', '##ల...
  mbert_token_classes: ['B-TIME', 'I-TIME', 'I-TIME', 'I-TIME', 'I-TIME', 'B-GIVENNAME', 'I-GIVENNAME', 'I-GIVENNAME', 'I-GIVENNAME', 'B-SURNAME', 'I-SURNAME', 'I-SURNAME', 'I-SURNAME', 'I-SURNAME', 'O', 'O', 'O', 'O', 'O',..

In [6]:
# Show the distribution of languages in the training set
print("\nLanguage distribution in training set:")
languages: list[str] = raw_dataset["train"]["language"]
unique_languages: set[str] = set(languages)
print(f"  Unique languages: {sorted(unique_languages)}")


Language distribution in training set:
  Unique languages: ['de', 'en', 'es', 'fr', 'hi', 'it', 'nl', 'te']


In [7]:
# Define the target language identifiers we consider as "English"
ENGLISH_IDENTIFIERS: frozenset[str] = frozenset({"en", "english", "English"})


def is_english(example: dict[str, Any]) -> bool:
    """
    Determine if a dataset example is in English.
    
    Args:
        example: A single record from the dataset containing a 'language' field.
        
    Returns:
        True if the example's language is identified as English, False otherwise.
    """
    language: str | None = example.get("language")
    if language is None:
        return False
    return language in ENGLISH_IDENTIFIERS

In [8]:
# Apply the filter to each split
# The filter operation is lazy by default but we use batched=False for row-wise filtering
filtered_dataset: DatasetDict = DatasetDict()

for split_name in raw_dataset.keys():
    print(f"Filtering {split_name} split...")
    
    original_count: int = len(raw_dataset[split_name])
    
    # Filter the split for English-only examples
    filtered_split: Dataset = raw_dataset[split_name].filter(
        is_english,
        desc=f"Filtering {split_name} for English",
    )
    
    filtered_count: int = len(filtered_split)
    retained_pct: float = (filtered_count / original_count) * 100 if original_count > 0 else 0.0
    
    filtered_dataset[split_name] = filtered_split
    
    print(f"  Original: {original_count:,} → Filtered: {filtered_count:,} ({retained_pct:.1f}% retained)")

print("\nFiltering complete.")

Filtering train split...


Filtering train for English:   0%|          | 0/464150 [00:00<?, ? examples/s]

  Original: 464,150 → Filtered: 120,533 (26.0% retained)
Filtering validation split...


Filtering validation for English:   0%|          | 0/116077 [00:00<?, ? examples/s]

  Original: 116,077 → Filtered: 30,160 (26.0% retained)

Filtering complete.


In [9]:
print("Verification of filtered dataset:")
print("=" * 50)

for split_name, split_data in filtered_dataset.items():
    print(f"\n{split_name.upper()} split:")
    print(f"  Total examples: {len(split_data):,}")
    
    # Check language distribution (should only contain English)
    if len(split_data) > 0:
        languages_in_split: list[str] = split_data["language"]
        unique_langs: set[str] = set(languages_in_split)
        print(f"  Languages present: {sorted(unique_langs)}")
        
        # Sanity check: all should be English
        non_english = [lang for lang in unique_langs if lang not in ENGLISH_IDENTIFIERS]
        if non_english:
            print(f"  WARNING: Non-English languages found: {non_english}")
        else:
            print("  ✓ All examples are English")
    else:
        print("  WARNING: No examples in this split after filtering!")

# Display a sample from the filtered training data
print("\n" + "=" * 50)
print("Sample record from filtered training set:")
if len(filtered_dataset["train"]) > 0:
    sample: dict[str, Any] = filtered_dataset["train"][0]
    print(f"  Language: {sample['language']}")
    print(f"  Source text (truncated): {str(sample.get('source_text', 'N/A'))[:300]}...")

Verification of filtered dataset:

TRAIN split:
  Total examples: 120,533
  Languages present: ['en']
  ✓ All examples are English

VALIDATION split:
  Total examples: 30,160
  Languages present: ['en']
  ✓ All examples are English

Sample record from filtered training set:
  Language: en
  Source text (truncated): To-do list for 4th August 1942: meet with Brandy Haroon at 10:17 to discuss the volunteer service record of [ORGANISATIONPLACEHOLDER_14]....


In [10]:
OUTPUT_DIR: str = "./data/ai4privacy_english_only"

# Save the filtered dataset to disk in Arrow format (efficient for large datasets)
filtered_dataset.save_to_disk(OUTPUT_DIR)

print(f"Filtered dataset saved to: {OUTPUT_DIR}")
print("To reload later, use:")
print(f'  from datasets import load_from_disk')
print(f'  dataset = load_from_disk("{OUTPUT_DIR}")')

Saving the dataset (0/1 shards):   0%|          | 0/120533 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/30160 [00:00<?, ? examples/s]

Filtered dataset saved to: ./data/ai4privacy_english_only
To reload later, use:
  from datasets import load_from_disk
  dataset = load_from_disk("./data/ai4privacy_english_only")
