In [2]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
import torch
from datasets import load_dataset


classifier = pipeline("zero-shot-classification",
                     model="MoritzLaurer/mDeBERTa-v3-base-mnli-xnli",
                     device=0 if torch.cuda.is_available() else -1)

Device set to use cuda:0


In [3]:
!pip install -U datasets
# Login using e.g. `huggingface-cli login` to access this dataset
df = load_dataset("ai4privacy/pii-masking-400k")["train"]
df



Dataset({
    features: ['source_text', 'locale', 'language', 'split', 'privacy_mask', 'uid', 'masked_text', 'mbert_tokens', 'mbert_token_classes'],
    num_rows: 325517
})

In [5]:
labels = [
    "development_logs",
    "application_logs",
    "corporate_logs",
    "other_logs",
    "message",
    "not_logs",
]

texts = list()
masks = list()
for i in range(10000): # здесь вставить кол-во записей
    try:
        result = classifier(df[i]["source_text"], labels, multi_label=False)
        if result['labels'][0] in labels[0:3]:
            print(f"{i:6}/{df.shape[0]} {result['labels'][0]:15} {df[i]['source_text'][:100]}...")
            texts.append(df[i]['source_text'])
            masks.append(df[i]['privacy_mask'])
    except:
        None

   188/325517 application_logs Application completeness check: <br> Name: Marah Peyrilles <br> Job Title: Therapist, drama <br> ZIP...
   253/325517 corporate_logs  Examination notes residence: 880 Pegglesworth, GL54. Records show 678 606 7954 and financial details...
   254/325517 corporate_logs  Doc details: 800 The Vintage Pair, ZIP GL54 2RJ. Age listed: 36. Social documentation 630.722.6794. ...
   315/325517 corporate_logs  Therapy check-in: Client Alfried Wohlgenannt, residing at White Springs, FL. Concerns over height (1...
   477/325517 development_logs The developmental evaluation of Niranjala Conejera focuses on their ability to engage socially and p...
   583/325517 development_logs Mrs Vlada's developmental delay noted. Residence at Kemble Close, CW2 6XN, proximity ([53.0822, -2.4...
   623/325517 development_logs <p>fatima.yavru09's medication plan overseen by Business Account. Therapy sessions begin 10/04/2054....
   683/325517 development_logs 2    Support Plan    Docume

In [9]:
new_data = pd.DataFrame({
    "source_text": texts,
    "privacy_mask": masks
})
new_data.head(10)
new_data.to_csv('logs_data.csv', index=None)