<a href="https://colab.research.google.com/github/Arseny20/robustLLM/blob/dataset/get_logs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
!pip install -U datasets
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
import torch
import json
import re
from datasets import load_dataset


classifier = pipeline("zero-shot-classification",
                     model="MoritzLaurer/mDeBERTa-v3-base-mnli-xnli",
                     device=0 if torch.cuda.is_available() else -1)



Device set to use cuda:0


In [20]:
# Login using e.g. `huggingface-cli login` to access this dataset
df = load_dataset("ai4privacy/pii-masking-400k")["train"]
df

Dataset({
    features: ['source_text', 'locale', 'language', 'split', 'privacy_mask', 'uid', 'masked_text', 'mbert_tokens', 'mbert_token_classes'],
    num_rows: 325517
})

In [21]:
def edit_json(input: str):
    json_dict = input
    json_new = list()
    for item in json_dict:
        label_mapping = {
            "PHONE": "Phone",
            "PASSWORD": "password",
            "USERNAME": "login",
            "EMAIL": "email",
            "GIVENNAME": "FIO",
            "SURNAME": "FIO"
        }
        original_label = item["label"]
        item["label"] = label_mapping.get(original_label, original_label)
        if item["label"] in ["Phone", "password", "login", "email", "FIO"]:
            json_new.append(item)
    return json_new


def edit_text(text: str):
    # Regular expression to match HTML tags
    clean_text = re.sub(r'<[^>]+>', '', text)

    # Optional: Replace common HTML entities
    html_entities = {
        '&nbsp;': ' ',
        '&amp;': '&',
        '&lt;': '<',
        '&gt;': '>',
        '&quot;': '"',
        '&apos;': "'"
    }

    for entity, replacement in html_entities.items():
        clean_text = clean_text.replace(entity, replacement)

    return clean_text.strip()
    return text

In [24]:
labels = [
    "development_logs",
    "application_logs",
    "corporate_logs",
    "other_logs",
    "message",
    "not_logs",
]

texts = list()
masks = list()
for i in range(1000): # здесь вставить кол-во записей
    result = classifier(df[i]["source_text"], labels, multi_label=False)
    if result['labels'][0] in labels[0:3]:
        print(f"{i:6}/{df.shape[0]} {result['labels'][0]:15} {df[i]['source_text'][:100]}...")
        json_to_insert = edit_json(df[i]['privacy_mask'])
        if json_to_insert:
            print(json_to_insert)
            text_to_insert = edit_text(df[i]['source_text'])
            texts.append(text_to_insert)
            masks.append(json_to_insert)

   188/325517 application_logs Application completeness check: <br> Name: Marah Peyrilles <br> Job Title: Therapist, drama <br> ZIP...
[{'label': 'FIO', 'start': 43, 'end': 48, 'value': 'Marah', 'label_index': 1}, {'label': 'FIO', 'start': 49, 'end': 58, 'value': 'Peyrilles', 'label_index': 1}]
   253/325517 corporate_logs  Examination notes residence: 880 Pegglesworth, GL54. Records show 678 606 7954 and financial details...
   254/325517 corporate_logs  Doc details: 800 The Vintage Pair, ZIP GL54 2RJ. Age listed: 36. Social documentation 630.722.6794. ...
   315/325517 corporate_logs  Therapy check-in: Client Alfried Wohlgenannt, residing at White Springs, FL. Concerns over height (1...
[{'label': 'FIO', 'start': 25, 'end': 32, 'value': 'Alfried', 'label_index': 1}, {'label': 'FIO', 'start': 33, 'end': 44, 'value': 'Wohlgenannt', 'label_index': 1}]
   477/325517 development_logs The developmental evaluation of Niranjala Conejera focuses on their ability to engage socially and p...
[{

In [25]:
new_data = pd.DataFrame({
    "source_text": texts,
    "privacy_mask": masks
})
new_data.to_csv('logs_data.csv', index=None)
new_data.head(10)

Unnamed: 0,source_text,privacy_mask
0,Application completeness check: Name: Marah P...,"[{'label': 'FIO', 'start': 43, 'end': 48, 'val..."
1,"Therapy check-in: Client Alfried Wohlgenannt, ...","[{'label': 'FIO', 'start': 25, 'end': 32, 'val..."
2,The developmental evaluation of Niranjala Cone...,"[{'label': 'FIO', 'start': 32, 'end': 41, 'val..."
3,Mrs Vlada's developmental delay noted. Residen...,"[{'label': 'FIO', 'start': 4, 'end': 9, 'value..."
4,fatima.yavru09's medication plan overseen by B...,"[{'label': 'login', 'start': 3, 'end': 17, 'va..."
5,2 Support Plan Document outlining indivi...,"[{'label': 'FIO', 'start': 67, 'end': 72, 'val..."
6,2. Export Compliance Report\nAssessment of the...,"[{'label': 'login', 'start': 112, 'end': 118, ..."
7,Fund Performance: User jhdzvmhiyonunahx273 wit...,"[{'label': 'login', 'start': 38, 'end': 57, 'v..."
8,Analysis of fund 1629729153 in Princeton: Perf...,"[{'label': 'login', 'start': 113, 'end': 122, ..."
9,2 Satellite Licensing Application Complete lic...,"[{'label': 'password', 'start': 238, 'end': 24..."
