<a href="https://colab.research.google.com/github/Arseny20/robustLLM/blob/dataset/get_logs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U datasets
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
import torch
import json
import re
from datasets import load_dataset


classifier = pipeline("zero-shot-classification",
                     model="MoritzLaurer/mDeBERTa-v3-base-mnli-xnli",
                     device=0 if torch.cuda.is_available() else -1)

In [None]:
# Login using e.g. `huggingface-cli login` to access this dataset
df = load_dataset("ai4privacy/pii-masking-400k")["train"]
df

In [3]:
def edit_json(input: str):
    json_dict = input
    json_new = list()
    for item in json_dict:
        label_mapping = {
            "PHONE": "Phone",
            "PASSWORD": "password",
            "USERNAME": "login",
            "EMAIL": "email",
            "GIVENNAME": "FIO",
            "SURNAME": "FIO"
        }
        original_label = item["label"]
        item["label"] = label_mapping.get(original_label, original_label)
        if item["label"] in ["Phone", "password", "login", "email", "FIO"]:
            json_new.append(item)
    return json_new


def edit_text(text: str):
    clean_text = re.sub(r'<[^>]+>', '', text) # <- Убираем HTML-теги

    html_entities = {
        '&nbsp;': ' ',
        '&amp;': '&',
        '&lt;': '<',
        '&gt;': '>',
        '&quot;': '"',
        '&apos;': "'"
    } # <- Замена сущностей с HTML

    for entity, replacement in html_entities.items():
        clean_text = clean_text.replace(entity, replacement)

    return clean_text.strip()
    return text

def word_to_ner(text, label_data):
    text = text.split()
    idx = 0
    new_list = list()
    mapping = ["Phone", "password", "login", "email", "FIO"]
    for word in text:
        if idx < len(label_data) and word == label_data[idx]["value"]:
            new_list.append(mapping.index(label_data[idx]["label"])+1)
            idx += 1
        else:
            new_list.append(0)
    return new_list


def tokens_marking(tokens, token_classes):
    token_mapping = {
        "B-PHONE": "B-Phone",
        "B-PASSWORD": "B-password",
        "B-USERNAME": "B-login",
        "B-EMAIL": "B-email",
        "B-GIVENNAME": "B-FIO",
        "B-SURNAME": "B-FIO",
        "I-PHONE": "I-Phone",
        "I-PASSWORD": "I-password",
        "I-USERNAME": "I-login",
        "I-EMAIL": "I-email",
        "I-GIVENNAME": "I-FIO",
        "I-SURNAME": "I-FIO"
    }

    token_classes = list(map(lambda x: token_mapping.get(x, 'O'), token_classes))

    result = []
    for token, cls in zip(tokens, token_classes):
        result.append((token, cls))

    return result

In [None]:
labels = [
    "development_logs",
    "application_logs",
    "corporate_logs",
    "other_logs",
    "message",
    "not_logs",
]


texts = list()
masks = list()
indices = list()
tokens = list()

for i in range(3000): # <- здесь вставить кол-во записей
    result = classifier(df[i]["source_text"], labels, multi_label=False)
    if result['labels'][0] in labels[0:3]:
        json_to_insert = edit_json(df[i]['privacy_mask'])
        if json_to_insert:
            print(f"{i:6}/{df.shape[0]} {result['labels'][0]:15} {df[i]['source_text'][:100]}...")
            print(json_to_insert)
            text_to_insert = df[i]['source_text'] # edit_text(df[i]['source_text']) <- убранные слова остаются в BIO-разметках, так что не стоит
            ner_indices = word_to_ner(text_to_insert, json_to_insert)
            tokens_to_insert = tokens_marking(df[i]['mbert_tokens'], df[i]['mbert_token_classes'])
            texts.append(text_to_insert)
            masks.append(json_to_insert)
            indices.append(ner_indices)
            tokens.append(tokens_to_insert)

In [5]:
new_data = pd.DataFrame({
    "source_text": texts,
    "privacy_mask": masks,
    "ner_indices": indices,
    "tokens": tokens
})
new_data.to_csv('logs_data.csv', index=None)
new_data.head(10)

Unnamed: 0,source_text,privacy_mask,ner_indices,tokens
0,Application completeness check: <br> Name: Mar...,"[{'label': 'FIO', 'start': 43, 'end': 48, 'val...","[0, 0, 0, 0, 0, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(Application, O), (complete, O), (##ness, O),..."
1,"Therapy check-in: Client Alfried Wohlgenannt, ...","[{'label': 'FIO', 'start': 25, 'end': 32, 'val...","[0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(Therapy, O), (check, O), (-, O), (in, O), (:..."
2,The developmental evaluation of Niranjala Cone...,"[{'label': 'FIO', 'start': 32, 'end': 41, 'val...","[0, 0, 0, 0, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(The, O), (development, O), (##al, O), (evalu..."
3,Mrs Vlada's developmental delay noted. Residen...,"[{'label': 'FIO', 'start': 4, 'end': 9, 'value...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(Mrs, O), (Vlad, B-FIO), (##a, I-FIO), (', O)..."
4,<p>fatima.yavru09's medication plan overseen b...,"[{'label': 'login', 'start': 3, 'end': 17, 'va...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(<, O), (p, O), (>, O), (fat, B-login), (##im..."
5,2 Support Plan Document outlining indivi...,"[{'label': 'FIO', 'start': 67, 'end': 72, 'val...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(2, O), (Support, O), (Plan, O), (Document, O..."
6,2. Export Compliance Report\nAssessment of the...,"[{'label': 'login', 'start': 112, 'end': 118, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(2, O), (., O), (Export, O), (Com, O), (##pli..."
7,<html><body><p>Fund Performance: User jhdzvmhi...,"[{'label': 'login', 'start': 38, 'end': 57, 'v...","[0, 0, 0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(<, O), (html, O), (>, O), (<, O), (body, O),..."
8,<html><body><p>Analysis of fund 1629729153 in ...,"[{'label': 'login', 'start': 113, 'end': 122, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(<, O), (html, O), (>, O), (<, O), (body, O),..."
9,<html><body>2 Satellite Licensing Application ...,"[{'label': 'password', 'start': 238, 'end': 24...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[(<, O), (html, O), (>, O), (<, O), (body, O),..."
