<a href="https://colab.research.google.com/github/Arseny20/robustLLM/blob/dataset/get_logs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U datasets
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
import torch
import json
import re
from datasets import load_dataset


classifier = pipeline("zero-shot-classification",
                     model="MoritzLaurer/mDeBERTa-v3-base-mnli-xnli",
                     device=0 if torch.cuda.is_available() else -1)

In [None]:
# Login using e.g. `huggingface-cli login` to access this dataset
df = load_dataset("ai4privacy/pii-masking-400k")["train"]
df

In [None]:
def edit_json(input: str):
    json_dict = input
    json_new = list()
    for item in json_dict:
        label_mapping = {
            "PHONE": "Phone",
            "PASSWORD": "password",
            "USERNAME": "login",
            "EMAIL": "email",
            "GIVENNAME": "FIO",
            "SURNAME": "FIO"
        }
        original_label = item["label"]
        item["label"] = label_mapping.get(original_label, original_label)
        if item["label"] in ["Phone", "password", "login", "email", "FIO"]:
            json_new.append(item)
    return json_new


def edit_text(text: str):
    # Regular expression to match HTML tags
    clean_text = re.sub(r'<[^>]+>', '', text)

    # Optional: Replace common HTML entities
    html_entities = {
        '&nbsp;': ' ',
        '&amp;': '&',
        '&lt;': '<',
        '&gt;': '>',
        '&quot;': '"',
        '&apos;': "'"
    }

    for entity, replacement in html_entities.items():
        clean_text = clean_text.replace(entity, replacement)

    return clean_text.strip()
    return text

def word_to_ner(text, label_data):
    text = text.split()
    idx = 0
    new_list = list()
    mapping = ["Phone", "password", "login", "email", "FIO"]
    for word in text:
        if idx < len(label_data) and word == label_data[idx]["value"]:
            new_list.append(mapping.index(label_data[idx]["label"])+1)
            idx += 1
        else:
            new_list.append(0)
    return new_list


def tokens_marking(tokens, token_classes):
    token_mapping = {
        "B-PHONE": "B-Phone",
        "B-PASSWORD": "B-password",
        "B-USERNAME": "B-login",
        "B-EMAIL": "B-email",
        "B-GIVENNAME": "B-FIO",
        "B-SURNAME": "B-FIO",
        "I-PHONE": "I-Phone",
        "I-PASSWORD": "I-password",
        "I-USERNAME": "I-login",
        "I-EMAIL": "I-email",
        "I-GIVENNAME": "I-FIO",
        "I-SURNAME": "I-FIO"
    }

    # Convert map object to list to use it multiple times
    token_classes = list(map(lambda x: token_mapping.get(x, 'O'), token_classes))

    result = []
    for token, cls in zip(tokens, token_classes):
        result.append((token, cls))  # Fixed: Using tuple() requires a single iterable argument

    return result

In [None]:
labels = [
    "development_logs",
    "application_logs",
    "corporate_logs",
    "other_logs",
    "message",
    "not_logs",
]


texts = list()
masks = list()
indices = list()
tokens = list()

for i in range(3000): # здесь вставить кол-во записей
    result = classifier(df[i]["source_text"], labels, multi_label=False)
    if result['labels'][0] in labels[0:3]:
        print(f"{i:6}/{df.shape[0]} {result['labels'][0]:15} {df[i]['source_text'][:100]}...")
        json_to_insert = edit_json(df[i]['privacy_mask'])
        if json_to_insert:
            print(json_to_insert)
            text_to_insert = edit_text(df[i]['source_text'])
            ner_indices = word_to_ner(text_to_insert, json_to_insert)
            tokens_to_insert = tokens_marking(df[i]['mbert_tokens'], df[i]['mbert_token_classes'])
            texts.append(text_to_insert)
            masks.append(json_to_insert)
            indices.append(ner_indices)
            tokens.append(tokens_to_insert)

In [None]:
new_data = pd.DataFrame({
    "source_text": texts,
    "privacy_mask": masks,
    "ner_indices": indices,
    "tokens": tokens
})
new_data.to_csv('logs_data.csv', index=None)
new_data.head(10)