In [None]:
import pandas as pd

In [None]:
DATASET_PATH = "/workspace/resources/data/restricted/anonymization/data-splits-2.0"

In [None]:
train = pd.read_csv(
    f"{DATASET_PATH}/train-ready.txt",
    sep=" ",
    names=["token", "label"],
    skip_blank_lines=False,
)

val = pd.read_csv(
    f"{DATASET_PATH}/dev-ready.txt",
    sep=" ",
    names=["token", "label"],
    skip_blank_lines=False,
)

test = pd.read_csv(
    f"{DATASET_PATH}/test-ready.txt",
    sep=" ",
    names=["token", "label"],
    skip_blank_lines=False,
)


print("train:", len(train))
print("val:", len(val))
print("test:", len(test))

In [None]:
import re
import numpy as np
from more_itertools import collapse, unique_everseen

labels = set(train["label"]) - {np.nan, "O"}
labels = sorted(labels)
labels = (re.sub("[BI]-", "", label) for label in labels)
labels = ((f"B-{label}", f"I-{label}") for label in labels)
labels = collapse(labels)
labels = unique_everseen(labels)
labels = ["O"] + list(labels)
print(labels)

code2label = {code: label for code, label in enumerate(labels)}
label2code = {label: code for code, label in code2label.items()}

print("nlabels:", len(labels))
print("nlabels:", len(code2label))
print("nlabels:", len(label2code))

In [None]:
label2code

In [None]:
from aymurai.datasets.utils import pandas_to_dataset
from datasets import DatasetDict


dataset = DatasetDict(
    {
        "train": pandas_to_dataset(train, label2code),
        "validation": pandas_to_dataset(val, label2code),
        "test": pandas_to_dataset(test, label2code),
    }
)
dataset

### drop duplicates

In [None]:
from joblib import hash

df_train = dataset["train"].to_pandas()
df_dev = dataset["validation"].to_pandas()
df_test = dataset["test"].to_pandas()

# apply hash to fast compare dupplicated
df_train["hash"] = df_train["tokens"].str.join(" ").apply(hash)
df_dev["hash"] = df_dev["tokens"].str.join(" ").apply(hash)
df_test["hash"] = df_test["tokens"].str.join(" ").apply(hash)

# drop duplicates
df_train.drop_duplicates(subset="hash", inplace=True)
df_dev.drop_duplicates(subset="hash", inplace=True)
df_test.drop_duplicates(subset="hash", inplace=True)

# get train hashes
train_hash = set(df_train["hash"])
dev_hash = set(df_dev["hash"])

In [None]:
from aymurai.utils.display.pandas import pandas_context

options = {}
options["display.max_colwidth"] = 0

with pandas_context(**options):
    aux = df_train.query("hash in @train_hash and hash in @dev_hash")
    aux["ntags"] = aux["tags"].apply(lambda x: np.sum(x))
    display(aux.query("ntags > 0"))
    # display(aux)

In [None]:
# drop paragraphs shared between datasets
# df_dev.query("hash not in @train_hash", inplace=True)
# df_test.query("hash not in @train_hash and hash not in @dev_hash", inplace=True)

In [None]:
dataset["train"] = Dataset.from_pandas(df_train)
dataset["validation"] = Dataset.from_pandas(df_dev)
dataset["test"] = Dataset.from_pandas(df_test)

In [None]:
set(dataset["train"]["hash"]).intersection(set(dataset["validation"]["hash"]))

In [None]:
dataset

## save dataset

In [None]:
import srsly

DATASET_NAME = (
    "/resources/data/restricted/anonymization/annonimization-dataset-pruned-2023-09-06"
)

dataset.save_to_disk(DATASET_NAME)
with open(f"{DATASET_NAME}/label_mapping.json", "w") as file:
    json = srsly.json_dumps(label2code)
    file.write(json)