# Faker

In [None]:
import re
import locale
import random

import numpy as np
import pandas as pd

from datetime import datetime

from faker import Faker
from faker.providers import DynamicProvider

In [None]:
# Instanciamos Faker con `locale="es_AR"`
faker = Faker(locale="es_AR")

## Nombres

In [None]:
# Nombres
for i in range(10):
    print(faker.name())

In [None]:
# Nombres masculinos
for i in range(10):
    print(faker.first_name_male())

In [None]:
# Nombres femeninos
for i in range(10):
    print(faker.first_name_female())

In [None]:
# Nombres no-binaries (¿funciona?)
for i in range(10):
    print(faker.first_name_nonbinary())

In [None]:
# Apellidos
for i in range(10):
    print(faker.last_name())

## Direcciones

In [None]:
# Direcciones
for i in range(10):
    print(faker.street_address())

In [None]:
# Dirección secundaria
for i in range(10):
    print(faker.secondary_address())

In [None]:
# Calles
for i in range(10):
    print(faker.address())

In [None]:
# Provincias
for i in range(10):
    print(faker.province())

In [None]:
# Ciudades
for i in range(10):
    print(faker.city())

In [None]:
# Municipalidades
for i in range(10):
    print(faker.municipality())

## Patentes

In [None]:
# Licencias
for i in range(10):
    print(faker.license_plate())

In [None]:
# Licencias Mercosur
for i in range(10):
    print(faker.license_plate_mercosur())

In [None]:
# Licencias antiguas
for i in range(10):
    print(faker.license_plate_old())

## Bancos

In [None]:
# Bancos
for i in range(10):
    print(faker.bank())

## Nacionalidades

In [None]:
# Listado de nacionalidades en español
countries = pd.read_html("https://www.spanish.cl/Vocabulary/Notes/Nacionalidades.htm")[
    -1
]
countries

In [None]:
# countries.to_csv("nacionalidades.csv", index=False)

In [None]:
# Proveedor de nacionalidades de faker
nationality_provider = DynamicProvider(
    provider_name="nationality",
    elements=countries["Nacionalidad"].apply(str.split).sum(),
)

faker = Faker("es_AR")
faker.add_provider(nationality_provider)

for i in range(10):
    print(faker.nationality())

## Fechas

In [None]:
# Días
for i in range(10):
    print(faker.day_of_week())

In [None]:
# Días - número
for i in range(10):
    print(faker.day_of_month())

In [None]:
# Meses
for i in range(10):
    print(faker.month())

In [None]:
# Meses - nombres
for i in range(10):
    print(faker.month_name())

In [None]:
# Años
for i in range(10):
    print(faker.year())

In [None]:
# NOTE https://stackoverflow.com/questions/2090840/python-datetime-localization
locale.setlocale(locale.LC_TIME, "es_AR.UTF-8")

In [None]:
# Concatenación
for i in range(10):
    dt = datetime.strptime(faker.date(), "%Y-%m-%d")
    print(
        dt.strftime(
            random.choice(
                [
                    "%A %d de %B del %Y",
                    "%d de %B del %Y",
                    "%d de %B de %Y",
                    "%d de %B del '%y",
                ]
            )
        )
    )

In [None]:
def fake_date():
    dt = datetime.strptime(faker.date(), "%Y-%m-%d")
    return dt.strftime(
        random.choice(
            [
                "%A %d de %B del %Y",
                "%d de %B del %Y",
                "%d de %B de %Y",
                "%d de %B del '%y",
            ]
        )
    )

## Internet

In [None]:
# E-mail
for i in range(10):
    print(faker.ascii_email())

In [None]:
# E-mail gratuitos
for i in range(10):
    print(faker.ascii_free_email())

In [None]:
# E-mail corporativos
for i in range(10):
    print(faker.ascii_company_email())

In [None]:
# Dominios
for i in range(10):
    print(faker.domain_name())

In [None]:
# URLs
for i in range(10):
    print(faker.url())

In [None]:
# Nombres de usuarie
for i in range(10):
    print(faker.user_name())

## Teléfonos

In [None]:
# Números de teléfono
for i in range(10):
    print(faker.phone_number())

# Data augmentation

In [None]:
from datasets import load_from_disk
import srsly

ANNOT_DIR = "/resources/data/restricted/anonymization"

dataset = load_from_disk(f"{ANNOT_DIR}/hg_dataset")

with open(f"{ANNOT_DIR}/hg_dataset/label_mapping.json") as file:
    label2code = srsly.json_loads(file.read())
    code2label = {v: k for k, v in label2code.items()}

print(dataset)
print("nlabels:", len(code2label))

In [None]:
train = dataset["train"]
train

In [None]:
train_labeled = [doc for doc in train if doc["n_labels"][0] > 0]

In [None]:
len(train_labeled)

In [None]:
sample = train_labeled[10]  # train_labeled[16]
sample

In [None]:
# train_labeled[16] =>

# sample["tags"][28] = 1
# sample["tags"][29] = 2
# sample["tags"][30] = 2
# sample["tags"][31] = 8

In [None]:
for token, tag in zip(sample["tokens"], sample["tags"]):
    print(token, tag)

In [None]:
def get_entity_counts(tags: list, code2label: dict = code2label) -> dict:
    entities = [
        code2label.get(tag)
        for tag in tags
        if tag != 0 and code2label.get(tag).startswith("B-")
    ]
    entity_counts = {
        re.sub(r"^B-", "", k): v
        for (k, v) in zip(*np.unique(entities, return_counts=True))
    }
    return entity_counts

In [None]:
entity_counts = get_entity_counts(sample["tags"])
entity_counts

In [None]:
entity_augmentation_functions = {
    "PER": faker.name,
    "DIRECCION": faker.street_address,
    "DNI": faker.license_plate,
    "FECHA": fake_date,
}

In [None]:
def get_tokens_and_tags(text: str, entity: str) -> tuple[list]:
    tokens = text.split()
    tags = [f"I-{entity}"] * len(tokens)
    tags[0] = f"B-{entity}"
    return tokens, tags

In [None]:
tokens_tags = [
    get_tokens_and_tags(entity_augmentation_functions[entity](), entity)
    for (entity, count) in entity_counts.items()
    for i in range(count)
]

tokens_tags

In [None]:
from itertools import chain, islice
from more_itertools import split_before

In [None]:
def find_labels_and_indices_to_replace(tags: list[int], code2label: dict = code2label):
    labeled_tags = [code2label.get(tag) for tag in tags]

    indices_to_replace = np.where(np.array(sample["tags"]) != 0)[0]
    split_indices = np.where(list(map(lambda x: x.startswith("B"), labeled_tags)))[0]
    indices_to_replace = list(
        split_before(indices_to_replace, lambda x: x in split_indices)
    )
    indices_to_replace = np.array(
        [np.array(idx) for idx in indices_to_replace], dtype="object"
    )

    labels_to_replace = np.array(labeled_tags)[
        [min(idx) for idx in indices_to_replace]
    ].tolist()
    labels_to_replace = [re.sub(r"^B-", "", label) for label in labels_to_replace]

    return labels_to_replace, indices_to_replace

In [None]:
labels_to_replace, indices_to_replace = find_labels_and_indices_to_replace(
    sample["tags"]
)
labels_to_replace, indices_to_replace

In [None]:
sample_tokens = sample["tokens"]
sample_tags = sample["tags"]
print("Original tokens:", sample_tokens)
print("Original tags:", sample_tags)
print()

labels_to_replace, indices_to_replace = find_labels_and_indices_to_replace(
    sample["tags"]
)
labels_to_replace, indices_to_replace
print(labels_to_replace, indices_to_replace)

# Replace list
for label_to_replace, idx_to_replace in zip(labels_to_replace, indices_to_replace):
    start_idx = min(idx_to_replace)
    end_idx = max(idx_to_replace) + 1

    entity = re.sub(r"^B-", "", label_to_replace)

    original_tokens = np.array(sample_tokens)[start_idx:end_idx].tolist()
    replacement_tokens, replacement_tags = get_tokens_and_tags(
        entity_augmentation_functions.get(entity)(), entity
    )
    replacement_tags = [label2code.get(tag) for tag in replacement_tags]

    print(original_tokens, replacement_tokens)
    print()
    len_diff = len(replacement_tokens) - len(original_tokens)
    print(len_diff)
    print(indices_to_replace)
    indices_to_replace += len_diff
    print(indices_to_replace)
    print()

    # Replace sublist with other in list
    # Using itertools.islice()
    replaced_tokens = chain(
        islice(sample_tokens, 0, start_idx),
        replacement_tokens,
        islice(sample_tokens, end_idx, len(sample_tokens)),
    )

    # convert the chain object back to a list
    replaced_tokens = list(replaced_tokens)

    # Replace sublist with other in list
    # Using itertools.islice()
    replaced_tags = chain(
        islice(sample_tags, 0, start_idx),
        replacement_tags,
        islice(sample_tags, end_idx, len(sample_tags)),
    )

    # convert the chain object back to a list
    replaced_tags = list(replaced_tags)

    # printing result
    print("Replaced tokens:", replaced_tokens)
    print("Replaced tags:", replaced_tags)
    print()
    sample_tokens = replaced_tokens
    sample_tags = replaced_tags

In [None]:
for token, tag in zip(sample_tokens, sample_tags):
    print(token, tag)

In [None]:
def augment_data(sample: dict, label2code: dict = label2code) -> dict:
    sample_tokens = sample["tokens"].copy()
    sample_tags = sample["tags"].copy()

    labels_to_replace, indices_to_replace = find_labels_and_indices_to_replace(
        sample_tags
    )

    for label_to_replace, idx_to_replace in zip(labels_to_replace, indices_to_replace):
        start_idx = min(idx_to_replace)
        end_idx = max(idx_to_replace) + 1

        entity = re.sub(r"^B-", "", label_to_replace)

        original_tokens = np.array(sample_tokens)[start_idx:end_idx].tolist()
        original_tags = np.array(sample_tags)[start_idx:end_idx].tolist()
        replacement_tokens, replacement_tags = get_tokens_and_tags(
            entity_augmentation_functions.get(entity)(), entity
        )
        replacement_tags = [label2code.get(tag) for tag in replacement_tags]

        len_diff = len(replacement_tokens) - len(original_tokens)
        indices_to_replace += len_diff

        replaced_tokens = chain(
            islice(original_tokens, 0, start_idx),
            replacement_tokens,
            islice(original_tokens, end_idx, len(sample_tokens)),
        )
        replaced_tokens = list(replaced_tokens)

        replaced_tags = chain(
            islice(sample_tags, 0, start_idx),
            replacement_tags,
            islice(sample_tags, end_idx, len(sample_tags)),
        )
        replaced_tags = list(replaced_tags)

        sample_tokens = replaced_tokens
        sample_tags = replaced_tags