# Faker

In [None]:
import locale
import random

import numpy as np
import pandas as pd

from datetime import datetime

from faker import Faker
from faker.providers import DynamicProvider

In [None]:
# Instanciamos Faker con `locale="es_AR"`
faker = Faker(locale="es_AR")

## Nombres

In [None]:
# Nombres
for i in range(10):
    print(faker.name())

In [None]:
# Nombres masculinos
for i in range(10):
    print(faker.first_name_male())

In [None]:
# Nombres femeninos
for i in range(10):
    print(faker.first_name_female())

In [None]:
# Nombres no-binaries (¿funciona?)
for i in range(10):
    print(faker.first_name_nonbinary())

In [None]:
# Apellidos
for i in range(10):
    print(faker.last_name())

## Direcciones

In [None]:
# Direcciones
for i in range(10):
    print(faker.street_address())

In [None]:
# Calles
for i in range(10):
    print(faker.address())

In [None]:
# Provincias
for i in range(10):
    print(faker.province())

In [None]:
# Ciudades
for i in range(10):
    print(faker.city())

In [None]:
# Municipalidades
for i in range(10):
    print(faker.municipality())

## Patentes

In [None]:
# Licencias
for i in range(10):
    print(faker.license_plate())

In [None]:
# Licencias Mercosur
for i in range(10):
    print(faker.license_plate_mercosur())

In [None]:
# Licencias antiguas
for i in range(10):
    print(faker.license_plate_old())

## Bancos

In [None]:
# Bancos
for i in range(10):
    print(faker.bank())

## Nacionalidades

In [None]:
# Listado de nacionalidades en español
countries = pd.read_html("https://www.spanish.cl/Vocabulary/Notes/Nacionalidades.htm")[
    -1
]
countries

In [None]:
# countries.to_csv("nacionalidades.csv", index=False)

In [None]:
# Proveedor de nacionalidades de faker
nationality_provider = DynamicProvider(
    provider_name="nationality",
    elements=countries["Nacionalidad"].apply(str.split).sum(),
)

faker = Faker("es_AR")
faker.add_provider(nationality_provider)

for i in range(10):
    print(faker.nationality())

## Fechas

In [None]:
# Días
for i in range(10):
    print(faker.day_of_week())

In [None]:
# Días - número
for i in range(10):
    print(faker.day_of_month())

In [None]:
# Meses
for i in range(10):
    print(faker.month())

In [None]:
# Meses - nombres
for i in range(10):
    print(faker.month_name())

In [None]:
# Años
for i in range(10):
    print(faker.year())

In [None]:
# NOTE https://stackoverflow.com/questions/2090840/python-datetime-localization
locale.setlocale(locale.LC_TIME, "es_AR.UTF-8")

In [None]:
# Concatenación
for i in range(10):
    dt = datetime.strptime(faker.date(), "%Y-%m-%d")
    print(
        dt.strftime(
            random.choice(
                [
                    "%A %d de %B del %Y",
                    "%d de %B del %Y",
                    "%d de %B de %Y",
                    "%d de %B del '%y",
                ]
            )
        )
    )

## Internet

In [None]:
# E-mail
for i in range(10):
    print(faker.ascii_email())

In [None]:
# E-mail gratuitos
for i in range(10):
    print(faker.ascii_free_email())

In [None]:
# E-mail corporativos
for i in range(10):
    print(faker.ascii_company_email())

In [None]:
# Dominios
for i in range(10):
    print(faker.domain_name())

In [None]:
# URLs
for i in range(10):
    print(faker.url())

In [None]:
# Nombres de usuarie
for i in range(10):
    print(faker.user_name())

## Teléfonos

In [None]:
# Números de teléfono
for i in range(10):
    print(faker.phone_number())

# Data augmentation

Realizamos la generación de data sintética a partir de los csvs que contienen los párrafos etiquetados

In [None]:
import os
import random
import re

In [None]:
from unidecode import unidecode

labels = []
with open("/notebooks/experiments/anonymization/NER.md", "r") as file:
    for line in file.readlines():
        line = unidecode(line)
        label_candidate = re.findall("^| ([A-Z]+) .*", line)
        labels.extend(label_candidate)

labels = set(labels)
labels.remove("")
labels = [f"<{label}>" for label in labels]


labels

In [None]:
path = "/notebooks/experiments/anonymization/output/"
files = [os.path.join(path, file) for file in os.listdir(path)]
len(files)

In [None]:
sample = pd.read_csv(random.choice(files))
sample

In [None]:
sample["label"].unique()

In [None]:
sample["label"].map(
    lambda x: re.sub(r"^[BI]-", "", x) if isinstance(x, str) else x
).unique()

In [None]:
sample["ia2_label"].map(
    lambda x: re.sub(r"[,.]", "", x)
    if isinstance(x, str) and x.startswith("<")
    else None
).unique()

In [None]:
sample["ia2_norm_label"].map(
    lambda x: re.sub(r"[,.]", "", x)
    if isinstance(x, str)  # and x.startswith("<")
    else None
).unique()

In [None]:
def dates():
    dt = datetime.strptime(faker.date(), "%Y-%m-%d")
    return dt.strftime(
        random.choice(
            [
                "%A %d de %B del %Y",
                "%d de %B del %Y",
                "%d de %B de %Y",
                "%d de %B del '%y",
            ]
        )
    )


fn_mapping = {
    "<DIRECCION>": faker.street_address,
    "<PER>": faker.name,
    "<BANCO>": faker.bank,
    "<CBU>": None,
    "<LEY>": None,
    "<NUM>": None,
    "<CUIT>": None,
    "<FISCAL>": None,
    "<FECHA>": dates,
    "<CUIJ>": None,
    "<USUARIX>": faker.user_name,
    "<SECRETARIX>": None,
    "<JUEZX>": None,
    "<PERIODO>": None,
    "<PASAPORTE>": None,
    "<DNI>": None,
    "<EDAD>": None,
    "<ESTUDIOS>": None,
    "<DEFENSORX>": None,
    "<NACIONALIDAD>": faker.nationality,
    "<ARTICULO>": None,
    "<LINK>": None,
    "<IP>": None,
    "<LOC>": random.choice([faker.province, faker.city, faker.municipality]),
}

In [None]:
fn_mapping["<PER>"]()

1. Recorrer el documento y extraer los párrafos
2. Por cada párrafo, detectar las distintas entidades válidas y sus respectivos índices
3. Por cada entidad, generar un reemplazo e insertar en el lugar que corresponda
4. Retornar tantas alteraciones del párrafo como se desee

In [None]:
# 1. Recorrer el doumento y extraer los párrafos
paragraphs = [
    line.strip()
    for line in " ".join(sample["original"].fillna("\n").values).splitlines()
]
paragraphs

In [None]:
# 2. Por cada párrafo, detectar las distintas entidades válidas y sus respectivos índices
idx = np.concatenate(
    [
        np.zeros(1, dtype=int),
        np.cumsum(list(map(len, [paragraph.split() for paragraph in paragraphs]))),
    ]
)
idx

In [None]:
labels = [list(labels[start:end]) for start, end in zip(idx, idx[1:])]
labels

In [None]:
for paragraph, label in zip(paragraphs, labels):
    print(paragraph)
    print(label)
    print()

In [None]:
labels[2]

In [None]:
def detect_beginning_inside_indices(labels: list):
    entities = [label for label in labels if label.startswith("B-")]
    indices = []

    for entity in entities:
        indices.append(
            (
                entity,
                # Beginning
                np.where(np.array(labels) == entity)[0].tolist(),
                # Inside
                np.where(np.array(labels) == re.sub(r"^B-", "I-", entity))[0].tolist(),
            )
        )

    return indices

In [None]:
detect_beginning_inside_indices(labels[2])

In [None]:
ent_idx = detect_beginning_inside_indices(labels[2])
np.where(np.array([ent[0] for ent in ent_idx]) == "B-NOMBRE")[0].tolist()

In [None]:
def replace_entity(paragraph: str, annots: list):
    paragraph = paragraph.split()

    entities_idx = detect_beginning_inside_indices(annots)
    entities = [ent[0] for ent in entities_idx]
    beginning_idx = [ent[1] for ent in entities_idx]
    inside_idx = [ent[2] for ent in entities_idx]

    if "B-NOMBRE" in entities:
        name_idx = np.where(np.array([ent[0] for ent in ent_idx]) == "B-NOMBRE")[0]
        name_beginning_idx = np.array(beginning_idx)[name_idx].tolist()
        name_inside_idx = np.array(inside_idx)[name_idx].tolist()

        fake_name = fn_mapping["<PER>"]()
        fake_name_list = fake_name.split()
        fake_name_labels = (
            ["B-NOMBRE"]
            if len(fake_name_list) == 1
            else ["B-NOMBRE"] + ["I-NOMBRE"] * (len(fake_name_list) - 1)
        )

        # return fake_name_list, fake_name_labels

In [None]:
detect_beginning_inside_indices(labels[2])

In [None]:
replace_entity(paragraph[2], labels[2])

In [None]:
# https://stackoverflow.com/questions/67118768/replace-sublist-with-different-length-sublist-in-python-like-string-replace
def list_find(haystack, needle):
    for i in range(len(haystack) - len(needle)):
        if haystack[i : i + len(needle)] == needle:
            return i
    return -1


def list_replace(old, new, haystack):
    n = list_find(haystack, old)
    if n < 0:
        return haystack
    return haystack[0:n] + new + haystack[n + len(old) :]


l = ["a", "b", "c", "d", " ", "a", "b", "c", "d"]
l1 = list_replace(["b", "c"], ["X", "Y", "Z"], l)
print(l1)