In [None]:
%load_ext aymurai.devtools.magic
%load_ext autoreload
%autoreload 2

# build dataset

In [None]:
import locale

from sklearn.model_selection import train_test_split

from aymurai.spacy.display import DocRender
from aymurai.pipeline import AymurAIPipeline
from aymurai.datasets.ar_juz_pcyf_10 import ArgentinaJuzgadoPCyF10Dataset

locale.setlocale(locale.LC_ALL, 'es_AR.UTF-8')
render = DocRender()

In [None]:
private = ArgentinaJuzgadoPCyF10Dataset('private', use_cache=True)
train, test = train_test_split(private, test_size=0.2, random_state=22)
train, val = train_test_split(train, test_size=0.2, random_state=22)
print('train:', len(train))
print('test:', len(test))
print('val:', len(val))

# build names database

In [None]:
# !gdown --fuzzy --continue --output /resources/data/facebook-names/full.tar.bz2 https://drive.google.com/file/d/1wRQfw5EYpzulvRfHCGIUWB2am5JUYVGk/view
# !cd /resources/data/facebook-names && tar xvf full.tar.bz2 && cd - 

In [None]:
%%export aymurai.utils.facebook_names

import os
import re
import hashlib
import tarfile
import subprocess
from glob import glob
from pathlib import Path

import srsly
import pandas as pd

from aymurai.logging import get_logger

logger = get_logger(__name__)

FACEBOOK_NAMES_URI = os.getenv(
    "FACEBOOK_NAMES_URI",
    "https://drive.google.com/file/d/1wRQfw5EYpzulvRfHCGIUWB2am5JUYVGk/view",
)
FACEBOOK_NAMES_PATH = os.getenv("FACEBOOK_NAMES_URI", "/resources/data/facebook-names/")


def download_fb_names_db() -> str:
    """
    download facebook names database from massive 2021 hack

    Returns:
        str: path
    """
    logger.info(f'getting facebook names database from {FACEBOOK_NAMES_URI}')
    fname = "full.tar.bz2"
    fpath = f"{FACEBOOK_NAMES_PATH}/{fname}"
    cmd = f"gdown --fuzzy --continue --output {fpath} {FACEBOOK_NAMES_URI}"
    subprocess.check_output(cmd.split())

    return fpath


def extract_fb_names(country_codes: list = ["AR"], use_cache: bool = True):
    """
    Extract the facebook names from a list of country codes

    Args:
        country_codes (list, optional): extract names from <code> countries.
            Defaults to ["AR"].
        use_cache (bool, optional): use cache.
            Defaults to True.
    """
    tar_path = download_fb_names_db()
    basepath = os.path.dirname(tar_path)

    # check if files are extracted (use_cache = True)
    extracted = glob(f"{basepath}/curate/*.csv")
    extracted_codes = [os.path.basename(path).split(".")[0] for path in extracted]
    if not use_cache:  # force to extract all
        extracted_codes = []

    tar_members = [
        f"curate/{code}.csv" for code in country_codes if code not in extracted_codes
    ]
    with tarfile.open(tar_path, "r:*") as tar:
        if tar_members:
            logger.info(f'extracting {tar_members}')
            tar.extractall(path=basepath, members=tar_members)

    return [f"{basepath}/curate/{code}.csv" for code in country_codes]


def norm(name: str) -> list[str]:
    """
    Normalize and split a name to be used in a valid way .
    FIXME: nomalization was designed with argentinian names in mind.
    Not a general solution.

    Args:
        name (str): string containing multiples first/last names

    Returns:
        list[str]: list of names
    """
    if not isinstance(name, str):
        return name
    # spliting in space except on compose names
    name = re.sub(r"(?i)((?<!\W(De|Da|Di|D\'|D\"))\s)", "|", name)
    # fixing some prefixes
    name = re.sub(r"(?i)(?<!\w)(San|Del|las?)(\|)", "\g<1> ", name)

    # splitting names
    name = name.split("|")
    return name


def load_database(country_codes=["AR"], use_cache: bool = True) -> pd.DataFrame:
    """
    Load a database of facebook names by country code.
    FIXME: nomalization was designed with argentinian names in mind.
     Not a general solution.

    Args:
        country_codes (list, optional): country codes to use. Defaults to ["AR"].
        use_cache (bool, optional): use cache. Defaults to True.

    Returns:
        pd.DataFrame: names, gender and origin
    """
    logger.info(f'loading facebook names database for {country_codes}')
    paths = extract_fb_names(country_codes=country_codes, use_cache=use_cache)
    db = pd.concat(
        [
            pd.read_csv(
                path,
                names=["first_name", "last_name", "gender", "loc"],
            )
            for path in paths
        ],
        ignore_index=True,
    )

    fname = db["first_name"].apply(norm)
    lname = db["last_name"].apply(norm)

    db["name"] = fname + lname

    db = db.explode("name")
    db.dropna(subset=["name"], inplace=True)

    return db


def load_counts(
    country_codes: list = ["AR"],
    min_freq: int = 100,
    min_name_length: int = 4,
    use_cache: bool = True,
) -> pd.DataFrame:
    """
    Load facebook name counts for a list of country codes.

    Args:
        country_codes (list, optional): list of country codes to use. Defaults to ['AR'].
        min_freq (int, optional): Minimum number of name repetitions. defaults to 100.
        min_name_length (int, optional): minimum name length
        use_cache (bool, optional): use cache. Defaults to True.

    Returns:
        pd.DataFrame: [description]
    """
    cache_key = hashlib.md5(
        srsly.json_dumps(sorted(country_codes)).encode()
    ).hexdigest()
    cache_path = f"{FACEBOOK_NAMES_PATH}/counts/{cache_key}.csv"

    db = load_database(country_codes=country_codes, use_cache=use_cache)

    if use_cache and Path(cache_path).exists():
        logger.info(f'loading name counts from cache: {cache_path}')
        counts = pd.read_csv(cache_path)

    else:
        logger.info(f'building names database')
        counts = (
            db.groupby("name")
            .agg({"loc": "count"})
            .sort_values(by=["loc"], ascending=False)
        )
        counts = counts.reset_index()
        counts.rename(columns={"loc": "counts"}, inplace=True)
        counts["len"] = counts["name"].apply(len)

    if use_cache:
        os.makedirs(os.path.dirname(cache_path), exist_ok=True)
        counts.to_csv(cache_path, index=False)

    counts.query(f"counts > {min_freq} and len >= {min_name_length}", inplace=True)

    return counts


# Define pipeline components

In [None]:
%%export aymurai.spacy.components.names

import unicodedata
from itertools import chain

from unidecode import unidecode
from spacy.language import Language
from spacy.pipeline import EntityRuler

from aymurai.logging import get_logger
from aymurai.utils.facebook_names import load_counts
from aymurai.devtools import resolve_package_path
from more_itertools import unique_everseen

logger = get_logger(__name__)

EXTRANAMES_BASEPATH = resolve_package_path("aymurai.data.spanish.names")
EXTRANAMES_FILENAME = f"{EXTRANAMES_BASEPATH}/extra_names.txt"
# export: start hide
import os
os.makedirs(EXTRANAMES_BASEPATH, exist_ok=True)
extra_names_db = pd.read_csv("extra_names.txt", header=None, sep="|")
extra_names_db = unique_everseen(extra_names_db[0].values)
extra_names_db = list(extra_names_db)
extra_names_db = pd.Series(extra_names_db)
extra_names_db.to_csv(EXTRANAMES_FILENAME, index=False, header=None)
# export: end hide

@Language.factory(name="name_lookup_ruler")
def name_lookup(
    nlp,
    name,
    country_codes=["AR"],
    overwrite_ents: bool = True,
    min_freq: int = 100,
    min_name_length: int = 4,
    unicode_norm: str = "NFKC",
):
    db_names = load_counts(
        country_codes=country_codes,
        min_freq=min_freq,
        min_name_length=min_name_length,
    )['name']
    with open(EXTRANAMES_FILENAME, "r") as file:
        extra_names = file.readlines()
        extra_names = map(str.strip, extra_names)
        extra_names = map(str.title, extra_names)

    names = chain(db_names, extra_names)

    norm_names = map(lambda x: unicodedata.normalize(unicode_norm, x), names)
    norm_names = tuple(norm_names)
    ascii_names = map(lambda x: unidecode(x), norm_names)

    names = tuple(chain(norm_names, ascii_names))
    names = chain(names, map(str.upper, names))

    ruler = EntityRuler(nlp, name=name, overwrite_ents=overwrite_ents)
    ruler.add_patterns(
        [
            {
                "label": "PER",
                "id": "PER",
                "pattern": [{"ORTH": name}],
            }
            for name in names
        ]
    )
    return ruler


@Language.factory(name='join_consecutive_names')
def join_consecutive_name_entities(nlp, name):
    ruler = EntityRuler(nlp, name=name, overwrite_ents=True)
    ruler.add_patterns(
        [
            # explicit consecutive names
            {
                "label": "PER",
                "id": "PER",
                "pattern": [{"ENT_TYPE": "PER", "OP": "{2,}"}],
            },
            # comma/dot separated
            {
                "label": "PER",
                "id": "PER",
                "pattern": [
                    {"ENT_TYPE": "PER", "OP": "+"},
                    {"IS_PUNCT": True},
                    {"ENT_TYPE": "PER", "OP": "+"},
                ],
            },
            # names with abbrvs within
            {
                "label": "PER",
                "id": "PER",
                "pattern": [
                    {"ENT_TYPE": "PER", "OP": "+"},
                    {"TEXT": {"REGEX": r"[A-Z][\.\s\,]?"}},
                    {"ENT_TYPE": "PER", "OP": "+"},
                ],
            },
        ]
    )
    return ruler


# Pipeline definition

In [None]:
from aymurai.spacy.ner import SpacyNER
from aymurai.text.normalize import TextNormalize
from aymurai.spacy.ruler import SpacyRulerPipeline
from aymurai.text.extraction import FulltextExtract
from aymurai.pipeline.pipeline import AymurAIPipeline

config = {
    "preprocess": [
        (
            FulltextExtract,
            {
                "extension": "pdf",
                "method": "tesseract",
                "language": "spa",
                "errors": "ignore",
                "use_cache": True,
            },
        ),
        (TextNormalize, {}),
        (
            SpacyRulerPipeline,
            {
                "base": "es",
                "steps": [
                    ("name_lookup_ruler", {'country_codes': ['AR']}),
                    ("join_consecutive_names", {}),
                ],
            },
        ),
    ],
    "models": [],
    "postprocess": [],
    "multiprocessing": {},
    "use_cache": False,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)

In [None]:
preprocessed_train = pipeline.preprocess(train)

# tryout

In [None]:
import spacy

nlp = spacy.blank("es")
nlp.add_pipe("name_lookup_ruler")
nlp.add_pipe("join_consecutive_names")
# ruler = name_lookup(nlp, 'asd')


In [None]:
ruler = name_lookup(nlp, 'names_lookup')
join_entities = join_consecutive_name_entities(nlp, 'asd')

In [None]:
import srsly
from spacy import displacy

item = preprocessed_train[53]
text = item['data']['doc.text']
print(srsly.yaml_dumps(item))
doc = nlp(text)
doc = ruler(doc)
doc = join_entities(doc)

displacy.render(doc, 'ent')