In [None]:
import os
from glob import glob

databases = glob('/resources/data/facebook-names/curate/*.csv')
databases = {os.path.basename(file).split('.')[0]: file for file in databases}

In [None]:
import pandas as pd


def join_names(row):
    fname = row['first_name'] or ''
    lname = row['last_name'] or ''

    return f'{fname} {lname}'

ar = pd.read_csv(databases['AR'], names=['first_name', 'last_name', 'gender', 'loc'])



ar['full_name'] = ar[['first_name', 'last_name']].apply(join_names, axis=1)

In [None]:
import re


def norm(name: str):
    if not isinstance(name, str):
        return name
    # spliting in space except on compose names
    name = re.sub(r'(?i)((?<!\W(De|Da|Di))\s)', '|', name)
    # fixing some prefixes
    name = re.sub(r'(?i)(?<!\w)(San|Del|las?)(\|)', '\g<1> ', name)
    
    # splitting names
    name = name.split('|')
    return name


ar_ = ar.copy()
ar_['first_name'] = ar_['first_name'].apply(norm)
ar_['last_name'] = ar_['last_name'].apply(norm)

ar_['name'] = ar_['first_name'] + ar_['last_name']
# ar['first_name'] = ar['first_name'].apply(norm)
# ar['last_name'] = ar['last_name'].apply(norm)

In [None]:
ar_ = ar_.explode('name')
ar_.drop(['first_name', 'last_name'], axis=1, inplace=True)
ar_.dropna(subset=['name'], inplace=True)

In [None]:
counts = ar_.groupby('name').agg({'full_name': 'count'}).sort_values(by=['full_name'], ascending=False)
# counts = ar_.groupby('name').transform('count')
# counts /= counts.sum()
counts = counts.reset_index()
counts.rename(columns={'full_name': 'counts'}, inplace=True)
counts['len'] = counts['name'].apply(len)
counts

# try in pipeline

In [None]:
import locale

from sklearn.model_selection import train_test_split

from aymurai.spacy.display import DocRender
from aymurai.pipeline import AymurAIPipeline
from aymurai.datasets.ar_juz_pcyf_10 import ArgentinaJuzgadoPCyF10Dataset

locale.setlocale(locale.LC_ALL, 'es_AR.UTF-8')
render = DocRender()

In [None]:
private = ArgentinaJuzgadoPCyF10Dataset('private', use_cache=True)
train, test = train_test_split(private, test_size=0.2, random_state=22)
train, val = train_test_split(train, test_size=0.2, random_state=22)
print('train:', len(train))
print('test:', len(test))
print('val:', len(val))

In [None]:
%load_ext aymurai.devtools.magic

# ruler definition

In [None]:
%%export aymurai.spacy.components.names

import unicodedata
from itertools import chain

from unidecode import unidecode
from spacy.language import Language
from spacy.pipeline import EntityRuler


@Language.factory(name="name_lookup_ruler")
def name_lookup(
    nlp,
    name,
    country_codes=["AR"],
    overwrite_ents: bool = True,
    min_freq: int = 100,
    min_name_length: int = 4,
    unicode_norm: str = "NFKC",
):
    db_names = counts.query(f"counts > {min_freq} and len >= {min_name_length}")["name"]
    with open('extra_names.txt', 'r') as file:
        extra_names = file.readlines()
        extra_names = map(str.strip, extra_names)
        extra_names = map(str.title, extra_names)
    
    names = chain(db_names, extra_names)

    norm_names = map(lambda x: unicodedata.normalize(unicode_norm, x), names)
    norm_names = tuple(norm_names)
    ascii_names = map(lambda x: unidecode(x), norm_names)

    names = tuple(chain(norm_names, ascii_names))
    names = chain(names, map(str.upper, names))


    ruler = EntityRuler(nlp, name=name, overwrite_ents=overwrite_ents)
    ruler.add_patterns(
        [
            {
                "label": "PER",
                "id": "PER",
                "pattern": [{"ORTH": name}],
            }
            for name in names
        ]
    )
    return ruler

@Language.factory('join_consecutive_names')
def join_consecutive_name_entities(nlp, name):
    ruler = EntityRuler(nlp, name=name, overwrite_ents=True)
    ruler.add_patterns(
        [
            # explicit consecutive names
            {
                "label": "PER",
                "id": "PER",
                "pattern": [{"ENT_TYPE": "PER", "OP": "{2,}"}],
            },
            # comma/dot separated
            {
                "label": "PER",
                "id": "PER",
                "pattern": [
                    {"ENT_TYPE": "PER", "OP": "+"},
                    {"IS_PUNCT": True},
                    {"ENT_TYPE": "PER", "OP": "+"},
                ],
            },
            # names with abbrvs within
            {
                "label": "PER",
                "id": "PER",
                "pattern": [
                    {"ENT_TYPE": "PER", "OP": "+"},
                    {"TEXT": {"REGEX": r"[A-Z][\.\s\,]?"}},
                    {"ENT_TYPE": "PER", "OP": "+"},
                ],
            },
        ]
    )
    return ruler

In [None]:
!sudo pip install unidecode

In [None]:
from spacy.language import Language
from spacy.pipeline import EntityRuler



In [None]:
names = counts.query('counts > 100 and len >= 3')['name']

In [None]:
names.values[:20]

In [None]:
from aymurai.spacy.ner import SpacyNER
from aymurai.text.normalize import TextNormalize
from aymurai.spacy.ruler import SpacyRulerPipeline
from aymurai.text.extraction import FulltextExtract
from aymurai.pipeline.pipeline import AymurAIPipeline

config = {
    "preprocess": [
        (
            FulltextExtract,
            {
                "extension": "pdf",
                "method": "tesseract",
                "language": "spa",
                "errors": "ignore",
                "use_cache": True,
            },
        ),
        (TextNormalize, {}),
        # (
        #     SpacyRulerPipeline,
        #     {
        #         "base": "es",
        #         "steps": [
        #             ("name_lookup_ruler", {'country_codes': ['AR']})
        #         ],
        #     },
        # ),
    ],
    "models": [],
    "postprocess": [],
    "multiprocessing": {},
    "use_cache": False,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)

In [None]:
preprocessed_train = pipeline.preprocess(train)

# tryout

In [None]:
import spacy

nlp = spacy.blank('es')
# nlp.add_pipe('name_lookup_ruler')
# nlp.add_pipe('merge_entities')
# ruler = name_lookup(nlp, 'asd')

In [None]:
ruler = name_lookup(nlp, 'names_lookup')
join_entities = join_consecutive_name_entities(nlp, 'asd')

In [None]:
from spacy import displacy
import srsly

item = preprocessed_train[906]
text = item['data']['doc.text']
print(srsly.yaml_dumps(item))
doc = nlp(text)
doc = ruler(doc)
doc = join_entities(doc)

displacy.render(doc, 'ent')