In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

In [None]:
import flair
import logging


flair.logger.setLevel(logging.ERROR)

In [None]:
persona_acusada_no_determinada_categories = [
    "manifestantes",
    "organo_jurisdiccional",
    "pagina_web",
    "persona_juridica",
    "personal_policial",
    "usuario_de_chatstep",
    "usuario_de_cuenta_de_google",
    "usuario_de_facebook",
    "usuario_de_imgur",
    "usuario_de_instagram",
    "usuario_de_mercado_libre",
    "usuario_de_outlook",
    "usuario_de_skout",
    "usuario_de_skype",
    "usuario_de_tiktok",
    "usuario_de_twitter",
    "usuario_de_whatsapp",
    "usuario_de_youtube",
    "usuario_microsoft",
]


In [None]:
from aymurai.utils.display import DocRender
from aymurai.datasets.ar_juz_pcyf_10.annotations import (
    ArgentinaJuzgadoPCyF10LabelStudioAnnotations,
)
from aymurai.utils.misc import get_element

render = DocRender()
docs = ArgentinaJuzgadoPCyF10LabelStudioAnnotations(
    "/resources/data/restricted/annotations/20221130-bis/"
).data

def has_persona_acusada_no_determinada(item):
    ents = get_element(item, ['annotations', 'entities']) or []
    ents = [ent for ent in ents if ent['label'] == 'PERSONA_ACUSADA_NO_DETERMINADA']
    return bool(ents)

filtered_docs = [item for item in docs if has_persona_acusada_no_determinada(item)]
# docs = ArgentinaJuzgadoPCyF10DocsDataset().data
sample = docs[:10]

print(len(docs), len(filtered_docs))

In [None]:
from more_itertools import collapse
import pandas as pd
from aymurai.utils.misc import get_element
import re
import unidecode


def normalize_text(text: str) -> str:
    text = unidecode.unidecode(text.lower())
    text = re.sub(r'[_\-,;:]+', '', text)
    return text

all_ents = map(lambda x: x['annotations']['entities'], docs)
all_ents = collapse(all_ents, base_type=dict)
filtered_ents = filter(lambda x: x['label'] == 'PERSONA_ACUSADA_NO_DETERMINADA', all_ents)

def extract_data(ent):
    text = ent['text']
    subclass = get_element(ent, ['attrs','aymurai_label_subclass', 0])

    pre = ent['context_pre'].split('\n')[-1]
    post = ent['context_post'].split('\n')[0]
    context = f'{pre} {text} {post}'
    return text, context, subclass

data = map(extract_data, filtered_ents)
data = pd.DataFrame(data, columns=['text', 'context', 'subcat'])
data

In [None]:
data.value_counts(subset='subcat')

In [None]:
patterns = {
    # could be improve using company names ner
    "persona_juridica": r"\W(S\.?R\.?L\.?|S\.?A\.?|sociedad anonima|consorcio|laboratorio|asociacion civil)\W",
    "personal_policial": r"policia",
    # all user categories should be looked somehow explicitely
    "usuario_de_facebook": r"(facebook)",
    "usuario_de_cuenta_de_google": r"(gmail)",
    "usuario_de_instagram": r"(instagram)",
    "usuario_de_twitter": r"(twitter)",
    "usuario_de_outlook": r"(hotmail)",
    "usuario_de_skype": r"(skype)",
    "usuario_microsoft": r"(microsoft|hotmail|one drive)",
    "usuario_de_whatsapp": r"(whatsapp|telefonia|celular)",
    "usuario_de_youtube": r"(you\s?tube)",
    "usuario_de_mercado_libre": r"(mercado\s?libre)",
    # general url pattern (https://stackoverflow.com/a/3809435)
    "pagina_web": r"(http(s)?:\/\/.)?(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)",
}


def find_persona_acusada_no_determinada(context: str) -> list:
    normalized_pred = normalize_text(context)

    suggestions = set()

    for category, pattern in patterns.items():
        if re.search(pattern, normalized_pred):
            suggestions.add(category)

    return list(suggestions)


In [None]:
subset = data.query('subcat == "usuario_de_mercado_libre"')
for i, row in subset.iterrows():
    text_ = f"{row['text']} || {row['context']}"
    text_ = normalize_text(text_)
    print(text_)

In [None]:
data['pred_subcat'] = data['context'].apply(find_persona_acusada_no_determinada)
data

In [None]:
for i, row in data.query('subcat == "persona_juridica"').iterrows():
    print(row['text'], '|||', row['context'])

In [None]:
list(patterns.keys())

In [None]:

# import warnings
# import numpy as np

# from sklearn.preprocessing import OneHotEncoder, LabelEncoder
# from sklearn.metrics import top_k_accuracy_score

# warnings.filterwarnings("ignore")
# # le = LabelEncoder()
# le = LabelEncoder()
# S = le.fit_transform(data['subcat'])
# # print(S.reshape(-1, 1))

# le = OneHotEncoder(sparse=False, handle_unknown='ignore')
# le.fit_transform(S.reshape(-1,1))

# # le.fit(data[['subcat']])
# y_true = data['subcat'].fillna('')
# # y_true = le.transform(data['subcat'].fillna(''))
# # y_pred = [np.sum([le.transform(c) for c in cat]) for cat in data['pred_subcat']]
# # y_pred = [onehot_initialization()]

# y_pred = []
# for cat in data["pred_subcat"]:
#     a = np.sum([le.transform([[c]]) for c in cat], axis=0)
#     a = np.zeros(10, dtype=int) if not len(cat) else a.flatten().astype(int)
#     y_pred.append(a)
# y_pred = np.array(y_pred)
# y_pred


In [None]:

# topk_acc = top_k_accuracy_score(y_true, y_pred, k=1)
# topk_acc

# test pipeline

In [None]:
from aymurai.pipeline import AymurAIPipeline
from aymurai.models.flair.core import FlairModel
from aymurai.models.flair.utils import FlairTextNormalize
from aymurai.transforms.entity_subcategories.regex import RegexSubcategorizer


config = {
    "preprocess": [
        (FlairTextNormalize, {}),
    ],
    "models": [
        (
            FlairModel,
            {
                "basepath": "/resources/pipelines/examples/flair-simple/FlairModel",
                "split_doc": True,
                "device": "cpu",
            },
        )
    ],
    "postprocess": [
        (RegexSubcategorizer, {}),
    ],
    # "multiprocessing": {},
    "use_cache": False,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)

In [None]:
doc = filtered_docs[:1]

preprocessed = pipeline.preprocess(doc)
predicted = pipeline.predict(preprocessed)
postprocessed = pipeline.postprocess(predicted)

In [None]:
item = postprocessed[0]

ents = get_element(item, ['predictions', 'entities'])
ents = [ent for ent in ents if ent['label'] == 'PERSONA_ACUSADA_NO_DETERMINADA']
print(ents)
print('---')
render(item)
