In [None]:
!pip install tensorflow_hub tensorflow-gpu tensorflow_text tensorflow-addons scikit-multilearn iterative-stratification

In [None]:
%load_ext autoreload
%load_ext aymurai.devtools.magic
%autoreload 2

In [None]:
import locale

from sklearn.model_selection import train_test_split

from aymurai.spacy.display import DocRender
from aymurai.pipeline import AymurAIPipeline
from aymurai.datasets.ar_juz_pcyf_10 import ArgentinaJuzgadoPCyF10Dataset

locale.setlocale(locale.LC_ALL, 'es_AR.UTF-8')
render = DocRender()

In [None]:
docs = ArgentinaJuzgadoPCyF10Dataset('private-docs', use_cache=True)
docs = filter(lambda x: 'admisibilidad_prueba' not in x['metadata']['objeto_de_la_resolucion'], docs)
docs = list(docs)
public = ArgentinaJuzgadoPCyF10Dataset('latest', use_cache=True)

train, test = train_test_split(docs, test_size=0.2, random_state=22)
train, val = train_test_split(train, test_size=0.2, random_state=22)

print('docs', len(docs))
print('public', len(public))
print('---')
print('train:', len(train))
print('test:', len(test))
print('val:', len(val))

# Pipeline definition

In [None]:
from copy import deepcopy

import pandas as pd

from aymurai.meta.types import DataItem
from aymurai.meta.pipeline_interfaces import Transform

# CATEGORIES = ['v_fisica', 'v_econ', 'v_psic', 'v_sex', 'v_soc', 'v_amb', 'v_simb', 'v_polit']
# CATEGORIES = ['violencia_de_genero', 'v_fisica', 'v_econ', 'v_psic', 'v_sex', 'v_soc', 'v_amb', 'v_simb', 'v_polit']
CATEGORIES = ['violencia_de_genero', 'v_fisica', 'v_econ', 'v_psic', 'v_sex', 'v_soc', 'v_amb', 'v_simb']
# CATEGORIES = ['v_fisica']

class ViolenceDocCategoryParser(Transform):
    categories = CATEGORIES
    
    def __call__(self, item: DataItem) -> DataItem:
        item = deepcopy(item)
        annotations = pd.DataFrame(item['annotations'])
        annotations = annotations[self.categories].any().to_list()

        item['data']['doc-cats'] = {
            f'{cat}': int(value) for cat, value in zip(self.categories, annotations)
        }
        return item

In [None]:
from aymurai.spacy.models.core import SpacyModel
from aymurai.text.normalize import TextNormalize
from aymurai.spacy.ruler import SpacyRulerPipeline
from aymurai.text.extraction import FulltextExtract

config = {
    "preprocess": [
        (
            FulltextExtract,
            {
                "errors": "ignore",
                "use_cache": True,
            },
        ),
        (TextNormalize, {}),
        (ViolenceDocCategoryParser, {}),
    ],
    "models": [
    ],
    "postprocess": [],
    "multiprocessing": {},
    "use_cache": False,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)

# Build train dataset

In [None]:
from itertools import chain

from sklearn.utils.validation import _num_samples
from sklearn.utils import indexable, _safe_indexing
from sklearn.model_selection._split import _validate_shuffle_split
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit


def multilabel_train_test_split(*arrays,
                                test_size=None,
                                train_size=None,
                                random_state=None,
                                shuffle=True,
                                stratify=None):
    """
    Train test split for multilabel classification. Uses the algorithm from: 
    'Sechidis K., Tsoumakas G., Vlahavas I. (2011) On the Stratification of Multi-Label Data'.
    """
    if stratify is None:
        return train_test_split(*arrays, test_size=test_size,train_size=train_size,
                                random_state=random_state, stratify=None, shuffle=shuffle)
    
    assert shuffle, "Stratified train/test split is not implemented for shuffle=False"
    
    n_arrays = len(arrays)
    arrays = indexable(*arrays)
    n_samples = _num_samples(arrays[0])
    n_train, n_test = _validate_shuffle_split(
        n_samples, test_size, train_size, default_test_size=0.25
    )
    cv = MultilabelStratifiedShuffleSplit(test_size=n_test, train_size=n_train, random_state=123)
    train, test = next(cv.split(X=arrays[0], y=stratify))

    return list(
        chain.from_iterable(
            (_safe_indexing(a, train), _safe_indexing(a, test)) for a in arrays
        )
    )

## Dataset from private data

In [None]:
import numpy as np

dataset = docs
preprocessed = pipeline.preprocess(dataset)

x = map(lambda x: x['data']['doc.text'], preprocessed)
x = np.array(list(x))
y = map(lambda x: list(x['data']['doc-cats'].values()), preprocessed)
y = np.array(list(y))

train, test = multilabel_train_test_split(dataset, test_size=0.2, random_state=22, stratify=y)
train, val = multilabel_train_test_split(train, test_size=0.2, random_state=22)

print('train:', len(train))
print('test:', len(test))
print('val:', len(val))

preprocessed_train = pipeline.preprocess(train)
preprocessed_val = pipeline.preprocess(val)
preprocessed_test = pipeline.preprocess(test)

import numpy as np

x_train = map(lambda x: x['data']['doc.text'], preprocessed_train)
x_train = np.array(list(x_train))
y_train = map(lambda x: list(x['data']['doc-cats'].values()), preprocessed_train)
y_train = np.array(list(y_train))

x_val = map(lambda x: x['data']['doc.text'], preprocessed_val)
x_val = np.array(list(x_val))
y_val = map(lambda x: list(x['data']['doc-cats'].values()), preprocessed_val)
y_val = np.array(list(y_val))

x_test = map(lambda x: x['data']['doc.text'], preprocessed_test)
x_test = np.array(list(x_test))
y_test = map(lambda x: list(x['data']['doc-cats'].values()), preprocessed_test)
y_test = np.array(list(y_test))

# training

## build vocab

In [None]:
from itertools import chain

from more_itertools import collapse

vocab = set()

text = chain(x_train, x_val, x_test)
text = map(str.lower, text)
text = map(str.split, text)
text = collapse(text)
vocab.update(text)
len(vocab)

## class weights

In [None]:


class_weights = {k: w for k, w in enumerate(1 - y_train.sum(axis=0)/len(y_train))}
class_weights

# class_weights = {1: y_train.sum()/len(y_train)}
# class_weights[0] = 1 - class_weights[1]

## training routine

In [None]:
from typing import Iterator

import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import tensorflow_hub as hub
from tqdm.auto import tqdm
import tensorflow_text as text
import matplotlib.pyplot as plt
import tensorflow_addons as tfa
from more_itertools import flatten
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input, TextVectorization
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

num_classes = y_train.shape[-1]
f1_score = tfa.metrics.F1Score(
    num_classes=num_classes, average="macro", name="f1_score"
)
adamw = tf.keras.optimizers.experimental.AdamW()


text_vectorizer = TextVectorization(
    max_tokens=len(vocab),
    ngrams=2,
    output_mode="tf_idf",
)
with tf.device("/CPU:0"):
    text_vectorizer.adapt(x_train)

def get_model_tfidf(encoder_trainable: bool = False) -> Model:
    input_ = Input(shape=[], dtype=tf.string)
    x = text_vectorizer(input_)
    embed_shape = x.shape[1]
    x = Dense(1024, activation="relu")(x)
    x = Dense(512, activation="relu")(x)
    x = Dense(128, activation="relu")(x)

    # output = Dense(num_classes, activation="softmax")(x)
    output = Dense(num_classes, activation="sigmoid")(x)
    model = Model(inputs=[input_], outputs=output)

    model.compile(
        loss="binary_crossentropy",
        optimizer=adamw,
        metrics=["categorical_accuracy", f1_score],
    )
    return model

def get_model(encoder_trainable: bool = False) -> Model:

    input_ = Input(shape=[], dtype=tf.string)
    x = hub.KerasLayer(
        # "https://tfhub.dev/google/sentence-t5/st5-base/1",
        # "https://tfhub.dev/google/universal-sentence-encoder-large/5",
        # "https://tfhub.dev/google/universal-sentence-encoder/4",
        # "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3",
        "https://tfhub.dev/google/nnlm-es-dim128/2",
        trainable=encoder_trainable,
    )(input_)

    embed_shape = x.shape[1]
    x = Dense(embed_shape, activation="relu")(x)
    x = Dense(embed_shape, activation="relu")(x)

    # output = Dense(num_classes, activation="softmax")(x)
    output = Dense(num_classes, activation="sigmoid")(x)
    model = Model(inputs=[input_], outputs=output)
    # model.compile(
    #     loss="categorical_crossentropy",
    #     # optimizer="adam",
    #     optimizer=adamw,
    #     metrics=["categorical_accuracy", f1_score],
    # )
    model.compile(
        loss="binary_crossentropy",
        optimizer=adamw,
        metrics=["categorical_accuracy", f1_score],
    )
    return model


# model = get_model_tfidf(encoder_trainable=True)
model = get_model(encoder_trainable=True)
model.summary()
callbacks = [
    # ModelCheckpoint("glove_embeddings_sequence_model.keras", save_best_only=True, save_format='tf'),
    EarlyStopping(patience=15, monitor="val_loss", restore_best_weights=True),
]
model.fit(
    x_train,
    y_train,
    validation_data=(x_val, y_val),
    epochs=50,
    callbacks=callbacks,
    class_weight=class_weights,
)


# Evaluation

# private

In [None]:
import pandas as pd
from sklearn.metrics import classification_report, multilabel_confusion_matrix

print('TRAIN')

hypothesis = model.predict(x_train)
reference = y_train

fig, subplot = plt.subplots(3, 4, figsize=(15, 8))

confusion = multilabel_confusion_matrix(reference, hypothesis > 0.5)
for ax, matrix, cat in zip(subplot.flatten(), confusion, CATEGORIES):
    sns.heatmap(matrix, annot=True, fmt='d', ax=ax)
    ax.set_xlabel("hypothesis")
    ax.set_ylabel("reference")
    ax.set_xticklabels(["false", "true"])
    ax.set_yticklabels(["false", "true"])
    ax.set_title(cat)

plt.tight_layout()

report = classification_report(reference, hypothesis > 0.5, output_dict=True, target_names=CATEGORIES)
pd.DataFrame(report).T

In [None]:
import pandas as pd
from sklearn.metrics import classification_report, multilabel_confusion_matrix

print('VALIDATION')

hypothesis = model.predict(x_val)
reference = y_val

fig, subplot = plt.subplots(3, 4, figsize=(15, 8))

confusion = multilabel_confusion_matrix(reference, hypothesis > 0.5)
for ax, matrix, cat in zip(subplot.flatten(), confusion, CATEGORIES):
    sns.heatmap(matrix, annot=True, fmt='d', ax=ax)
    ax.set_xlabel("hypothesis")
    ax.set_ylabel("reference")
    ax.set_xticklabels(["false", "true"])
    ax.set_yticklabels(["false", "true"])
    ax.set_title(cat)

plt.tight_layout()

report = classification_report(reference, hypothesis > 0.5, output_dict=True, target_names=CATEGORIES)
pd.DataFrame(report).T

In [None]:
import pandas as pd
from sklearn.metrics import classification_report, multilabel_confusion_matrix

print('TEST')

hypothesis = model.predict(x_test)
reference = y_test

fig, subplot = plt.subplots(3, 4, figsize=(15, 8))

confusion = multilabel_confusion_matrix(reference, hypothesis > 0.5)
for ax, matrix, cat in zip(subplot.flatten(), confusion, CATEGORIES):
    sns.heatmap(matrix, annot=True, fmt='d', ax=ax)
    ax.set_xlabel("hypothesis")
    ax.set_ylabel("reference")
    ax.set_xticklabels(["false", "true"])
    ax.set_yticklabels(["false", "true"])
    ax.set_title(cat)

plt.tight_layout()

report = classification_report(reference, hypothesis > 0.5, output_dict=True, target_names=CATEGORIES)
pd.DataFrame(report).T



# public

In [None]:
import numpy as np

dataset = public
preprocessed = pipeline.preprocess(dataset)

x = map(lambda x: x['data']['doc.text'], preprocessed)
x = np.array(list(x))
y = map(lambda x: list(x['data']['doc-cats'].values()), preprocessed)
y = np.array(list(y))

train, test = multilabel_train_test_split(dataset, test_size=0.2, random_state=22, stratify=y)
train, val = multilabel_train_test_split(train, test_size=0.2, random_state=22)

print('train:', len(train))
print('test:', len(test))
print('val:', len(val))

preprocessed_train = pipeline.preprocess(train)
preprocessed_val = pipeline.preprocess(val)
preprocessed_test = pipeline.preprocess(test)

import numpy as np

x_train = map(lambda x: x['data']['doc.text'], preprocessed_train)
x_train = np.array(list(x_train))
y_train = map(lambda x: list(x['data']['doc-cats'].values()), preprocessed_train)
y_train = np.array(list(y_train))

x_val = map(lambda x: x['data']['doc.text'], preprocessed_val)
x_val = np.array(list(x_val))
y_val = map(lambda x: list(x['data']['doc-cats'].values()), preprocessed_val)
y_val = np.array(list(y_val))

x_test = map(lambda x: x['data']['doc.text'], preprocessed_test)
x_test = np.array(list(x_test))
y_test = map(lambda x: list(x['data']['doc-cats'].values()), preprocessed_test)
y_test = np.array(list(y_test))

In [None]:
import pandas as pd
from sklearn.metrics import classification_report, multilabel_confusion_matrix

print('TRAIN')

hypothesis = model.predict(x_train)
reference = y_train

fig, subplot = plt.subplots(3, 4, figsize=(15, 8))

confusion = multilabel_confusion_matrix(reference, hypothesis > 0.5)
for ax, matrix, cat in zip(subplot.flatten(), confusion, CATEGORIES):
    sns.heatmap(matrix, annot=True, fmt='d', ax=ax)
    ax.set_xlabel("hypothesis")
    ax.set_ylabel("reference")
    ax.set_xticklabels(["false", "true"])
    ax.set_yticklabels(["false", "true"])
    ax.set_title(cat)

plt.tight_layout()

report = classification_report(reference, hypothesis > 0.5, output_dict=True, target_names=CATEGORIES)
pd.DataFrame(report).T

In [None]:
import pandas as pd
from sklearn.metrics import classification_report, multilabel_confusion_matrix

print('VALIDATION')

hypothesis = model.predict(x_val)
reference = y_val

fig, subplot = plt.subplots(3, 4, figsize=(15, 8))

confusion = multilabel_confusion_matrix(reference, hypothesis > 0.5)
for ax, matrix, cat in zip(subplot.flatten(), confusion, CATEGORIES):
    sns.heatmap(matrix, annot=True, fmt='d', ax=ax)
    ax.set_xlabel("hypothesis")
    ax.set_ylabel("reference")
    ax.set_xticklabels(["false", "true"])
    ax.set_yticklabels(["false", "true"])
    ax.set_title(cat)

plt.tight_layout()

report = classification_report(reference, hypothesis > 0.5, output_dict=True, target_names=CATEGORIES)
pd.DataFrame(report).T

In [None]:
import pandas as pd
from sklearn.metrics import classification_report, multilabel_confusion_matrix

print('TEST')

hypothesis = model.predict(x_test)
reference = y_test

fig, subplot = plt.subplots(3, 4, figsize=(15, 8))

confusion = multilabel_confusion_matrix(reference, hypothesis > 0.5)
for ax, matrix, cat in zip(subplot.flatten(), confusion, CATEGORIES):
    sns.heatmap(matrix, annot=True, fmt='d', ax=ax)
    ax.set_xlabel("hypothesis")
    ax.set_ylabel("reference")
    ax.set_xticklabels(["false", "true"])
    ax.set_yticklabels(["false", "true"])
    ax.set_title(cat)

plt.tight_layout()

report = classification_report(reference, hypothesis > 0.5, output_dict=True, target_names=CATEGORIES)
pd.DataFrame(report).T

