In [None]:
!pip install tensorflow_hub tensorflow-gpu tensorflow_text tensorflow-addons tf-sentencepiece

In [None]:
%load_ext autoreload
%load_ext aymurai.devtools.magic
%autoreload 2

In [None]:
import locale

from sklearn.model_selection import train_test_split

from aymurai.spacy.display import DocRender
from aymurai.pipeline import AymurAIPipeline
from aymurai.datasets.ar_juz_pcyf_10 import ArgentinaJuzgadoPCyF10Dataset

locale.setlocale(locale.LC_ALL, 'es_AR.UTF-8')
render = DocRender()

In [None]:
private = ArgentinaJuzgadoPCyF10Dataset('private', use_cache=True)
train, test = train_test_split(private, test_size=0.2, random_state=22)
train, val = train_test_split(train, test_size=0.2, random_state=22)
print('train:', len(train))
print('test:', len(test))
print('val:', len(val))

# Pipeline definition

In [None]:
from copy import deepcopy

import pandas as pd

from aymurai.meta.types import DataItem
from aymurai.meta.pipeline_interfaces import Transform


class ViolenceDocCategoryParser(Transform):
    categories = ['violencia_de_genero']
    
    def __call__(self, item: DataItem) -> DataItem:
        item = deepcopy(item)
        annotations = pd.DataFrame(item['annotations'])
        annotations = annotations[self.categories].any().to_list()

        item['data']['doc-cats'] = {
            f'{cat}': int(value) for cat, value in zip(self.categories, annotations)
        }
        return item

In [None]:
from aymurai.spacy.models.core import SpacyModel
from aymurai.text.normalize import TextNormalize
from aymurai.spacy.ruler import SpacyRulerPipeline
from aymurai.text.extraction import FulltextExtract

config = {
    "preprocess": [
        (
            FulltextExtract,
            {
                "extension": "pdf",
                "method": "tesseract",
                "language": "spa",
                "errors": "ignore",
                "use_cache": True,
            },
        ),
        (TextNormalize, {}),
        (ViolenceDocCategoryParser, {}),
    ],
    "models": [
    ],
    "postprocess": [],
    "multiprocessing": {},
    "use_cache": False,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)

# training

In [None]:
preprocessed_train = pipeline.preprocess(train)
preprocessed_val = pipeline.preprocess(val)
preprocessed_test = pipeline.preprocess(test)


In [None]:
import numpy as np

x_train = map(lambda x: x['data']['doc.text'], preprocessed_train)
x_train = np.array(list(x_train))
y_train = map(lambda x: x['data']['doc-cats']['violencia_de_genero'], preprocessed_train)
y_train = np.array(list(y_train))

x_val = map(lambda x: x['data']['doc.text'], preprocessed_val)
x_val = np.array(list(x_val))
y_val = map(lambda x: x['data']['doc-cats']['violencia_de_genero'], preprocessed_val)
y_val = np.array(list(y_val))

x_test = map(lambda x: x['data']['doc.text'], preprocessed_test)
x_test = np.array(list(x_test))
y_test = map(lambda x: x['data']['doc-cats']['violencia_de_genero'], preprocessed_test)
y_test = np.array(list(y_test))

In [None]:
class_weights = {1: y_train.sum()/len(y_train)}
class_weights[0] = 1 - class_weights[1]

In [None]:
from typing import Iterator

import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import tensorflow_hub as hub
from tqdm.auto import tqdm
import tensorflow_text as text
import matplotlib.pyplot as plt
import tensorflow_addons as tfa
from more_itertools import flatten
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import f1_score, confusion_matrix, classification_report

f1_score = tfa.metrics.F1Score(num_classes=1, average="macro", name="f1_score")
adamw = tf.keras.optimizers.experimental.AdamW()

def get_model(encoder_trainable: bool = False) -> Model:

    input_ = Input(shape=[], dtype=tf.string)
    x = hub.KerasLayer(
        # "https://tfhub.dev/google/sentence-t5/st5-base/1",
        # "https://tfhub.dev/google/universal-sentence-encoder-large/5",
        # "https://tfhub.dev/google/universal-sentence-encoder/4",
        # "https://tfhub.dev/google/universal-sentence-encoder-multilingual/3",
        "https://tfhub.dev/google/nnlm-es-dim128/2",
        # "https://tfhub.dev/google/universal-sentence-encoder-cmlm/multilingual-base/1",
        trainable=encoder_trainable,
    )(input_)

    embed_shape = x.shape[1]
    x = Dense(embed_shape, activation="relu")(x)
    x = Dense(embed_shape, activation="relu")(x)

    output = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=[input_], outputs=output)
    model.compile(
        loss="binary_crossentropy",
        # optimizer="adam",
        optimizer=adamw,
        metrics=["accuracy", f1_score],
    )
    return model


model = get_model(encoder_trainable=True)
model.summary()
callbacks = [
    ModelCheckpoint("glove_embeddings_sequence_model.keras", save_best_only=True),
    EarlyStopping(patience=15, monitor="val_loss", restore_best_weights=True),
]
model.fit(
    x_train,
    y_train,
    validation_data=(x_val, y_val),
    epochs=50,
    callbacks=callbacks,
    class_weight=class_weights,
)


# Evaluation

## private

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, subplot = plt.subplots(2, 3, figsize=(20, 10))

print("TRAIN")
hypothesis = model.predict(x_train)
reference = y_train
report = classification_report(reference, hypothesis > 0.5)
confusion = confusion_matrix(reference, hypothesis > 0.5)
sns.heatmap(confusion, ax=subplot[0, 0], annot=True, fmt="d")
subplot[1, 0].text(0, 0.5, report, fontfamily='monospace')
# print(report)

print("val")
hypothesis = model.predict(x_val)
reference = y_val
report = classification_report(reference, hypothesis > 0.5)
confusion = confusion_matrix(reference, hypothesis > 0.5)
sns.heatmap(confusion, ax=subplot[0, 1], annot=True, fmt="d")
subplot[1, 1].text(0, 0.5, report, fontfamily='monospace')
# print(report)

print("TEST")
hypothesis = model.predict(x_test)
reference = y_test
report = classification_report(reference, hypothesis > 0.5)
confusion = confusion_matrix(reference, hypothesis > 0.5)
sns.heatmap(confusion, ax=subplot[0, 2], annot=True, fmt="d")
subplot[1, 2].text(0, 0.5, report, fontfamily='monospace')
# print(report)

subplot[0, 0].set_title("TRAIN")
subplot[0, 1].set_title("VALIDATION")
subplot[0, 2].set_title("TEST")
for ax in subplot[0]:
    ax.set_xlabel("hypothesis")
    ax.set_ylabel("reference")
    ax.set_xticklabels(["false", "true"])
    ax.set_yticklabels(["false", "true"])

for ax in subplot[1]:
    ax.axis('off')

## public

In [None]:
public = ArgentinaJuzgadoPCyF10Dataset('latest', use_cache=True)
preprocessed = pipeline.preprocess(public)

x_train = map(lambda x: x['data']['doc.text'], preprocessed)
x_train = np.array(list(x_train))
y_train = map(lambda x: x['data']['doc-cats']['violencia_de_genero'], preprocessed)
y_train = np.array(list(y_train))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig, subplot = plt.subplots(2, 3, figsize=(20, 10))

print("PUBLIC")
hypothesis = model.predict(x_train)
reference = y_train
report = classification_report(reference, hypothesis > 0.5)
confusion = confusion_matrix(reference, hypothesis > 0.5)
sns.heatmap(confusion, ax=subplot[0, 0], annot=True, fmt="d")
subplot[1, 0].text(0, 0.5, report, fontfamily='monospace')
# print(report)


subplot[0, 0].set_title("PUBLIC")
for ax in subplot[0]:
    ax.set_xlabel("hypothesis")
    ax.set_ylabel("reference")
    ax.set_xticklabels(["false", "true"])
    ax.set_yticklabels(["false", "true"])

for ax in subplot[1]:
    ax.axis('off')

In [None]:
model.save('/resources/checkpoints/usem-gender-violence-binary-cat')

In [None]:
model = tf.keras.models.load_model('/resources/checkpoints/usem-gender-violence-binary-cat')