In [None]:
!pip install tensorflow_hub tensorflow-gpu tensorflow_text tensorflow-addons scikit-multilearn iterative-stratification simpletransformers

In [None]:
%load_ext autoreload
%load_ext aymurai.devtools.magic
%autoreload 2

In [None]:
import locale

from sklearn.model_selection import train_test_split

from aymurai.spacy.display import DocRender
from aymurai.pipeline import AymurAIPipeline
from aymurai.datasets.ar_juz_pcyf_10 import ArgentinaJuzgadoPCyF10Dataset

locale.setlocale(locale.LC_ALL, 'es_AR.UTF-8')
render = DocRender()

In [None]:
private = ArgentinaJuzgadoPCyF10Dataset('private', use_cache=True)
public = ArgentinaJuzgadoPCyF10Dataset('latest', use_cache=True)

train, test = train_test_split(public, test_size=0.2, random_state=22)
train, val = train_test_split(train, test_size=0.2, random_state=22)

print('private', len(private))
print('public', len(public))
print('---')
print('train:', len(train))
print('test:', len(test))
print('val:', len(val))

# Pipeline definition

In [None]:
from copy import deepcopy

import pandas as pd

from aymurai.meta.types import DataItem
from aymurai.meta.pipeline_interfaces import Transform

# CATEGORIES = ['v_fisica', 'v_econ', 'v_psic', 'v_sex', 'v_soc', 'v_amb', 'v_simb', 'v_polit']
# CATEGORIES = ['violencia_de_genero', 'v_fisica', 'v_econ', 'v_psic', 'v_sex', 'v_soc', 'v_amb', 'v_simb', 'v_polit']
CATEGORIES = ['violencia_de_genero', 'v_fisica', 'v_econ', 'v_psic', 'v_sex', 'v_soc', 'v_amb', 'v_simb']
# CATEGORIES = ['v_fisica']

class ViolenceDocCategoryParser(Transform):
    categories = CATEGORIES
    
    def __call__(self, item: DataItem) -> DataItem:
        item = deepcopy(item)
        annotations = pd.DataFrame(item['annotations'])
        annotations = annotations[self.categories].any().to_list()

        item['data']['doc-cats'] = {
            f'{cat}': int(value) for cat, value in zip(self.categories, annotations)
        }
        return item

In [None]:
from aymurai.spacy.models.core import SpacyModel
from aymurai.text.normalize import TextNormalize
from aymurai.spacy.ruler import SpacyRulerPipeline
from aymurai.text.extraction import FulltextExtract

config = {
    "preprocess": [
        (
            FulltextExtract,
            {
                "extension": "pdf",
                "method": "tesseract",
                "language": "spa",
                "errors": "ignore",
                "use_cache": True,
            },
        ),
        (TextNormalize, {}),
        (ViolenceDocCategoryParser, {}),
    ],
    "models": [
    ],
    "postprocess": [],
    "multiprocessing": {},
    "use_cache": False,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)

# Build train dataset

In [None]:
from itertools import chain

from sklearn.utils.validation import _num_samples
from sklearn.utils import indexable, _safe_indexing
from sklearn.model_selection._split import _validate_shuffle_split
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit


def multilabel_train_test_split(*arrays,
                                test_size=None,
                                train_size=None,
                                random_state=None,
                                shuffle=True,
                                stratify=None):
    """
    Train test split for multilabel classification. Uses the algorithm from: 
    'Sechidis K., Tsoumakas G., Vlahavas I. (2011) On the Stratification of Multi-Label Data'.
    """
    if stratify is None:
        return train_test_split(*arrays, test_size=test_size,train_size=train_size,
                                random_state=random_state, stratify=None, shuffle=shuffle)
    
    assert shuffle, "Stratified train/test split is not implemented for shuffle=False"
    
    n_arrays = len(arrays)
    arrays = indexable(*arrays)
    n_samples = _num_samples(arrays[0])
    n_train, n_test = _validate_shuffle_split(
        n_samples, test_size, train_size, default_test_size=0.25
    )
    cv = MultilabelStratifiedShuffleSplit(test_size=n_test, train_size=n_train, random_state=123)
    train, test = next(cv.split(X=arrays[0], y=stratify))

    return list(
        chain.from_iterable(
            (_safe_indexing(a, train), _safe_indexing(a, test)) for a in arrays
        )
    )

## Dataset from private data

In [None]:
import numpy as np

dataset = private
preprocessed = pipeline.preprocess(dataset)

x = map(lambda x: x['data']['doc.text'], preprocessed)
x = np.array(list(x))
y = map(lambda x: list(x['data']['doc-cats'].values()), preprocessed)
y = np.array(list(y))

train, test = multilabel_train_test_split(dataset, test_size=0.2, random_state=22, stratify=y)
train, val = multilabel_train_test_split(train, test_size=0.2, random_state=22)

print('train:', len(train))
print('test:', len(test))
print('val:', len(val))

preprocessed_train = pipeline.preprocess(train)
preprocessed_val = pipeline.preprocess(val)
preprocessed_test = pipeline.preprocess(test)

import numpy as np

x_train = map(lambda x: x['data']['doc.text'], preprocessed_train)
x_train = np.array(list(x_train))
y_train = map(lambda x: list(x['data']['doc-cats'].values()), preprocessed_train)
y_train = np.array(list(y_train))

x_val = map(lambda x: x['data']['doc.text'], preprocessed_val)
x_val = np.array(list(x_val))
y_val = map(lambda x: list(x['data']['doc-cats'].values()), preprocessed_val)
y_val = np.array(list(y_val))

x_test = map(lambda x: x['data']['doc.text'], preprocessed_test)
x_test = np.array(list(x_test))
y_test = map(lambda x: list(x['data']['doc-cats'].values()), preprocessed_test)
y_test = np.array(list(y_test))

# training

In [None]:
import logging

import pandas as pd
from simpletransformers.classification import ClassificationArgs, ClassificationModel

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
train_data = pd.DataFrame.from_dict({'text': x_train, 'labels': y_train.tolist()})
eval_data = pd.DataFrame.from_dict({'text': x_val, 'labels': y_val.tolist()})
# Preparing train data
# train_data = [
#     ["Aragorn", [1, 0, 0]],
#     ["Frodo", [0, 1, 1]],
#     ["Gimli", [1, 0, 1]],
# ]
# train_df = pd.DataFrame(train_data)
# train_df.columns = ["text", "labels"]

# # Preparing eval data
# eval_data = [
#     ["Legolas", [1, 0, 0]],
#     ["Merry", [0, 0, 1]],
#     ["Eomer", [1, 0, 0]],
# ]
# eval_df = pd.DataFrame(eval_data)
# eval_df.columns = ["text", "labels"]


In [None]:
from simpletransformers.classification import (
    MultiLabelClassificationArgs,
    MultiLabelClassificationModel,
)

# Optional model configuration
model_args = MultiLabelClassificationArgs(num_train_epochs=1, sliding_window=True)
model_args.use_early_stopping = True
model_args.early_stopping_delta = 0.01
# model_args.early_stopping_metric = "eval_loss"
model_args.early_stopping_metric = "LRAP"
model_args.early_stopping_metric_minimize = False
model_args.early_stopping_patience = 5
model_args.evaluate_during_training_steps = 1000
model_args.regression = False
# model_args.overwrite_output_dir = True
model_args.output_dir = 'beto-01'


# class weights
class_weights = np.array([w for k, w in enumerate(len(y_train)/y_train[:,1:].sum(axis=0))])
class_weights /= class_weights.min()
class_weights = [1] + list(class_weights)

# Create a MultiLabelClassificationModel
model = MultiLabelClassificationModel(
    # "roberta",
    # "outputs/checkpoint-15489-epoch-3",
    'bert',
    # 'dccuchile/bert-base-spanish-wwm-uncased',
    'beto-01/checkpoint-3231-epoch-1',
    num_labels=8,
    args=model_args,
    pos_weight=class_weights,
)

In [None]:
# Train the model
model.train_model(train_data, eval_df=eval_data)


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, multilabel_confusion_matrix

print('TRAIN')

reference = y_train
# hypothesis = model.predict(x_train)
predictions, raw_outputs = model.predict(train_data['text'])
hypothesis = np.array([np.max(p, axis=0) for p in raw_outputs])

fig, subplot = plt.subplots(3, 4, figsize=(15, 8))

confusion = multilabel_confusion_matrix(reference, hypothesis > 0.5)
for ax, matrix, cat in zip(subplot.flatten(), confusion, CATEGORIES):
    sns.heatmap(matrix, annot=True, fmt='d', ax=ax)
    ax.set_xlabel("hypothesis")
    ax.set_ylabel("reference")
    ax.set_xticklabels(["false", "true"])
    ax.set_yticklabels(["false", "true"])
    ax.set_title(cat)

plt.tight_layout()

report = classification_report(reference, hypothesis > 0.5, output_dict=True, target_names=CATEGORIES)
pd.DataFrame(report).T

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, multilabel_confusion_matrix

print('eval')

reference = y_val
# hypothesis = model.predict(x_train)
predictions, raw_outputs = model.predict(eval_data['text'])
hypothesis = np.array([np.max(p, axis=0) for p in raw_outputs])

fig, subplot = plt.subplots(3, 4, figsize=(15, 8))

confusion = multilabel_confusion_matrix(reference, hypothesis > 0.5)
for ax, matrix, cat in zip(subplot.flatten(), confusion, CATEGORIES):
    sns.heatmap(matrix, annot=True, fmt='d', ax=ax)
    ax.set_xlabel("hypothesis")
    ax.set_ylabel("reference")
    ax.set_xticklabels(["false", "true"])
    ax.set_yticklabels(["false", "true"])
    ax.set_title(cat)

plt.tight_layout()

report = classification_report(reference, hypothesis > 0.5, output_dict=True, target_names=CATEGORIES)
pd.DataFrame(report).T

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, multilabel_confusion_matrix

print('test')

reference = y_test
# hypothesis = model.predict(x_train)
predictions, raw_outputs = model.predict(x_test)
hypothesis = np.array([np.max(p, axis=0) for p in raw_outputs])

fig, subplot = plt.subplots(3, 4, figsize=(15, 8))

confusion = multilabel_confusion_matrix(reference, hypothesis > 0.5)
for ax, matrix, cat in zip(subplot.flatten(), confusion, CATEGORIES):
    sns.heatmap(matrix, annot=True, fmt='d', ax=ax)
    ax.set_xlabel("hypothesis")
    ax.set_ylabel("reference")
    ax.set_xticklabels(["false", "true"])
    ax.set_yticklabels(["false", "true"])
    ax.set_title(cat)

plt.tight_layout()

report = classification_report(reference, hypothesis > 0.5, output_dict=True, target_names=CATEGORIES)
pd.DataFrame(report).T