In [None]:
!pip install  transformers==4.20.1
!pip install --no-deps git+https://github.com/Shivanandroy/simpleT5.git@4c1afee10bf822ab711660cf7fe20595f1f368e1

In [None]:
!pip freeze | grep -e transformers -e simple

In [None]:
%load_ext autoreload
%load_ext aymurai.devtools
%autoreload 2

In [None]:
import locale

from sklearn.model_selection import train_test_split

from aymurai.spacy.display import DocRender
from aymurai.pipeline import AymurAIPipeline
from aymurai.datasets.ar_juz_pcyf_10 import ArgentinaJuzgadoPCyF10Dataset

locale.setlocale(locale.LC_ALL, 'es_AR.UTF-8')
render = DocRender()

In [None]:
private = ArgentinaJuzgadoPCyF10Dataset('private', use_cache=True)
train, test = train_test_split(private, test_size=0.2, random_state=22)
train, val = train_test_split(train, test_size=0.2, random_state=22)
print('train:', len(train))
print('test:', len(test))
print('val:', len(val))

In [None]:
import aymurai.spacy.components
from aymurai.spacy.models.core import SpacyModel
from aymurai.text.normalize import TextNormalize
from aymurai.spacy.ruler import SpacyRulerPipeline
from aymurai.text.extraction import FulltextExtract

config = {
    "preprocess": [
        (
            FulltextExtract,
            {
                "extension": "pdf",
                "method": "tesseract",
                "language": "spa",
                "errors": "ignore",
                "use_cache": True,
            },
        ),
        (TextNormalize, {}),
        (
            SpacyRulerPipeline,
            {
                "base": "es",
                "steps": [("aymurai_violence_quotes_ruler", {})],
            },
        ),
    ],
    "models": [],
    "postprocess": [],
    "multiprocessing": {},
    "use_cache": True,
    # 'log_level': 'debug'
}

pipeline = AymurAIPipeline(config)

In [None]:
preprocessed = pipeline.preprocess(private)

In [None]:
def have_quotes_entities(item)-> bool:
    if 'data' not in item:
        return False
    if 'entities' not in item['data']:
        return False
    
    if not item['metadata']['frases_agresion']:
        return False
    
    labels = list(map(lambda x: x['label'], item['data']['entities']))
    return 'AYMURAI_VIOLENCE_QUOTE' in labels
    
with_quotes = filter(have_quotes_entities, preprocessed)
with_quotes = list(with_quotes)

In [None]:
len(with_quotes)

In [None]:
from more_itertools import unique_everseen

texts = map(lambda x: x["data"]["doc.text"], with_quotes)
texts = list(texts)

quotes = map(
    lambda x: list(
        unique_everseen(
            filter(bool, map(lambda y: y["frases_agresion"], x["annotations"]))
        )
    ),
    with_quotes,
)
quotes = map(lambda x: ', '.join(x), quotes)
quotes = list(quotes)



In [None]:
import pandas as pd

data = pd.DataFrame({
    'source_text': [f'question: cuales son las frases de violencia? context: {text} </s>' for text in texts],
    'target_text': [f'{target} </s>' for target in quotes]
})
data

In [None]:
from simplet5 import SimpleT5
from transformers import AutoTokenizer

# instantiate
model = SimpleT5()

# load (supports t5, mt5, byT5 models)
model.from_pretrained("longt5","google/long-t5-local-base")
# model.tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")

In [None]:
tokens = (model.tokenizer(doc)['input_ids'] for doc in data['source_text'])

In [None]:
import numpy as np
lens = np.array([len(token) for token in tokens])

In [None]:
import seaborn as sns

sns.distplot(lens)

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, random_state=22)
train, val = train_test_split(train, test_size=0.2, random_state=22)

In [None]:
model.train(
    train_df=train,  # pandas dataframe with 2 columns: source_text & target_text
    eval_df=val,  # pandas dataframe with 2 columns: source_text & target_text
    source_max_token_len=20480,
    target_max_token_len=128,
    batch_size=1,
    max_epochs=5,
    use_gpu=True,
    outputdir="outputs",
    early_stopping_patience_epochs=0,
    precision=32,
)
