In [None]:
!pip install ohmeow-blurr -q
!pip install bert-score -q
!pip install sacremoses

import pandas as pd
import torch
from transformers import *
from fastai.text.all import *
from blurr.text.data.all import *
from blurr.text.modeling.all import *
import nltk
nltk.download('punkt')

In [None]:
#Get data
df = pd.read_csv('/content/datos_modifiedComa.csv', error_bad_lines=False, sep=',')
df = df[['snt_id','source_snt','simplified_snt']]

articles = df.head(638)


n_labels = len(articles["source_snt"].unique())
print(n_labels)
articles

In [None]:
#Import model
pretrained_model_name = "facebook/bart-large-cnn"
hf_arch, hf_config, hf_tokenizer, hf_model = get_hf_objects(pretrained_model_name, model_cls=BartForConditionalGeneration)

hf_arch, type(hf_config), type(hf_tokenizer), type(hf_model)

In [None]:
text_gen_kwargs = {}
if hf_arch in ["bart", "t5"]:
    text_gen_kwargs = {**hf_config.task_specific_params["summarization"], **{"max_length": 50, "min_length": 10}}

# not all "summarization" parameters are for the model.generate method ... remove them here
generate_func_args = list(inspect.signature(hf_model.generate).parameters.keys())
for k in text_gen_kwargs.copy():
    if k not in generate_func_args:
        del text_gen_kwargs[k]

if hf_arch == "mbart":
    text_gen_kwargs["decoder_start_token_id"] = hf_tokenizer.get_vocab()["en_XX"]
    
tok_kwargs = {}
if hf_arch == "mbart":
    tok_kwargs["src_lang"], tok_kwargs["tgt_lang"] = "en_XX", "en_XX"

In [None]:
batch_tokenize_tfm = Seq2SeqBatchTokenizeTransform(
    hf_arch,
    hf_config,
    hf_tokenizer,
    hf_model,
    max_length=256,
    max_target_length=130,
    tok_kwargs=tok_kwargs,
    text_gen_kwargs=text_gen_kwargs,
)

blocks = (Seq2SeqTextBlock(batch_tokenize_tfm=batch_tokenize_tfm), noop)

dblock = DataBlock(blocks=blocks, get_x=ColReader("source_snt"), get_y=ColReader("simplified_snt"), splitter=RandomSplitter())
dls = dblock.dataloaders(articles, bs=2)

In [None]:
dls.show_batch(dataloaders=dls, max_n=2)

In [None]:
seq2seq_metrics = {
    "rouge": {
        "compute_kwargs": {"rouge_types": ["rouge1", "rouge2", "rougeL", "rougeLsum"], "use_stemmer": True},
        "returns": ["rouge1", "rouge2", "rougeL", "rougeLsum"],
    },
    "bertscore": {"compute_kwargs": {"lang": "en"}, "returns": ["precision", "recall", "f1"]},
}

In [None]:
model = BaseModelWrapper(hf_model)
learn_cbs = [BaseModelCallback]
fit_cbs = [Seq2SeqMetricsCallback(custom_metrics=seq2seq_metrics)]

learn = Learner(
    dls,
    model,
    opt_func=partial(Adam),
    loss_func=CrossEntropyLossFlat(),
    cbs=learn_cbs,
    splitter=partial(blurr_seq2seq_splitter, arch=hf_arch),
)

# learn = learn.to_native_fp16() #.to_fp16()
learn.freeze()
#learn.summary()

In [None]:
learn.lr_find(suggest_funcs=[minimum, steep, valley, slide])

In [None]:
learn.fit_one_cycle(5, lr_max=3.311311302240938e-05, cbs=fit_cbs)

In [None]:
learn.show_results(learner=learn, input_trunc_at=500, target_trunc_at=250)

In [None]:
#Use test data to generate simplifications
df = pd.read_csv('/content/simpletext_task3_test.csv',  sep=',')
df['simplified_snt'] = " "
#modify run id for different runs
df['run_id'] = 'HULAT-UC3M11'
#the generation of the simplifications has been automatic
df['manual'] = 0

In [None]:
#the model cannot process all 100.000+ entries of the test data in one go, a subset of the test data must be chosen to be executed
#it doesnt have to strictly be the head of the dataframe, for example rows 2000-3999 could be chosen
df = df.head(100)
df

In [None]:
pip install swifter

In [None]:
import swifter

In [None]:
#generate simplifications
def simplify (snt):
  output = learn.blurr_generate(snt, num_return_sequences=1)
  final = output[0]['generated_texts']
  return(final)


df['simplified_snt'] = df.swifter.apply(lambda row: simplify(row['source_snt']),axis=1)


In [None]:
df = df[['run_id','manual','snt_id','simplified_snt']]
df

In [None]:
df.to_csv('runA.csv', index=False,sep ='\t')