In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [None]:
import datasets
import pandas as pd
from fastai.text.all import *
from transformers import *

from blurr.data.all import *
from blurr.modeling.all import *

In [None]:
from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForSeq2SeqLM, BertTokenizer, EncoderDecoderModel, BlenderbotSmallTokenizer

model_choice="bert" 

#other models and use instructions available here: https://huggingface.co/models?filter=summarization

if model_choice=="t5":
  model = AutoModelWithLMHead.from_pretrained("t5-base")
  tokenizer = AutoTokenizer.from_pretrained("t5-base")
elif model_choice=="bert":
  model = AutoModelForSeq2SeqLM.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")
  tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")
elif model_choice=="pegasus": #This is pretty intense on memory
  model = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-cnn_dailymail")
  tokenizer = AutoTokenizer.from_pretrained("google/pegasus-cnn_dailymail")
elif model_choice=="bart": 
  model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")
  tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
elif model_choice=="prophetnet":#Does not work, since our version does not seem pre-trained for summarization :()
  model = AutoModelForSeq2SeqLM.from_pretrained("microsoft/prophetnet-large-uncased")
  tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")


Next we do a basic summarization task, for illustration...

In [None]:
#raw_data = datasets.load_dataset('wikihow', 'sep', split='train[:1000') # 
raw_data = datasets.load_dataset('reddit_tifu', 'long', split='train')

In [None]:
df = pd.DataFrame(raw_data)

# df = pd.read_csv("../trainDataPoints_1.csv",sep=',')

# df['x'] = df.apply(lambda row: str(row.tname) + " ; " + str(row.qType) + " ; " + str(row.col) + " ; " + 
#                    str(row.row) + " ; " + str(row.data) + " ; " + str(row.stat) + " ; " , axis = 1)


# df = df.drop(['tname', 'qType','col','row','data','stat'], axis=1)
df.head(1)

pre trained model and model class for summarization (ConditionalGeneration)

In [None]:
model_choice="bert"

"""
Working models: 
- Bert works.. it comes already trained for summarization and we add on that...
- Bart works.. (but this might not be RXF), similarly, it seems to be already trained for summarization, so we might just add on that
- t5 apparently working, but it seems to be already trained for summarization, so we might just add on that
- pegasus working, but it seems it is precisely trained for summarization, so we might just add on that
- blenderbot seems to be working and results in something like a summarizer with a strong personality, with a lot of extractive behavior
- prophetnet seems to be working after some adaptation... overall good results
"""

if model_choice=="t5":
  pretrained_model_name = "t5-base"
  m_cls= T5ForConditionalGeneration
elif model_choice=="bert":
  pretrained_model_name = "patrickvonplaten/bert2bert_cnn_daily_mail"#"this is the only model we have that really is structured as an encoder_decoder in HF
  m_cls=EncoderDecoderModel
elif model_choice=="pegasus":  
  pretrained_model_name = "google/pegasus-large" #large pegasus really uses a lot of RAM
  m_cls=PegasusForConditionalGeneration
elif model_choice=="bart": 
  pretrained_model_name = "facebook/bart-base"
  m_cls=BartForConditionalGeneration
elif model_choice=="prophetnet":
  pretrained_model_name = "microsoft/prophetnet-large-uncased-cnndm"
  m_cls=ProphetNetForConditionalGeneration
elif model_choice=="blenderbot":
  pretrained_model_name = "facebook/blenderbot-90M"
  m_cls=BlenderbotForConditionalGeneration

hf_arch, hf_config, hf_tokenizer, hf_model = BLURR_MODEL_HELPER.get_hf_objects(pretrained_model_name, model_cls=m_cls)
if model_choice=="blenderbot":#we benefit from the similar code structure in Hugging Face
  hf_arch="bart"
if model_choice=="bert":
  hf_arch="bert_encoder_decoder"
hf_arch, type(hf_config), type(hf_tokenizer), type(hf_model)

Build data blocks:
For building datablocks it needs just a HF_Seq2SeqBeforeBatchTransform instance. Therefore, the HF_SummarizationBeforeBatchTransform should be replaced by HF_Seq2SeqBeforeBatchTransform. 
Since last update on 12/31/2020 HF_SummarizationBeforeBatchTransform gives a not-defined error
https://ohmeow.com/posts/2020/05/23/text-generation-with-blurr.html

text generation task is specified by passing in text_gen_kwargs

In [None]:
if model_choice=="pegasus" or model_choice=="blenderbot" or model_choice=="bert":
  text_gen_kwargs = { **{'max_length': 130, 'min_length': 30} }

In [None]:
#text_gen_kwargs = default_text_gen_kwargs(hf_config, hf_model, task='summarization'); 
text_gen_kwargs['do_sample'] = False
text_gen_kwargs['temperature'] = 1.0
text_gen_kwargs['early_stopping'] = True
text_gen_kwargs['repetition_penalty'] =  1.0
text_gen_kwargs['length_penalty'] = 1.2


# text_gen_kwargs['no_repeat_ngram_size'] = 3
# text_gen_kwargs['num_beams'] = 1
# text_gen_kwargs['num_beams_groups'] = 1
# text_gen_kwargs['diversity'] = 0.0

# MAX_LENGTH = 250
# MIN_LENGTH = 30

# text_gen_kwargs['max_length'] = MAX_LENGTH
# text_gen_kwargs['min_length'] = MIN_LENGTH

text_gen_kwargs

In [None]:
#Added
#text_gen_kwargs = default_text_gen_kwargs(hf_config, hf_model, task='summarization');
#hf_batch_tfm = HF_Seq2SeqBeforeBatchTransform(hf_arch, hf_config, hf_tokenizer, hf_model, max_length=130, min_length=30, text_gen_kwargs=text_gen_kwargs)
hf_batch_tfm = HF_SummarizationBeforeBatchTransform(hf_arch, hf_tokenizer, max_length=[256, 130],text_gen_kwargs=text_gen_kwargs)
#hf_batch_tfm = HF_Seq2SeqBeforeBatchTransform(hf_arch, hf_config, hf_tokenizer, hf_model, 
#                                              max_length=256, max_tgt_length=130, text_gen_kwargs=text_gen_kwargs)

blocks = (HF_TextBlock(before_batch_tfms=hf_batch_tfm, input_return_type=HF_SummarizationInput), noop)
#blocks = (HF_Seq2SeqBlock(before_batch_tfm=hf_batch_tfm), noop)

#blocks = (HF_Seq2SeqBlock(before_batch_tfm=hf_batch_tfm), noop)
dblock = DataBlock(blocks=blocks, # get_x=ColReader('text'), get_y=ColReader('sectionLabel'), 
                  get_x=ColReader('documents'), get_y=ColReader('tldr'), 
                  splitter=RandomSplitter())

In [None]:
# hf_batch_tfm

In [None]:
dls = dblock.dataloaders(df, bs=8)

In [None]:
len(dls.train.items), len(dls.valid.items)

In [None]:
b = dls.one_batch()
len(b), b[0]['input_ids'].shape, b[1].shape

In [None]:
dls.show_batch(dataloaders=dls, max_n=2)

HF_SummarizationModelCallback dont work anymore due to changes in the library. Instead the HF_BaseModelCallback should be used without the text_gen_kwargs 

In [None]:
import torch
model = HF_BaseModelWrapper(hf_model)
learn_cbs = [HF_BaseModelCallback]
#learn_cbs = [HF_SummarizationModelCallback]
model_cb = HF_SummarizationModelCallback(text_gen_kwargs=text_gen_kwargs)
#fit_cbs = [HF_SummarizationModelCallback(custom_metrics=seq2seq_metrics)]
#seq2seq_metrics = {
#        'rouge': {
#            'compute_kwargs': { 'rouge_types': ["rouge1", "rouge2", "rougeL"], 'use_stemmer': True },
#            'returns': ["rouge1", "rouge2", "rougeL"]
#        },
#        'bertscore': {
#            'compute_kwargs': { 'lang': 'en' },
#            'returns': ["precision", "recall", "f1"]
#        }
#    }
fit_cbs = [HF_SummarizationModelCallback(rouge_metrics = ("rouge1", "rouge2", "rougeL")), CSVLogger]
def sum_split(m, arch):
    """Custom param splitter for summarization models"""
    model = m.hf_model if (hasattr(m, 'hf_model')) else m

    if arch in ['bert_encoder_decoder']:
        embeds = nn.Sequential(
          model.encoder.embeddings.word_embeddings,
          model.encoder,
          model.decoder.cls.predictions.decoder
        )
        groups = L(embeds, model.encoder, model.decoder.cls.predictions.decoder)
        return groups.map(params).filter(lambda el: len(el) > 0)
    if arch in ['prophetnet']:
        embeds = nn.Sequential(
          model.prophetnet.word_embeddings,
          model.prophetnet.encoder,
          model.prophetnet.decoder,
        )
        groups = L(embeds, model.prophetnet.encoder, model.prophetnet.decoder)
        return groups.map(params).filter(lambda el: len(el) > 0)
    raise ValueError('Invalid architecture')

if model_choice!="bert" and model_choice!="prophetnet":
  learn = Learner(dls, 
                model,
                opt_func=ranger,
                loss_func=CrossEntropyLossFlat(),
                cbs=[model_cb],
                #cbs=learn_cbs,
                splitter=partial(summarization_splitter, arch=hf_arch)).to_fp16()
                #splitter=partial(seq2seq_splitter, arch=hf_arch)).to_fp16()
else:
  learn = Learner(dls, 
                model,
                opt_func=ranger,
                loss_func=CrossEntropyLossFlat(),
                #cbs=[model_cb],
                cbs = fit_cbs,
                #cbs=learn_cbs,
                splitter=partial(sum_split, arch=hf_arch)).to_fp16()
learn.create_opt() 
learn.freeze()
print("New summary")
print(learn.blurr_summary())

#print(sum_split(hf_model, hf_arch))

In [None]:
learn.lr_find(suggestions=True)

In [None]:
b = dls.one_batch()
preds = learn.model(b[0])
len(preds),preds[0], preds[1].shape

In [None]:
learn.show_results(learner=learn, max_n=5)

Moving forward, maybe for next projects, it would be ideal to scale-up the model training to multiple processors. 

In [None]:
learn.fit_one_cycle(10, lr_max=10e-2)

In [None]:
learn.show_results(learner=learn, max_n=2)


In [None]:
learn.recorder.plot_loss()



In [None]:
learn.save('stage01_tmp')

In [None]:
outputs = learn.blurr_summarize(test_article, early_stopping=True, num_beams=1, num_return_sequences=1)
#outputs = learn.blurr_generate(test_article, early_stopping=True, num_beams=4, num_return_sequences=3)

for idx, o in enumerate(outputs):
    print(f'=== Prediction {idx+1} ===\n{o}\n')