From 47690d127db14c5c99301d7fda993ebf8b7c69d1 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Fri, 1 Jan 2021 02:38:07 -0500 Subject: [PATCH 001/155] QA training and new architecture --- data/sample-qa-training-eval-data.csv | 3 + happytransformer/happy_bert.py | 142 +++++++------ happytransformer/happy_transformer.py | 11 +- happytransformer/qa_util.py | 111 ---------- happytransformer/runners/__init__.py | 0 .../runners/runner_answer_question.py | 172 ++++++++++++++++ happytransformer/trainer.py | 93 +++++++++ happytransformer/trainers/__init__.py | 0 .../trainers/default_args/default_args_qa.py | 8 + happytransformer/trainers/trainer_qa.py | 192 ++++++++++++++++++ tests/test_qa_multi.py | 3 +- tests/test_qa_training.py | 40 ++++ tests/test_qa_util.py | 17 +- 13 files changed, 606 insertions(+), 186 deletions(-) create mode 100644 data/sample-qa-training-eval-data.csv delete mode 100644 happytransformer/qa_util.py create mode 100644 happytransformer/runners/__init__.py create mode 100644 happytransformer/runners/runner_answer_question.py create mode 100644 happytransformer/trainer.py create mode 100644 happytransformer/trainers/__init__.py create mode 100644 happytransformer/trainers/default_args/default_args_qa.py create mode 100644 happytransformer/trainers/trainer_qa.py create mode 100644 tests/test_qa_training.py diff --git a/data/sample-qa-training-eval-data.csv b/data/sample-qa-training-eval-data.csv new file mode 100644 index 00000000..ffa3c9dc --- /dev/null +++ b/data/sample-qa-training-eval-data.csv @@ -0,0 +1,3 @@ +context,question,answer_text,answer_start +October 31st is the date,what is the date?,October 31st,0 +The date is November 23rd ,what is the date?,November 23rd,12 diff --git a/happytransformer/happy_bert.py b/happytransformer/happy_bert.py index 0e353bf6..4ab5b059 100644 --- a/happytransformer/happy_bert.py +++ b/happytransformer/happy_bert.py @@ -3,7 +3,6 @@ """ -from collections import namedtuple # disable pylint TODO warning # pylint: disable=W0511 import re @@ -11,14 +10,15 @@ BertForMaskedLM, BertForNextSentencePrediction, BertForQuestionAnswering, - BertTokenizer + BertTokenizerFast + ) import torch -import numpy as np +from happytransformer.runners.runner_answer_question import AnswerQuestionRunner from happytransformer.happy_transformer import HappyTransformer -from happytransformer.qa_util import qa_probabilities +from happytransformer.trainers.trainer_qa import QATrainer class HappyBERT(HappyTransformer): """ @@ -39,15 +39,25 @@ class HappyBERT(HappyTransformer): """ def __init__(self, model='bert-base-uncased'): + # todo remove model parameter. Each model will have its own super().__init__(model, "BERT") self.mlm = None # Masked Language Model self.nsp = None # Next Sentence Prediction - self.qa = None # Question Answering - self.tokenizer = BertTokenizer.from_pretrained(model) + + #todo separate tokenizer for each model + self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") self.masked_token = self.tokenizer.mask_token self.sep_token = self.tokenizer.sep_token self.cls_token = self.tokenizer.cls_token + # ------------------------ QA + self.__qa_model = None # Question Answering + self.__qa_tokenizer = None + + self.__qa_init = False + self.__qa_trainer = None + self.__qa_runner = None + def _get_masked_language_model(self): """ Initializes the BertForMaskedLM transformer @@ -62,13 +72,8 @@ def _get_next_sentence_prediction(self): self.nsp = BertForNextSentencePrediction.from_pretrained(self.model) self.nsp.eval() - def _get_question_answering(self): - """ - Initializes the BertForQuestionAnswering transformer - NOTE: This uses the bert-large-uncased-whole-word-masking-finetuned-squad pretraining for best results. - """ - self.qa = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad') - self.qa.eval() + + def predict_next_sentence(self, sentence_a, sentence_b, use_probability=False): """ @@ -131,63 +136,72 @@ def __is_one_sentence(self, text): sentence_found = True break return True +#-------------------------------------------------------# + + # QUESTION ANSWERING # +#-------------------------------------------------------# + + def init_qa(self, model='bert-large-uncased-whole-word-masking-finetuned-squad'): + """ + Initializes the BertForQuestionAnswering transformer + NOTE: This uses the bert-large-uncased-whole-word-masking-finetuned-squad pretraining for best results. + """ + self.__qa_model = BertForQuestionAnswering.from_pretrained(model) + self.__qa_tokenizer = BertTokenizerFast.from_pretrained(model) + self.__qa_model.eval() + + if self.gpu_support == 'cuda': + self.__qa_model.to('cuda') + + self.__qa_runner = AnswerQuestionRunner(self._model_name, self.__qa_model, self.__qa_tokenizer) + self.__qa_init = True + + def answers_to_question(self, question, context, k=3): + if self.__qa_init: + return self.__qa_runner.run_answers_to_question(question, context, k=k) + else: + self._init_model_first_warning("question answering", "init_qa(model_name)") + def answer_question(self, question, text): """ Using the given text, find the answer to the given question and return it. :param question: The question to be answered + #todo breaking change: change text to context :param text: The text containing the answer to the question :return: The answer to the given question, as a string """ - return self.answers_to_question(question, text, 1)[0]["text"] - - def _tokenize_qa(self, question, context): - input_text = ' '.join([ - question, - self.sep_token, - context - ]) - input_ids = self.tokenizer.encode(input_text) - return input_ids - - def _run_qa_model(self, input_ids): - if self.qa is None: - self._get_question_answering() - sep_id_index = input_ids.index(self.tokenizer.sep_token_id) - before_after_ids = [ - 0 if idx <= sep_id_index else 1 - for idx, _ in enumerate(input_ids) - ] - with torch.no_grad(): - return self.qa( - input_ids=torch.tensor([input_ids]), - token_type_ids=torch.tensor([before_after_ids]) - ) - - def answers_to_question(self, question, context, k=3): - input_ids = self._tokenize_qa(question, context) - qa_output = self._run_qa_model(input_ids) - sep_id_index = input_ids.index(self.tokenizer.sep_token_id) - probabilities = qa_probabilities( - # only consider logits from the context part of the embedding. - # that is, between the middle [SEP] token - # and the final [SEP] token - qa_output.start_logits[0][sep_id_index+1:-1], - qa_output.end_logits[0][sep_id_index+1:-1], - k - ) - # qa probabilities use indices relative to context. - # tokens use indices relative to overall question [SEP] context embedding. - # need offset to resolve this difference - token_offset = sep_id_index + 1 - - return [ - {"text": self.tokenizer.decode( - # grab ids from start to end (inclusive) and decode to text - input_ids[token_offset+answer.start_idx : token_offset+answer.end_idx+1] - ), - "softmax": answer.probability} - - for answer in probabilities - ] \ No newline at end of file + if self.__qa_init: + return self.__qa_runner.run_answer_question(question, text) + else: + self._init_model_first_warning("question answering", "init_qa(model_name)") + + def train_qa(self, filepath, args=None): + if self.__qa_init: + if self.__qa_trainer==None: + # model, model_name, tokenizer, args, model_type, device, runne + self.__qa_trainer = QATrainer(self.__qa_model, "bert", self.tokenizer, self.gpu_support, self.__qa_runner, self.logger) + + self.__qa_trainer.train(filepath, args) + else: + self._init_model_first_warning("question answering", "init_qa(model_name)") + + def test_qa(self, filepath, args=None): + if self.__qa_init: + if self.qa_trainer == None: + self.qa_trainer = QATrainer(self.__qa_model, "bert", self.tokenizer, self.gpu_support, self.__qa_runner, self.logger) + + self.__qa_trainer.train(filepath, args) + else: + self._init_model_first_warning("question answering", "init_qa(model_name)") + + def eval_qa(self, filepath, output_filepath=None, args=None): + if self.__qa_init: + if self.__qa_trainer == None: + self.__qa_trainer = QATrainer(self.__qa_model, "bert", self.tokenizer, self.gpu_support, self.__qa_runner, self.logger) + + return self.__qa_trainer.eval(filepath, args, output_filepath) + else: + self._init_model_first_warning("question answering", "init_qa(model_name)") + return - 1 diff --git a/happytransformer/happy_transformer.py b/happytransformer/happy_transformer.py index 85ba33d8..3537773d 100644 --- a/happytransformer/happy_transformer.py +++ b/happytransformer/happy_transformer.py @@ -7,7 +7,6 @@ easier to use. """ -from collections import namedtuple import string import re import os @@ -45,7 +44,7 @@ class HappyTransformer: def __init__(self, model, model_name): # Transformer and tokenizer set in child class self.model = model - self.model_name = model_name + self._model_name = model_name self.mlm = None # Masked Language Model self.seq = None # Sequence Classification self.qa = None # Question Answering @@ -63,6 +62,9 @@ def __init__(self, model, model_name): else "cpu" ) + self.device = True if torch.cuda.is_available() else False + + # show only happytransformer logs handler = logging.StreamHandler() handler.addFilter(logging.Filter('happytransformer')) @@ -572,3 +574,8 @@ def eval_mwp(self, eval_path: str, batch_size: int = 2): results = self.mwp_trainer.evaluate(eval_path, batch_size) return results + + def _init_model_first_warning(self, model_type, method_name): + + # todo make this a logger message + print("First initialize the", model_type, "using the", method_name, "method") \ No newline at end of file diff --git a/happytransformer/qa_util.py b/happytransformer/qa_util.py deleted file mode 100644 index 7156d2e9..00000000 --- a/happytransformer/qa_util.py +++ /dev/null @@ -1,111 +0,0 @@ -""" -Contains named tuples and functions used by questions answering methods -""" - -from collections import namedtuple -import torch - -SumPair = namedtuple('SumPair', ['idx1', 'idx2', 'sum']) - -def biggest_sums(items_a, items_b): - ''' - compute biggest sums from two descending ordered lists, - labeled by indices - :param items_a: list of numeric values, sorted descendingly - :param items_b: list of numeric values, sorted descendingly - :returns: list of namedtuples of the form (idx1,idx2,sum), - sorted by descending sum - ''' - a_index = b_index = 0 - while a_index < len(items_a) and b_index < len(items_b): - yield SumPair( - a_index, b_index, - sum=items_a[a_index] + items_b[b_index] - ) - # increment in whichever direction has smaller gain - # fallback to -inf at end of list. - # this will always be taken last. - next_from_a = items_a[a_index+1] if a_index + 1 < len(items_a) else float('-inf') - next_from_b = items_b[b_index+1] if b_index + 1 < len(items_b) else float('-inf') - - diff_a = items_a[a_index] - next_from_a - diff_b = items_b[b_index] - next_from_b - - if diff_a >= diff_b: - b_index += 1 - else: - a_index += 1 - -QAAnswerLogit = namedtuple('QaAnswerLogit', [ - 'start_idx', 'end_idx', 'logit' -]) - -def qa_logits(start_logits, end_logits): - """ - Compute the logits for top qa pairs - :param start_logits: tensor from qa model output - :param end_logits: tensor from qa model output - :returns: generator of namedtuples of the form - (start_idx, end_idx, logit), sorted in descending order - by score - """ - - sorted_starts_tensors = torch.sort(start_logits, descending=True) - sorted_ends_tensors = torch.sort(end_logits, descending=True) - # start logits sorted in descending order INDEPENDENTLY - sorted_start_scores = sorted_starts_tensors.values.tolist() - sorted_start_indices = sorted_starts_tensors.indices.tolist() - # end logits sorted in descending order INDEPENDENTLY - sorted_end_scores = sorted_ends_tensors.values.tolist() - sorted_end_indices = sorted_ends_tensors.indices.tolist() - # start logit + end logit pairs sorted in descending order - # of their sum TOGETHER - all_answers = ( - QAAnswerLogit( - start_idx=sorted_start_indices[sum_pair.idx1], - end_idx=sorted_end_indices[sum_pair.idx2], - logit=sum_pair.sum - ) - for sum_pair in - biggest_sums(sorted_start_scores, sorted_end_scores) - ) - # filter for only answers which have end at or after start - legit_answers = ( - answer - for answer in all_answers - if answer.end_idx >= answer.start_idx - ) - return legit_answers - - -QAProbability = namedtuple('QaProbability', [ - 'start_idx', 'end_idx', 'probability' -]) - - -def qa_probabilities(start_logits, end_logits, k): - """ - Computes the top k qa probabilities, in terms of indices. - :param start_logits: tensor from qa model output - :param end_logits: tensor from qa model output - :param k: number of results to return - :returns: list of namedtuples of the form (text,probability) - """ - top_answers = [ - qa_logit - for qa_logit, _ in zip(qa_logits(start_logits, end_logits), range(k)) - ] - logit_scores = torch.tensor([ - answer.logit - for answer in top_answers - ]) - - probabilities = torch.nn.Softmax(dim=0)(logit_scores).tolist() - return [ - QAProbability( - start_idx=answer.start_idx, - end_idx=answer.end_idx, - probability=probability - ) - for answer, probability in zip(top_answers, probabilities) - ] diff --git a/happytransformer/runners/__init__.py b/happytransformer/runners/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/happytransformer/runners/runner_answer_question.py b/happytransformer/runners/runner_answer_question.py new file mode 100644 index 00000000..707e42f3 --- /dev/null +++ b/happytransformer/runners/runner_answer_question.py @@ -0,0 +1,172 @@ +import torch + +from collections import namedtuple + + +class AnswerQuestionRunner(): + SumPair = namedtuple('SumPair', ['idx1', 'idx2', 'sum']) + + def __init__(self, type, model, tokenizer): + self.type = type # BERT, ROBERTA, ALBERT etc + self.model = model + self.tokenizer = tokenizer + + def run_answers_to_question(self, question, context, k=3): + input_ids = self._tokenize_qa(question, context) + qa_output = self._run_qa_model(input_ids) + sep_id_index = input_ids.index(self.tokenizer.sep_token_id) + probabilities = self.qa_probabilities( + # only consider logits from the context part of the embedding. + # that is, between the middle [SEP] token + # and the final [SEP] token + qa_output.start_logits[0][sep_id_index+1:-1], + qa_output.end_logits[0][sep_id_index+1:-1], + k + ) + # qa probabilities use indices relative to context. + # tokens use indices relative to overall question [SEP] context embedding. + # need offset to resolve this difference + token_offset = sep_id_index + 1 + return [ + {"text": self.tokenizer.decode( + # grab ids from start to end (inclusive) and decode to text + input_ids[token_offset+answer.start_idx : token_offset+answer.end_idx+1] + ), + "softmax": answer.probability} + + for answer in probabilities + ] + + def run_answer_question(self, question, text): + """ + Using the given text, find the answer to the given question and return it. + + :param question: The question to be answered + :param text: The text containing the answer to the question + :return: The answer to the given question, as a string + """ + return self.run_answers_to_question(question, text, 1)[0]["text"] + + def _tokenize_qa(self, question, context): + input_text = ' '.join([ + question, + self.tokenizer.sep_token, + context + ]) + input_ids = self.tokenizer.encode(input_text) + return input_ids + + + def _run_qa_model(self, input_ids): + + sep_id_index = input_ids.index(self.tokenizer.sep_token_id) + before_after_ids = [ + 0 if idx <= sep_id_index else 1 + for idx, _ in enumerate(input_ids) + ] + with torch.no_grad(): + return self.model( + input_ids=torch.tensor([input_ids]), + token_type_ids=torch.tensor([before_after_ids]) + ) + + def biggest_sums(self, items_a, items_b): + ''' + compute biggest sums from two descending ordered lists, + labeled by indices + :param items_a: list of numeric values, sorted descendingly + :param items_b: list of numeric values, sorted descendingly + :returns: list of namedtuples of the form (idx1,idx2,sum), + sorted by descending sum + ''' + a_index = b_index = 0 + while a_index < len(items_a) and b_index < len(items_b): + yield self.SumPair( + a_index, b_index, + sum=items_a[a_index] + items_b[b_index] + ) + # increment in whichever direction has smaller gain + # fallback to -inf at end of list. + # this will always be taken last. + next_from_a = items_a[a_index + 1] if a_index + 1 < len(items_a) else float('-inf') + next_from_b = items_b[b_index + 1] if b_index + 1 < len(items_b) else float('-inf') + + diff_a = items_a[a_index] - next_from_a + diff_b = items_b[b_index] - next_from_b + + if diff_a >= diff_b: + b_index += 1 + else: + a_index += 1 + + QAAnswerLogit = namedtuple('QaAnswerLogit', [ + 'start_idx', 'end_idx', 'logit' + ]) + + def qa_logits(self, start_logits, end_logits): + """ + Compute the logits for top qa pairs + :param start_logits: tensor from qa model output + :param end_logits: tensor from qa model output + :returns: generator of namedtuples of the form + (start_idx, end_idx, logit), sorted in descending order + by score + """ + + sorted_starts_tensors = torch.sort(start_logits, descending=True) + sorted_ends_tensors = torch.sort(end_logits, descending=True) + # start logits sorted in descending order INDEPENDENTLY + sorted_start_scores = sorted_starts_tensors.values.tolist() + sorted_start_indices = sorted_starts_tensors.indices.tolist() + # end logits sorted in descending order INDEPENDENTLY + sorted_end_scores = sorted_ends_tensors.values.tolist() + sorted_end_indices = sorted_ends_tensors.indices.tolist() + # start logit + end logit pairs sorted in descending order + # of their sum TOGETHER + all_answers = ( + self.QAAnswerLogit( + start_idx=sorted_start_indices[sum_pair.idx1], + end_idx=sorted_end_indices[sum_pair.idx2], + logit=sum_pair.sum + ) + for sum_pair in + self.biggest_sums(sorted_start_scores, sorted_end_scores) + ) + # filter for only answers which have end at or after start + legit_answers = ( + answer + for answer in all_answers + if answer.end_idx >= answer.start_idx + ) + return legit_answers + + QAProbability = namedtuple('QaProbability', [ + 'start_idx', 'end_idx', 'probability' + ]) + + def qa_probabilities(self, start_logits, end_logits, k): + """ + Computes the top k qa probabilities, in terms of indices. + :param start_logits: tensor from qa model output + :param end_logits: tensor from qa model output + :param k: number of results to return + :returns: list of namedtuples of the form (text,probability) + """ + top_answers = [ + qa_logit + for qa_logit, _ in zip(self.qa_logits(start_logits, end_logits), range(k)) + ] + logit_scores = torch.tensor([ + answer.logit + for answer in top_answers + ]) + + probabilities = torch.nn.Softmax(dim=0)(logit_scores).tolist() + return [ + self.QAProbability( + start_idx=answer.start_idx, + end_idx=answer.end_idx, + probability=probability + ) + for answer, probability in zip(top_answers, probabilities) + ] diff --git a/happytransformer/trainer.py b/happytransformer/trainer.py new file mode 100644 index 00000000..c8c2085c --- /dev/null +++ b/happytransformer/trainer.py @@ -0,0 +1,93 @@ +import time +import datetime +import math +from csv import DictWriter + + +class Trainer: + def __init__(self, model, model_type, tokenizer, device, runner, logger): + self.model = model + self.model_type = model_type + self.tokenizer = tokenizer + self.device = device + self.runner = runner + self.logger = logger + + + def train(self, filepath, args): + pass + + def test(self, filepath, args, output_filepath): + pass + + def eval(self, filepath, args, output_filepath): + pass + + def _get_train_eval_data(self, filepath): + """ + Used for parsing data for training and evaluating (both contain labels) + :param filepath: a string that contains the location of the data + :return: + """ + pass + + def _get_test_data(self, filepath): + pass + + def _format_time(self, time): + """ + elapsed: time in seconds + return: time outputted in hh:mm:ss format + """ + time_rounded = int(round((time))) + + # Format as hh:mm:ss + return str(datetime.timedelta(seconds=time_rounded)) + + + def _get_update_interval(self, count): + """ + Determines how often to print status, given the number of cases. + + First determines how often to update for exactly 50 updates. + Then, rounds to the nearest power of ten (10, 100, 1000 etc) + + + :param count: + :return: + """ + + x = count / 50 + order = math.floor(math.log(x, 10)) + + update_interval = order ** 10 + if update_interval == 0: + return 1 + return update_interval + + def _print_status(self, init_time, count, total, update_interval, percentage = None): + if count % update_interval and not count == 0: + current_time = time.time() + elapsed_time_string = self._format_time(current_time - init_time) + + avg_ex = (current_time - init_time) / count + rem_time_int = avg_ex * (total - count) + rem_time_string = self._format_time(rem_time_int) + ending = "" + if percentage != None: + ending = "Correct: " + str(round(percentage, 2)*100) + "%" + status_output = "Done: ", str(count) + "/" + str( + total) + " ---- Elapsed: " + elapsed_time_string + " Estimated Remaining: " + rem_time_string +" " + ending + + + self.logger.info(status_output) + + def _output_result_to_csv(self, output_filepath, fieldnames, results): + with open(output_filepath, 'w') as csv_file: + csv_writer = DictWriter(csv_file, fieldnames=fieldnames) + csv_writer.writeheader() + for result in results: + csv_writer.writerow( + result + ) + csv_file.close() diff --git a/happytransformer/trainers/__init__.py b/happytransformer/trainers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/happytransformer/trainers/default_args/default_args_qa.py b/happytransformer/trainers/default_args/default_args_qa.py new file mode 100644 index 00000000..c967fb56 --- /dev/null +++ b/happytransformer/trainers/default_args/default_args_qa.py @@ -0,0 +1,8 @@ +ARGS_QA_TRAINING = { + 'max_length': 300, + 'batch_size': 16, + 'learning_rate': 5e-5, + 'epochs': 2, + + +} \ No newline at end of file diff --git a/happytransformer/trainers/trainer_qa.py b/happytransformer/trainers/trainer_qa.py new file mode 100644 index 00000000..5a18da2b --- /dev/null +++ b/happytransformer/trainers/trainer_qa.py @@ -0,0 +1,192 @@ +""" +This code is a modified version of the official documentation for the transformer library by Hugging Face +which can be found below. + +We prioritized following the official documentation as close as possible to ensure we're using +robust methods. And also, to improve maintainability as they update the documentation. + +https://huggingface.co/transformers/custom_datasets.html#question-answering-with-squad-2-0 +""" + +import torch +from happytransformer.trainer import Trainer +import csv +from torch.utils.data import DataLoader +from torch.utils.data import DataLoader +from transformers import AdamW +import time +from happytransformer.trainers.default_args.default_args_qa import ARGS_QA_TRAINING + +class QATrainer(Trainer): + + def __init__(self, model, model_type, tokenizer, device, runner, logger): + super(QATrainer, self).__init__(model, model_type, tokenizer, device, runner, logger) + + def train(self, filepath, args=ARGS_QA_TRAINING): + """ + #todo: add time elapsed and test time remaining similar to what is within eval + + + :param filepath: + :param args: + :return: + """ + + if args == None: + args = ARGS_QA_TRAINING + + contexts, questions, answers = self.__get_train_eval_data(filepath) + + self.__add_end_idx(contexts, answers) + encodings = self.tokenizer(contexts, questions, truncation=True, padding=True) + self.__add_token_positions(encodings, answers) + dataset = QuestionAnsweringDataset(encodings) + self.model.train() + + train_loader = DataLoader(dataset, batch_size=args['batch_size'], shuffle=True) + + optim = AdamW(self.model.parameters(), lr=args['learning_rate']) + + for epoch in range(args['epochs']): + epoch_output = "Epoch: " + str(epoch) + self.logger.info(epoch_output) + batch_num = 0 + for batch in train_loader: + optim.zero_grad() + input_ids = batch['input_ids'].to(self.device) + attention_mask = batch['attention_mask'].to(self.device) + start_positions = batch['start_positions'].to(self.device) + end_positions = batch['end_positions'].to(self.device) + outputs = self.model(input_ids, attention_mask=attention_mask, start_positions=start_positions, + end_positions=end_positions) + loss = outputs[0] + loss.backward() + optim.step() + batch_logger_output = "Batch: " + str(batch_num) + " loss: " + str(round(loss.item(), 6)) + self.logger.info(batch_logger_output) + batch_num += 1 + self.model.eval() + + def eval(self, filepath, args, output_filepath=None): + + contexts, questions, answers = self.__get_train_eval_data(filepath) + init_time = time.time() + correct = 0 + count = 0 + total = len(contexts) + update_interval = self._get_update_interval(total) + + results = list() + + for case in zip(contexts, questions, answers): + context = case[0] + question = case[1] + answer = case[2] + output = self.runner.run_answer_question(question, context) + + # todo modify the qa functionality to output with correct capitalization + + compare_answer = answer["answer_text"].lower() + + if output == compare_answer: + correct += 1 + count += 1 + + if output_filepath != None: + results.append( + { + "contexts": context, + "questions": question, + "answer": answer["answer_text"].lower(), + "outputs": output, + "correct": output == compare_answer + } + ) + + self._print_status(init_time, count, total, update_interval, correct/count) + + score = correct/total + ending = str(round(score, 2) * 100) + "%" + + result_output = "Evaluating Result: " + str(correct) + "/" + str(total) + " -- " + ending + self.logger.info(result_output) + + if output_filepath != None: + fieldnames = ["contexts", "questions", "answer", "outputs", "correct"] + self._output_result_to_csv(output_filepath, fieldnames, results) + + return score + + + def test(self, filepath, args, output_filepath): + #todo + pass + + def __get_train_eval_data(self, filepath): + """ + Used for parsing data for training and evaluating (both contain labels) + :param filepath: a string that contains the location of the data + :return: + """ + + contexts = [] + questions = [] + answers = [] + with open(filepath, newline='') as csv_file: + reader = csv.DictReader(csv_file) + for row in reader: + contexts.append(row['context']) + questions.append(row['question']) + answer = {} + answer["answer_text"] = row['answer_text'] + answer["answer_start"] = int(row['answer_start']) + answers.append(answer) + csv_file.close() + + return contexts, questions, answers + + + def __add_end_idx(self, contexts, answers): + for answer, context in zip(answers, contexts): + + gold_text = answer['answer_text'] + start_idx = answer['answer_start'] + end_idx = start_idx + len(gold_text) + + # todo (maybe): strip answer['text'] (remove white space from start and end) + # sometimes squad answers are off by a character or two – fix this + if context[start_idx:end_idx] == gold_text: + answer['answer_end'] = end_idx + elif context[start_idx - 1:end_idx - 1] == gold_text: + answer['answer_start'] = start_idx - 1 + answer['answer_end'] = end_idx - 1 # When the gold label is off by one character + elif context[start_idx - 2:end_idx - 2] == gold_text: + answer['answer_start'] = start_idx - 2 + answer['answer_end'] = end_idx - 2 # When the gold label is off by two characters + else: + print("error: implement skipping training answer") + + def __add_token_positions(self, encodings, answers): + start_positions = [] + end_positions = [] + for i in range(len(answers)): + start_positions.append(encodings.char_to_token(i, answers[i]['answer_start'])) + end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1)) + # if None, the answer passage has been truncated + if start_positions[-1] is None: + start_positions[-1] = self.tokenizer.model_max_length + if end_positions[-1] is None: + end_positions[-1] = self.tokenizer.model_max_length + encodings.update({'start_positions': start_positions, 'end_positions': end_positions}) + + +class QuestionAnsweringDataset(torch.utils.data.Dataset): + + def __init__(self, encodings): + self.encodings = encodings + + def __getitem__(self, idx): + return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} + + def __len__(self): + return len(self.encodings.input_ids) \ No newline at end of file diff --git a/tests/test_qa_multi.py b/tests/test_qa_multi.py index e5d4960a..a4b3696a 100644 --- a/tests/test_qa_multi.py +++ b/tests/test_qa_multi.py @@ -4,7 +4,8 @@ from happytransformer import HappyBERT -happy_bert = HappyBERT('bert-large-uncased-whole-word-masking-finetuned-squad') +happy_bert = HappyBERT() +happy_bert.init_qa() PARAGRAPH = ( 'McGill is a university located in Montreal. ' diff --git a/tests/test_qa_training.py b/tests/test_qa_training.py new file mode 100644 index 00000000..ef4c4eb5 --- /dev/null +++ b/tests/test_qa_training.py @@ -0,0 +1,40 @@ +from happytransformer import HappyBERT + + +def test_qa_training(): + + happy_bert = HappyBERT() + happy_bert.init_qa() + start_answers = happy_bert.answers_to_question("What is the date?", "October 31st is the date") + happy_bert.train_qa("../data/sample-qa-training-eval-data.csv") + end_answers = happy_bert.answers_to_question("What is the date?", "October 31st is the date") + + assert start_answers[0]["text"] == "october 31st" + assert end_answers[0]["text"] == "october 31st" + assert end_answers[0]["softmax"] > start_answers[0]["softmax"] + +def test_qa_eval(): + happy_bert = HappyBERT() + happy_bert.init_qa() + before = happy_bert.eval_qa("../data/sample-qa-training-eval-data.csv") + happy_bert.train_qa("../data/sample-qa-training-eval-data.csv") + after = happy_bert.eval_qa("../data/sample-qa-training-eval-data.csv") + + #todo assert by making sure the output csv makes sense + # todo, also, perhaps use a different dataset for training and eval + + + assert after >= before + # todo get a larger dataset + # however, we do not want to commit a large dataset to the repo, + # so we'll have to find a way to download it from the web when the code runs + + # also, use separate data for test and eval + # assert after > before + + + + +# def test_qa_test(): +#todo + diff --git a/tests/test_qa_util.py b/tests/test_qa_util.py index 82f7af1f..4f610b68 100644 --- a/tests/test_qa_util.py +++ b/tests/test_qa_util.py @@ -2,22 +2,23 @@ Contains tests for functions found within qa_util.py """ -from happytransformer.qa_util import biggest_sums, SumPair - +from happytransformer.runners.runner_answer_question import AnswerQuestionRunner def test_biggest_sums(): """ Tests the biggest_sums function """ items_a = [7, 4, 3] items_b = [7, 6, 4] + qa_runner = AnswerQuestionRunner("bert", None, None) + expected_pairs = [ - SumPair(idx1=0, idx2=0, sum=14), # 7+7 - SumPair(idx1=0, idx2=1, sum=13), # 7+6 - SumPair(idx1=0, idx2=2, sum=11), # 7+4 - SumPair(idx1=1, idx2=2, sum=8), # 4+4 - SumPair(idx1=2, idx2=2, sum=7) # 3+4 + qa_runner.SumPair(idx1=0, idx2=0, sum=14), # 7+7 + qa_runner.SumPair(idx1=0, idx2=1, sum=13), # 7+6 + qa_runner.SumPair(idx1=0, idx2=2, sum=11), # 7+4 + qa_runner.SumPair(idx1=1, idx2=2, sum=8), # 4+4 + qa_runner.SumPair(idx1=2, idx2=2, sum=7) # 3+4 ] - computed_pairs = biggest_sums(items_a, items_b) + computed_pairs = qa_runner.biggest_sums(items_a, items_b) assert all( expected_pair == computed_pair for expected_pair, computed_pair in From ee92150ec51e6534f13bfab80e2601a6764b5f49 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Fri, 1 Jan 2021 23:19:40 -0500 Subject: [PATCH 002/155] Added suggestions from Ted --- happytransformer/happy_bert.py | 6 +-- .../runners/runner_answer_question.py | 37 ++----------------- happytransformer/runners/runner_util.py | 34 +++++++++++++++++ happytransformer/trainer.py | 15 ++++---- happytransformer/trainers/trainer_qa.py | 6 ++- tests/test_qa_util.py | 15 ++++---- 6 files changed, 58 insertions(+), 55 deletions(-) create mode 100644 happytransformer/runners/runner_util.py diff --git a/happytransformer/happy_bert.py b/happytransformer/happy_bert.py index 4ab5b059..55729acf 100644 --- a/happytransformer/happy_bert.py +++ b/happytransformer/happy_bert.py @@ -15,7 +15,7 @@ ) import torch -from happytransformer.runners.runner_answer_question import AnswerQuestionRunner +from happytransformer.runners.runner_answer_question import QuestionAnswering from happytransformer.happy_transformer import HappyTransformer from happytransformer.trainers.trainer_qa import QATrainer @@ -45,7 +45,7 @@ def __init__(self, model='bert-base-uncased'): self.nsp = None # Next Sentence Prediction #todo separate tokenizer for each model - self.tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased") + self.tokenizer = BertTokenizerFast.from_pretrained(model) self.masked_token = self.tokenizer.mask_token self.sep_token = self.tokenizer.sep_token self.cls_token = self.tokenizer.cls_token @@ -153,7 +153,7 @@ def init_qa(self, model='bert-large-uncased-whole-word-masking-finetuned-squad') if self.gpu_support == 'cuda': self.__qa_model.to('cuda') - self.__qa_runner = AnswerQuestionRunner(self._model_name, self.__qa_model, self.__qa_tokenizer) + self.__qa_runner = QuestionAnswering(self._model_name, self.__qa_model, self.__qa_tokenizer) self.__qa_init = True def answers_to_question(self, question, context, k=3): diff --git a/happytransformer/runners/runner_answer_question.py b/happytransformer/runners/runner_answer_question.py index 707e42f3..e3f13ddb 100644 --- a/happytransformer/runners/runner_answer_question.py +++ b/happytransformer/runners/runner_answer_question.py @@ -1,11 +1,9 @@ import torch +from happytransformer.runners.runner_util import biggest_sums from collections import namedtuple - -class AnswerQuestionRunner(): - SumPair = namedtuple('SumPair', ['idx1', 'idx2', 'sum']) - +class QuestionAnswering(): def __init__(self, type, model, tokenizer): self.type = type # BERT, ROBERTA, ALBERT etc self.model = model @@ -70,35 +68,6 @@ def _run_qa_model(self, input_ids): token_type_ids=torch.tensor([before_after_ids]) ) - def biggest_sums(self, items_a, items_b): - ''' - compute biggest sums from two descending ordered lists, - labeled by indices - :param items_a: list of numeric values, sorted descendingly - :param items_b: list of numeric values, sorted descendingly - :returns: list of namedtuples of the form (idx1,idx2,sum), - sorted by descending sum - ''' - a_index = b_index = 0 - while a_index < len(items_a) and b_index < len(items_b): - yield self.SumPair( - a_index, b_index, - sum=items_a[a_index] + items_b[b_index] - ) - # increment in whichever direction has smaller gain - # fallback to -inf at end of list. - # this will always be taken last. - next_from_a = items_a[a_index + 1] if a_index + 1 < len(items_a) else float('-inf') - next_from_b = items_b[b_index + 1] if b_index + 1 < len(items_b) else float('-inf') - - diff_a = items_a[a_index] - next_from_a - diff_b = items_b[b_index] - next_from_b - - if diff_a >= diff_b: - b_index += 1 - else: - a_index += 1 - QAAnswerLogit = namedtuple('QaAnswerLogit', [ 'start_idx', 'end_idx', 'logit' ]) @@ -130,7 +99,7 @@ def qa_logits(self, start_logits, end_logits): logit=sum_pair.sum ) for sum_pair in - self.biggest_sums(sorted_start_scores, sorted_end_scores) + biggest_sums(sorted_start_scores, sorted_end_scores) ) # filter for only answers which have end at or after start legit_answers = ( diff --git a/happytransformer/runners/runner_util.py b/happytransformer/runners/runner_util.py new file mode 100644 index 00000000..c19f9a8d --- /dev/null +++ b/happytransformer/runners/runner_util.py @@ -0,0 +1,34 @@ +from collections import namedtuple + + +SumPair = namedtuple('SumPair', ['idx1', 'idx2', 'sum']) + + +def biggest_sums(items_a, items_b): + ''' + compute biggest sums from two descending ordered lists, + labeled by indices + :param items_a: list of numeric values, sorted descendingly + :param items_b: list of numeric values, sorted descendingly + :returns: list of namedtuples of the form (idx1,idx2,sum), + sorted by descending sum + ''' + a_index = b_index = 0 + while a_index < len(items_a) and b_index < len(items_b): + yield SumPair( + a_index, b_index, + sum=items_a[a_index] + items_b[b_index] + ) + # increment in whichever direction has smaller gain + # fallback to -inf at end of list. + # this will always be taken last. + next_from_a = items_a[a_index + 1] if a_index + 1 < len(items_a) else float('-inf') + next_from_b = items_b[b_index + 1] if b_index + 1 < len(items_b) else float('-inf') + + diff_a = items_a[a_index] - next_from_a + diff_b = items_b[b_index] - next_from_b + + if diff_a >= diff_b: + b_index += 1 + else: + a_index += 1 \ No newline at end of file diff --git a/happytransformer/trainer.py b/happytransformer/trainer.py index c8c2085c..c8366c30 100644 --- a/happytransformer/trainer.py +++ b/happytransformer/trainer.py @@ -83,11 +83,10 @@ def _print_status(self, init_time, count, total, update_interval, percentage = N self.logger.info(status_output) def _output_result_to_csv(self, output_filepath, fieldnames, results): - with open(output_filepath, 'w') as csv_file: - csv_writer = DictWriter(csv_file, fieldnames=fieldnames) - csv_writer.writeheader() - for result in results: - csv_writer.writerow( - result - ) - csv_file.close() + with open(output_filepath, 'w') as csv_file: + csv_writer = DictWriter(csv_file, fieldnames=fieldnames) + csv_writer.writeheader() + for result in results: + csv_writer.writerow( + result + ) diff --git a/happytransformer/trainers/trainer_qa.py b/happytransformer/trainers/trainer_qa.py index 5a18da2b..6ce5200b 100644 --- a/happytransformer/trainers/trainer_qa.py +++ b/happytransformer/trainers/trainer_qa.py @@ -122,7 +122,8 @@ def test(self, filepath, args, output_filepath): #todo pass - def __get_train_eval_data(self, filepath): + @staticmethod + def __get_train_eval_data(filepath): """ Used for parsing data for training and evaluating (both contain labels) :param filepath: a string that contains the location of the data @@ -145,8 +146,9 @@ def __get_train_eval_data(self, filepath): return contexts, questions, answers + @staticmethod - def __add_end_idx(self, contexts, answers): + def __add_end_idx(contexts, answers): for answer, context in zip(answers, contexts): gold_text = answer['answer_text'] diff --git a/tests/test_qa_util.py b/tests/test_qa_util.py index 4f610b68..1e7bc501 100644 --- a/tests/test_qa_util.py +++ b/tests/test_qa_util.py @@ -2,23 +2,22 @@ Contains tests for functions found within qa_util.py """ -from happytransformer.runners.runner_answer_question import AnswerQuestionRunner +from happytransformer.runners.runner_util import SumPair, biggest_sums def test_biggest_sums(): """ Tests the biggest_sums function """ items_a = [7, 4, 3] items_b = [7, 6, 4] - qa_runner = AnswerQuestionRunner("bert", None, None) expected_pairs = [ - qa_runner.SumPair(idx1=0, idx2=0, sum=14), # 7+7 - qa_runner.SumPair(idx1=0, idx2=1, sum=13), # 7+6 - qa_runner.SumPair(idx1=0, idx2=2, sum=11), # 7+4 - qa_runner.SumPair(idx1=1, idx2=2, sum=8), # 4+4 - qa_runner.SumPair(idx1=2, idx2=2, sum=7) # 3+4 + SumPair(idx1=0, idx2=0, sum=14), # 7+7 + SumPair(idx1=0, idx2=1, sum=13), # 7+6 + SumPair(idx1=0, idx2=2, sum=11), # 7+4 + SumPair(idx1=1, idx2=2, sum=8), # 4+4 + SumPair(idx1=2, idx2=2, sum=7) # 3+4 ] - computed_pairs = qa_runner.biggest_sums(items_a, items_b) + computed_pairs = biggest_sums(items_a, items_b) assert all( expected_pair == computed_pair for expected_pair, computed_pair in From db8d5373360ddbb89a1e035f5af6f47cd6f491c6 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sat, 2 Jan 2021 01:20:59 -0500 Subject: [PATCH 003/155] Added suggestions from Ted --- happytransformer/happy_bert.py | 74 ++++++++----------- happytransformer/happy_transformer.py | 12 ++- .../runners/runner_answer_question.py | 6 +- happytransformer/trainer.py | 5 +- happytransformer/trainers/trainer_qa.py | 3 +- 5 files changed, 48 insertions(+), 52 deletions(-) diff --git a/happytransformer/happy_bert.py b/happytransformer/happy_bert.py index 55729acf..c6711964 100644 --- a/happytransformer/happy_bert.py +++ b/happytransformer/happy_bert.py @@ -50,13 +50,6 @@ def __init__(self, model='bert-base-uncased'): self.sep_token = self.tokenizer.sep_token self.cls_token = self.tokenizer.cls_token - # ------------------------ QA - self.__qa_model = None # Question Answering - self.__qa_tokenizer = None - - self.__qa_init = False - self.__qa_trainer = None - self.__qa_runner = None def _get_masked_language_model(self): """ @@ -146,21 +139,31 @@ def init_qa(self, model='bert-large-uncased-whole-word-masking-finetuned-squad') Initializes the BertForQuestionAnswering transformer NOTE: This uses the bert-large-uncased-whole-word-masking-finetuned-squad pretraining for best results. """ - self.__qa_model = BertForQuestionAnswering.from_pretrained(model) - self.__qa_tokenizer = BertTokenizerFast.from_pretrained(model) - self.__qa_model.eval() + self._qa_model = BertForQuestionAnswering.from_pretrained(model) + self._qa_tokenizer = BertTokenizerFast.from_pretrained(model) + self._qa_model.eval() if self.gpu_support == 'cuda': - self.__qa_model.to('cuda') - - self.__qa_runner = QuestionAnswering(self._model_name, self.__qa_model, self.__qa_tokenizer) - self.__qa_init = True - - def answers_to_question(self, question, context, k=3): - if self.__qa_init: - return self.__qa_runner.run_answers_to_question(question, context, k=k) + self._qa_model.to('cuda') + + self._qa_runner = QuestionAnswering(self.model_name, self._qa_model, self._qa_tokenizer) + self._qa_init = True + + def __check_if_init(self, check_trainer=False): + if self._qa_init: + if check_trainer: + if self._qa_trainer == None: + self._qa_trainer = QATrainer(self._qa_model, "bert", self.tokenizer, self.gpu_support, + self._qa_runner, + self.logger) + return True else: self._init_model_first_warning("question answering", "init_qa(model_name)") + return False + + def answers_to_question(self, question, context, k=3): + if self.__check_if_init(): + return self._qa_runner.answers_to_question(question, context, k=k) def answer_question(self, question, text): @@ -172,36 +175,21 @@ def answer_question(self, question, text): :param text: The text containing the answer to the question :return: The answer to the given question, as a string """ - if self.__qa_init: - return self.__qa_runner.run_answer_question(question, text) - else: - self._init_model_first_warning("question answering", "init_qa(model_name)") + + if self.__check_if_init(): + return self._qa_runner.answer_question(question, text) + def train_qa(self, filepath, args=None): - if self.__qa_init: - if self.__qa_trainer==None: - # model, model_name, tokenizer, args, model_type, device, runne - self.__qa_trainer = QATrainer(self.__qa_model, "bert", self.tokenizer, self.gpu_support, self.__qa_runner, self.logger) + if self.__check_if_init(True): + self._qa_trainer.train(filepath, args) - self.__qa_trainer.train(filepath, args) - else: - self._init_model_first_warning("question answering", "init_qa(model_name)") def test_qa(self, filepath, args=None): - if self.__qa_init: - if self.qa_trainer == None: - self.qa_trainer = QATrainer(self.__qa_model, "bert", self.tokenizer, self.gpu_support, self.__qa_runner, self.logger) - - self.__qa_trainer.train(filepath, args) - else: - self._init_model_first_warning("question answering", "init_qa(model_name)") + if self.__check_if_init(True): + raise NotImplementedError() def eval_qa(self, filepath, output_filepath=None, args=None): - if self.__qa_init: - if self.__qa_trainer == None: - self.__qa_trainer = QATrainer(self.__qa_model, "bert", self.tokenizer, self.gpu_support, self.__qa_runner, self.logger) + if self.__check_if_init(True): + return self._qa_trainer.eval(filepath, args, output_filepath) - return self.__qa_trainer.eval(filepath, args, output_filepath) - else: - self._init_model_first_warning("question answering", "init_qa(model_name)") - return - 1 diff --git a/happytransformer/happy_transformer.py b/happytransformer/happy_transformer.py index 3537773d..d540c1cd 100644 --- a/happytransformer/happy_transformer.py +++ b/happytransformer/happy_transformer.py @@ -44,7 +44,7 @@ class HappyTransformer: def __init__(self, model, model_name): # Transformer and tokenizer set in child class self.model = model - self._model_name = model_name + self.model_name = model_name self.mlm = None # Masked Language Model self.seq = None # Sequence Classification self.qa = None # Question Answering @@ -62,8 +62,6 @@ def __init__(self, model, model_name): else "cpu" ) - self.device = True if torch.cuda.is_available() else False - # show only happytransformer logs handler = logging.StreamHandler() @@ -81,6 +79,14 @@ def __init__(self, model, model_name): self.mwp_trainer = None self.mwp_trained = False + # ------------------------ QA + self._qa_model = None # Question Answering + self._qa_tokenizer = None + + self._qa_init = False + self._qa_trainer = None + self._qa_runner = None + def _get_masked_language_model(self): pass diff --git a/happytransformer/runners/runner_answer_question.py b/happytransformer/runners/runner_answer_question.py index e3f13ddb..fd6decc1 100644 --- a/happytransformer/runners/runner_answer_question.py +++ b/happytransformer/runners/runner_answer_question.py @@ -9,7 +9,7 @@ def __init__(self, type, model, tokenizer): self.model = model self.tokenizer = tokenizer - def run_answers_to_question(self, question, context, k=3): + def answers_to_question(self, question, context, k=3): input_ids = self._tokenize_qa(question, context) qa_output = self._run_qa_model(input_ids) sep_id_index = input_ids.index(self.tokenizer.sep_token_id) @@ -35,7 +35,7 @@ def run_answers_to_question(self, question, context, k=3): for answer in probabilities ] - def run_answer_question(self, question, text): + def answer_question(self, question, text): """ Using the given text, find the answer to the given question and return it. @@ -43,7 +43,7 @@ def run_answer_question(self, question, text): :param text: The text containing the answer to the question :return: The answer to the given question, as a string """ - return self.run_answers_to_question(question, text, 1)[0]["text"] + return self.answers_to_question(question, text, 1)[0]["text"] def _tokenize_qa(self, question, context): input_text = ' '.join([ diff --git a/happytransformer/trainer.py b/happytransformer/trainer.py index c8366c30..0a513105 100644 --- a/happytransformer/trainer.py +++ b/happytransformer/trainer.py @@ -15,12 +15,15 @@ def __init__(self, model, model_type, tokenizer, device, runner, logger): def train(self, filepath, args): + raise NotImplementedError() pass def test(self, filepath, args, output_filepath): + raise NotImplementedError() pass def eval(self, filepath, args, output_filepath): + raise NotImplementedError() pass def _get_train_eval_data(self, filepath): @@ -60,7 +63,7 @@ def _get_update_interval(self, count): x = count / 50 order = math.floor(math.log(x, 10)) - update_interval = order ** 10 + update_interval = 10 ** order if update_interval == 0: return 1 return update_interval diff --git a/happytransformer/trainers/trainer_qa.py b/happytransformer/trainers/trainer_qa.py index 6ce5200b..34bb4289 100644 --- a/happytransformer/trainers/trainer_qa.py +++ b/happytransformer/trainers/trainer_qa.py @@ -12,7 +12,6 @@ from happytransformer.trainer import Trainer import csv from torch.utils.data import DataLoader -from torch.utils.data import DataLoader from transformers import AdamW import time from happytransformer.trainers.default_args.default_args_qa import ARGS_QA_TRAINING @@ -82,7 +81,7 @@ def eval(self, filepath, args, output_filepath=None): context = case[0] question = case[1] answer = case[2] - output = self.runner.run_answer_question(question, context) + output = self.runner.answer_question(question, context) # todo modify the qa functionality to output with correct capitalization From 5057b5d344afc2a43087cb6e523a1d03263cd1d7 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sat, 2 Jan 2021 01:22:43 -0500 Subject: [PATCH 004/155] Added NotImplementedError() and removed pass --- happytransformer/trainer.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/happytransformer/trainer.py b/happytransformer/trainer.py index 0a513105..cc9f34f4 100644 --- a/happytransformer/trainer.py +++ b/happytransformer/trainer.py @@ -16,15 +16,12 @@ def __init__(self, model, model_type, tokenizer, device, runner, logger): def train(self, filepath, args): raise NotImplementedError() - pass def test(self, filepath, args, output_filepath): raise NotImplementedError() - pass def eval(self, filepath, args, output_filepath): raise NotImplementedError() - pass def _get_train_eval_data(self, filepath): """ @@ -32,10 +29,10 @@ def _get_train_eval_data(self, filepath): :param filepath: a string that contains the location of the data :return: """ - pass + raise NotImplementedError() def _get_test_data(self, filepath): - pass + raise NotImplementedError() def _format_time(self, time): """ From b9761c32cf089d09855148033b0333b181aa5a84 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Wed, 6 Jan 2021 16:44:12 -0500 Subject: [PATCH 005/155] Rearranged to follow new arch, and finished QA training --- .../sample-qa-training-eval-data.csv | 0 happytransformer/__init__.py | 8 +- happytransformer/happy_question_answering.py | 142 +++++ happytransformer/happy_transformer.py | 600 ++---------------- happytransformer/happy_word_prediction.py | 44 ++ .../{runners => happytasks}/__init__.py | 0 .../happytasks/happy_mwp/default_args_mwp.py | 19 + .../happytasks/happy_mwp/trainer_mwp.py | 17 + .../happy_qa}/__init__.py | 0 .../happytasks/happy_qa/default_args_qa.py | 17 + .../happytasks/happy_qa/qa_util.py | 75 +++ .../happy_qa}/trainer_qa.py | 153 +++-- .../tasks_util.py} | 2 +- .../runners/runner_answer_question.py | 141 ---- happytransformer/to_delete/__init__.py | 0 .../{ => to_delete}/classifier_args.py | 0 .../{ => to_delete}/classifier_utils.py | 0 .../{ => to_delete}/happy_bert.py | 67 +- .../{ => to_delete}/happy_roberta.py | 2 +- .../to_delete/happy_transformer.py | 587 +++++++++++++++++ .../{ => to_delete}/happy_xlnet.py | 2 +- happytransformer/{ => to_delete}/mlm_utils.py | 0 .../{ => to_delete}/sequence_classifier.py | 2 +- happytransformer/trainer.py | 18 +- .../trainers/default_args/default_args_qa.py | 8 - tests/test_predict.py | 4 +- tests/test_qa_multi.py | 11 +- tests/test_qa_trainer.py | 55 ++ tests/test_qa_training.py | 40 -- tests/{test_qa_util.py => test_task_util.py} | 2 +- 30 files changed, 1125 insertions(+), 891 deletions(-) rename data/{ => test_qa_trainer}/sample-qa-training-eval-data.csv (100%) create mode 100644 happytransformer/happy_question_answering.py create mode 100644 happytransformer/happy_word_prediction.py rename happytransformer/{runners => happytasks}/__init__.py (100%) create mode 100644 happytransformer/happytasks/happy_mwp/default_args_mwp.py create mode 100644 happytransformer/happytasks/happy_mwp/trainer_mwp.py rename happytransformer/{trainers => happytasks/happy_qa}/__init__.py (100%) create mode 100644 happytransformer/happytasks/happy_qa/default_args_qa.py create mode 100644 happytransformer/happytasks/happy_qa/qa_util.py rename happytransformer/{trainers => happytasks/happy_qa}/trainer_qa.py (59%) rename happytransformer/{runners/runner_util.py => happytasks/tasks_util.py} (97%) delete mode 100644 happytransformer/runners/runner_answer_question.py create mode 100644 happytransformer/to_delete/__init__.py rename happytransformer/{ => to_delete}/classifier_args.py (100%) rename happytransformer/{ => to_delete}/classifier_utils.py (100%) rename happytransformer/{ => to_delete}/happy_bert.py (62%) rename happytransformer/{ => to_delete}/happy_roberta.py (93%) create mode 100644 happytransformer/to_delete/happy_transformer.py rename happytransformer/{ => to_delete}/happy_xlnet.py (93%) rename happytransformer/{ => to_delete}/mlm_utils.py (100%) rename happytransformer/{ => to_delete}/sequence_classifier.py (99%) delete mode 100644 happytransformer/trainers/default_args/default_args_qa.py create mode 100644 tests/test_qa_trainer.py delete mode 100644 tests/test_qa_training.py rename tests/{test_qa_util.py => test_task_util.py} (89%) diff --git a/data/sample-qa-training-eval-data.csv b/data/test_qa_trainer/sample-qa-training-eval-data.csv similarity index 100% rename from data/sample-qa-training-eval-data.csv rename to data/test_qa_trainer/sample-qa-training-eval-data.csv diff --git a/happytransformer/__init__.py b/happytransformer/__init__.py index b8904fb4..e8a705c9 100644 --- a/happytransformer/__init__.py +++ b/happytransformer/__init__.py @@ -1,6 +1,6 @@ -from happytransformer.happy_roberta import HappyROBERTA -from happytransformer.happy_xlnet import HappyXLNET -from happytransformer.happy_bert import HappyBERT -from happytransformer.classifier_args import classifier_args +from happytransformer.to_delete.happy_roberta import HappyROBERTA +from happytransformer.to_delete.happy_xlnet import HappyXLNET +from happytransformer.to_delete.happy_bert import HappyBERT +from happytransformer.to_delete.classifier_args import classifier_args name = "happytransformer" diff --git a/happytransformer/happy_question_answering.py b/happytransformer/happy_question_answering.py new file mode 100644 index 00000000..a3f664f9 --- /dev/null +++ b/happytransformer/happy_question_answering.py @@ -0,0 +1,142 @@ +""" +Contains the HappyQuestionAnswering class. + +""" + +import torch +from happytransformer.happy_transformer import HappyTransformer +from happytransformer.happytasks.happy_qa.qa_util import qa_probabilities +from happytransformer.happytasks.happy_qa.trainer_qa import QATrainer +from happytransformer.happytasks.happy_qa.default_args_qa \ + import ARGS_QA_EVAL, ARGS_QA_TEST, ARGS_QA_TRAIN +from transformers import ( + BertForQuestionAnswering, + BertTokenizerFast + +) + + +class HappyQuestionAnswering(HappyTransformer): + """ + This class is a user facing class that allows users to solve question answering problems using + a transformer QA models. These models are able to answer a question given context for the + question by selecting a span within the context that answers the question. + + The purpose of this class is to be lightweight and easy + to understand and to offload complex tasks to + other classes. + """ + def __init__(self, model_type="BERT", + model_name="bert-large-uncased-whole-word-masking-finetuned-squad", device=None): + model = BertForQuestionAnswering.from_pretrained(model_name) + tokenizer = BertTokenizerFast.from_pretrained(model_name) + + super().__init__(model_type, model_name, model, tokenizer, device) + + self._trainer = QATrainer(model, model_type, tokenizer, self._device, + self.logger) + + def answers_to_question(self, question, context, k=3): + input_ids = self._tokenize_qa(question, context) + qa_output = self._run_qa_model(input_ids) + sep_id_index = input_ids.index(self._tokenizer.sep_token_id) + probabilities = qa_probabilities( + # only consider logits from the context part of the embedding. + # that is, between the middle [SEP] token + # and the final [SEP] token + start_logits=qa_output.start_logits[0][sep_id_index+1:-1], + end_logits=qa_output.end_logits[0][sep_id_index+1:-1], + k=k + ) + # qa probabilities use indices relative to context. + # tokens use indices relative to overall question [SEP] context embedding. + # need offset to resolve this difference + token_offset = sep_id_index + 1 + return [ + # grab ids from start to end (inclusive) and decode to text + {"text": self._tokenizer.decode( + input_ids[token_offset+answer.start_idx: token_offset+answer.end_idx+1] + ), + "softmax": answer.probability} + + for answer in probabilities + ] + + def answer_question(self, question, text): + """ + Using the given text, find the answer to the given question and return it. + + :param question: The question to be answered + :param text: The text containing the answer to the question + :return: The answer to the given question, as a string + """ + return self.answers_to_question(question, text, 1)[0]["text"] + + def _run_qa_model(self, input_ids): + sep_id_index = input_ids.index(self._tokenizer.sep_token_id) + before_after_ids = [ + 0 if idx <= sep_id_index else 1 + for idx, _ in enumerate(input_ids) + ] + with torch.no_grad(): + return self._model( + input_ids=torch.tensor([input_ids]), + token_type_ids=torch.tensor([before_after_ids]) + ) + + def _tokenize_qa(self, question, context): + input_text = ' '.join([ + question, + self._tokenizer.sep_token, + context + ]) + input_ids = self._tokenizer.encode(input_text) + return input_ids + + def train(self, input_filepath, args=ARGS_QA_TRAIN): + """ + Trains the question answering model + + input_filepath: a string that contains the location of a csv file + for training. Contains the following header values: context, + question, answer_text, answer_start + args: a dictionary that contains settings found under + happytransformer.happytasks.happy_qa.default_args_qa.py + return: None + """ + self._trainer.train(input_filepath=input_filepath, args=args) + + def eval(self, input_filepath, output_filepath, args=ARGS_QA_EVAL): + """ + Trains the question answering model + + input_filepath: a string that contains the location of a csv file + for training. Contains the following header values: + context, question, answer_text, answer_start + args: a dictionary that contains settings found under + happytransformer.happytasks.happy_qa.default_args_qa.py + output_filepath: a path to a csv file to output the results. + This file contains the following header values: contexts, + questions, answer, outputs, correct, softmax + return: correct ration (correct/total) + """ + return self._trainer.eval(input_filepath=input_filepath, + solve=self.answers_to_question, args=args, + output_filepath=output_filepath) + + def test(self, input_filepath, output_filepath, args=ARGS_QA_TEST): + """ + Tests the question answering model. Used to obtain results + + input_filepath: a string that contains the location of a csv file + for training. Contains the following header values: + context, question + args: a dictionary that contains settings found under + happytransformer.happytasks.happy_qa.default_args_qa.py + output_filepath: a path to a csv file to output the results. + This file contains the following header values: contexts, questions, outputs, softmax + return: None + """ + self._trainer.test(input_filepath=input_filepath, + solve=self.answers_to_question, args=args, + output_filepath=output_filepath) diff --git a/happytransformer/happy_transformer.py b/happytransformer/happy_transformer.py index d540c1cd..3ddb8f94 100644 --- a/happytransformer/happy_transformer.py +++ b/happytransformer/happy_transformer.py @@ -1,69 +1,20 @@ -# disable pylint TODO warning -# pylint: disable=W0511 -# pylint: disable=C0301 - -""" -HappyTransformer is a wrapper over pytorch_transformers to make it -easier to use. -""" - -import string -import re -import os -import sys -import csv -import logging -import logging.config -import numpy as np import torch -import pandas as pd - -from happytransformer.classifier_args import classifier_args -from happytransformer.sequence_classifier import SequenceClassifier -from happytransformer.mlm_utils import FinetuneMlm, word_prediction_args - -def _indices_where(items, predicate): - return [ - idx - for idx,item in enumerate(items) - if predicate(item) - ] - +import logging -_POSSIBLE_MASK_TOKENS = ['', '', '[MASK]'] -class HappyTransformer: - """ - Initializes pytroch's transformer models and provided methods for - their basic functionality. - Philosophy: Automatically make decisions for the user so that they don't - have to have any understanding of PyTorch or transformer - models to be able to utilize their capabilities. - """ +class HappyTransformer(): - def __init__(self, model, model_name): - # Transformer and tokenizer set in child class - self.model = model + def __init__(self, model_type, model_name, model, tokenizer, device): + self.model_type = model_type # BERT, ROBERTA, ALBERT etc self.model_name = model_name - self.mlm = None # Masked Language Model - self.seq = None # Sequence Classification - self.qa = None # Question Answering - self.mlm_args = None # Mask Language Model Finetuning - - # the following variables are declared in the child class: - self.tokenizer = None - - # Child class sets to indicate which model is being used - self.tag_one_transformers = ['BERT', "ROBERTA", 'XLNET'] - - # GPU support - self.gpu_support = torch.device( - "cuda" if torch.cuda.is_available() - else "cpu" - ) + self._model = model + self._tokenizer = tokenizer + self._model.eval() + self._trainer = None + # todo change logging system + self.logger = logging.getLogger(__name__) - # show only happytransformer logs handler = logging.StreamHandler() handler.addFilter(logging.Filter('happytransformer')) logging.basicConfig( @@ -72,516 +23,53 @@ def __init__(self, model, model_name): level=logging.INFO, handlers=[handler] ) - self.logger = logging.getLogger(__name__) - - self.logger.info("Using model: %s", self.gpu_support) - self.seq_trained = False - self.mwp_trainer = None - self.mwp_trained = False - - # ------------------------ QA - self._qa_model = None # Question Answering - self._qa_tokenizer = None - - self._qa_init = False - self._qa_trainer = None - self._qa_runner = None - - def _get_masked_language_model(self): - pass - - def _standardize_mask_tokens(self, text): - ''' - convert mask tokens to mask token preferred by tokenizer - ''' - for possible_mask_token in _POSSIBLE_MASK_TOKENS: - text = text.replace(possible_mask_token, self.tokenizer.mask_token) - return text - - def _prepare_mlm(self): - if self.mlm is None: - self._get_masked_language_model() - if self.gpu_support=='cuda': - self.mlm.to('cuda') - - def _masked_predictions_at_index_any(self, softmax, index, k): - ''' - return top predictions for a mask token from all embeddings - ''' - scores_tensor, token_ids_tensor = torch.topk(softmax[0, index], k) - scores = scores_tensor.tolist() - token_ids = token_ids_tensor.tolist() - tokens = self.tokenizer.convert_ids_to_tokens(token_ids) - options = [ - self._postprocess_option(token) - for token in tokens - ] - return [ - {"word": option, "softmax": score} - for option, score in zip(options, scores) - ] - - def _masked_predictions_at_index_options(self, softmax, index, options): - ''' - return top predictions for a mask token from a list of options - ''' - option_ids = [ - self.tokenizer.encode(option) - for option in options - ] - scores = [ - self.soft_sum(option_id, softmax[0], index) - for option_id in option_ids - ] - return [ - {"word": option, "softmax": score} - for option,score in zip(options,scores) - ] - - def _postprocess_option(self, text: str): - ''' - modifies option text as seen by predict_masks() output. - override in subclass to filter out weird characters. - :param text: original text of prediction option - :returns text: processed text of prediction option - ''' - return text - - def predict_masks(self, text: str, options=None, num_results=1): - ''' - Predict multiple [MASK] tokens in some text. - :param text: text containing the mask tokens - :param masks_options: list of lists of options as strings - :param num_results: number of results to return per mask token - num_results is ignored if options are supplied. - :returns: A list of list of namedtuples of the form (text,probability), - where predictions are ordered descendingly by likelihood - ''' - self._prepare_mlm() - text = self._standardize_mask_tokens(text) - - self._text_verification(text) - - text_tokens = ( - self._get_tokenized_text(text) - ) - softmax = self._get_prediction_softmax(text_tokens) - - masked_indices = _indices_where( - text_tokens, - lambda text: text == self.tokenizer.mask_token - ) - - if options is None: - return [ - self._masked_predictions_at_index_any( - softmax, masked_index, num_results - ) - for masked_index in masked_indices - ] - else: - return [ - self._masked_predictions_at_index_options( - softmax, masked_index, mask_options - ) - for masked_index, mask_options in zip(masked_indices, options) - ] - - def predict_mask(self, text: str, options=None, num_results=1): - ''' - Predict a single [MASK] token in some text. - :param text: text containing the mask token - :param options: list of options as strings - :param num_results: number of predictions to return if no options supplied - :returns: list of dictionaries with keys 'word' and 'softmax' - ''' - masks_options = None if options is None else [options] - predictions = self.predict_masks(text, masks_options, num_results) - return self.__format_option_scores(predictions[0]) - - def _get_tokenized_text(self, text): - """ - Formats a sentence so that it can be tokenized by a transformer. - :param text: a 1-2 sentence text that contains [MASK] - :return: A string with the same sentence that contains the required - tokens for the transformer - """ - - # Create a spacing around each punctuation character. eg "!" -> " ! " - # TODO: easy: find a cleaner way to do punctuation spacing - text = re.sub('([.,!?()])', r' \1 ', text) - # text = re.sub('\s{2,}', ' ', text) - - split_text = text.split() - new_text = list() - new_text.append(self.tokenizer.cls_token) - - for i, char in enumerate(split_text): - new_text.append(char.lower()) - if char not in string.punctuation: - pass - # must be a punctuation symbol - elif i + 1 >= len(split_text): - # is the last punctuation so simply add to the new_text - pass - else: - if split_text[i + 1] in string.punctuation: - pass - else: - new_text.append(self.tokenizer.sep_token) - # if self.model_name == "ROBERTA": - # # ROBERTA requires two "" tokens to separate sentences - # new_text.append(self.sep_token) - # must be a middle punctuation - new_text.append(self.tokenizer.sep_token) - - text = " ".join(new_text).replace('[mask]', self.tokenizer.mask_token) - text = self.tokenizer.tokenize(text) - return text - - def _get_prediction_softmax(self, text): - """ - Gets the softmaxes of the predictions for each index in the the given - input string. - Returned tensor will be in shape: - [1, , ] - :param text: a tokenized string to be used by the transformer. - :return: a tensor of the softmaxes of the predictions of the - transformer - - """ - - indexed_tokens = self.tokenizer.convert_tokens_to_ids(text) - # Convert inputs to PyTorch tensors - tokens_tensor = torch.tensor([indexed_tokens]) - - if self.gpu_support == "cuda": - tokens_tensor = tokens_tensor.to('cuda') - - with torch.no_grad(): - - if self.model_name != "ROBERTA": - segments_ids = self._get_segment_ids(text) - segments_tensors = torch.tensor([segments_ids]) - if self.gpu_support == "cuda": - segments_tensors = segments_tensors.to('cuda') - outputs = self.mlm(tokens_tensor, token_type_ids=segments_tensors) - else: - outputs = self.mlm(tokens_tensor) - - predictions = outputs[0] - - softmax = self._softmax(predictions) - return softmax - - def __format_option_scores(self, tupled_predicitons: list): - """ - Formats the given list of tuples containing the option and its - corresponding softtmax into a user friendly list of dictionaries where - the first element in the list is the option with the highest softmax. - Dictionary will be in the form: - {'word': , 'softmax': } - :param: ranked_scores: list of tuples to be converted into user - friendly dicitonary - :return: formatted_ranked_scores: list of dictionaries of the ranked - scores - """ - ranked_scores = sorted(tupled_predicitons, key=lambda x: x["softmax"], - reverse=True) - formatted_ranked_scores = list() - for dic in ranked_scores: - - formatted_ranked_scores.append({'word': dic["word"], 'softmax': dic["softmax"]}) - return formatted_ranked_scores - - def _softmax(self, value): - # TODO: make it an external function - return value.exp() / (value.exp().sum(-1)).unsqueeze(-1) - - def _get_segment_ids(self, tokenized_text: list): - """ - Converts a list of tokens into segment_ids. The segment id is a array - representation of the location for each character in the - first and second sentence. This method only words with 1-2 sentences. - Example: - tokenized_text = ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', - 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', - '[SEP]'] - segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] - returns segments_ids - """ - split_location = tokenized_text.index(self.tokenizer.sep_token) - - segment_ids = [ - 0 if idx <= split_location else 1 - for idx in range(len(tokenized_text)) - ] - # add exception case for XLNet - - return segment_ids - - def _text_verification(self, text: str): - - # TODO, Add cases for the other masked tokens used in common transformer models - valid = True - if '[MASK]' not in text: - self.logger.error("[MASK] was not found in your string. Change the word you want to predict to [MASK]") - valid = False - if '' in text or '' in text: - self.logger.info('Instead of using or , use [MASK] please as it is the convention') - valid = True - if '[CLS]' in text: - self.logger.error("[CLS] was found in your string. Remove it as it will be automatically added later") - valid = False - if '[SEP]' in text: - self.logger.error("[SEP] was found in your string. Remove it as it will be automatically added later") - valid = False - if not valid: - exit() - - @staticmethod - def soft_sum(option: list, softed, mask_id: int): - # TODO: Better logic. - """ - Adds the softmax of a single option - XLNET tokenizer sometimes splits words in to pieces. - Ex: The councilmen -> ['the', 'council', 'men'] - Pretty sure that this is mathematically wrong - :param option: Id of tokens in one option - :param softed: softmax of the output - :param mask: Index of masked word - :return: float Tensor - """ - # Collects the softmax of all tokens in list - return np.sum([softed[mask_id][op] for op in option]) - - def init_sequence_classifier(self): - """ - Initializes a binary sequence classifier model with default settings - """ - - # TODO Test the sequence classifier with other models - args = classifier_args.copy() - self.seq = SequenceClassifier(args, self.tokenizer, self.logger, self.gpu_support, self.model, self.model_name) - - self.logger.info("A binary sequence classifier for %s has been initialized", self.model_name) - - def custom_init_sequence_classifier(self, args): - """ - Initializes a binary sequence classifier model with custom settings. - The default settings args dictionary can be found happy_transformer/sequence_classification/classifier_args. - This dictionary can then be modified and then used as the only input for this method. - - """ - self.seq = SequenceClassifier(args, self.tokenizer, self.logger, self.gpu_support, self.model, self.model_name) - self.logger.info("A binary sequence classifier for %s has been initialized", self.model_name) - - def train_sequence_classifier(self, train_csv_path): - """ - Trains the HappyTransformer's sequence classifier - - :param train_csv_path: A path to the csv evaluation file. - Each test is contained within a row. - The first column is for the the correct answers, either 0 or 1 as an int or a string . - The second column is for the text. - """ - self.logger.info("***** Running Training *****") - - train_df = self.__process_classifier_data(train_csv_path) - - if self.seq is None: - self.logger.error("Initialize the sequence classifier before training") - exit() - - sys.stdout = open(os.devnull, - 'w') # Disable printing to stop external libraries from printing - train_df = train_df.astype("str") - self.seq.train_list_data = train_df.values.tolist() - del train_df # done with train_df - self.seq.train_model() - self.seq_trained = True - sys.stdout = sys.__stdout__ # Enable printing - - def eval_sequence_classifier(self, eval_csv_path): - """ - Evaluates the trained sequence classifier against a testing set. - - :param csv_path: A path to the csv evaluation file. - Each test is contained within a row. - The first column is for the the correct answers, either 0 or 1 as an int or a string . - The second column is for the text. - - :return: A dictionary evaluation matrix - """ - - self.logger.info("***** Running evaluation *****") - - sys.stdout = open(os.devnull, 'w') # Disable printing - - eval_df = self.__process_classifier_data(eval_csv_path) - - if not self.seq_trained: - self.logger.error("Train the sequence classifier before evaluation") - exit() - - eval_df = eval_df.astype("str") - self.seq.eval_list_data = eval_df.values.tolist() - - results = self.seq.evaluate() - sys.stdout = sys.__stdout__ # Enable printing - - return results - - def test_sequence_classifier(self, test_csv_path): - """ - - :param test_csv_path: a path to the csv evaluation file. - Each test is contained within a row. - The first column is for the the correct answers, either 0 or 1 as an int or a string . - The second column is for the text. - :return: A list of predictions where each prediction index is the same as the corresponding test's index - """ - self.logger.info("***** Running Testing *****") - sys.stdout = open(os.devnull, 'w') # Disable printing - - test_df = self.__process_classifier_data(test_csv_path, for_test_data=True) - - # todo finish - if not self.seq_trained: - self.logger.error("Train the sequence classifier before testing") - exit() - - test_df = test_df.astype("str") - self.seq.test_list_data = test_df.values.tolist() - del test_df # done with test_df - - results = self.seq.test() - - sys.stdout = sys.__stdout__ # Enable printing - - return results - - def __process_classifier_data(self, csv_path, for_test_data=False): - """ - Credit: This code was modified from this repository - https://github.com/ThilinaRajapakse/pytorch-transformers-classification - :param csv_path: Path to csv file that must be processed - :return: A Panda dataframe with the proper information for classification tasks - """ - - if for_test_data: - with open(csv_path, 'r') as test_file: - reader = csv.reader(test_file) - text_list = list(reader) - # Blank values are required for the first column value the testing data to increase - # reusability of preprocessing methods between the tasks - blank_values = ["0"] * len(text_list) - data_frame = pd.DataFrame([*zip(blank_values, text_list)]) - del blank_values # done with blank_values + if device is None: + self._device = torch.device( + "cuda" if torch.cuda.is_available() + else "cpu" + ) else: - data_frame = pd.read_csv(csv_path, header=None) + self._device = device - data_frame[0] = data_frame[0].astype("int") - data_frame = pd.DataFrame({ - 'id': range(len(data_frame)), - 'label': data_frame[0], - 'alpha': ['a'] * data_frame.shape[0], - 'text': data_frame[1].replace(r'\n', ' ', regex=True) - }) + if self._device == 'cuda': + self._model.to(self._device) + self.logger.info("Using model: %s", self._device) - return data_frame - def init_train_mwp(self, args=None): + def train(self, input_filepath, args): """ - Initializes the MLM for fine-tuning on masked word prediction. - If args are not supplied the following hyperparameters are used: - batch size = 1 - Number of epochs = 1 - Learning rate = 5e-5 - Adam epsilon = 1e-8 - + Trains a model + :param input_filepath: a string that contains a path to a csv file + that contains testing data + :param args: settings in the form of a dictionary + :return: None """ - if not args: - self.mlm_args = word_prediction_args - else: - self.mlm_args = args - - # TODO Test the sequence classifier with other models + raise NotImplementedError() - if self.model_name != "XLNET": - - # current implementation: - if not self.mlm: - self._get_masked_language_model() # if already has self.mlm - # don't call this - self.mwp_trainer = FinetuneMlm(self.mlm, self.mlm_args, - self.tokenizer, self.logger) - - self.logger.info( - "You can now train a masked word prediction model using %s", - self.model_name) - - else: - self.logger.error( - "Masked language model training is not available for XLNET") - sys.exit() - - def train_mwp(self, train_path: str): + def eval(self, input_filepath, output_filepath, args): """ - Trains the model with masked language modeling loss. + Evaluates the model. Determines how well the model performs on a given dataset - train_path: Path to the training file, expected to be a .txt or of - similar form. + :param input_filepath: a string that contains a path to a + csv file that contains evaluating data + :param output_filepath: a string that contains a path to a + csv file that will be created to store the results + :param args: settings in the form of a dictionary + :return: correct percentage """ + raise NotImplementedError() - if torch.cuda.is_available(): - if self.mwp_trained and self.mwp_trainer: # If model is trained - self.logger.warning("Training on the already fine-tuned model") - self.mwp_trainer.train(train_path) - - elif self.mwp_trainer and not self.mwp_trained: # If trainer - # exists but isn't trained - self.mlm, self.tokenizer = self.mwp_trainer.train(train_path) - self.mwp_trained = True - - elif not self.mwp_trainer: # If trainer doesn't exist - self.logger.error( - "The model is not loaded, you should run init_train_mwp.") - sys.exit() - - else: # If the user doesn't have a gpu. - self.logger.error( - "You are using %s, you must use a GPU to train a MLM", - self.gpu_support) - sys.exit() - - def eval_mwp(self, eval_path: str, batch_size: int = 2): + def test(self, input_filepath, output_filepath, args): """ - Evaluates the masked language model and returns the perplexity and - the evaluation loss. - - eval_path: Path to the evaluation file, expected to be a .txt or - similar. - batch_size: Depending on the gpu the user may increase or decrease - batch size. + Used to generate predictions for a given dataset. + The dataset may not be labelled. + :param input_filepath: a string that contains a path to + a csv file that contains testing data + :param output_filepath: a string that contains a path to + a csv file that will be created to store the results + :param args: settings in the form of a dictionary """ - if not self.mwp_trainer: - self.logger.error( - "The model is not loaded, you should run init_train_mwp.") - sys.exit() - - if not self.mwp_trained: - self.logger.warning( - "You are evaluating on the pretrained model, not the fine-tuned model.") - - results = self.mwp_trainer.evaluate(eval_path, batch_size) - - return results - - def _init_model_first_warning(self, model_type, method_name): - - # todo make this a logger message - print("First initialize the", model_type, "using the", method_name, "method") \ No newline at end of file + raise NotImplementedError() diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py new file mode 100644 index 00000000..590959f0 --- /dev/null +++ b/happytransformer/happy_word_prediction.py @@ -0,0 +1,44 @@ +from happytransformer.happytasks.happy_qa.trainer_qa import QATrainer + +from transformers import ( + BertForMaskedLM, + BertTokenizerFast, + RobertaForMaskedLM, + RobertaTokenizerFast +) + +from happytransformer.happy_transformer import HappyTransformer + + +class HappyWorkPrediction(HappyTransformer): + def __init__(self, model_type="BERT", + model_name="bert-large-uncased-whole-word-masking-finetuned-squad", device=None): + model = None + tokenizer = None + + if model_type == "BERT": + model = BertForMaskedLM.from_pretrained(model_name) + tokenizer = BertTokenizerFast.from_pretrained(model_name) + + elif model_type == "ROBERTA": + model = RobertaForMaskedLM.from_pretrained(model_name) + tokenizer = RobertaTokenizerFast.from_pretrained(model_name) + + super().__init__(model_type, model_name, model, tokenizer, device) + + self._trainer = QATrainer(model, + model_type, tokenizer, self._device, self.logger) + def predict_masks(self): + raise NotImplementedError() + + def predict_mask(self): + raise NotImplementedError() + + def train(self, input_filepath, args): + raise NotImplementedError() + + def test(self, input_filepath, output_filepath, args): + raise NotImplementedError() + + def eval(self, input_filepath, output_filepath, args): + raise NotImplementedError() diff --git a/happytransformer/runners/__init__.py b/happytransformer/happytasks/__init__.py similarity index 100% rename from happytransformer/runners/__init__.py rename to happytransformer/happytasks/__init__.py diff --git a/happytransformer/happytasks/happy_mwp/default_args_mwp.py b/happytransformer/happytasks/happy_mwp/default_args_mwp.py new file mode 100644 index 00000000..c5c1d089 --- /dev/null +++ b/happytransformer/happytasks/happy_mwp/default_args_mwp.py @@ -0,0 +1,19 @@ +ARGS_MWP_TRAIN = { + 'max_length': 300, + 'batch_size': 16, + 'learning_rate': 5e-5, + 'epochs': 2, + + +} + +ARGS_MWP_TEST = { + # eventually we'll add settings + +} + +ARGS_MWP_EVAL = { + + # eventually we'll add settings + +} \ No newline at end of file diff --git a/happytransformer/happytasks/happy_mwp/trainer_mwp.py b/happytransformer/happytasks/happy_mwp/trainer_mwp.py new file mode 100644 index 00000000..ef9c67c9 --- /dev/null +++ b/happytransformer/happytasks/happy_mwp/trainer_mwp.py @@ -0,0 +1,17 @@ +from happytransformer.trainer import Trainer +from happytransformer.happytasks.happy_mwp.default_args_mwp import ARGS_MWP_EVAL, ARGS_MWP_TEST, ARGS_MWP_TRAIN + + +class QATrainer(Trainer): + + def __init__(self, model, model_type, tokenizer, device, logger): + super(QATrainer, self).__init__(model, model_type, tokenizer, device, logger) + + def train(self, input_filepath, args=ARGS_MWP_TRAIN): + raise NotImplementedError() + + def test(self, input_filepath, answers_to_question, output_filepath, args=ARGS_MWP_TEST): + raise NotImplementedError() + + def eval(self, input_filepath, answers_to_question, args=ARGS_MWP_EVAL, output_filepath=None): + raise NotImplementedError() \ No newline at end of file diff --git a/happytransformer/trainers/__init__.py b/happytransformer/happytasks/happy_qa/__init__.py similarity index 100% rename from happytransformer/trainers/__init__.py rename to happytransformer/happytasks/happy_qa/__init__.py diff --git a/happytransformer/happytasks/happy_qa/default_args_qa.py b/happytransformer/happytasks/happy_qa/default_args_qa.py new file mode 100644 index 00000000..5877ce79 --- /dev/null +++ b/happytransformer/happytasks/happy_qa/default_args_qa.py @@ -0,0 +1,17 @@ +ARGS_QA_TRAIN = { + 'max_length': 300, + 'batch_size': 16, + 'learning_rate': 5e-5, + 'epochs': 2, + + +} + +ARGS_QA_TEST = { + # eventually we'll add settings + +} + +ARGS_QA_EVAL = { + # eventually we'll add settings +} diff --git a/happytransformer/happytasks/happy_qa/qa_util.py b/happytransformer/happytasks/happy_qa/qa_util.py new file mode 100644 index 00000000..be576491 --- /dev/null +++ b/happytransformer/happytasks/happy_qa/qa_util.py @@ -0,0 +1,75 @@ +from collections import namedtuple +import torch +from happytransformer.happytasks.tasks_util import biggest_sums + +QAAnswerLogit = namedtuple('QaAnswerLogit', [ + 'start_idx', 'end_idx', 'logit' +]) + +def qa_logits(start_logits, end_logits): + """ + Compute the logits for top qa pairs + :param start_logits: tensor from qa model output + :param end_logits: tensor from qa model output + :returns: generator of namedtuples of the form + (start_idx, end_idx, logit), sorted in descending order + by score + """ + + sorted_starts_tensors = torch.sort(start_logits, descending=True) + sorted_ends_tensors = torch.sort(end_logits, descending=True) + # start logits sorted in descending order INDEPENDENTLY + sorted_start_scores = sorted_starts_tensors.values.tolist() + sorted_start_indices = sorted_starts_tensors.indices.tolist() + # end logits sorted in descending order INDEPENDENTLY + sorted_end_scores = sorted_ends_tensors.values.tolist() + sorted_end_indices = sorted_ends_tensors.indices.tolist() + # start logit + end logit pairs sorted in descending order + # of their sum TOGETHER + all_answers = ( + QAAnswerLogit( + start_idx=sorted_start_indices[sum_pair.idx1], + end_idx=sorted_end_indices[sum_pair.idx2], + logit=sum_pair.sum + ) + for sum_pair in + biggest_sums(sorted_start_scores, sorted_end_scores) + ) + # filter for only answers which have end at or after start + legit_answers = ( + answer + for answer in all_answers + if answer.end_idx >= answer.start_idx + ) + return legit_answers + +QAProbability = namedtuple('QaProbability', [ + 'start_idx', 'end_idx', 'probability' +]) + +def qa_probabilities(start_logits, end_logits, k): + """ + Computes the top k qa probabilities, in terms of indices. + :param start_logits: tensor from qa model output + :param end_logits: tensor from qa model output + :param k: number of results to return + :returns: list of namedtuples of the form (text,probability) + """ + top_answers = [ + qa_logit + for qa_logit, _ in zip(qa_logits(start_logits, end_logits), range(k)) + ] + logit_scores = torch.tensor([ + answer.logit + for answer in top_answers + ]) + + probabilities = torch.nn.Softmax(dim=0)(logit_scores).tolist() + return [ + QAProbability( + start_idx=answer.start_idx, + end_idx=answer.end_idx, + probability=probability + ) + for answer, probability in zip(top_answers, probabilities) + ] diff --git a/happytransformer/trainers/trainer_qa.py b/happytransformer/happytasks/happy_qa/trainer_qa.py similarity index 59% rename from happytransformer/trainers/trainer_qa.py rename to happytransformer/happytasks/happy_qa/trainer_qa.py index 34bb4289..16cb7007 100644 --- a/happytransformer/trainers/trainer_qa.py +++ b/happytransformer/happytasks/happy_qa/trainer_qa.py @@ -1,41 +1,32 @@ """ -This code is a modified version of the official documentation for the transformer library by Hugging Face -which can be found below. +This code is a modified version of the official documentation for the +transformer library by Hugging Face which can be found below. We prioritized following the official documentation as close as possible to ensure we're using robust methods. And also, to improve maintainability as they update the documentation. https://huggingface.co/transformers/custom_datasets.html#question-answering-with-squad-2-0 """ - -import torch -from happytransformer.trainer import Trainer +import time import csv +import torch from torch.utils.data import DataLoader from transformers import AdamW -import time -from happytransformer.trainers.default_args.default_args_qa import ARGS_QA_TRAINING + +from happytransformer.trainer import Trainer class QATrainer(Trainer): - def __init__(self, model, model_type, tokenizer, device, runner, logger): - super(QATrainer, self).__init__(model, model_type, tokenizer, device, runner, logger) + def __init__(self, model, model_type, tokenizer, device, logger): + super().__init__(model, model_type, tokenizer, device, logger) - def train(self, filepath, args=ARGS_QA_TRAINING): + def train(self, input_filepath, args): """ - #todo: add time elapsed and test time remaining similar to what is within eval - - - :param filepath: - :param args: - :return: + See docstring in HappyQuestionAnswering.train() """ + #todo: add time elapsed and test time remaining similar to what is within eval - if args == None: - args = ARGS_QA_TRAINING - - contexts, questions, answers = self.__get_train_eval_data(filepath) - + contexts, questions, answers = self.__get_data(input_filepath) self.__add_end_idx(contexts, answers) encodings = self.tokenizer(contexts, questions, truncation=True, padding=True) self.__add_token_positions(encodings, answers) @@ -47,28 +38,37 @@ def train(self, filepath, args=ARGS_QA_TRAINING): optim = AdamW(self.model.parameters(), lr=args['learning_rate']) for epoch in range(args['epochs']): - epoch_output = "Epoch: " + str(epoch) + epoch_output = "Epoch: " + str(epoch) + "\n\n" self.logger.info(epoch_output) batch_num = 0 for batch in train_loader: + batch_output = "Batch: " + str(batch_num) + self.logger.info(batch_output) optim.zero_grad() input_ids = batch['input_ids'].to(self.device) attention_mask = batch['attention_mask'].to(self.device) start_positions = batch['start_positions'].to(self.device) end_positions = batch['end_positions'].to(self.device) - outputs = self.model(input_ids, attention_mask=attention_mask, start_positions=start_positions, + outputs = self.model(input_ids, attention_mask=attention_mask, + start_positions=start_positions, end_positions=end_positions) loss = outputs[0] loss.backward() optim.step() - batch_logger_output = "Batch: " + str(batch_num) + " loss: " + str(round(loss.item(), 6)) + batch_logger_output = "Batch: " + str(batch_num)\ + + " loss: " + str(round(loss.item(), 6)) self.logger.info(batch_logger_output) batch_num += 1 self.model.eval() - def eval(self, filepath, args, output_filepath=None): + def eval(self, input_filepath, solve, output_filepath, args): + """ + See docstring in HappyQuestionAnswering.eval() + + solve: HappyQuestionAnswering.answers_to_question() + """ - contexts, questions, answers = self.__get_train_eval_data(filepath) + contexts, questions, answers = self.__get_data(input_filepath) init_time = time.time() correct = 0 count = 0 @@ -81,26 +81,30 @@ def eval(self, filepath, args, output_filepath=None): context = case[0] question = case[1] answer = case[2] - output = self.runner.answer_question(question, context) + + result = solve(question, context, k=1000)[0] + output_text = result["text"] + output_softmax = result["softmax"] # todo modify the qa functionality to output with correct capitalization compare_answer = answer["answer_text"].lower() - if output == compare_answer: + if output_text == compare_answer: correct += 1 - count += 1 - if output_filepath != None: - results.append( - { - "contexts": context, - "questions": question, - "answer": answer["answer_text"].lower(), - "outputs": output, - "correct": output == compare_answer - } + results.append( + { + "contexts": context, + "questions": question, + "answer": answer["answer_text"].lower(), + "outputs": output_text, + "correct": output_text == compare_answer, + "softmax": output_softmax + + } ) + count += 1 self._print_status(init_time, count, total, update_interval, correct/count) @@ -110,25 +114,62 @@ def eval(self, filepath, args, output_filepath=None): result_output = "Evaluating Result: " + str(correct) + "/" + str(total) + " -- " + ending self.logger.info(result_output) - if output_filepath != None: - fieldnames = ["contexts", "questions", "answer", "outputs", "correct"] - self._output_result_to_csv(output_filepath, fieldnames, results) + fieldnames = ["contexts", "questions", "answer", "outputs", "correct", "softmax"] + self._output_result_to_csv(output_filepath, fieldnames, results) return score + def test(self, input_filepath, solve, output_filepath, args): + """ + See docstring in HappyQuestionAnswering.test() - def test(self, filepath, args, output_filepath): - #todo - pass + solve: HappyQuestionAnswering.answers_to_question() + + """ + contexts, questions = self.__get_data(input_filepath, test_data=True) + init_time = time.time() + total = len(contexts) + count = 0 + update_interval = self._get_update_interval(total) + + results = list() + + for case in zip(contexts, questions): + context = case[0] + question = case[1] + + result = solve(question, context, k=1000)[0] + output_text = result["text"] + output_softmax = result["softmax"] + + # todo modify the qa functionality to output with correct capitalization + results.append( + { + "contexts": context, + "questions": question, + "outputs": output_text, + "softmax": output_softmax + } + ) + + self._print_status(init_time, count, total, update_interval, None) + count += 1 + + fieldnames = ["contexts", "questions", "outputs", "softmax"] + self._output_result_to_csv(output_filepath, fieldnames, results) + + result_output = "Output saved to: " + output_filepath + + count += 1 + self.logger.info(result_output) @staticmethod - def __get_train_eval_data(filepath): + def __get_data(filepath, test_data=False): """ Used for parsing data for training and evaluating (both contain labels) :param filepath: a string that contains the location of the data :return: """ - contexts = [] questions = [] answers = [] @@ -137,16 +178,18 @@ def __get_train_eval_data(filepath): for row in reader: contexts.append(row['context']) questions.append(row['question']) - answer = {} - answer["answer_text"] = row['answer_text'] - answer["answer_start"] = int(row['answer_start']) - answers.append(answer) + if not test_data: + answer = {} + answer["answer_text"] = row['answer_text'] + answer["answer_start"] = int(row['answer_start']) + answers.append(answer) csv_file.close() - return contexts, questions, answers + if not test_data: + return contexts, questions, answers + return contexts, questions @staticmethod - def __add_end_idx(contexts, answers): for answer, context in zip(answers, contexts): @@ -182,7 +225,11 @@ def __add_token_positions(self, encodings, answers): class QuestionAnsweringDataset(torch.utils.data.Dataset): - + """ + A class used to iterate through the training data. + It used to create a torch DataLoader object, so that the training data can be + iterated through in batches easily. + """ def __init__(self, encodings): self.encodings = encodings diff --git a/happytransformer/runners/runner_util.py b/happytransformer/happytasks/tasks_util.py similarity index 97% rename from happytransformer/runners/runner_util.py rename to happytransformer/happytasks/tasks_util.py index c19f9a8d..784a652b 100644 --- a/happytransformer/runners/runner_util.py +++ b/happytransformer/happytasks/tasks_util.py @@ -31,4 +31,4 @@ def biggest_sums(items_a, items_b): if diff_a >= diff_b: b_index += 1 else: - a_index += 1 \ No newline at end of file + a_index += 1 diff --git a/happytransformer/runners/runner_answer_question.py b/happytransformer/runners/runner_answer_question.py deleted file mode 100644 index fd6decc1..00000000 --- a/happytransformer/runners/runner_answer_question.py +++ /dev/null @@ -1,141 +0,0 @@ -import torch - -from happytransformer.runners.runner_util import biggest_sums -from collections import namedtuple - -class QuestionAnswering(): - def __init__(self, type, model, tokenizer): - self.type = type # BERT, ROBERTA, ALBERT etc - self.model = model - self.tokenizer = tokenizer - - def answers_to_question(self, question, context, k=3): - input_ids = self._tokenize_qa(question, context) - qa_output = self._run_qa_model(input_ids) - sep_id_index = input_ids.index(self.tokenizer.sep_token_id) - probabilities = self.qa_probabilities( - # only consider logits from the context part of the embedding. - # that is, between the middle [SEP] token - # and the final [SEP] token - qa_output.start_logits[0][sep_id_index+1:-1], - qa_output.end_logits[0][sep_id_index+1:-1], - k - ) - # qa probabilities use indices relative to context. - # tokens use indices relative to overall question [SEP] context embedding. - # need offset to resolve this difference - token_offset = sep_id_index + 1 - return [ - {"text": self.tokenizer.decode( - # grab ids from start to end (inclusive) and decode to text - input_ids[token_offset+answer.start_idx : token_offset+answer.end_idx+1] - ), - "softmax": answer.probability} - - for answer in probabilities - ] - - def answer_question(self, question, text): - """ - Using the given text, find the answer to the given question and return it. - - :param question: The question to be answered - :param text: The text containing the answer to the question - :return: The answer to the given question, as a string - """ - return self.answers_to_question(question, text, 1)[0]["text"] - - def _tokenize_qa(self, question, context): - input_text = ' '.join([ - question, - self.tokenizer.sep_token, - context - ]) - input_ids = self.tokenizer.encode(input_text) - return input_ids - - - def _run_qa_model(self, input_ids): - - sep_id_index = input_ids.index(self.tokenizer.sep_token_id) - before_after_ids = [ - 0 if idx <= sep_id_index else 1 - for idx, _ in enumerate(input_ids) - ] - with torch.no_grad(): - return self.model( - input_ids=torch.tensor([input_ids]), - token_type_ids=torch.tensor([before_after_ids]) - ) - - QAAnswerLogit = namedtuple('QaAnswerLogit', [ - 'start_idx', 'end_idx', 'logit' - ]) - - def qa_logits(self, start_logits, end_logits): - """ - Compute the logits for top qa pairs - :param start_logits: tensor from qa model output - :param end_logits: tensor from qa model output - :returns: generator of namedtuples of the form - (start_idx, end_idx, logit), sorted in descending order - by score - """ - - sorted_starts_tensors = torch.sort(start_logits, descending=True) - sorted_ends_tensors = torch.sort(end_logits, descending=True) - # start logits sorted in descending order INDEPENDENTLY - sorted_start_scores = sorted_starts_tensors.values.tolist() - sorted_start_indices = sorted_starts_tensors.indices.tolist() - # end logits sorted in descending order INDEPENDENTLY - sorted_end_scores = sorted_ends_tensors.values.tolist() - sorted_end_indices = sorted_ends_tensors.indices.tolist() - # start logit + end logit pairs sorted in descending order - # of their sum TOGETHER - all_answers = ( - self.QAAnswerLogit( - start_idx=sorted_start_indices[sum_pair.idx1], - end_idx=sorted_end_indices[sum_pair.idx2], - logit=sum_pair.sum - ) - for sum_pair in - biggest_sums(sorted_start_scores, sorted_end_scores) - ) - # filter for only answers which have end at or after start - legit_answers = ( - answer - for answer in all_answers - if answer.end_idx >= answer.start_idx - ) - return legit_answers - - QAProbability = namedtuple('QaProbability', [ - 'start_idx', 'end_idx', 'probability' - ]) - - def qa_probabilities(self, start_logits, end_logits, k): - """ - Computes the top k qa probabilities, in terms of indices. - :param start_logits: tensor from qa model output - :param end_logits: tensor from qa model output - :param k: number of results to return - :returns: list of namedtuples of the form (text,probability) - """ - top_answers = [ - qa_logit - for qa_logit, _ in zip(self.qa_logits(start_logits, end_logits), range(k)) - ] - logit_scores = torch.tensor([ - answer.logit - for answer in top_answers - ]) - - probabilities = torch.nn.Softmax(dim=0)(logit_scores).tolist() - return [ - self.QAProbability( - start_idx=answer.start_idx, - end_idx=answer.end_idx, - probability=probability - ) - for answer, probability in zip(top_answers, probabilities) - ] diff --git a/happytransformer/to_delete/__init__.py b/happytransformer/to_delete/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/happytransformer/classifier_args.py b/happytransformer/to_delete/classifier_args.py similarity index 100% rename from happytransformer/classifier_args.py rename to happytransformer/to_delete/classifier_args.py diff --git a/happytransformer/classifier_utils.py b/happytransformer/to_delete/classifier_utils.py similarity index 100% rename from happytransformer/classifier_utils.py rename to happytransformer/to_delete/classifier_utils.py diff --git a/happytransformer/happy_bert.py b/happytransformer/to_delete/happy_bert.py similarity index 62% rename from happytransformer/happy_bert.py rename to happytransformer/to_delete/happy_bert.py index c6711964..21193c07 100644 --- a/happytransformer/happy_bert.py +++ b/happytransformer/to_delete/happy_bert.py @@ -9,16 +9,13 @@ from transformers import ( BertForMaskedLM, BertForNextSentencePrediction, - BertForQuestionAnswering, BertTokenizerFast ) import torch -from happytransformer.runners.runner_answer_question import QuestionAnswering -from happytransformer.happy_transformer import HappyTransformer -from happytransformer.trainers.trainer_qa import QATrainer +from happytransformer.to_delete.happy_transformer import HappyTransformer class HappyBERT(HappyTransformer): """ @@ -67,7 +64,6 @@ def _get_next_sentence_prediction(self): - def predict_next_sentence(self, sentence_a, sentence_b, use_probability=False): """ Determines if sentence B is likely to be a continuation after sentence @@ -129,67 +125,6 @@ def __is_one_sentence(self, text): sentence_found = True break return True -#-------------------------------------------------------# - - # QUESTION ANSWERING # -#-------------------------------------------------------# - - def init_qa(self, model='bert-large-uncased-whole-word-masking-finetuned-squad'): - """ - Initializes the BertForQuestionAnswering transformer - NOTE: This uses the bert-large-uncased-whole-word-masking-finetuned-squad pretraining for best results. - """ - self._qa_model = BertForQuestionAnswering.from_pretrained(model) - self._qa_tokenizer = BertTokenizerFast.from_pretrained(model) - self._qa_model.eval() - - if self.gpu_support == 'cuda': - self._qa_model.to('cuda') - - self._qa_runner = QuestionAnswering(self.model_name, self._qa_model, self._qa_tokenizer) - self._qa_init = True - - def __check_if_init(self, check_trainer=False): - if self._qa_init: - if check_trainer: - if self._qa_trainer == None: - self._qa_trainer = QATrainer(self._qa_model, "bert", self.tokenizer, self.gpu_support, - self._qa_runner, - self.logger) - return True - else: - self._init_model_first_warning("question answering", "init_qa(model_name)") - return False - - def answers_to_question(self, question, context, k=3): - if self.__check_if_init(): - return self._qa_runner.answers_to_question(question, context, k=k) - - - def answer_question(self, question, text): - """ - Using the given text, find the answer to the given question and return it. - - :param question: The question to be answered - #todo breaking change: change text to context - :param text: The text containing the answer to the question - :return: The answer to the given question, as a string - """ - - if self.__check_if_init(): - return self._qa_runner.answer_question(question, text) - - - def train_qa(self, filepath, args=None): - if self.__check_if_init(True): - self._qa_trainer.train(filepath, args) - - def test_qa(self, filepath, args=None): - if self.__check_if_init(True): - raise NotImplementedError() - def eval_qa(self, filepath, output_filepath=None, args=None): - if self.__check_if_init(True): - return self._qa_trainer.eval(filepath, args, output_filepath) diff --git a/happytransformer/happy_roberta.py b/happytransformer/to_delete/happy_roberta.py similarity index 93% rename from happytransformer/happy_roberta.py rename to happytransformer/to_delete/happy_roberta.py index d4ff6cef..bee4732b 100644 --- a/happytransformer/happy_roberta.py +++ b/happytransformer/to_delete/happy_roberta.py @@ -4,7 +4,7 @@ # disable pylint TODO warning # pylint: disable=W0511 -from happytransformer.happy_transformer import HappyTransformer +from happytransformer.to_delete.happy_transformer import HappyTransformer from transformers import RobertaForMaskedLM, RobertaTokenizer diff --git a/happytransformer/to_delete/happy_transformer.py b/happytransformer/to_delete/happy_transformer.py new file mode 100644 index 00000000..84c53839 --- /dev/null +++ b/happytransformer/to_delete/happy_transformer.py @@ -0,0 +1,587 @@ +# disable pylint TODO warning +# pylint: disable=W0511 +# pylint: disable=C0301 + +""" +HappyTransformer is a wrapper over pytorch_transformers to make it +easier to use. +""" + +import string +import re +import os +import sys +import csv +import logging +import logging.config +import numpy as np +import torch +import pandas as pd + +from happytransformer.to_delete.classifier_args import classifier_args +from happytransformer.to_delete.sequence_classifier import SequenceClassifier +from happytransformer.to_delete.mlm_utils import FinetuneMlm, word_prediction_args + +def _indices_where(items, predicate): + return [ + idx + for idx,item in enumerate(items) + if predicate(item) + ] + + +_POSSIBLE_MASK_TOKENS = ['', '', '[MASK]'] + +class HappyTransformer: + """ + Initializes pytroch's transformer models and provided methods for + their basic functionality. + Philosophy: Automatically make decisions for the user so that they don't + have to have any understanding of PyTorch or transformer + models to be able to utilize their capabilities. + """ + + def __init__(self, model, model_name): + # Transformer and tokenizer set in child class + self.model = model + self.model_name = model_name + self.mlm = None # Masked Language Model + self.seq = None # Sequence Classification + self.qa = None # Question Answering + self.mlm_args = None # Mask Language Model Finetuning + + # the following variables are declared in the child class: + self.tokenizer = None + + # Child class sets to indicate which model is being used + self.tag_one_transformers = ['BERT', "ROBERTA", 'XLNET'] + + # GPU support + self.gpu_support = torch.device( + "cuda" if torch.cuda.is_available() + else "cpu" + ) + + + # show only happytransformer logs + handler = logging.StreamHandler() + handler.addFilter(logging.Filter('happytransformer')) + logging.basicConfig( + format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', + datefmt='%m/%d/%Y %H:%M:%S', + level=logging.INFO, + handlers=[handler] + ) + self.logger = logging.getLogger(__name__) + + self.logger.info("Using model: %s", self.gpu_support) + self.seq_trained = False + self.mwp_trainer = None + self.mwp_trained = False + + # ------------------------ QA + self._qa_model = None # Question Answering + self._qa_tokenizer = None + + self._qa_init = False + self._qa_trainer = None + self._qa_runner = None + + def _get_masked_language_model(self): + pass + + def _standardize_mask_tokens(self, text): + ''' + convert mask tokens to mask token preferred by tokenizer + ''' + for possible_mask_token in _POSSIBLE_MASK_TOKENS: + text = text.replace(possible_mask_token, self.tokenizer.mask_token) + return text + + def _prepare_mlm(self): + if self.mlm is None: + self._get_masked_language_model() + if self.gpu_support=='cuda': + self.mlm.to('cuda') + + def _masked_predictions_at_index_any(self, softmax, index, k): + ''' + return top predictions for a mask token from all embeddings + ''' + scores_tensor, token_ids_tensor = torch.topk(softmax[0, index], k) + scores = scores_tensor.tolist() + token_ids = token_ids_tensor.tolist() + tokens = self.tokenizer.convert_ids_to_tokens(token_ids) + options = [ + self._postprocess_option(token) + for token in tokens + ] + return [ + {"word": option, "softmax": score} + for option, score in zip(options, scores) + ] + + def _masked_predictions_at_index_options(self, softmax, index, options): + ''' + return top predictions for a mask token from a list of options + ''' + option_ids = [ + self.tokenizer.encode(option) + for option in options + ] + scores = [ + self.soft_sum(option_id, softmax[0], index) + for option_id in option_ids + ] + return [ + {"word": option, "softmax": score} + for option,score in zip(options,scores) + ] + + def _postprocess_option(self, text: str): + ''' + modifies option text as seen by predict_masks() output. + override in subclass to filter out weird characters. + :param text: original text of prediction option + :returns text: processed text of prediction option + ''' + return text + + def predict_masks(self, text: str, options=None, num_results=1): + ''' + Predict multiple [MASK] tokens in some text. + :param text: text containing the mask tokens + :param masks_options: list of lists of options as strings + :param num_results: number of results to return per mask token + num_results is ignored if options are supplied. + :returns: A list of list of namedtuples of the form (text,probability), + where predictions are ordered descendingly by likelihood + ''' + self._prepare_mlm() + text = self._standardize_mask_tokens(text) + + self._text_verification(text) + + text_tokens = ( + self._get_tokenized_text(text) + ) + softmax = self._get_prediction_softmax(text_tokens) + + masked_indices = _indices_where( + text_tokens, + lambda text: text == self.tokenizer.mask_token + ) + + if options is None: + return [ + self._masked_predictions_at_index_any( + softmax, masked_index, num_results + ) + for masked_index in masked_indices + ] + else: + return [ + self._masked_predictions_at_index_options( + softmax, masked_index, mask_options + ) + for masked_index, mask_options in zip(masked_indices, options) + ] + + def predict_mask(self, text: str, options=None, num_results=1): + ''' + Predict a single [MASK] token in some text. + :param text: text containing the mask token + :param options: list of options as strings + :param num_results: number of predictions to return if no options supplied + :returns: list of dictionaries with keys 'word' and 'softmax' + ''' + masks_options = None if options is None else [options] + predictions = self.predict_masks(text, masks_options, num_results) + return self.__format_option_scores(predictions[0]) + + def _get_tokenized_text(self, text): + """ + Formats a sentence so that it can be tokenized by a transformer. + :param text: a 1-2 sentence text that contains [MASK] + :return: A string with the same sentence that contains the required + tokens for the transformer + """ + + # Create a spacing around each punctuation character. eg "!" -> " ! " + # TODO: easy: find a cleaner way to do punctuation spacing + text = re.sub('([.,!?()])', r' \1 ', text) + # text = re.sub('\s{2,}', ' ', text) + + split_text = text.split() + new_text = list() + new_text.append(self.tokenizer.cls_token) + + for i, char in enumerate(split_text): + new_text.append(char.lower()) + if char not in string.punctuation: + pass + # must be a punctuation symbol + elif i + 1 >= len(split_text): + # is the last punctuation so simply add to the new_text + pass + else: + if split_text[i + 1] in string.punctuation: + pass + else: + new_text.append(self.tokenizer.sep_token) + # if self.model_name == "ROBERTA": + # # ROBERTA requires two "" tokens to separate sentences + # new_text.append(self.sep_token) + # must be a middle punctuation + new_text.append(self.tokenizer.sep_token) + + text = " ".join(new_text).replace('[mask]', self.tokenizer.mask_token) + text = self.tokenizer.tokenize(text) + return text + + def _get_prediction_softmax(self, text): + """ + Gets the softmaxes of the predictions for each index in the the given + input string. + Returned tensor will be in shape: + [1, , ] + :param text: a tokenized string to be used by the transformer. + :return: a tensor of the softmaxes of the predictions of the + transformer + + """ + + indexed_tokens = self.tokenizer.convert_tokens_to_ids(text) + # Convert inputs to PyTorch tensors + tokens_tensor = torch.tensor([indexed_tokens]) + + if self.gpu_support == "cuda": + tokens_tensor = tokens_tensor.to('cuda') + + with torch.no_grad(): + + if self.model_name != "ROBERTA": + segments_ids = self._get_segment_ids(text) + segments_tensors = torch.tensor([segments_ids]) + if self.gpu_support == "cuda": + segments_tensors = segments_tensors.to('cuda') + outputs = self.mlm(tokens_tensor, token_type_ids=segments_tensors) + else: + outputs = self.mlm(tokens_tensor) + + predictions = outputs[0] + + softmax = self._softmax(predictions) + return softmax + + def __format_option_scores(self, tupled_predicitons: list): + """ + Formats the given list of tuples containing the option and its + corresponding softtmax into a user friendly list of dictionaries where + the first element in the list is the option with the highest softmax. + Dictionary will be in the form: + {'word': , 'softmax': } + :param: ranked_scores: list of tuples to be converted into user + friendly dicitonary + :return: formatted_ranked_scores: list of dictionaries of the ranked + scores + """ + ranked_scores = sorted(tupled_predicitons, key=lambda x: x["softmax"], + reverse=True) + formatted_ranked_scores = list() + for dic in ranked_scores: + + formatted_ranked_scores.append({'word': dic["word"], 'softmax': dic["softmax"]}) + return formatted_ranked_scores + + def _softmax(self, value): + # TODO: make it an external function + return value.exp() / (value.exp().sum(-1)).unsqueeze(-1) + + def _get_segment_ids(self, tokenized_text: list): + """ + Converts a list of tokens into segment_ids. The segment id is a array + representation of the location for each character in the + first and second sentence. This method only words with 1-2 sentences. + Example: + tokenized_text = ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', + 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', + '[SEP]'] + segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] + returns segments_ids + """ + split_location = tokenized_text.index(self.tokenizer.sep_token) + + segment_ids = [ + 0 if idx <= split_location else 1 + for idx in range(len(tokenized_text)) + ] + # add exception case for XLNet + + return segment_ids + + def _text_verification(self, text: str): + + # TODO, Add cases for the other masked tokens used in common transformer models + valid = True + if '[MASK]' not in text: + self.logger.error("[MASK] was not found in your string. Change the word you want to predict to [MASK]") + valid = False + if '' in text or '' in text: + self.logger.info('Instead of using or , use [MASK] please as it is the convention') + valid = True + if '[CLS]' in text: + self.logger.error("[CLS] was found in your string. Remove it as it will be automatically added later") + valid = False + if '[SEP]' in text: + self.logger.error("[SEP] was found in your string. Remove it as it will be automatically added later") + valid = False + if not valid: + exit() + + @staticmethod + def soft_sum(option: list, softed, mask_id: int): + # TODO: Better logic. + """ + Adds the softmax of a single option + XLNET tokenizer sometimes splits words in to pieces. + Ex: The councilmen -> ['the', 'council', 'men'] + Pretty sure that this is mathematically wrong + :param option: Id of tokens in one option + :param softed: softmax of the output + :param mask: Index of masked word + :return: float Tensor + """ + # Collects the softmax of all tokens in list + return np.sum([softed[mask_id][op] for op in option]) + + def init_sequence_classifier(self): + """ + Initializes a binary sequence classifier model with default settings + """ + + # TODO Test the sequence classifier with other models + args = classifier_args.copy() + self.seq = SequenceClassifier(args, self.tokenizer, self.logger, self.gpu_support, self.model, self.model_name) + + self.logger.info("A binary sequence classifier for %s has been initialized", self.model_name) + + def custom_init_sequence_classifier(self, args): + """ + Initializes a binary sequence classifier model with custom settings. + The default settings args dictionary can be found happy_transformer/sequence_classification/classifier_args. + This dictionary can then be modified and then used as the only input for this method. + + """ + self.seq = SequenceClassifier(args, self.tokenizer, self.logger, self.gpu_support, self.model, self.model_name) + self.logger.info("A binary sequence classifier for %s has been initialized", self.model_name) + + def train_sequence_classifier(self, train_csv_path): + """ + Trains the HappyTransformer's sequence classifier + + :param train_csv_path: A path to the csv evaluation file. + Each test is contained within a row. + The first column is for the the correct answers, either 0 or 1 as an int or a string . + The second column is for the text. + """ + self.logger.info("***** Running Training *****") + + train_df = self.__process_classifier_data(train_csv_path) + + if self.seq is None: + self.logger.error("Initialize the sequence classifier before training") + exit() + + sys.stdout = open(os.devnull, + 'w') # Disable printing to stop external libraries from printing + train_df = train_df.astype("str") + self.seq.train_list_data = train_df.values.tolist() + del train_df # done with train_df + self.seq.train_model() + self.seq_trained = True + sys.stdout = sys.__stdout__ # Enable printing + + def eval_sequence_classifier(self, eval_csv_path): + """ + Evaluates the trained sequence classifier against a testing set. + + :param csv_path: A path to the csv evaluation file. + Each test is contained within a row. + The first column is for the the correct answers, either 0 or 1 as an int or a string . + The second column is for the text. + + :return: A dictionary evaluation matrix + """ + + self.logger.info("***** Running evaluation *****") + + sys.stdout = open(os.devnull, 'w') # Disable printing + + eval_df = self.__process_classifier_data(eval_csv_path) + + if not self.seq_trained: + self.logger.error("Train the sequence classifier before evaluation") + exit() + + eval_df = eval_df.astype("str") + self.seq.eval_list_data = eval_df.values.tolist() + + results = self.seq.evaluate() + sys.stdout = sys.__stdout__ # Enable printing + + return results + + def test_sequence_classifier(self, test_csv_path): + """ + + :param test_csv_path: a path to the csv evaluation file. + Each test is contained within a row. + The first column is for the the correct answers, either 0 or 1 as an int or a string . + The second column is for the text. + :return: A list of predictions where each prediction index is the same as the corresponding test's index + """ + self.logger.info("***** Running Testing *****") + sys.stdout = open(os.devnull, 'w') # Disable printing + + test_df = self.__process_classifier_data(test_csv_path, for_test_data=True) + + # todo finish + if not self.seq_trained: + self.logger.error("Train the sequence classifier before testing") + exit() + + test_df = test_df.astype("str") + self.seq.test_list_data = test_df.values.tolist() + del test_df # done with test_df + + results = self.seq.test() + + sys.stdout = sys.__stdout__ # Enable printing + + return results + + def __process_classifier_data(self, csv_path, for_test_data=False): + """ + Credit: This code was modified from this repository + https://github.com/ThilinaRajapakse/pytorch-transformers-classification + :param csv_path: Path to csv file that must be processed + :return: A Panda dataframe with the proper information for classification tasks + """ + + if for_test_data: + with open(csv_path, 'r') as test_file: + reader = csv.reader(test_file) + text_list = list(reader) + # Blank values are required for the first column value the testing data to increase + # reusability of preprocessing methods between the tasks + blank_values = ["0"] * len(text_list) + data_frame = pd.DataFrame([*zip(blank_values, text_list)]) + del blank_values # done with blank_values + + else: + data_frame = pd.read_csv(csv_path, header=None) + + data_frame[0] = data_frame[0].astype("int") + data_frame = pd.DataFrame({ + 'id': range(len(data_frame)), + 'label': data_frame[0], + 'alpha': ['a'] * data_frame.shape[0], + 'text': data_frame[1].replace(r'\n', ' ', regex=True) + }) + + return data_frame + + def init_train_mwp(self, args=None): + """ + Initializes the MLM for fine-tuning on masked word prediction. + If args are not supplied the following hyperparameters are used: + batch size = 1 + Number of epochs = 1 + Learning rate = 5e-5 + Adam epsilon = 1e-8 + + """ + if not args: + self.mlm_args = word_prediction_args + else: + self.mlm_args = args + + # TODO Test the sequence classifier with other models + + if self.model_name != "XLNET": + + # current implementation: + if not self.mlm: + self._get_masked_language_model() # if already has self.mlm + # don't call this + self.mwp_trainer = FinetuneMlm(self.mlm, self.mlm_args, + self.tokenizer, self.logger) + + self.logger.info( + "You can now train a masked word prediction model using %s", + self.model_name) + + else: + self.logger.error( + "Masked language model training is not available for XLNET") + sys.exit() + + def train_mwp(self, train_path: str): + """ + Trains the model with masked language modeling loss. + + train_path: Path to the training file, expected to be a .txt or of + similar form. + + """ + + if torch.cuda.is_available(): + if self.mwp_trained and self.mwp_trainer: # If model is trained + self.logger.warning("Training on the already fine-tuned model") + self.mwp_trainer.train(train_path) + + elif self.mwp_trainer and not self.mwp_trained: # If trainer + # exists but isn't trained + self.mlm, self.tokenizer = self.mwp_trainer.train(train_path) + self.mwp_trained = True + + elif not self.mwp_trainer: # If trainer doesn't exist + self.logger.error( + "The model is not loaded, you should run init_train_mwp.") + sys.exit() + + else: # If the user doesn't have a gpu. + self.logger.error( + "You are using %s, you must use a GPU to train a MLM", + self.gpu_support) + sys.exit() + + def eval_mwp(self, eval_path: str, batch_size: int = 2): + """ + Evaluates the masked language model and returns the perplexity and + the evaluation loss. + + eval_path: Path to the evaluation file, expected to be a .txt or + similar. + batch_size: Depending on the gpu the user may increase or decrease + batch size. + + """ + if not self.mwp_trainer: + self.logger.error( + "The model is not loaded, you should run init_train_mwp.") + sys.exit() + + if not self.mwp_trained: + self.logger.warning( + "You are evaluating on the pretrained model, not the fine-tuned model.") + + results = self.mwp_trainer.evaluate(eval_path, batch_size) + + return results + + def _init_model_first_warning(self, model_type, method_name): + + # todo make this a logger message + print("First initialize the", model_type, "using the", method_name, "method") \ No newline at end of file diff --git a/happytransformer/happy_xlnet.py b/happytransformer/to_delete/happy_xlnet.py similarity index 93% rename from happytransformer/happy_xlnet.py rename to happytransformer/to_delete/happy_xlnet.py index 457ff82c..8ce5d761 100644 --- a/happytransformer/happy_xlnet.py +++ b/happytransformer/to_delete/happy_xlnet.py @@ -7,7 +7,7 @@ XLNetTokenizer ) -from happytransformer.happy_transformer import HappyTransformer +from happytransformer.to_delete.happy_transformer import HappyTransformer class HappyXLNET(HappyTransformer): diff --git a/happytransformer/mlm_utils.py b/happytransformer/to_delete/mlm_utils.py similarity index 100% rename from happytransformer/mlm_utils.py rename to happytransformer/to_delete/mlm_utils.py diff --git a/happytransformer/sequence_classifier.py b/happytransformer/to_delete/sequence_classifier.py similarity index 99% rename from happytransformer/sequence_classifier.py rename to happytransformer/to_delete/sequence_classifier.py index 71367dfb..911d0959 100644 --- a/happytransformer/sequence_classifier.py +++ b/happytransformer/to_delete/sequence_classifier.py @@ -30,7 +30,7 @@ from transformers import AdamW, get_linear_schedule_with_warmup -from happytransformer.classifier_utils import ( +from happytransformer.to_delete.classifier_utils import ( convert_examples_to_features, output_modes, processors diff --git a/happytransformer/trainer.py b/happytransformer/trainer.py index cc9f34f4..259b4e83 100644 --- a/happytransformer/trainer.py +++ b/happytransformer/trainer.py @@ -5,22 +5,21 @@ class Trainer: - def __init__(self, model, model_type, tokenizer, device, runner, logger): + def __init__(self, model, model_type, tokenizer, device, logger): self.model = model self.model_type = model_type self.tokenizer = tokenizer self.device = device - self.runner = runner self.logger = logger - def train(self, filepath, args): + def train(self, input_filepath, args): raise NotImplementedError() - def test(self, filepath, args, output_filepath): + def test(self, input_filepath, solve, output_filepath, args): raise NotImplementedError() - def eval(self, filepath, args, output_filepath): + def eval(self, input_filepath, solve, output_filepath, args): raise NotImplementedError() def _get_train_eval_data(self, filepath): @@ -65,7 +64,7 @@ def _get_update_interval(self, count): return 1 return update_interval - def _print_status(self, init_time, count, total, update_interval, percentage = None): + def _print_status(self, init_time, count, total, update_interval, percentage=None): if count % update_interval and not count == 0: current_time = time.time() elapsed_time_string = self._format_time(current_time - init_time) @@ -74,12 +73,11 @@ def _print_status(self, init_time, count, total, update_interval, percentage = N rem_time_int = avg_ex * (total - count) rem_time_string = self._format_time(rem_time_int) ending = "" - if percentage != None: + if percentage is not None: ending = "Correct: " + str(round(percentage, 2)*100) + "%" status_output = "Done: ", str(count) + "/" + str( - total) + " ---- Elapsed: " + elapsed_time_string + " Estimated Remaining: " + rem_time_string +" " + ending - - + total) + " ---- Elapsed: " + elapsed_time_string +\ + " Estimated Remaining: " + rem_time_string +" " + ending self.logger.info(status_output) def _output_result_to_csv(self, output_filepath, fieldnames, results): diff --git a/happytransformer/trainers/default_args/default_args_qa.py b/happytransformer/trainers/default_args/default_args_qa.py deleted file mode 100644 index c967fb56..00000000 --- a/happytransformer/trainers/default_args/default_args_qa.py +++ /dev/null @@ -1,8 +0,0 @@ -ARGS_QA_TRAINING = { - 'max_length': 300, - 'batch_size': 16, - 'learning_rate': 5e-5, - 'epochs': 2, - - -} \ No newline at end of file diff --git a/tests/test_predict.py b/tests/test_predict.py index 0c0a35cb..2f81e7c7 100644 --- a/tests/test_predict.py +++ b/tests/test_predict.py @@ -1,4 +1,4 @@ -from happytransformer.happy_bert import HappyBERT +from happytransformer.to_delete.happy_bert import HappyBERT happy = HappyBERT() @@ -9,7 +9,7 @@ def test_prediction_options(): ''' predictions = happy.predict_mask( 'I want crackers and [MASK]', - options=['death','cheese'], + options=['death', 'cheese'], num_results=1 ) print(predictions) diff --git a/tests/test_qa_multi.py b/tests/test_qa_multi.py index a4b3696a..3bf8107e 100644 --- a/tests/test_qa_multi.py +++ b/tests/test_qa_multi.py @@ -2,10 +2,8 @@ Tests for the "answers_to_question" method that can be accessed through a HappyBERT object """ -from happytransformer import HappyBERT - -happy_bert = HappyBERT() -happy_bert.init_qa() +from happytransformer.happy_question_answering import HappyQuestionAnswering +happy_qa = HappyQuestionAnswering() PARAGRAPH = ( 'McGill is a university located in Montreal. ' @@ -23,8 +21,9 @@ def test_qa_multi(): for question, expected_answer in QA_PAIRS: - computed_answers = happy_bert.answers_to_question(question, PARAGRAPH, k=10) - computed_answer = happy_bert.answer_question(question, PARAGRAPH) + + computed_answers = happy_qa.answers_to_question(question, PARAGRAPH, k=10) + computed_answer = happy_qa.answer_question(question, PARAGRAPH) # k is being respected assert len(computed_answers) == 10 # both answering methods yield correct result diff --git a/tests/test_qa_trainer.py b/tests/test_qa_trainer.py new file mode 100644 index 00000000..b8fef0d2 --- /dev/null +++ b/tests/test_qa_trainer.py @@ -0,0 +1,55 @@ +""" +Tests for the question answering training, evaluating and testing functionality +""" + +from happytransformer.happy_question_answering import HappyQuestionAnswering + + +def test_qa_train_eval(): + """ + Tests + HappyQuestionAnswering.eval() + HappyQuestionAnswering.train() + + """ + happy_qa = HappyQuestionAnswering() + # Test 1 + start_answers = happy_qa.answers_to_question("What is the date?", "October 31st is the date") + + # Test 2 + before = happy_qa.eval("../data/test_qa_trainer/sample-qa-training-eval-data.csv", + output_filepath="../data/test_qa_trainer/results/output-eval-before-qa.csv") + + happy_qa.train("../data/test_qa_trainer/sample-qa-training-eval-data.csv") + + + # Test 1 + end_answers = happy_qa.answers_to_question("What is the date?", "October 31st is the date") + assert start_answers[0]["text"] == "october 31st" + assert end_answers[0]["text"] == "october 31st" + assert end_answers[0]["softmax"] > start_answers[0]["softmax"] + + # Test 2 + after = happy_qa.eval("../data/test_qa_trainer/sample-qa-training-eval-data.csv", + output_filepath="../data/test_qa_trainer/results/output-eval-after-qa.csv") + assert after >= before + + # Test 3: + #todo ensure the output csv file makes sense for eval + + +def test_qa_testing(): + """ + tests: + + HappyQuestionAnswering.test() + + """ + + happy_qa = HappyQuestionAnswering() + happy_qa.test("../data/test_qa_trainer/sample-qa-test-data.csv", + output_filepath="../data/test_qa_trainer/results/output-test-output.csv") + + #todo ensure the output csv file makes sense + + diff --git a/tests/test_qa_training.py b/tests/test_qa_training.py deleted file mode 100644 index ef4c4eb5..00000000 --- a/tests/test_qa_training.py +++ /dev/null @@ -1,40 +0,0 @@ -from happytransformer import HappyBERT - - -def test_qa_training(): - - happy_bert = HappyBERT() - happy_bert.init_qa() - start_answers = happy_bert.answers_to_question("What is the date?", "October 31st is the date") - happy_bert.train_qa("../data/sample-qa-training-eval-data.csv") - end_answers = happy_bert.answers_to_question("What is the date?", "October 31st is the date") - - assert start_answers[0]["text"] == "october 31st" - assert end_answers[0]["text"] == "october 31st" - assert end_answers[0]["softmax"] > start_answers[0]["softmax"] - -def test_qa_eval(): - happy_bert = HappyBERT() - happy_bert.init_qa() - before = happy_bert.eval_qa("../data/sample-qa-training-eval-data.csv") - happy_bert.train_qa("../data/sample-qa-training-eval-data.csv") - after = happy_bert.eval_qa("../data/sample-qa-training-eval-data.csv") - - #todo assert by making sure the output csv makes sense - # todo, also, perhaps use a different dataset for training and eval - - - assert after >= before - # todo get a larger dataset - # however, we do not want to commit a large dataset to the repo, - # so we'll have to find a way to download it from the web when the code runs - - # also, use separate data for test and eval - # assert after > before - - - - -# def test_qa_test(): -#todo - diff --git a/tests/test_qa_util.py b/tests/test_task_util.py similarity index 89% rename from tests/test_qa_util.py rename to tests/test_task_util.py index 1e7bc501..f7e6cc42 100644 --- a/tests/test_qa_util.py +++ b/tests/test_task_util.py @@ -2,7 +2,7 @@ Contains tests for functions found within qa_util.py """ -from happytransformer.runners.runner_util import SumPair, biggest_sums +from happytransformer.happytasks.tasks_util import SumPair, biggest_sums def test_biggest_sums(): """ Tests the biggest_sums function From 5daa9c2666b4fbe2c16ec1a1b3e2f9d9cff373e5 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Wed, 6 Jan 2021 16:46:21 -0500 Subject: [PATCH 006/155] Added testing data --- data/test_qa_trainer/sample-qa-test-data.csv | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 data/test_qa_trainer/sample-qa-test-data.csv diff --git a/data/test_qa_trainer/sample-qa-test-data.csv b/data/test_qa_trainer/sample-qa-test-data.csv new file mode 100644 index 00000000..da0dbee8 --- /dev/null +++ b/data/test_qa_trainer/sample-qa-test-data.csv @@ -0,0 +1,3 @@ +context,question +October 31st is the date,what is the date? +The date is November 23rd ,what is the date? From d6654954bbe23760e5e4d4ba99f1220879233da9 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Wed, 6 Jan 2021 16:52:30 -0500 Subject: [PATCH 007/155] Added news to readme --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 24bd7359..59d64cda 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,10 @@ ## News: +### January x, 2021 +Introducing Version 2.0.0! +... + ### November 23rd, 2020 Last month, Happy Transformer was presented at a conference called C-Search, and the presentation won the Best Presentation Award. C-Search is the Queen's University Student Research Conference and had Turing Award Winner Professor Bengio as the Keynote Speaker this year. The video for the presentation can be found [here](https://www.youtube.com/watch?v=nNdFkq-y8Ng&t=12s). From 732ceea8992cebb13b8121866aefa88b1637d46a Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Wed, 6 Jan 2021 18:13:21 -0500 Subject: [PATCH 008/155] deleted unneeded files --- happytransformer/to_delete/__init__.py | 0 happytransformer/to_delete/classifier_args.py | 19 - .../to_delete/classifier_utils.py | 266 -------- happytransformer/to_delete/happy_bert.py | 130 ---- happytransformer/to_delete/happy_roberta.py | 47 -- .../to_delete/happy_transformer.py | 587 ------------------ happytransformer/to_delete/happy_xlnet.py | 45 -- happytransformer/to_delete/mlm_utils.py | 297 --------- .../to_delete/sequence_classifier.py | 310 --------- 9 files changed, 1701 deletions(-) delete mode 100644 happytransformer/to_delete/__init__.py delete mode 100644 happytransformer/to_delete/classifier_args.py delete mode 100644 happytransformer/to_delete/classifier_utils.py delete mode 100644 happytransformer/to_delete/happy_bert.py delete mode 100644 happytransformer/to_delete/happy_roberta.py delete mode 100644 happytransformer/to_delete/happy_transformer.py delete mode 100644 happytransformer/to_delete/happy_xlnet.py delete mode 100644 happytransformer/to_delete/mlm_utils.py delete mode 100644 happytransformer/to_delete/sequence_classifier.py diff --git a/happytransformer/to_delete/__init__.py b/happytransformer/to_delete/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/happytransformer/to_delete/classifier_args.py b/happytransformer/to_delete/classifier_args.py deleted file mode 100644 index 5372c356..00000000 --- a/happytransformer/to_delete/classifier_args.py +++ /dev/null @@ -1,19 +0,0 @@ -classifier_args = { - # Basic fine tuning parameters - 'learning_rate': 1e-5, - 'num_epochs': 2, - 'batch_size': 8, - - # More advanced fine tuning parameters - 'max_seq_length': 128, # The maximum tokens allowed in each input. Max value = 512. Increasing it significantly increases memory usage - 'adam_epsilon': 1e-5, - 'gradient_accumulation_steps': 1, - 'weight_decay': 0, - 'warmup_ratio': 0.06, - 'warmup_steps': 0, - 'max_grad_norm': 1.0, - - # More modes will become available in future releases - 'task_mode': 'binary', - -} \ No newline at end of file diff --git a/happytransformer/to_delete/classifier_utils.py b/happytransformer/to_delete/classifier_utils.py deleted file mode 100644 index 346d9ea7..00000000 --- a/happytransformer/to_delete/classifier_utils.py +++ /dev/null @@ -1,266 +0,0 @@ - -""" -BERT classification fine-tuning: utilities to work with GLUE tasks - -Credit: The code below is from this file - https://github.com/ThilinaRajapakse/pytorch-transformers-classification/blob/master/utils.py - -""" - -# coding=utf-8 -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import absolute_import, division, print_function - -import csv -import logging -import sys -from io import open -from multiprocessing import Pool, cpu_count -from tqdm import tqdm - -logger = logging.getLogger(__name__) -csv.field_size_limit(2147483647) - - -class InputExample(object): - """A single training/test example for simple sequence classification.""" - - def __init__(self, guid, text_a, text_b=None, label=None): - """Constructs a InputExample. - Args: - guid: Unique id for the example. - text_a: string. The untokenized text of the first sequence. For single - sequence tasks, only this sequence must be specified. - text_b: (Optional) string. The untokenized text of the second sequence. - Only must be specified for sequence pair tasks. - label: (Optional) string. The label of the example. This should be - specified for train and dev examples, but not for test examples. - """ - self.guid = guid - self.text_a = text_a - self.text_b = text_b - self.label = label - - -class InputFeatures(object): - """A single set of features of data.""" - - def __init__(self, input_ids, input_mask, segment_ids, label_id): - self.input_ids = input_ids - self.input_mask = input_mask - self.segment_ids = segment_ids - self.label_id = label_id - - -class DataProcessor(object): - """Base class for data converters for sequence classification data sets.""" - - def get_train_examples(self, data_dir): - """Gets a collection of `InputExample`s for the train set.""" - raise NotImplementedError() - - def get_dev_examples(self, data_dir): - """Gets a collection of `InputExample`s for the dev set.""" - raise NotImplementedError() - - def get_labels(self): - """Gets the list of labels for this data set.""" - raise NotImplementedError() - - @classmethod - def _read_tsv(cls, input_file, quotechar=None): - """Reads a tab separated value file.""" - with open(input_file, "r", encoding="utf-8-sig") as f: - reader = csv.reader(f, delimiter="\t", quotechar=quotechar) - lines = [] - for line in reader: - if sys.version_info[0] == 2: - line = list(unicode(cell, 'utf-8') for cell in line) - lines.append(line) - return lines - - -class BinaryProcessor(DataProcessor): - """Processor for the binary data sets""" - - def get_train_examples(self, data): - """See base class.""" - - - return self._create_examples(data , "train") - - - def get_dev_examples(self, data): - """See base class.""" - return self._create_examples(data, "dev") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - guid = "%s-%s" % (set_type, i) - text_a = line[3] - label = line[1] - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) - return examples - - -def convert_example_to_feature(example_row, pad_token=0, - sequence_a_segment_id=0, sequence_b_segment_id=1, - cls_token_segment_id=1, pad_token_segment_id=0, - mask_padding_with_zero=True, sep_token_extra=False): - example, label_map, max_seq_length, tokenizer, output_mode, cls_token_at_end, cls_token, sep_token, cls_token_segment_id, pad_on_left, pad_token_segment_id, sep_token_extra = example_row - - tokens_a = tokenizer.tokenize(example.text_a) - - tokens_b = None - if example.text_b: - tokens_b = tokenizer.tokenize(example.text_b) - # Modifies `tokens_a` and `tokens_b` in place so that the total - # length is less than the specified length. - # Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa. - special_tokens_count = 4 if sep_token_extra else 3 - _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count) - else: - # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa. - special_tokens_count = 3 if sep_token_extra else 2 - if len(tokens_a) > max_seq_length - special_tokens_count: - tokens_a = tokens_a[:(max_seq_length - special_tokens_count)] - - # The convention in BERT is: - # (a) For sequence pairs: - # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] - # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 - # (b) For single sequences: - # tokens: [CLS] the dog is hairy . [SEP] - # type_ids: 0 0 0 0 0 0 0 - # - # Where "type_ids" are used to indicate whether this is the first - # sequence or the second sequence. The embedding vectors for `type=0` and - # `type=1` were learned during pre-training and are added to the wordpiece - # embedding vector (and position vector). This is not *strictly* necessary - # since the [SEP] token unambiguously separates the sequences, but it makes - # it easier for the model to learn the concept of sequences. - # - # For classification tasks, the first vector (corresponding to [CLS]) is - # used as as the "sentence vector". Note that this only makes sense because - # the entire model is fine-tuned. - tokens = tokens_a + [sep_token] - segment_ids = [sequence_a_segment_id] * len(tokens) - - if tokens_b: - tokens += tokens_b + [sep_token] - segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1) - - if cls_token_at_end: - tokens = tokens + [cls_token] - segment_ids = segment_ids + [cls_token_segment_id] - else: - tokens = [cls_token] + tokens - segment_ids = [cls_token_segment_id] + segment_ids - - input_ids = tokenizer.convert_tokens_to_ids(tokens) - - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) - - # Zero-pad up to the sequence length. - padding_length = max_seq_length - len(input_ids) - if pad_on_left: - input_ids = ([pad_token] * padding_length) + input_ids - input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask - segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids - else: - input_ids = input_ids + ([pad_token] * padding_length) - input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length) - segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) - - assert len(input_ids) == max_seq_length - assert len(input_mask) == max_seq_length - assert len(segment_ids) == max_seq_length - - if output_mode == "classification": - label_id = label_map[example.label] - elif output_mode == "regression": - label_id = float(example.label) - else: - raise KeyError(output_mode) - - return InputFeatures(input_ids=input_ids, - input_mask=input_mask, - segment_ids=segment_ids, - label_id=label_id) - - -def convert_examples_to_features(examples, label_list, max_seq_length, - tokenizer, output_mode, - cls_token_at_end=False, sep_token_extra=False, pad_on_left=False, - cls_token='[CLS]', sep_token='[SEP]', pad_token=0, - sequence_a_segment_id=0, sequence_b_segment_id=1, - cls_token_segment_id=1, pad_token_segment_id=0, - mask_padding_with_zero=True, - process_count=max(cpu_count() - 2,1)): - """ Loads a data file into a list of `InputBatch`s - `cls_token_at_end` define the location of the CLS token: - - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] - `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) - """ - - label_map = {label: i for i, label in enumerate(label_list)} - - examples = [(example, label_map, max_seq_length, tokenizer, output_mode, cls_token_at_end, cls_token, sep_token, - cls_token_segment_id, pad_on_left, pad_token_segment_id, sep_token_extra) for example in examples] - - with Pool(process_count) as p: - features = list(tqdm(p.imap(convert_example_to_feature, examples, chunksize=500), total=len(examples))) - - return features - - -def _truncate_seq_pair(tokens_a, tokens_b, max_length): - """Truncates a sequence pair in place to the maximum length.""" - - # This is a simple heuristic which will always truncate the longer sequence - # one token at a time. This makes more sense than truncating an equal percent - # of tokens from each, since if one sequence is very short then each token - # that's truncated likely contains more information than a longer sequence. - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_length: - break - if len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - tokens_b.pop() - - -processors = { - "binary": BinaryProcessor -} - -output_modes = { - "binary": "classification" -} - - diff --git a/happytransformer/to_delete/happy_bert.py b/happytransformer/to_delete/happy_bert.py deleted file mode 100644 index 21193c07..00000000 --- a/happytransformer/to_delete/happy_bert.py +++ /dev/null @@ -1,130 +0,0 @@ -""" -HappyBERT: a wrapper over PyTorch's BERT implementation - -""" - -# disable pylint TODO warning -# pylint: disable=W0511 -import re -from transformers import ( - BertForMaskedLM, - BertForNextSentencePrediction, - BertTokenizerFast - -) - -import torch - -from happytransformer.to_delete.happy_transformer import HappyTransformer - -class HappyBERT(HappyTransformer): - """ - Currently available public methods: - BertForMaskedLM: - 1. predict_mask(text: str, options=None, k=1) - BertForSequenceClassification: - 1. init_sequence_classifier() - 2. advanced_init_sequence_classifier() - 3. train_sequence_classifier(train_csv_path) - 4. eval_sequence_classifier(eval_csv_path) - 5. test_sequence_classifier(test_csv_path) - BertForNextSentencePrediction: - 1. predict_next_sentence(sentence_a, sentence_b) - BertForQuestionAnswering: - 1. answer_question(question, text) - - """ - - def __init__(self, model='bert-base-uncased'): - # todo remove model parameter. Each model will have its own - super().__init__(model, "BERT") - self.mlm = None # Masked Language Model - self.nsp = None # Next Sentence Prediction - - #todo separate tokenizer for each model - self.tokenizer = BertTokenizerFast.from_pretrained(model) - self.masked_token = self.tokenizer.mask_token - self.sep_token = self.tokenizer.sep_token - self.cls_token = self.tokenizer.cls_token - - - def _get_masked_language_model(self): - """ - Initializes the BertForMaskedLM transformer - """ - self.mlm = BertForMaskedLM.from_pretrained(self.model) - self.mlm.eval() - - def _get_next_sentence_prediction(self): - """ - Initializes the BertForNextSentencePrediction transformer - """ - self.nsp = BertForNextSentencePrediction.from_pretrained(self.model) - self.nsp.eval() - - - - def predict_next_sentence(self, sentence_a, sentence_b, use_probability=False): - """ - Determines if sentence B is likely to be a continuation after sentence - A. - :param sentence_a: First sentence - :param sentence_b: Second sentence to test if it comes after the first - :param use_probability: Toggle outputting probability instead of boolean - :return Result of whether sentence B follows sentence A, - as either a probability or a boolean - """ - - if not self.__is_one_sentence(sentence_a) or not self.__is_one_sentence(sentence_b): - self.logger.error('Each inputted text variable for the "predict_next_sentence" method must contain a single sentence') - exit() - - if self.nsp is None: - self._get_next_sentence_prediction() - - if self.gpu_support == 'cuda': - self.nsp.to('cuda') - - connected = sentence_a + ' ' + sentence_b - tokenized_text = self._get_tokenized_text(connected) - indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text) - segments_ids = self._get_segment_ids(tokenized_text) - # Convert inputs to PyTorch tensors - tokens_tensor = torch.tensor([indexed_tokens]) - segments_tensors = torch.tensor([segments_ids]) - with torch.no_grad(): - predictions = self.nsp(tokens_tensor, token_type_ids=segments_tensors)[0] - - probabilities = torch.nn.Softmax(dim=1)(predictions) - # probability that sentence B follows sentence A - correct_probability = probabilities[0][0].item() - - if self.gpu_support == 'cuda': - torch.cuda.empty_cache() - - return ( - correct_probability if use_probability else - correct_probability >= 0.5 - ) - - def __is_one_sentence(self, text): - """ - Used to verify the proper input requirements for sentence_relation. - The text must contain no more than a single sentence. - Casual use of punctuation is accepted, such as using multiple exclamation marks. - :param text: A body of text - :return: True if the body of text contains a single sentence, else False - """ - split_text = re.split('[?.!]', text) - sentence_found = False - for possible_sentence in split_text: - for char in possible_sentence: - if char.isalpha(): - if sentence_found: - return False - sentence_found = True - break - return True - - - diff --git a/happytransformer/to_delete/happy_roberta.py b/happytransformer/to_delete/happy_roberta.py deleted file mode 100644 index bee4732b..00000000 --- a/happytransformer/to_delete/happy_roberta.py +++ /dev/null @@ -1,47 +0,0 @@ -""" -HappyROBERTA: a wrapper over PyTorch's RoBERTa implementation -""" -# disable pylint TODO warning -# pylint: disable=W0511 - -from happytransformer.to_delete.happy_transformer import HappyTransformer - -from transformers import RobertaForMaskedLM, RobertaTokenizer - -class HappyROBERTA(HappyTransformer): - """ - Currently available public methods: - RobertaForMaskedLM: - 1. predict_mask(text: str, options=None, k=1) - RobertaForSequenceClassification: - 1. init_sequence_classifier() - 2. advanced_init_sequence_classifier() - 3. train_sequence_classifier(train_csv_path) - 4. eval_sequence_classifier(eval_csv_path) - 5. test_sequence_classifier(test_csv_path) - - """ - - def __init__(self, model='roberta-base'): - super().__init__(model, "ROBERTA") - - self.mlm = None # Masked Language Model - self.nsp = None # Next Sentence Prediction - self.tokenizer = RobertaTokenizer.from_pretrained(model) - self.masked_token = self.tokenizer.mask_token - self.sep_token = self.tokenizer.sep_token - self.cls_token = self.tokenizer.cls_token - - def _get_masked_language_model(self): - """ - Initializes the RoBERTaForMaskedLM transformer - """ - self.mlm = RobertaForMaskedLM.from_pretrained(self.model) - self.mlm.eval() - - def _postprocess_option(self, text): - if text.startswith("Ġ"): - return text[1:] - if text == '': - return '.' - return text \ No newline at end of file diff --git a/happytransformer/to_delete/happy_transformer.py b/happytransformer/to_delete/happy_transformer.py deleted file mode 100644 index 84c53839..00000000 --- a/happytransformer/to_delete/happy_transformer.py +++ /dev/null @@ -1,587 +0,0 @@ -# disable pylint TODO warning -# pylint: disable=W0511 -# pylint: disable=C0301 - -""" -HappyTransformer is a wrapper over pytorch_transformers to make it -easier to use. -""" - -import string -import re -import os -import sys -import csv -import logging -import logging.config -import numpy as np -import torch -import pandas as pd - -from happytransformer.to_delete.classifier_args import classifier_args -from happytransformer.to_delete.sequence_classifier import SequenceClassifier -from happytransformer.to_delete.mlm_utils import FinetuneMlm, word_prediction_args - -def _indices_where(items, predicate): - return [ - idx - for idx,item in enumerate(items) - if predicate(item) - ] - - -_POSSIBLE_MASK_TOKENS = ['', '', '[MASK]'] - -class HappyTransformer: - """ - Initializes pytroch's transformer models and provided methods for - their basic functionality. - Philosophy: Automatically make decisions for the user so that they don't - have to have any understanding of PyTorch or transformer - models to be able to utilize their capabilities. - """ - - def __init__(self, model, model_name): - # Transformer and tokenizer set in child class - self.model = model - self.model_name = model_name - self.mlm = None # Masked Language Model - self.seq = None # Sequence Classification - self.qa = None # Question Answering - self.mlm_args = None # Mask Language Model Finetuning - - # the following variables are declared in the child class: - self.tokenizer = None - - # Child class sets to indicate which model is being used - self.tag_one_transformers = ['BERT', "ROBERTA", 'XLNET'] - - # GPU support - self.gpu_support = torch.device( - "cuda" if torch.cuda.is_available() - else "cpu" - ) - - - # show only happytransformer logs - handler = logging.StreamHandler() - handler.addFilter(logging.Filter('happytransformer')) - logging.basicConfig( - format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt='%m/%d/%Y %H:%M:%S', - level=logging.INFO, - handlers=[handler] - ) - self.logger = logging.getLogger(__name__) - - self.logger.info("Using model: %s", self.gpu_support) - self.seq_trained = False - self.mwp_trainer = None - self.mwp_trained = False - - # ------------------------ QA - self._qa_model = None # Question Answering - self._qa_tokenizer = None - - self._qa_init = False - self._qa_trainer = None - self._qa_runner = None - - def _get_masked_language_model(self): - pass - - def _standardize_mask_tokens(self, text): - ''' - convert mask tokens to mask token preferred by tokenizer - ''' - for possible_mask_token in _POSSIBLE_MASK_TOKENS: - text = text.replace(possible_mask_token, self.tokenizer.mask_token) - return text - - def _prepare_mlm(self): - if self.mlm is None: - self._get_masked_language_model() - if self.gpu_support=='cuda': - self.mlm.to('cuda') - - def _masked_predictions_at_index_any(self, softmax, index, k): - ''' - return top predictions for a mask token from all embeddings - ''' - scores_tensor, token_ids_tensor = torch.topk(softmax[0, index], k) - scores = scores_tensor.tolist() - token_ids = token_ids_tensor.tolist() - tokens = self.tokenizer.convert_ids_to_tokens(token_ids) - options = [ - self._postprocess_option(token) - for token in tokens - ] - return [ - {"word": option, "softmax": score} - for option, score in zip(options, scores) - ] - - def _masked_predictions_at_index_options(self, softmax, index, options): - ''' - return top predictions for a mask token from a list of options - ''' - option_ids = [ - self.tokenizer.encode(option) - for option in options - ] - scores = [ - self.soft_sum(option_id, softmax[0], index) - for option_id in option_ids - ] - return [ - {"word": option, "softmax": score} - for option,score in zip(options,scores) - ] - - def _postprocess_option(self, text: str): - ''' - modifies option text as seen by predict_masks() output. - override in subclass to filter out weird characters. - :param text: original text of prediction option - :returns text: processed text of prediction option - ''' - return text - - def predict_masks(self, text: str, options=None, num_results=1): - ''' - Predict multiple [MASK] tokens in some text. - :param text: text containing the mask tokens - :param masks_options: list of lists of options as strings - :param num_results: number of results to return per mask token - num_results is ignored if options are supplied. - :returns: A list of list of namedtuples of the form (text,probability), - where predictions are ordered descendingly by likelihood - ''' - self._prepare_mlm() - text = self._standardize_mask_tokens(text) - - self._text_verification(text) - - text_tokens = ( - self._get_tokenized_text(text) - ) - softmax = self._get_prediction_softmax(text_tokens) - - masked_indices = _indices_where( - text_tokens, - lambda text: text == self.tokenizer.mask_token - ) - - if options is None: - return [ - self._masked_predictions_at_index_any( - softmax, masked_index, num_results - ) - for masked_index in masked_indices - ] - else: - return [ - self._masked_predictions_at_index_options( - softmax, masked_index, mask_options - ) - for masked_index, mask_options in zip(masked_indices, options) - ] - - def predict_mask(self, text: str, options=None, num_results=1): - ''' - Predict a single [MASK] token in some text. - :param text: text containing the mask token - :param options: list of options as strings - :param num_results: number of predictions to return if no options supplied - :returns: list of dictionaries with keys 'word' and 'softmax' - ''' - masks_options = None if options is None else [options] - predictions = self.predict_masks(text, masks_options, num_results) - return self.__format_option_scores(predictions[0]) - - def _get_tokenized_text(self, text): - """ - Formats a sentence so that it can be tokenized by a transformer. - :param text: a 1-2 sentence text that contains [MASK] - :return: A string with the same sentence that contains the required - tokens for the transformer - """ - - # Create a spacing around each punctuation character. eg "!" -> " ! " - # TODO: easy: find a cleaner way to do punctuation spacing - text = re.sub('([.,!?()])', r' \1 ', text) - # text = re.sub('\s{2,}', ' ', text) - - split_text = text.split() - new_text = list() - new_text.append(self.tokenizer.cls_token) - - for i, char in enumerate(split_text): - new_text.append(char.lower()) - if char not in string.punctuation: - pass - # must be a punctuation symbol - elif i + 1 >= len(split_text): - # is the last punctuation so simply add to the new_text - pass - else: - if split_text[i + 1] in string.punctuation: - pass - else: - new_text.append(self.tokenizer.sep_token) - # if self.model_name == "ROBERTA": - # # ROBERTA requires two "" tokens to separate sentences - # new_text.append(self.sep_token) - # must be a middle punctuation - new_text.append(self.tokenizer.sep_token) - - text = " ".join(new_text).replace('[mask]', self.tokenizer.mask_token) - text = self.tokenizer.tokenize(text) - return text - - def _get_prediction_softmax(self, text): - """ - Gets the softmaxes of the predictions for each index in the the given - input string. - Returned tensor will be in shape: - [1, , ] - :param text: a tokenized string to be used by the transformer. - :return: a tensor of the softmaxes of the predictions of the - transformer - - """ - - indexed_tokens = self.tokenizer.convert_tokens_to_ids(text) - # Convert inputs to PyTorch tensors - tokens_tensor = torch.tensor([indexed_tokens]) - - if self.gpu_support == "cuda": - tokens_tensor = tokens_tensor.to('cuda') - - with torch.no_grad(): - - if self.model_name != "ROBERTA": - segments_ids = self._get_segment_ids(text) - segments_tensors = torch.tensor([segments_ids]) - if self.gpu_support == "cuda": - segments_tensors = segments_tensors.to('cuda') - outputs = self.mlm(tokens_tensor, token_type_ids=segments_tensors) - else: - outputs = self.mlm(tokens_tensor) - - predictions = outputs[0] - - softmax = self._softmax(predictions) - return softmax - - def __format_option_scores(self, tupled_predicitons: list): - """ - Formats the given list of tuples containing the option and its - corresponding softtmax into a user friendly list of dictionaries where - the first element in the list is the option with the highest softmax. - Dictionary will be in the form: - {'word': , 'softmax': } - :param: ranked_scores: list of tuples to be converted into user - friendly dicitonary - :return: formatted_ranked_scores: list of dictionaries of the ranked - scores - """ - ranked_scores = sorted(tupled_predicitons, key=lambda x: x["softmax"], - reverse=True) - formatted_ranked_scores = list() - for dic in ranked_scores: - - formatted_ranked_scores.append({'word': dic["word"], 'softmax': dic["softmax"]}) - return formatted_ranked_scores - - def _softmax(self, value): - # TODO: make it an external function - return value.exp() / (value.exp().sum(-1)).unsqueeze(-1) - - def _get_segment_ids(self, tokenized_text: list): - """ - Converts a list of tokens into segment_ids. The segment id is a array - representation of the location for each character in the - first and second sentence. This method only words with 1-2 sentences. - Example: - tokenized_text = ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', - 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', - '[SEP]'] - segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] - returns segments_ids - """ - split_location = tokenized_text.index(self.tokenizer.sep_token) - - segment_ids = [ - 0 if idx <= split_location else 1 - for idx in range(len(tokenized_text)) - ] - # add exception case for XLNet - - return segment_ids - - def _text_verification(self, text: str): - - # TODO, Add cases for the other masked tokens used in common transformer models - valid = True - if '[MASK]' not in text: - self.logger.error("[MASK] was not found in your string. Change the word you want to predict to [MASK]") - valid = False - if '' in text or '' in text: - self.logger.info('Instead of using or , use [MASK] please as it is the convention') - valid = True - if '[CLS]' in text: - self.logger.error("[CLS] was found in your string. Remove it as it will be automatically added later") - valid = False - if '[SEP]' in text: - self.logger.error("[SEP] was found in your string. Remove it as it will be automatically added later") - valid = False - if not valid: - exit() - - @staticmethod - def soft_sum(option: list, softed, mask_id: int): - # TODO: Better logic. - """ - Adds the softmax of a single option - XLNET tokenizer sometimes splits words in to pieces. - Ex: The councilmen -> ['the', 'council', 'men'] - Pretty sure that this is mathematically wrong - :param option: Id of tokens in one option - :param softed: softmax of the output - :param mask: Index of masked word - :return: float Tensor - """ - # Collects the softmax of all tokens in list - return np.sum([softed[mask_id][op] for op in option]) - - def init_sequence_classifier(self): - """ - Initializes a binary sequence classifier model with default settings - """ - - # TODO Test the sequence classifier with other models - args = classifier_args.copy() - self.seq = SequenceClassifier(args, self.tokenizer, self.logger, self.gpu_support, self.model, self.model_name) - - self.logger.info("A binary sequence classifier for %s has been initialized", self.model_name) - - def custom_init_sequence_classifier(self, args): - """ - Initializes a binary sequence classifier model with custom settings. - The default settings args dictionary can be found happy_transformer/sequence_classification/classifier_args. - This dictionary can then be modified and then used as the only input for this method. - - """ - self.seq = SequenceClassifier(args, self.tokenizer, self.logger, self.gpu_support, self.model, self.model_name) - self.logger.info("A binary sequence classifier for %s has been initialized", self.model_name) - - def train_sequence_classifier(self, train_csv_path): - """ - Trains the HappyTransformer's sequence classifier - - :param train_csv_path: A path to the csv evaluation file. - Each test is contained within a row. - The first column is for the the correct answers, either 0 or 1 as an int or a string . - The second column is for the text. - """ - self.logger.info("***** Running Training *****") - - train_df = self.__process_classifier_data(train_csv_path) - - if self.seq is None: - self.logger.error("Initialize the sequence classifier before training") - exit() - - sys.stdout = open(os.devnull, - 'w') # Disable printing to stop external libraries from printing - train_df = train_df.astype("str") - self.seq.train_list_data = train_df.values.tolist() - del train_df # done with train_df - self.seq.train_model() - self.seq_trained = True - sys.stdout = sys.__stdout__ # Enable printing - - def eval_sequence_classifier(self, eval_csv_path): - """ - Evaluates the trained sequence classifier against a testing set. - - :param csv_path: A path to the csv evaluation file. - Each test is contained within a row. - The first column is for the the correct answers, either 0 or 1 as an int or a string . - The second column is for the text. - - :return: A dictionary evaluation matrix - """ - - self.logger.info("***** Running evaluation *****") - - sys.stdout = open(os.devnull, 'w') # Disable printing - - eval_df = self.__process_classifier_data(eval_csv_path) - - if not self.seq_trained: - self.logger.error("Train the sequence classifier before evaluation") - exit() - - eval_df = eval_df.astype("str") - self.seq.eval_list_data = eval_df.values.tolist() - - results = self.seq.evaluate() - sys.stdout = sys.__stdout__ # Enable printing - - return results - - def test_sequence_classifier(self, test_csv_path): - """ - - :param test_csv_path: a path to the csv evaluation file. - Each test is contained within a row. - The first column is for the the correct answers, either 0 or 1 as an int or a string . - The second column is for the text. - :return: A list of predictions where each prediction index is the same as the corresponding test's index - """ - self.logger.info("***** Running Testing *****") - sys.stdout = open(os.devnull, 'w') # Disable printing - - test_df = self.__process_classifier_data(test_csv_path, for_test_data=True) - - # todo finish - if not self.seq_trained: - self.logger.error("Train the sequence classifier before testing") - exit() - - test_df = test_df.astype("str") - self.seq.test_list_data = test_df.values.tolist() - del test_df # done with test_df - - results = self.seq.test() - - sys.stdout = sys.__stdout__ # Enable printing - - return results - - def __process_classifier_data(self, csv_path, for_test_data=False): - """ - Credit: This code was modified from this repository - https://github.com/ThilinaRajapakse/pytorch-transformers-classification - :param csv_path: Path to csv file that must be processed - :return: A Panda dataframe with the proper information for classification tasks - """ - - if for_test_data: - with open(csv_path, 'r') as test_file: - reader = csv.reader(test_file) - text_list = list(reader) - # Blank values are required for the first column value the testing data to increase - # reusability of preprocessing methods between the tasks - blank_values = ["0"] * len(text_list) - data_frame = pd.DataFrame([*zip(blank_values, text_list)]) - del blank_values # done with blank_values - - else: - data_frame = pd.read_csv(csv_path, header=None) - - data_frame[0] = data_frame[0].astype("int") - data_frame = pd.DataFrame({ - 'id': range(len(data_frame)), - 'label': data_frame[0], - 'alpha': ['a'] * data_frame.shape[0], - 'text': data_frame[1].replace(r'\n', ' ', regex=True) - }) - - return data_frame - - def init_train_mwp(self, args=None): - """ - Initializes the MLM for fine-tuning on masked word prediction. - If args are not supplied the following hyperparameters are used: - batch size = 1 - Number of epochs = 1 - Learning rate = 5e-5 - Adam epsilon = 1e-8 - - """ - if not args: - self.mlm_args = word_prediction_args - else: - self.mlm_args = args - - # TODO Test the sequence classifier with other models - - if self.model_name != "XLNET": - - # current implementation: - if not self.mlm: - self._get_masked_language_model() # if already has self.mlm - # don't call this - self.mwp_trainer = FinetuneMlm(self.mlm, self.mlm_args, - self.tokenizer, self.logger) - - self.logger.info( - "You can now train a masked word prediction model using %s", - self.model_name) - - else: - self.logger.error( - "Masked language model training is not available for XLNET") - sys.exit() - - def train_mwp(self, train_path: str): - """ - Trains the model with masked language modeling loss. - - train_path: Path to the training file, expected to be a .txt or of - similar form. - - """ - - if torch.cuda.is_available(): - if self.mwp_trained and self.mwp_trainer: # If model is trained - self.logger.warning("Training on the already fine-tuned model") - self.mwp_trainer.train(train_path) - - elif self.mwp_trainer and not self.mwp_trained: # If trainer - # exists but isn't trained - self.mlm, self.tokenizer = self.mwp_trainer.train(train_path) - self.mwp_trained = True - - elif not self.mwp_trainer: # If trainer doesn't exist - self.logger.error( - "The model is not loaded, you should run init_train_mwp.") - sys.exit() - - else: # If the user doesn't have a gpu. - self.logger.error( - "You are using %s, you must use a GPU to train a MLM", - self.gpu_support) - sys.exit() - - def eval_mwp(self, eval_path: str, batch_size: int = 2): - """ - Evaluates the masked language model and returns the perplexity and - the evaluation loss. - - eval_path: Path to the evaluation file, expected to be a .txt or - similar. - batch_size: Depending on the gpu the user may increase or decrease - batch size. - - """ - if not self.mwp_trainer: - self.logger.error( - "The model is not loaded, you should run init_train_mwp.") - sys.exit() - - if not self.mwp_trained: - self.logger.warning( - "You are evaluating on the pretrained model, not the fine-tuned model.") - - results = self.mwp_trainer.evaluate(eval_path, batch_size) - - return results - - def _init_model_first_warning(self, model_type, method_name): - - # todo make this a logger message - print("First initialize the", model_type, "using the", method_name, "method") \ No newline at end of file diff --git a/happytransformer/to_delete/happy_xlnet.py b/happytransformer/to_delete/happy_xlnet.py deleted file mode 100644 index 8ce5d761..00000000 --- a/happytransformer/to_delete/happy_xlnet.py +++ /dev/null @@ -1,45 +0,0 @@ -""" -HappyXLNET: a wrapper over PyTorch's XLNet implementation -""" - -from transformers import ( - XLNetLMHeadModel, - XLNetTokenizer -) - -from happytransformer.to_delete.happy_transformer import HappyTransformer - - -class HappyXLNET(HappyTransformer): - """ - Currently available public methods: - XLNetLMHeadModel: - 1. predict_mask(text: str, options=None, k=1) - XLNetForSequenceClassification: - 1. init_sequence_classifier() - 2. advanced_init_sequence_classifier() - 3. train_sequence_classifier(train_csv_path) - 4. eval_sequence_classifier(eval_csv_path) - 5. test_sequence_classifier(test_csv_path) - - """ - - def __init__(self, model='xlnet-base-cased'): - super().__init__(model, "XLNET") - self.mlm = None - self.tokenizer = XLNetTokenizer.from_pretrained(model) - self.masked_token = self.tokenizer.mask_token - self.sep_token = self.tokenizer.sep_token - self.cls_token = self.tokenizer.cls_token - - def _get_masked_language_model(self): - """ - Initializes the XLNetLMHeadModel transformer - """ - self.mlm = XLNetLMHeadModel.from_pretrained(self.model) - self.mlm.eval() - - def _postprocess_option(self, text): - if text.startswith('▁'): - text = text[1:] - return text \ No newline at end of file diff --git a/happytransformer/to_delete/mlm_utils.py b/happytransformer/to_delete/mlm_utils.py deleted file mode 100644 index 8ce77ab1..00000000 --- a/happytransformer/to_delete/mlm_utils.py +++ /dev/null @@ -1,297 +0,0 @@ -""" -BERT and ROBERTA masked language model fine-tuning: - -Credit: This code is a modified version of the code found in this repository -under - https://github.com/huggingface/transformers/blob/master/examples - /run_lm_finetuning.py - -""" - -import logging -import os -import random - -import numpy as np -import torch -from torch.utils.data import (DataLoader, Dataset, RandomSampler, - SequentialSampler) -from tqdm import trange -from tqdm.notebook import tqdm_notebook -from transformers import (AdamW) - -try: - from transformers import get_linear_schedule_with_warmup -except ImportError: - from transformers import WarmupLinearSchedule \ - as get_linear_schedule_with_warmup - -logger = logging.getLogger(__name__) - -class TextDataset(Dataset): - """ - Used to turn .txt file into a suitable dataset object - """ - - def __init__(self, tokenizer, file_path, block_size=512): - assert os.path.isfile(file_path) - with open(file_path, encoding="utf-8") as f: - text = f.read() - lines = text.split("\n") - self.examples = [] - for line in lines: - tokenized_text = tokenizer.encode(line, max_length=block_size, - add_special_tokens=True, pad_to_max_length=True) # Get ids from text - self.examples.append(tokenized_text) - - def __len__(self): - return len(self.examples) - - def __getitem__(self, item): - return torch.tensor(self.examples[item]) - - -def set_seed(seed=42): - """ - Sets seed for all random number generators available. - """ - random.seed(seed) - np.random.seed(seed) - torch.manual_seed(seed) - try: - torch.cuda.manual_seed_all(seed) - except: - print('Cuda manual seed is not set') - - -def mask_tokens(inputs, tokenizer): - """ Prepare masked tokens inputs/labels for masked language modeling: - 80% MASK, 10% random, 10% original. - * The standard implementation from Huggingface Transformers library * - """ - labels = inputs.clone() - # We sample a few tokens in each sequence for masked-LM training - # (with probability args.mlm_probability defaults to 0.15 in Bert/RoBERTa) - # MLM Prob is 0.15 in examples - probability_matrix = torch.full(labels.shape, 0.15) - special_tokens_mask = [ - tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) - for val in - labels.tolist()] - probability_matrix.masked_fill_(torch.tensor( - special_tokens_mask, dtype=torch.bool), value=0.0) - masked_indices = torch.bernoulli(probability_matrix).bool() - labels[~masked_indices] = -100 # We only compute loss on masked tokens - - # 80% of the time, we replace masked input tokens with - # tokenizer.mask_token ([MASK]) - indices_replaced = torch.bernoulli(torch.full( - labels.shape, 0.8)).bool() & masked_indices - inputs[indices_replaced] = tokenizer.convert_tokens_to_ids( - tokenizer.mask_token) - - # 10% of the time, we replace masked input tokens with random word - indices_random = torch.bernoulli(torch.full( - labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced - random_words = torch.randint( - len(tokenizer), labels.shape, dtype=torch.long) - inputs[indices_random] = random_words[indices_random] - - # The rest of the time (10% of the time) we keep the masked input tokens - # unchanged - return inputs, labels - - -def train(model, tokenizer, train_dataset, batch_size, lr, adam_epsilon, - epochs): - """ - - :param model: Bert Model to train - :param tokenizer: Bert Tokenizer to train - :param train_dataset: - :param batch_size: Stick to 1 if not using using a high end GPU - :param lr: Suggested learning rate from paper is 5e-5 - :param adam_epsilon: Used for weight decay fixed suggested parameter is - 1e-8 - :param epochs: Usually a single pass through the entire dataset is - satisfactory - :return: Loss - """ - - train_sampler = RandomSampler(train_dataset) - train_dataloader = DataLoader( - train_dataset, sampler=train_sampler, batch_size=batch_size) - - t_total = len(train_dataloader) // batch_size # Total Steps - - # Prepare optimizer and schedule (linear warmup and decay) - no_decay = ['bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in model.named_parameters() if - not any(nd in n for nd in no_decay)], - 'weight_decay': 0.01}, - {'params': [p for n, p in model.named_parameters() if any( - nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] - optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=adam_epsilon) - scheduler = get_linear_schedule_with_warmup( - optimizer, 0, t_total) - - # ToDo Case for fp16 - - # Start of training loop - logger.info("***** Running training *****") - logger.info(" Num examples = %d", len(train_dataset)) - logger.info(" Batch size = %d", batch_size) - - model.train() - global_step = 0 - tr_loss, logging_loss = 0.0, 0.0 - model.resize_token_embeddings(len(tokenizer)) - model.zero_grad() - train_iterator = trange(int(epochs), desc="Epoch") - for _ in train_iterator: - epoch_iterator = tqdm_notebook(train_dataloader, desc="Iteration") - for batch in epoch_iterator: - inputs, labels = mask_tokens(batch, tokenizer) - inputs = inputs.to('cuda') # Don't bother if you don't have a gpu - labels = labels.to('cuda') - - outputs = model(inputs, masked_lm_labels=labels) - # model outputs are always tuple in transformers (see doc) - loss = outputs[0] - - loss.backward() - tr_loss += loss.item() - - # if (step + 1) % 1 == 0: # 1 here is a placeholder for gradient - # accumulation steps - torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1) - optimizer.step() - scheduler.step() - model.zero_grad() - global_step += 1 - - logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) - - return model, tokenizer - - -def create_dataset(tokenizer, file_path, block_size=512): - """ - Creates a dataset object from file path. - :param tokenizer: Bert tokenizer to create dataset - :param file_path: Path where data is stored - :param block_size: Should be in range of [0,512], viable choices are 64, - 128, 256, 512 - :return: The dataset - """ - dataset = TextDataset(tokenizer, file_path=file_path, - block_size=block_size) - return dataset - - -def evaluate(model, tokenizer, eval_dataset, batch_size): - """ - - :param model: Newly trained Bert model - :param tokenizer:Newly trained Bert tokenizer - :param eval_dataset: - :param batch_size: More flexible than training, the user can get away - with picking a higher batch_size - :return: The perplexity of the dataset - """ - eval_sampler = SequentialSampler(eval_dataset) # Same order samplinng - eval_dataloader = DataLoader( - eval_dataset, sampler=eval_sampler, batch_size=batch_size) - - # Eval! - logger.info("***** Running evaluation *****") - logger.info(" Num examples = %d", len(eval_dataset)) - logger.info(" Batch size = %d", batch_size) - eval_loss = 0.0 - nb_eval_steps = 0 - model.eval() - - # Evaluation loop - - for batch in tqdm_notebook(eval_dataloader, desc='Evaluating'): - inputs, labels = mask_tokens(batch, tokenizer) - inputs = inputs.to('cuda') - labels = labels.to('cuda') - - with torch.no_grad(): - outputs = model(inputs, masked_lm_labels=labels) - lm_loss = outputs[0] - eval_loss += lm_loss.mean().item() - nb_eval_steps += 1 - - eval_loss = eval_loss / nb_eval_steps - perplexity = torch.exp(torch.tensor(eval_loss)).item() - - result = { - 'perplexity': perplexity, - 'eval_loss': eval_loss - } - - return result - - -word_prediction_args = { - "batch_size": 1, - "epochs": 1, - "lr": 5e-5, - "adam_epsilon": 1e-8 - -} - - -class FinetuneMlm(): - """ - - :param train_path: Path to the training file, expected to be a .txt or - similar - :param test_path: Path to the testing file, expected to be a .txt or - similar - - Default parameters for effortless finetuning - batch size = 1 - Number of epochs = 1 - Learning rate = 5e-5 - Adam epsilon = 1e-8 - model_name = Must be in default transformer name. ie: 'bert-base-uncased' - - """ - - def __init__(self, mlm, args, tokenizer, logger): - self.mlm = mlm - self.tokenizer = tokenizer - self.args = args - self.logger = logger - - def train(self, train_path): - self.mlm.resize_token_embeddings(len(self.tokenizer)) - # Start Train - self.mlm.cuda() - train_dataset = create_dataset( - self.tokenizer, file_path=train_path) - self.mlm, self.tokenizer = train(self.mlm, self.tokenizer, - train_dataset, - batch_size=self.args["batch_size"], - epochs=self.args["epochs"], - lr=self.args["lr"], - adam_epsilon=self.args[ - "adam_epsilon"]) - - del train_dataset - self.mlm.cpu() - return self.mlm, self.tokenizer - - def evaluate(self, test_path, batch_size): - self.mlm.cuda() - test_dataset = create_dataset(self.tokenizer, file_path=test_path) - result = evaluate(self.mlm, self.tokenizer, test_dataset, - batch_size=batch_size) - del test_dataset - self.mlm.cpu() - return result diff --git a/happytransformer/to_delete/sequence_classifier.py b/happytransformer/to_delete/sequence_classifier.py deleted file mode 100644 index 911d0959..00000000 --- a/happytransformer/to_delete/sequence_classifier.py +++ /dev/null @@ -1,310 +0,0 @@ -""" -Binary Sequence Classifier for BERT, XLNET and RoBERTa that has fine tuning capabilities. - -Credit: This code is a modified version of the code found in this repository under "run_model.ipynb" - https://github.com/ThilinaRajapakse/pytorch-transformers-classification - # Licensed under the Apache License, Version 2.0 - http://www.apache.org/licenses/LICENSE-2.0 - - -""" - -# pylint: disable=C0301 - -from __future__ import absolute_import, division, print_function -import math -import numpy as np -from tqdm import tqdm, trange -from sklearn.metrics import confusion_matrix -import torch -from torch.utils.data import ( - DataLoader, RandomSampler, SequentialSampler, - TensorDataset -) - -from transformers import ( - BertForSequenceClassification, - XLNetForSequenceClassification, - RobertaForSequenceClassification -) - -from transformers import AdamW, get_linear_schedule_with_warmup - -from happytransformer.to_delete.classifier_utils import ( - convert_examples_to_features, - output_modes, - processors -) - -class SequenceClassifier(): - """ - Sequence Classifier with fine tuning capabilities - """ - - def __init__(self, args, tokenizer, logger, gpu_support, model, model_name): - self.args = args - self.processor = None - self.train_dataset = None - self.eval_dataset = None - self.model_classes = { - 'BERT': BertForSequenceClassification, - 'XLNET': XLNetForSequenceClassification, - 'ROBERTA': RobertaForSequenceClassification - } - self.train_list_data = None - self.eval_list_data = None - self.test_list_data = None - self.tokenizer = tokenizer - self.logger = logger - self.gpu_support = gpu_support - self.model_name = model_name - - self.model_class = self.model_classes[model_name] - - self.model = self.model_class.from_pretrained(model) - self.model.to(self.gpu_support) - - - def check_task(self): - "Checks to make sure the task is valid. Currently only \"Binary\" is accepted" - task = self.args['task_mode'] - - if task in processors.keys() and task in output_modes.keys(): - self.processor = processors[task]() - else: - raise KeyError(f'{task} is not available') - - - def train_model(self): - """ - Does the proper checks and initializations before training self.model. Then, saves the model - :return: - """ - self.check_task() - - self.train_dataset = self.__load_and_cache_examples("train") - self.__train() - - # Takes care of distributed/parallel training - model_to_save = self.model.module if hasattr(self.model, 'module') else self.model - - self.model = model_to_save # new - del self.train_dataset - - - def __train(self): - """ - Trains the binary sequence classifier - """ - sampler = RandomSampler(self.train_dataset) - train_dataloader = DataLoader(self.train_dataset, - sampler=sampler, - batch_size=self.args['batch_size']) - - t_total = len(train_dataloader) \ - // self.args['gradient_accumulation_steps'] * \ - self.args['num_epochs'] - - no_decay = ['bias', 'LayerNorm.weight'] - optimizer_grouped_parameters = [ - {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)], - 'weight_decay': self.args['weight_decay']}, - {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] - - warmup_steps = math.ceil(t_total * self.args['warmup_ratio']) - self.args['warmup_steps'] = warmup_steps if self.args['warmup_steps'] == 0 else self.args['warmup_steps'] - - optimizer = AdamW(optimizer_grouped_parameters, lr=self.args['learning_rate'], eps=self.args['adam_epsilon']) - scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=self.args['warmup_steps'], num_training_steps=t_total) - - global_step = 0 - tr_loss, logging_loss = 0.0, 0.0 - self.model.zero_grad() - train_iterator = trange(int(self.args['num_epochs']), desc="Epoch") - - for _ in train_iterator: - epoch_iterator = tqdm(train_dataloader, desc="Iteration") - for step, batch in enumerate(epoch_iterator): - self.model.train() - batch = tuple(t.to(self.gpu_support) for t in batch) - - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - 'token_type_ids': batch[2], - 'labels': batch[3]} - outputs = self.model(**inputs) - loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc) - # print("\r%f" % loss, end='') - - if self.args['gradient_accumulation_steps'] > 1: - loss = loss / self.args['gradient_accumulation_steps'] - - else: - loss.backward() - torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.args['max_grad_norm']) - - tr_loss += loss.item() - if (step + 1) % self.args['gradient_accumulation_steps'] == 0: - optimizer.step() - scheduler.step() # Update learning rate schedule - self.model.zero_grad() - global_step += 1 - - def __get_eval_report(self, labels, preds): - """ - :param labels: Correct answers - :param preds: predictions - :return: a confusion matrix - """ - assert len(preds) == len(labels) - - true_negative, false_positive, false_negative, true_positive = confusion_matrix(labels, preds).ravel() - return { - "true_positive": true_positive, - "true_negative": true_negative, - "false_positive": false_positive, - "false_negative": false_negative - } - - def evaluate(self): - """ - Evaluates the model against a set of questions to determine accuracy - :return: a dictionary confusion martrix - """ - # Loop to handle MNLI double evaluation (matched, mis-matched) - self.check_task() - - self.eval_dataset = self.__load_and_cache_examples("eval") - - results = {} - - eval_sampler = SequentialSampler(self.eval_dataset) - eval_dataloader = DataLoader(self.eval_dataset, sampler=eval_sampler, batch_size=self.args['batch_size']) - - # Eval! - eval_loss = 0.0 - nb_eval_steps = 0 - preds = None - out_label_ids = None - for batch in tqdm(eval_dataloader, desc="Evaluating"): - self.model.eval() - batch = tuple(t.to(self.gpu_support) for t in batch) - - with torch.no_grad(): - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - 'token_type_ids': batch[2], - 'labels': batch[3]} - outputs = self.model(**inputs) - tmp_eval_loss, logits = outputs[:2] - - eval_loss += tmp_eval_loss.mean().item() - nb_eval_steps += 1 - if preds is None: - preds = logits.detach().cpu().numpy() - out_label_ids = inputs['labels'].detach().cpu().numpy() - else: - preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) - out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) - - - preds = np.argmax(preds, axis=1) - - - result = self.__get_eval_report(out_label_ids, preds) - - results.update(result) - del self.eval_dataset - return results - - def test(self): - """ - Generates answers for an input - - :return: a list of answers where each index contains the answer 1 or 0 - for the corresponding test question with the same index - """ - # Loop to handle MNLI double evaluation (matched, mis-matched) - self.check_task() - - self.eval_dataset = self.__load_and_cache_examples("test") - - eval_sampler = SequentialSampler(self.eval_dataset) - eval_dataloader = DataLoader(self.eval_dataset, sampler=eval_sampler, batch_size=self.args['batch_size']) - - # Eval! - eval_loss = 0.0 - nb_eval_steps = 0 - preds = None - for batch in tqdm(eval_dataloader, desc="Evaluating"): - self.model.eval() - batch = tuple(t.to(self.gpu_support) for t in batch) - - with torch.no_grad(): - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - 'token_type_ids': batch[2], - 'labels': batch[3]} - outputs = self.model(**inputs) - tmp_eval_loss, logits = outputs[:2] - - eval_loss += tmp_eval_loss.mean().item() - nb_eval_steps += 1 - if preds is None: - preds = logits.detach().cpu().numpy() - else: - preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) - - preds = np.argmax(preds, axis=1) - - - return preds.tolist() - - def __load_and_cache_examples(self, task): - """ - Converts the proper list_data variable to a TensorDataset for the current task - :return: a TensorDataset for the requested task - """ - self.processor = processors[self.args["task_mode"]]() - output_mode = "classification" - - label_list = self.processor.get_labels() - - if task == 'eval': - examples = self.processor.get_dev_examples(self.eval_list_data) - del self.eval_list_data - elif task == 'train': - examples = self.processor.get_train_examples(self.train_list_data) - del self.train_list_data - else: - examples = self.processor.get_dev_examples(self.test_list_data) - del self.test_list_data - - - features = convert_examples_to_features(examples, label_list, self.args['max_seq_length'], self.tokenizer, - output_mode, - cls_token_at_end=bool(self.model_name in ['XLNET']), - # xlnet has a cls token at the end - cls_token=self.tokenizer.cls_token, - cls_token_segment_id=2 if self.model_name in [ - 'XLNET'] else 0, - sep_token=self.tokenizer.sep_token, - sep_token_extra=bool(self.model_name in ['ROBERTA']), - # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 - pad_on_left=bool(self.model_name in ['XLNET']), - # pad on the left for xlnet - pad_token=self.tokenizer.convert_tokens_to_ids([self.tokenizer.pad_token])[0], - pad_token_segment_id=4 if self.model_name in [ - 'XLNET'] else 0) - - all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) - all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) - all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long) - - - dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) - del all_input_ids, all_input_mask, all_segment_ids, all_label_ids - - return dataset From 078ce4bfe42fd4d0855e63208f4b3072a54e6ea6 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Wed, 6 Jan 2021 18:18:12 -0500 Subject: [PATCH 009/155] Deleted unneeded __init__.py, updated required one --- happytransformer/__init__.py | 7 ++----- happytransformer/happy_word_prediction.py | 2 +- happytransformer/happytasks/__init__.py | 0 happytransformer/happytasks/happy_qa/__init__.py | 0 tests/test_qa_trainer.py | 2 +- 5 files changed, 4 insertions(+), 7 deletions(-) delete mode 100644 happytransformer/happytasks/__init__.py delete mode 100644 happytransformer/happytasks/happy_qa/__init__.py diff --git a/happytransformer/__init__.py b/happytransformer/__init__.py index e8a705c9..2ac707aa 100644 --- a/happytransformer/__init__.py +++ b/happytransformer/__init__.py @@ -1,6 +1,3 @@ -from happytransformer.to_delete.happy_roberta import HappyROBERTA -from happytransformer.to_delete.happy_xlnet import HappyXLNET -from happytransformer.to_delete.happy_bert import HappyBERT -from happytransformer.to_delete.classifier_args import classifier_args - +from happytransformer.happy_question_answering import HappyQuestionAnswering +from happytransformer.happy_word_prediction import HappyWordPrediction name = "happytransformer" diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index 590959f0..9f2af18a 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -10,7 +10,7 @@ from happytransformer.happy_transformer import HappyTransformer -class HappyWorkPrediction(HappyTransformer): +class HappyWordPrediction(HappyTransformer): def __init__(self, model_type="BERT", model_name="bert-large-uncased-whole-word-masking-finetuned-squad", device=None): model = None diff --git a/happytransformer/happytasks/__init__.py b/happytransformer/happytasks/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/happytransformer/happytasks/happy_qa/__init__.py b/happytransformer/happytasks/happy_qa/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/test_qa_trainer.py b/tests/test_qa_trainer.py index b8fef0d2..4962b4d0 100644 --- a/tests/test_qa_trainer.py +++ b/tests/test_qa_trainer.py @@ -2,7 +2,7 @@ Tests for the question answering training, evaluating and testing functionality """ -from happytransformer.happy_question_answering import HappyQuestionAnswering +from happytransformer.happy_question_answering import HappyQuestionAnswering def test_qa_train_eval(): From 9ef13857f8548d61b75ab27b1c3341626585b16a Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Wed, 6 Jan 2021 20:05:06 -0500 Subject: [PATCH 010/155] Applied Ted's suggestion on organizing directories --- happytransformer/happy_question_answering.py | 12 ++++++------ happytransformer/happy_word_prediction.py | 2 +- .../default_args_mwp.py => mwp/default_args.py} | 0 .../happy_mwp/trainer_mwp.py => mwp/trainer.py} | 2 +- happytransformer/mwp/util.py | 0 .../default_args_qa.py => qa/default_args.py} | 0 .../happy_qa/trainer_qa.py => qa/trainer.py} | 0 .../{happytasks/happy_qa/qa_util.py => qa/util.py} | 2 +- happytransformer/{happytasks => }/tasks_util.py | 0 tests/test_task_util.py | 4 ++-- 10 files changed, 11 insertions(+), 11 deletions(-) rename happytransformer/{happytasks/happy_mwp/default_args_mwp.py => mwp/default_args.py} (100%) rename happytransformer/{happytasks/happy_mwp/trainer_mwp.py => mwp/trainer.py} (79%) create mode 100644 happytransformer/mwp/util.py rename happytransformer/{happytasks/happy_qa/default_args_qa.py => qa/default_args.py} (100%) rename happytransformer/{happytasks/happy_qa/trainer_qa.py => qa/trainer.py} (100%) rename happytransformer/{happytasks/happy_qa/qa_util.py => qa/util.py} (97%) rename happytransformer/{happytasks => }/tasks_util.py (100%) diff --git a/happytransformer/happy_question_answering.py b/happytransformer/happy_question_answering.py index a3f664f9..57e581ee 100644 --- a/happytransformer/happy_question_answering.py +++ b/happytransformer/happy_question_answering.py @@ -5,9 +5,9 @@ import torch from happytransformer.happy_transformer import HappyTransformer -from happytransformer.happytasks.happy_qa.qa_util import qa_probabilities -from happytransformer.happytasks.happy_qa.trainer_qa import QATrainer -from happytransformer.happytasks.happy_qa.default_args_qa \ +from happytransformer.qa.util import qa_probabilities +from happytransformer.qa.trainer import QATrainer +from happytransformer.qa.default_args \ import ARGS_QA_EVAL, ARGS_QA_TEST, ARGS_QA_TRAIN from transformers import ( BertForQuestionAnswering, @@ -101,7 +101,7 @@ def train(self, input_filepath, args=ARGS_QA_TRAIN): for training. Contains the following header values: context, question, answer_text, answer_start args: a dictionary that contains settings found under - happytransformer.happytasks.happy_qa.default_args_qa.py + happytransformer.happytasks.happy_qa.default_args.py return: None """ self._trainer.train(input_filepath=input_filepath, args=args) @@ -114,7 +114,7 @@ def eval(self, input_filepath, output_filepath, args=ARGS_QA_EVAL): for training. Contains the following header values: context, question, answer_text, answer_start args: a dictionary that contains settings found under - happytransformer.happytasks.happy_qa.default_args_qa.py + happytransformer.happytasks.happy_qa.default_args.py output_filepath: a path to a csv file to output the results. This file contains the following header values: contexts, questions, answer, outputs, correct, softmax @@ -132,7 +132,7 @@ def test(self, input_filepath, output_filepath, args=ARGS_QA_TEST): for training. Contains the following header values: context, question args: a dictionary that contains settings found under - happytransformer.happytasks.happy_qa.default_args_qa.py + happytransformer.happytasks.happy_qa.default_args.py output_filepath: a path to a csv file to output the results. This file contains the following header values: contexts, questions, outputs, softmax return: None diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index 9f2af18a..7f368abb 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -1,4 +1,4 @@ -from happytransformer.happytasks.happy_qa.trainer_qa import QATrainer +from happytransformer.qa.trainer import QATrainer from transformers import ( BertForMaskedLM, diff --git a/happytransformer/happytasks/happy_mwp/default_args_mwp.py b/happytransformer/mwp/default_args.py similarity index 100% rename from happytransformer/happytasks/happy_mwp/default_args_mwp.py rename to happytransformer/mwp/default_args.py diff --git a/happytransformer/happytasks/happy_mwp/trainer_mwp.py b/happytransformer/mwp/trainer.py similarity index 79% rename from happytransformer/happytasks/happy_mwp/trainer_mwp.py rename to happytransformer/mwp/trainer.py index ef9c67c9..711eb9e2 100644 --- a/happytransformer/happytasks/happy_mwp/trainer_mwp.py +++ b/happytransformer/mwp/trainer.py @@ -1,5 +1,5 @@ from happytransformer.trainer import Trainer -from happytransformer.happytasks.happy_mwp.default_args_mwp import ARGS_MWP_EVAL, ARGS_MWP_TEST, ARGS_MWP_TRAIN +from happytransformer.mwp.default_args import ARGS_MWP_EVAL, ARGS_MWP_TEST, ARGS_MWP_TRAIN class QATrainer(Trainer): diff --git a/happytransformer/mwp/util.py b/happytransformer/mwp/util.py new file mode 100644 index 00000000..e69de29b diff --git a/happytransformer/happytasks/happy_qa/default_args_qa.py b/happytransformer/qa/default_args.py similarity index 100% rename from happytransformer/happytasks/happy_qa/default_args_qa.py rename to happytransformer/qa/default_args.py diff --git a/happytransformer/happytasks/happy_qa/trainer_qa.py b/happytransformer/qa/trainer.py similarity index 100% rename from happytransformer/happytasks/happy_qa/trainer_qa.py rename to happytransformer/qa/trainer.py diff --git a/happytransformer/happytasks/happy_qa/qa_util.py b/happytransformer/qa/util.py similarity index 97% rename from happytransformer/happytasks/happy_qa/qa_util.py rename to happytransformer/qa/util.py index be576491..fa81ef79 100644 --- a/happytransformer/happytasks/happy_qa/qa_util.py +++ b/happytransformer/qa/util.py @@ -1,6 +1,6 @@ from collections import namedtuple import torch -from happytransformer.happytasks.tasks_util import biggest_sums +from happytransformer.tasks_util import biggest_sums QAAnswerLogit = namedtuple('QaAnswerLogit', [ 'start_idx', 'end_idx', 'logit' diff --git a/happytransformer/happytasks/tasks_util.py b/happytransformer/tasks_util.py similarity index 100% rename from happytransformer/happytasks/tasks_util.py rename to happytransformer/tasks_util.py diff --git a/tests/test_task_util.py b/tests/test_task_util.py index f7e6cc42..72a73b59 100644 --- a/tests/test_task_util.py +++ b/tests/test_task_util.py @@ -1,8 +1,8 @@ """ -Contains tests for functions found within qa_util.py +Contains tests for functions found within util.py """ -from happytransformer.happytasks.tasks_util import SumPair, biggest_sums +from happytransformer.tasks_util import SumPair, biggest_sums def test_biggest_sums(): """ Tests the biggest_sums function From b81126e7f6993a72dd5a222754f334fdf4c0eff5 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Thu, 7 Jan 2021 00:18:11 -0500 Subject: [PATCH 011/155] Added text classification files --- happytransformer/happy_text_classification.py | 0 happytransformer/tc/default_args.py | 0 happytransformer/tc/trainer.py | 0 happytransformer/tc/util.py | 0 tests/test_text_classification.py | 0 5 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 happytransformer/happy_text_classification.py create mode 100644 happytransformer/tc/default_args.py create mode 100644 happytransformer/tc/trainer.py create mode 100644 happytransformer/tc/util.py create mode 100644 tests/test_text_classification.py diff --git a/happytransformer/happy_text_classification.py b/happytransformer/happy_text_classification.py new file mode 100644 index 00000000..e69de29b diff --git a/happytransformer/tc/default_args.py b/happytransformer/tc/default_args.py new file mode 100644 index 00000000..e69de29b diff --git a/happytransformer/tc/trainer.py b/happytransformer/tc/trainer.py new file mode 100644 index 00000000..e69de29b diff --git a/happytransformer/tc/util.py b/happytransformer/tc/util.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_text_classification.py b/tests/test_text_classification.py new file mode 100644 index 00000000..e69de29b From 37652b46e6257a50cd514115061533822f26625a Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Thu, 7 Jan 2021 11:52:35 -0500 Subject: [PATCH 012/155] text classification starter code --- happytransformer/happy_text_classification.py | 78 ++++++++++++++ happytransformer/tc/default_args.py | 17 +++ happytransformer/tc/trainer.py | 102 ++++++++++++++++++ 3 files changed, 197 insertions(+) diff --git a/happytransformer/happy_text_classification.py b/happytransformer/happy_text_classification.py index e69de29b..08eb8648 100644 --- a/happytransformer/happy_text_classification.py +++ b/happytransformer/happy_text_classification.py @@ -0,0 +1,78 @@ +from happytransformer.tc.trainer import TCTrainer + +from transformers import ( + BertForSequenceClassification, + BertTokenizerFast, + + +) + +from happytransformer.happy_transformer import HappyTransformer +from happytransformer.tc.default_args import ARGS_TC_EVAL, ARGS_TC_TEST, ARGS_TC_TRAIN + +class HappyTextClassification(HappyTransformer): + def __init__(self, model_type="BERT", + model_name="'bert-large-uncased-whole-word-masking-finetuned-squad'", device=None): + model = None + tokenizer = None + + if model_type == "BERT": + model = BertForSequenceClassification.from_pretrained(model_name) + tokenizer = BertTokenizerFast.from_pretrained(model_name) + + + super().__init__(model_type, model_name, model, tokenizer, device) + + self._trainer = TCTrainer(model, + model_type, tokenizer, self._device, self.logger) + + def predict_text(self, text): + raise NotImplementedError() + + def train(self, input_filepath, args=ARGS_TC_TRAIN): + """ + Trains the question answering model + + input_filepath: a string that contains the location of a csv file + for training. Contains the following header values: text, + label + args: a dictionary that contains settings found under + happytransformer.happytasks.happy_qa.default_args.py + return: None + """ + self._trainer.train(input_filepath=input_filepath, args=args) + + def eval(self, input_filepath, output_filepath, args=ARGS_TC_EVAL): + """ + Trains the question answering model + + input_filepath: a string that contains the location of a csv file + for training. Contains the following header values: + text, label + args: a dictionary that contains settings found under + happytransformer.happytasks.happy_qa.default_args.py + output_filepath: a path to a csv file to output the results. + This file contains the following header values: text, + label, output, correct, softmax + return: correct ration (correct/total) + """ + return self._trainer.eval(input_filepath=input_filepath, + solve=self.predict_text, args=args, + output_filepath=output_filepath) + + def test(self, input_filepath, output_filepath, args=ARGS_TC_TEST): + """ + Tests the text classification model. Used to obtain results + + input_filepath: a string that contains the location of a csv file + for training. Contains the following header value: + text + args: a dictionary that contains settings found under + happytransformer.happytasks.happy_qa.default_args.py + output_filepath: a path to a csv file to output the results. + This file contains the following header values: text, output, softmax + return: None + """ + self._trainer.test(input_filepath=input_filepath, + solve=self.predict_text, args=args, + output_filepath=output_filepath) diff --git a/happytransformer/tc/default_args.py b/happytransformer/tc/default_args.py index e69de29b..40c78509 100644 --- a/happytransformer/tc/default_args.py +++ b/happytransformer/tc/default_args.py @@ -0,0 +1,17 @@ +ARGS_TC_TRAIN = { + 'max_length': 300, + 'batch_size': 16, + 'learning_rate': 5e-5, + 'epochs': 2, + + +} + +ARGS_TC_TEST = { + # eventually we'll add settings + +} + +ARGS_TC_EVAL = { + # eventually we'll add settings +} diff --git a/happytransformer/tc/trainer.py b/happytransformer/tc/trainer.py index e69de29b..a5a0f4aa 100644 --- a/happytransformer/tc/trainer.py +++ b/happytransformer/tc/trainer.py @@ -0,0 +1,102 @@ +""" +This code is a modified version of the official documentation for the +transformer library by Hugging Face which can be found below. + +We prioritized following the official documentation as close as possible to ensure we're using +robust methods. And also, to improve maintainability as they update the documentation. + +https://huggingface.co/transformers/custom_datasets.html#sequence-classification-with-imdb-reviews""" + +import csv +import torch +from torch.utils.data import DataLoader + +from transformers import Trainer, AdamW + + +class TCTrainer(Trainer): + + def __init__(self, model, model_type, tokenizer, device, logger): + super().__init__(model, model_type, tokenizer, device, logger) + + def train(self, input_filepath, args): + contexts, labels = self.__get_data(input_filepath) + train_encodings = self.tokenizer(contexts, truncation=True, padding=True) + train_dataset = TextClassificationDataset(train_encodings, labels) + + train_loader = DataLoader(train_dataset, batch_size=args['batch_size'], shuffle=True) + + optim = AdamW(self.model.parameters(), lr=args['learning_rate']) + self.model.train() + + for epoch in range(args['epochs']): + epoch_output = "Epoch: " + str(epoch) + "\n\n" + self.logger.info(epoch_output) + batch_num = 1 + for batch in train_loader: + + batch_output = "Batch: " + str(batch_num) + self.logger.info(batch_output) + optim.zero_grad() + input_ids = batch['input_ids'].to(self.device) + attention_mask = batch['attention_mask'].to(self.device) + labels = batch['labels'].to(self.device) + outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels) + loss = outputs[0] + loss.backward() + optim.step() + batch_num += 1 + + self.model.eval() + + + + + def eval(self, input_filepath, solve, output_filepath, args): + contexts, labels = self.__get_data(input_filepath) + eval_encodings = self.tokenizer(contexts, truncation=True, padding=True) + eval_dataset = TextClassificationDataset(eval_encodings, labels) + + + + + + def test(self, input_filepath, solve, output_filepath, args): + contexts = self.__get_data(input_filepath, True) + test_encodings = self.tokenizer(contexts, truncation=True, padding=True) + + + @staticmethod + def __get_data(filepath, test_data=False): + """ + Used for parsing data for training and evaluating (both contain labels) + :param filepath: a string that contains the location of the data + :return: + """ + contexts = [] + labels = [] + with open(filepath, newline='') as csv_file: + reader = csv.DictReader(csv_file) + for row in reader: + contexts.append(row['text']) + if not test_data: + labels.append(row['label']) + csv_file.close() + + if not test_data: + return contexts, labels + return contexts + + +class TextClassificationDataset(torch.utils.data.Dataset): + def __init__(self, encodings, labels): + self.encodings = encodings + self.labels = labels + + def __getitem__(self, idx): + item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} + item['labels'] = torch.tensor(self.labels[idx]) + return item + + def __len__(self): + return len(self.labels) \ No newline at end of file From c49d9757cdda97aef4ae91cf8e10ee68aba20b7b Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Thu, 7 Jan 2021 16:56:24 -0500 Subject: [PATCH 013/155] Used HF Trainer for TC --- data/test_tc_trainer/sample-tc-test-data.csv | 5 ++ .../sample-tc-training-eval-data.csv | 5 ++ happytransformer/happy_text_classification.py | 20 ++--- .../{trainer.py => happy_trainer.py} | 34 +++++++- happytransformer/happy_transformer.py | 14 ++-- happytransformer/tc/default_args.py | 34 +++++--- happytransformer/tc/trainer.py | 78 +++++++++++-------- tests/test_tc_trainer.py | 40 ++++++++++ 8 files changed, 160 insertions(+), 70 deletions(-) create mode 100644 data/test_tc_trainer/sample-tc-test-data.csv create mode 100644 data/test_tc_trainer/sample-tc-training-eval-data.csv rename happytransformer/{trainer.py => happy_trainer.py} (76%) create mode 100644 tests/test_tc_trainer.py diff --git a/data/test_tc_trainer/sample-tc-test-data.csv b/data/test_tc_trainer/sample-tc-test-data.csv new file mode 100644 index 00000000..8e35b279 --- /dev/null +++ b/data/test_tc_trainer/sample-tc-test-data.csv @@ -0,0 +1,5 @@ +text +"Wow I loved the food so much" +"Awful restaurant" +"Sooooo good" +"eeewwwww not coming here again" \ No newline at end of file diff --git a/data/test_tc_trainer/sample-tc-training-eval-data.csv b/data/test_tc_trainer/sample-tc-training-eval-data.csv new file mode 100644 index 00000000..a9b58762 --- /dev/null +++ b/data/test_tc_trainer/sample-tc-training-eval-data.csv @@ -0,0 +1,5 @@ +text,label +"Wow I loved the food so much", 1 +"Awful restaurant", 0 +"Sooooo good", 1 +"eeewwwww not coming here again", 0 diff --git a/happytransformer/happy_text_classification.py b/happytransformer/happy_text_classification.py index 08eb8648..8559cb45 100644 --- a/happytransformer/happy_text_classification.py +++ b/happytransformer/happy_text_classification.py @@ -8,11 +8,11 @@ ) from happytransformer.happy_transformer import HappyTransformer -from happytransformer.tc.default_args import ARGS_TC_EVAL, ARGS_TC_TEST, ARGS_TC_TRAIN +from happytransformer.tc.default_args import ARGS_TC_TRAIN class HappyTextClassification(HappyTransformer): def __init__(self, model_type="BERT", - model_name="'bert-large-uncased-whole-word-masking-finetuned-squad'", device=None): + model_name="bert-large-uncased-whole-word-masking-finetuned-squad", device=None): model = None tokenizer = None @@ -29,7 +29,7 @@ def __init__(self, model_type="BERT", def predict_text(self, text): raise NotImplementedError() - def train(self, input_filepath, args=ARGS_TC_TRAIN): + def train(self, input_filepath, output_path, args=ARGS_TC_TRAIN): """ Trains the question answering model @@ -40,9 +40,9 @@ def train(self, input_filepath, args=ARGS_TC_TRAIN): happytransformer.happytasks.happy_qa.default_args.py return: None """ - self._trainer.train(input_filepath=input_filepath, args=args) + self._trainer.train(input_filepath=input_filepath, args=args, output_path=output_path) - def eval(self, input_filepath, output_filepath, args=ARGS_TC_EVAL): + def eval(self, input_filepath, output_path): """ Trains the question answering model @@ -56,11 +56,9 @@ def eval(self, input_filepath, output_filepath, args=ARGS_TC_EVAL): label, output, correct, softmax return: correct ration (correct/total) """ - return self._trainer.eval(input_filepath=input_filepath, - solve=self.predict_text, args=args, - output_filepath=output_filepath) + return self._trainer.eval(input_filepath=input_filepath, output_path=output_path) - def test(self, input_filepath, output_filepath, args=ARGS_TC_TEST): + def test(self, input_filepath, output_path): """ Tests the text classification model. Used to obtain results @@ -73,6 +71,4 @@ def test(self, input_filepath, output_filepath, args=ARGS_TC_TEST): This file contains the following header values: text, output, softmax return: None """ - self._trainer.test(input_filepath=input_filepath, - solve=self.predict_text, args=args, - output_filepath=output_filepath) + self._trainer.test(input_filepath=input_filepath, output_path=output_path) diff --git a/happytransformer/trainer.py b/happytransformer/happy_trainer.py similarity index 76% rename from happytransformer/trainer.py rename to happytransformer/happy_trainer.py index 259b4e83..62199aad 100644 --- a/happytransformer/trainer.py +++ b/happytransformer/happy_trainer.py @@ -3,8 +3,9 @@ import math from csv import DictWriter +from transformers import TrainingArguments -class Trainer: +class HappyTrainer: def __init__(self, model, model_type, tokenizer, device, logger): self.model = model self.model_type = model_type @@ -13,15 +14,40 @@ def __init__(self, model, model_type, tokenizer, device, logger): self.logger = logger - def train(self, input_filepath, args): + def train(self, input_filepath, output_path, args): raise NotImplementedError() - def test(self, input_filepath, solve, output_filepath, args): + def test(self, input_filepath, output_path): raise NotImplementedError() - def eval(self, input_filepath, solve, output_filepath, args): + def eval(self, input_filepath, output_path): raise NotImplementedError() + @staticmethod + def _get_training_args(args, output_path): + return TrainingArguments( + output_dir=output_path, + learning_rate=args["learning_rate"], + weight_decay=args["weight_decay"], + adam_beta1=args["adam_beta1"], + adam_beta2=args["adam_beta2"], + adam_epsilon=args["adam_epsilon"], + max_grad_norm=args["max_grad_norm"], + num_train_epochs=args["num_train_epochs"], + + ) + + + @staticmethod + def _get_test_eval_args(output_path): + return TrainingArguments( + output_dir=output_path, + seed=42 + + ) + + + def _get_train_eval_data(self, filepath): """ Used for parsing data for training and evaluating (both contain labels) diff --git a/happytransformer/happy_transformer.py b/happytransformer/happy_transformer.py index 3ddb8f94..94ec068e 100644 --- a/happytransformer/happy_transformer.py +++ b/happytransformer/happy_transformer.py @@ -37,7 +37,7 @@ def __init__(self, model_type, model_name, model, tokenizer, device): self.logger.info("Using model: %s", self._device) - def train(self, input_filepath, args): + def train(self, input_filepath, output_path, args): """ Trains a model :param input_filepath: a string that contains a path to a csv file @@ -47,29 +47,25 @@ def train(self, input_filepath, args): """ raise NotImplementedError() - def eval(self, input_filepath, output_filepath, args): + def eval(self, input_filepath, output_path): """ Evaluates the model. Determines how well the model performs on a given dataset :param input_filepath: a string that contains a path to a csv file that contains evaluating data - :param output_filepath: a string that contains a path to a - csv file that will be created to store the results - :param args: settings in the form of a dictionary + :return: correct percentage """ raise NotImplementedError() - def test(self, input_filepath, output_filepath, args): + def test(self, input_filepath, output_path): """ Used to generate predictions for a given dataset. The dataset may not be labelled. :param input_filepath: a string that contains a path to a csv file that contains testing data - :param output_filepath: a string that contains a path to - a csv file that will be created to store the results - :param args: settings in the form of a dictionary + """ raise NotImplementedError() diff --git a/happytransformer/tc/default_args.py b/happytransformer/tc/default_args.py index 40c78509..aa210623 100644 --- a/happytransformer/tc/default_args.py +++ b/happytransformer/tc/default_args.py @@ -1,17 +1,27 @@ ARGS_TC_TRAIN = { - 'max_length': 300, - 'batch_size': 16, 'learning_rate': 5e-5, - 'epochs': 2, - + 'weight_decay': 0, + 'adam_beta1': 0.9, + 'adam_beta2': 0.999, + 'adam_epsilon': 1e-8, + 'max_grad_norm': 1.0, + 'num_train_epochs': 3.0, } -ARGS_TC_TEST = { - # eventually we'll add settings - -} - -ARGS_TC_EVAL = { - # eventually we'll add settings -} +# maybe implement later +# 'max_steps': -1, +# 'warmup_steps': 0, +# # 'logging_dir': #todo +# 'logging_first_step': False, +# 'logging_steps': 500, +# 'save_steps': 500, +# #'save_total_limit': +# 'no_cuda': False, +# 'seed': 42, +# 'fp16': False, +# 'fp16_opt_level': "O1", +# 'local_rank': -1, +# #'tpu_num_cores':, +# "debug": False, +# diff --git a/happytransformer/tc/trainer.py b/happytransformer/tc/trainer.py index a5a0f4aa..f35c25d6 100644 --- a/happytransformer/tc/trainer.py +++ b/happytransformer/tc/trainer.py @@ -11,60 +11,59 @@ import torch from torch.utils.data import DataLoader -from transformers import Trainer, AdamW +from transformers import Trainer +from happytransformer.happy_trainer import HappyTrainer -class TCTrainer(Trainer): +class TCTrainer(HappyTrainer): def __init__(self, model, model_type, tokenizer, device, logger): super().__init__(model, model_type, tokenizer, device, logger) - def train(self, input_filepath, args): + def train(self, input_filepath, output_path, args): contexts, labels = self.__get_data(input_filepath) train_encodings = self.tokenizer(contexts, truncation=True, padding=True) train_dataset = TextClassificationDataset(train_encodings, labels) - train_loader = DataLoader(train_dataset, batch_size=args['batch_size'], shuffle=True) + training_args = self._get_training_args(args, output_path) - optim = AdamW(self.model.parameters(), lr=args['learning_rate']) - self.model.train() + trainer = Trainer( + model=self.model, # the instantiated 🤗 Transformers model to be trained + args=training_args, # training arguments, defined above + train_dataset=train_dataset, # training dataset + ) + trainer.train() - for epoch in range(args['epochs']): - epoch_output = "Epoch: " + str(epoch) + "\n\n" - self.logger.info(epoch_output) - batch_num = 1 - for batch in train_loader: - - batch_output = "Batch: " + str(batch_num) - self.logger.info(batch_output) - optim.zero_grad() - input_ids = batch['input_ids'].to(self.device) - attention_mask = batch['attention_mask'].to(self.device) - labels = batch['labels'].to(self.device) - outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels) - loss = outputs[0] - loss.backward() - optim.step() - batch_num += 1 - - self.model.eval() - - - - - def eval(self, input_filepath, solve, output_filepath, args): + def eval(self, input_filepath, output_path): contexts, labels = self.__get_data(input_filepath) eval_encodings = self.tokenizer(contexts, truncation=True, padding=True) + eval_dataset = TextClassificationDataset(eval_encodings, labels) + eval_args = self._get_test_eval_args(output_path) + trainer = Trainer( + model=self.model, # the instantiated 🤗 Transformers model to be trained + args=eval_args, + eval_dataset=eval_dataset, # training dataset + ) + return trainer.evaluate() - def test(self, input_filepath, solve, output_filepath, args): + def test(self, input_filepath, output_path): contexts = self.__get_data(input_filepath, True) test_encodings = self.tokenizer(contexts, truncation=True, padding=True) + test_dataset = TextClassificationDatasetTest(test_encodings, len(contexts)) + test_args = self._get_test_eval_args(output_path) + + trainer = Trainer( + model=self.model, # the instantiated 🤗 Transformers model to be trained + args=test_args + ) + print(trainer.predict(test_dataset)) + @staticmethod def __get_data(filepath, test_data=False): @@ -80,7 +79,7 @@ def __get_data(filepath, test_data=False): for row in reader: contexts.append(row['text']) if not test_data: - labels.append(row['label']) + labels.append(int(row['label'])) csv_file.close() if not test_data: @@ -88,6 +87,7 @@ def __get_data(filepath, test_data=False): return contexts + class TextClassificationDataset(torch.utils.data.Dataset): def __init__(self, encodings, labels): self.encodings = encodings @@ -99,4 +99,16 @@ def __getitem__(self, idx): return item def __len__(self): - return len(self.labels) \ No newline at end of file + return len(self.labels) + +class TextClassificationDatasetTest(torch.utils.data.Dataset): + def __init__(self, encodings, length): + self.encodings = encodings + self.length = length + + def __getitem__(self, idx): + item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} + return item + + def __len__(self): + return self.length \ No newline at end of file diff --git a/tests/test_tc_trainer.py b/tests/test_tc_trainer.py new file mode 100644 index 00000000..322570f9 --- /dev/null +++ b/tests/test_tc_trainer.py @@ -0,0 +1,40 @@ +""" +Tests for the question answering training, evaluating and testing functionality +""" + +from happytransformer.happy_text_classification import HappyTextClassification + + +def test_qa_train(): + """ + Tests + HappyQuestionAnswering.eval() + HappyQuestionAnswering.train() + + """ + happy_tc = HappyTextClassification() + # Test 1 + + # Test 2 + happy_tc.train("../data/test_tc_trainer/sample-tc-training-eval-data.csv", "../data/test_tc_trainer/results/test_qa_train") + + +def test_qa_eval(): + """ + Tests + HappyQuestionAnswering.eval() + HappyQuestionAnswering.train() + + """ + happy_tc = HappyTextClassification() + results = happy_tc.eval("../data/test_tc_trainer/sample-tc-training-eval-data.csv", "../data/test_tc_trainer/results/test_qa_eval") + print(results) + # happy_tc.train("../data/test_tc_trainer/sample-tc-training-eval-data.csv", "../data/test_tc_trainer/results/test_qa_train") + #results = happy_tc.eval("../data/test_tc_trainer/sample-tc-training-eval-data.csv", "../data/test_tc_trainer/results/test_qa_eval") + + + +def test_qa_test(): + happy_tc = HappyTextClassification() + + happy_tc.test("../data/test_tc_trainer/sample-tc-test-data.csv", "../data/test_tc_trainer/results/test_qa_test") From 1c48602994545bc4778cd094cf2b578af3fb0933 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Thu, 7 Jan 2021 17:53:26 -0500 Subject: [PATCH 014/155] updated Trainer to HappyTrainer' --- happytransformer/qa/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/happytransformer/qa/trainer.py b/happytransformer/qa/trainer.py index 16cb7007..d102b742 100644 --- a/happytransformer/qa/trainer.py +++ b/happytransformer/qa/trainer.py @@ -13,9 +13,9 @@ from torch.utils.data import DataLoader from transformers import AdamW -from happytransformer.trainer import Trainer +from happytransformer.happy_trainer import HappyTrainer -class QATrainer(Trainer): +class QATrainer(HappyTrainer): def __init__(self, model, model_type, tokenizer, device, logger): super().__init__(model, model_type, tokenizer, device, logger) From b6f3423d53d54ec533d5f3042505d50ffe718c45 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Fri, 8 Jan 2021 17:15:08 -0500 Subject: [PATCH 015/155] Finished Text Classification --- data/tc/test.csv | 5 ++ data/tc/train-eval.csv | 5 ++ data/test_tc_trainer/sample-tc-test-data.csv | 5 -- .../sample-tc-training-eval-data.csv | 5 -- happytransformer/happy_text_classification.py | 71 +++++++++------ happytransformer/happy_trainer.py | 42 ++++----- happytransformer/happy_transformer.py | 21 +++-- happytransformer/tc/trainer.py | 87 +++++++++++-------- happytransformer/{tasks_util.py => util.py} | 18 +++- tests/test_task_util.py | 2 +- tests/test_tc.py | 62 +++++++++++++ tests/test_tc_trainer.py | 40 --------- tests/test_text_classification.py | 0 13 files changed, 215 insertions(+), 148 deletions(-) create mode 100644 data/tc/test.csv create mode 100644 data/tc/train-eval.csv delete mode 100644 data/test_tc_trainer/sample-tc-test-data.csv delete mode 100644 data/test_tc_trainer/sample-tc-training-eval-data.csv rename happytransformer/{tasks_util.py => util.py} (77%) create mode 100644 tests/test_tc.py delete mode 100644 tests/test_tc_trainer.py delete mode 100644 tests/test_text_classification.py diff --git a/data/tc/test.csv b/data/tc/test.csv new file mode 100644 index 00000000..1b1ef800 --- /dev/null +++ b/data/tc/test.csv @@ -0,0 +1,5 @@ +text +Wow what a great place to eat +Horrible food +Terrible service +yum yum I'm coming here again \ No newline at end of file diff --git a/data/tc/train-eval.csv b/data/tc/train-eval.csv new file mode 100644 index 00000000..b4a0a214 --- /dev/null +++ b/data/tc/train-eval.csv @@ -0,0 +1,5 @@ +text,label +Wow what a great place to eat,1 +Horrible food,0 +Terrible service,0 +I'm coming here again,1 \ No newline at end of file diff --git a/data/test_tc_trainer/sample-tc-test-data.csv b/data/test_tc_trainer/sample-tc-test-data.csv deleted file mode 100644 index 8e35b279..00000000 --- a/data/test_tc_trainer/sample-tc-test-data.csv +++ /dev/null @@ -1,5 +0,0 @@ -text -"Wow I loved the food so much" -"Awful restaurant" -"Sooooo good" -"eeewwwww not coming here again" \ No newline at end of file diff --git a/data/test_tc_trainer/sample-tc-training-eval-data.csv b/data/test_tc_trainer/sample-tc-training-eval-data.csv deleted file mode 100644 index a9b58762..00000000 --- a/data/test_tc_trainer/sample-tc-training-eval-data.csv +++ /dev/null @@ -1,5 +0,0 @@ -text,label -"Wow I loved the food so much", 1 -"Awful restaurant", 0 -"Sooooo good", 1 -"eeewwwww not coming here again", 0 diff --git a/happytransformer/happy_text_classification.py b/happytransformer/happy_text_classification.py index 8559cb45..85217358 100644 --- a/happytransformer/happy_text_classification.py +++ b/happytransformer/happy_text_classification.py @@ -3,72 +3,87 @@ from transformers import ( BertForSequenceClassification, BertTokenizerFast, - - + DistilBertForSequenceClassification, + DistilBertTokenizerFast ) from happytransformer.happy_transformer import HappyTransformer from happytransformer.tc.default_args import ARGS_TC_TRAIN +import numpy as np + +from happytransformer.util import softmax_of_matrix + class HappyTextClassification(HappyTransformer): - def __init__(self, model_type="BERT", - model_name="bert-large-uncased-whole-word-masking-finetuned-squad", device=None): + + def __init__(self, model_type="DISTILBERT", + model_name="distilbert-base-uncased-finetuned-sst-2-english", device=None): model = None tokenizer = None if model_type == "BERT": model = BertForSequenceClassification.from_pretrained(model_name) tokenizer = BertTokenizerFast.from_pretrained(model_name) - + elif model_type == "DISTILBERT": + model = DistilBertForSequenceClassification.from_pretrained(model_name) + tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) super().__init__(model_type, model_name, model, tokenizer, device) - self._trainer = TCTrainer(model, - model_type, tokenizer, self._device, self.logger) + self._trainer = TCTrainer(self._model, + self.model_type, self._tokenizer, self._device, self.logger) - def predict_text(self, text): - raise NotImplementedError() + def classify_text(self, text): + """ + :param text: A text string to be classified + :return: + """ - def train(self, input_filepath, output_path, args=ARGS_TC_TRAIN): + inputs = self._tokenizer(text, return_tensors="pt") + output = self._model(**inputs) + logits = output.logits + scores = logits.detach().cpu() + softmax = softmax_of_matrix(scores)[0] + preds = np.argmax(scores.numpy(), axis=1) + return { + "answer": preds[0], + 'softmax': softmax + } + + def train(self, input_filepath, args=ARGS_TC_TRAIN): """ Trains the question answering model input_filepath: a string that contains the location of a csv file for training. Contains the following header values: text, label + args: a dictionary that contains settings found under - happytransformer.happytasks.happy_qa.default_args.py + return: None + """ - self._trainer.train(input_filepath=input_filepath, args=args, output_path=output_path) + self._trainer.train(input_filepath=input_filepath, args=args) - def eval(self, input_filepath, output_path): + def eval(self, input_filepath): """ - Trains the question answering model + Evaluated the text classification answering model input_filepath: a string that contains the location of a csv file for training. Contains the following header values: text, label - args: a dictionary that contains settings found under - happytransformer.happytasks.happy_qa.default_args.py - output_filepath: a path to a csv file to output the results. - This file contains the following header values: text, - label, output, correct, softmax - return: correct ration (correct/total) + + return: #todo """ - return self._trainer.eval(input_filepath=input_filepath, output_path=output_path) + return self._trainer.eval(input_filepath=input_filepath) - def test(self, input_filepath, output_path): + def test(self, input_filepath): """ Tests the text classification model. Used to obtain results input_filepath: a string that contains the location of a csv file for training. Contains the following header value: text - args: a dictionary that contains settings found under - happytransformer.happytasks.happy_qa.default_args.py - output_filepath: a path to a csv file to output the results. - This file contains the following header values: text, output, softmax - return: None + return: #todo """ - self._trainer.test(input_filepath=input_filepath, output_path=output_path) + return self._trainer.test(input_filepath=input_filepath) diff --git a/happytransformer/happy_trainer.py b/happytransformer/happy_trainer.py index 62199aad..3132ff69 100644 --- a/happytransformer/happy_trainer.py +++ b/happytransformer/happy_trainer.py @@ -1,8 +1,11 @@ +""" +Parent class for training classes, such as TCTrainer and QATrainer +""" + import time import datetime import math from csv import DictWriter - from transformers import TrainingArguments class HappyTrainer: @@ -14,17 +17,26 @@ def __init__(self, model, model_type, tokenizer, device, logger): self.logger = logger - def train(self, input_filepath, output_path, args): + def train(self, input_filepath, args): + raise NotImplementedError() + + def test(self, input_filepath): raise NotImplementedError() - def test(self, input_filepath, output_path): + def eval(self, input_filepath): raise NotImplementedError() - def eval(self, input_filepath, output_path): + @staticmethod + def _get_data(filepath, test_data=False): raise NotImplementedError() @staticmethod def _get_training_args(args, output_path): + """ + :param args: a dictionary of arguments for training + :param output_path: A string to a temporary directory + :return: A TrainingArguments object + """ return TrainingArguments( output_dir=output_path, learning_rate=args["learning_rate"], @@ -37,35 +49,25 @@ def _get_training_args(args, output_path): ) - @staticmethod def _get_test_eval_args(output_path): + """ + + :param output_path: A string to a temporary directory + :return: A TrainingArguments object + """ return TrainingArguments( output_dir=output_path, seed=42 ) - - - def _get_train_eval_data(self, filepath): - """ - Used for parsing data for training and evaluating (both contain labels) - :param filepath: a string that contains the location of the data - :return: - """ - raise NotImplementedError() - - def _get_test_data(self, filepath): - raise NotImplementedError() - def _format_time(self, time): """ elapsed: time in seconds return: time outputted in hh:mm:ss format """ time_rounded = int(round((time))) - # Format as hh:mm:ss return str(datetime.timedelta(seconds=time_rounded)) @@ -77,7 +79,6 @@ def _get_update_interval(self, count): First determines how often to update for exactly 50 updates. Then, rounds to the nearest power of ten (10, 100, 1000 etc) - :param count: :return: """ @@ -114,3 +115,4 @@ def _output_result_to_csv(self, output_filepath, fieldnames, results): csv_writer.writerow( result ) + diff --git a/happytransformer/happy_transformer.py b/happytransformer/happy_transformer.py index 94ec068e..2ec90401 100644 --- a/happytransformer/happy_transformer.py +++ b/happytransformer/happy_transformer.py @@ -1,16 +1,22 @@ +""" +Parent class to HappyTextClassification, HappyWordPrediction, HappyQuestionAnswering +and HappyNextSentencePrediction. + +Contains shared variables and methods for these classes. +""" + import torch import logging - class HappyTransformer(): def __init__(self, model_type, model_name, model, tokenizer, device): - self.model_type = model_type # BERT, ROBERTA, ALBERT etc + self.model_type = model_type # BERT, #DISTILBERT, ROBERTA, ALBERT etc self.model_name = model_name self._model = model self._tokenizer = tokenizer self._model.eval() - self._trainer = None + self._trainer = None # initialized in child class # todo change logging system self.logger = logging.getLogger(__name__) @@ -37,7 +43,7 @@ def __init__(self, model_type, model_name, model, tokenizer, device): self.logger.info("Using model: %s", self._device) - def train(self, input_filepath, output_path, args): + def train(self, input_filepath, args): """ Trains a model :param input_filepath: a string that contains a path to a csv file @@ -47,19 +53,16 @@ def train(self, input_filepath, output_path, args): """ raise NotImplementedError() - def eval(self, input_filepath, output_path): + def eval(self, input_filepath): """ Evaluates the model. Determines how well the model performs on a given dataset - :param input_filepath: a string that contains a path to a csv file that contains evaluating data - - :return: correct percentage """ raise NotImplementedError() - def test(self, input_filepath, output_path): + def test(self, input_filepath): """ Used to generate predictions for a given dataset. The dataset may not be labelled. diff --git a/happytransformer/tc/trainer.py b/happytransformer/tc/trainer.py index f35c25d6..e2d5d732 100644 --- a/happytransformer/tc/trainer.py +++ b/happytransformer/tc/trainer.py @@ -8,65 +8,68 @@ https://huggingface.co/transformers/custom_datasets.html#sequence-classification-with-imdb-reviews""" import csv -import torch -from torch.utils.data import DataLoader +import tempfile +import torch from transformers import Trainer - from happytransformer.happy_trainer import HappyTrainer +from happytransformer.util import softmax_of_matrix + class TCTrainer(HappyTrainer): + """ + A class for training text classification functionality + """ - def __init__(self, model, model_type, tokenizer, device, logger): - super().__init__(model, model_type, tokenizer, device, logger) + def train(self, input_filepath, args): - def train(self, input_filepath, output_path, args): - contexts, labels = self.__get_data(input_filepath) + contexts, labels = self._get_data(input_filepath) train_encodings = self.tokenizer(contexts, truncation=True, padding=True) train_dataset = TextClassificationDataset(train_encodings, labels) - - training_args = self._get_training_args(args, output_path) - - trainer = Trainer( - model=self.model, # the instantiated 🤗 Transformers model to be trained - args=training_args, # training arguments, defined above - train_dataset=train_dataset, # training dataset - ) - trainer.train() - - def eval(self, input_filepath, output_path): - contexts, labels = self.__get_data(input_filepath) + with tempfile.TemporaryDirectory() as tmp_dir_name: + training_args = self._get_training_args(args, tmp_dir_name) + trainer = Trainer( + model=self.model, # the instantiated 🤗 Transformers model to be trained + args=training_args, # training arguments, defined above + train_dataset=train_dataset, # training dataset + ) + trainer.train() + + def eval(self, input_filepath): + contexts, labels = self._get_data(input_filepath) eval_encodings = self.tokenizer(contexts, truncation=True, padding=True) eval_dataset = TextClassificationDataset(eval_encodings, labels) - eval_args = self._get_test_eval_args(output_path) + with tempfile.TemporaryDirectory() as tmp_dir_name: + eval_args = self._get_test_eval_args(tmp_dir_name) - trainer = Trainer( - model=self.model, # the instantiated 🤗 Transformers model to be trained - args=eval_args, - eval_dataset=eval_dataset, # training dataset + trainer = Trainer( + model=self.model, # the instantiated 🤗 Transformers model to be trained + args=eval_args, + eval_dataset=eval_dataset, # training dataset - ) + ) - return trainer.evaluate() + return trainer.evaluate() - - def test(self, input_filepath, output_path): - contexts = self.__get_data(input_filepath, True) + def test(self, input_filepath): + contexts = self._get_data(input_filepath, True) test_encodings = self.tokenizer(contexts, truncation=True, padding=True) test_dataset = TextClassificationDatasetTest(test_encodings, len(contexts)) - test_args = self._get_test_eval_args(output_path) - - trainer = Trainer( - model=self.model, # the instantiated 🤗 Transformers model to be trained - args=test_args - ) - print(trainer.predict(test_dataset)) + with tempfile.TemporaryDirectory() as tmp_dir_name: + test_args = self._get_test_eval_args(tmp_dir_name) + trainer = Trainer( + model=self.model, # the instantiated 🤗 Transformers model to be trained + args=test_args + ) + result_logits = trainer.predict(test_dataset).predictions + result_softmax = softmax_of_matrix(result_logits.tolist()) + return result_softmax @staticmethod - def __get_data(filepath, test_data=False): + def _get_data(filepath, test_data=False): """ Used for parsing data for training and evaluating (both contain labels) :param filepath: a string that contains the location of the data @@ -89,6 +92,10 @@ def __get_data(filepath, test_data=False): class TextClassificationDataset(torch.utils.data.Dataset): + """ + A class to allow the training and testing data to be used by + a transformers.Trainer object + """ def __init__(self, encodings, labels): self.encodings = encodings self.labels = labels @@ -102,6 +109,10 @@ def __len__(self): return len(self.labels) class TextClassificationDatasetTest(torch.utils.data.Dataset): + """ + A class to allow the testing data to be used by + a transformers.Trainer object + """ def __init__(self, encodings, length): self.encodings = encodings self.length = length @@ -111,4 +122,4 @@ def __getitem__(self, idx): return item def __len__(self): - return self.length \ No newline at end of file + return self.length diff --git a/happytransformer/tasks_util.py b/happytransformer/util.py similarity index 77% rename from happytransformer/tasks_util.py rename to happytransformer/util.py index 784a652b..753a1544 100644 --- a/happytransformer/tasks_util.py +++ b/happytransformer/util.py @@ -1,6 +1,5 @@ from collections import namedtuple - - +from numpy import exp SumPair = namedtuple('SumPair', ['idx1', 'idx2', 'sum']) @@ -32,3 +31,18 @@ def biggest_sums(items_a, items_b): b_index += 1 else: a_index += 1 + + +def softmax_of_matrix(matrix): + """ + + :param matrix: A list of lists of logits + :return: a list of lists of softmax values + """ + result = list() + + for logits in matrix: + e_logits = exp(logits) + softmax = e_logits/e_logits.sum() + result.append(softmax.tolist()) + return result diff --git a/tests/test_task_util.py b/tests/test_task_util.py index 72a73b59..993f591c 100644 --- a/tests/test_task_util.py +++ b/tests/test_task_util.py @@ -2,7 +2,7 @@ Contains tests for functions found within util.py """ -from happytransformer.tasks_util import SumPair, biggest_sums +from happytransformer.util import SumPair, biggest_sums def test_biggest_sums(): """ Tests the biggest_sums function diff --git a/tests/test_tc.py b/tests/test_tc.py new file mode 100644 index 00000000..522d156c --- /dev/null +++ b/tests/test_tc.py @@ -0,0 +1,62 @@ +""" +Tests for Text Classification Functionality +""" + +from happytransformer.happy_text_classification import HappyTextClassification + +def test_classify_text(): + """ + Tests + HappyQuestionAnswering.classify_text() + + """ + happy_tc = HappyTextClassification() + result = happy_tc.classify_text("What a great movie") + assert result["answer"] == 1 and result["softmax"][1] == 0.9998726844787598 + +def test_qa_train(): + """ + Tests + HappyQuestionAnswering.train() + + """ + happy_tc = HappyTextClassification() + + happy_tc.train("../data/tc/train-eval.csv") + + +def test_qa_eval(): + """ + Tests + HappyQuestionAnswering.eval() + """ + happy_tc = HappyTextClassification() + results = happy_tc.eval("../data/tc/train-eval.csv") + assert results["eval_loss"] == 0.007262040860950947 + + +def test_qa_test(): + """ + Tests + HappyQuestionAnswering.test() + """ + happy_tc = HappyTextClassification() + + result = happy_tc.test("../data/tc/test.csv") + expected_result = [[0.00015978473364387978, 0.9998402152663561], [0.9772132247336673, 0.022786775266332746], [0.9966067733093962, 0.0033932266906038368], [0.020770484301764973, 0.979229515698235]] + + assert result == expected_result + + +def test_qa_train_effectiveness(): + """ + Tests + Ensures that HappyQuestionAnswering.train() results in + lowering the loss as determined by HappyQuestionAnswering.eval() + """ + + happy_tc = HappyTextClassification() + before_loss = happy_tc.eval("../data/tc/train-eval.csv")["eval_loss"] + happy_tc.train("../data/tc/train-eval.csv") + after_loss = happy_tc.eval("../data/tc/train-eval.csv")["eval_loss"] + assert after_loss < before_loss diff --git a/tests/test_tc_trainer.py b/tests/test_tc_trainer.py deleted file mode 100644 index 322570f9..00000000 --- a/tests/test_tc_trainer.py +++ /dev/null @@ -1,40 +0,0 @@ -""" -Tests for the question answering training, evaluating and testing functionality -""" - -from happytransformer.happy_text_classification import HappyTextClassification - - -def test_qa_train(): - """ - Tests - HappyQuestionAnswering.eval() - HappyQuestionAnswering.train() - - """ - happy_tc = HappyTextClassification() - # Test 1 - - # Test 2 - happy_tc.train("../data/test_tc_trainer/sample-tc-training-eval-data.csv", "../data/test_tc_trainer/results/test_qa_train") - - -def test_qa_eval(): - """ - Tests - HappyQuestionAnswering.eval() - HappyQuestionAnswering.train() - - """ - happy_tc = HappyTextClassification() - results = happy_tc.eval("../data/test_tc_trainer/sample-tc-training-eval-data.csv", "../data/test_tc_trainer/results/test_qa_eval") - print(results) - # happy_tc.train("../data/test_tc_trainer/sample-tc-training-eval-data.csv", "../data/test_tc_trainer/results/test_qa_train") - #results = happy_tc.eval("../data/test_tc_trainer/sample-tc-training-eval-data.csv", "../data/test_tc_trainer/results/test_qa_eval") - - - -def test_qa_test(): - happy_tc = HappyTextClassification() - - happy_tc.test("../data/test_tc_trainer/sample-tc-test-data.csv", "../data/test_tc_trainer/results/test_qa_test") diff --git a/tests/test_text_classification.py b/tests/test_text_classification.py deleted file mode 100644 index e69de29b..00000000 From 27e4c92bae1c985980843f23e0c02e78dc2e9d34 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Fri, 8 Jan 2021 17:20:22 -0500 Subject: [PATCH 016/155] added _run_train() --- happytransformer/happy_trainer.py | 16 +++++++++++++++- happytransformer/tc/trainer.py | 9 +-------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/happytransformer/happy_trainer.py b/happytransformer/happy_trainer.py index 3132ff69..f1594dbb 100644 --- a/happytransformer/happy_trainer.py +++ b/happytransformer/happy_trainer.py @@ -5,8 +5,10 @@ import time import datetime import math +import tempfile from csv import DictWriter -from transformers import TrainingArguments +from transformers import TrainingArguments, Trainer + class HappyTrainer: def __init__(self, model, model_type, tokenizer, device, logger): @@ -49,6 +51,18 @@ def _get_training_args(args, output_path): ) + def _run_train(self, dataset, args): + with tempfile.TemporaryDirectory() as tmp_dir_name: + training_args = self._get_training_args(args, tmp_dir_name) + trainer = Trainer( + model=self.model, # the instantiated 🤗 Transformers model to be trained + args=training_args, # training arguments, defined above + train_dataset=dataset, # training dataset + ) + trainer.train() + + + @staticmethod def _get_test_eval_args(output_path): """ diff --git a/happytransformer/tc/trainer.py b/happytransformer/tc/trainer.py index e2d5d732..4d5cb6b1 100644 --- a/happytransformer/tc/trainer.py +++ b/happytransformer/tc/trainer.py @@ -26,14 +26,7 @@ def train(self, input_filepath, args): contexts, labels = self._get_data(input_filepath) train_encodings = self.tokenizer(contexts, truncation=True, padding=True) train_dataset = TextClassificationDataset(train_encodings, labels) - with tempfile.TemporaryDirectory() as tmp_dir_name: - training_args = self._get_training_args(args, tmp_dir_name) - trainer = Trainer( - model=self.model, # the instantiated 🤗 Transformers model to be trained - args=training_args, # training arguments, defined above - train_dataset=train_dataset, # training dataset - ) - trainer.train() + self._run_train(train_dataset, args) def eval(self, input_filepath): contexts, labels = self._get_data(input_filepath) From 11e096c5213afbe3e1c14da4f4eb69c00282ec38 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Fri, 8 Jan 2021 17:23:54 -0500 Subject: [PATCH 017/155] added _run_eval() --- happytransformer/happy_trainer.py | 13 +++++++++++++ happytransformer/tc/trainer.py | 10 +--------- 2 files changed, 14 insertions(+), 9 deletions(-) diff --git a/happytransformer/happy_trainer.py b/happytransformer/happy_trainer.py index f1594dbb..da997ce6 100644 --- a/happytransformer/happy_trainer.py +++ b/happytransformer/happy_trainer.py @@ -61,6 +61,19 @@ def _run_train(self, dataset, args): ) trainer.train() + def _run_eval(self, dataset): + with tempfile.TemporaryDirectory() as tmp_dir_name: + eval_args = self._get_test_eval_args(tmp_dir_name) + + trainer = Trainer( + model=self.model, # the instantiated 🤗 Transformers model to be trained + args=eval_args, + eval_dataset=dataset, # training dataset + + ) + + return trainer.evaluate() + @staticmethod diff --git a/happytransformer/tc/trainer.py b/happytransformer/tc/trainer.py index 4d5cb6b1..90871bf4 100644 --- a/happytransformer/tc/trainer.py +++ b/happytransformer/tc/trainer.py @@ -33,17 +33,9 @@ def eval(self, input_filepath): eval_encodings = self.tokenizer(contexts, truncation=True, padding=True) eval_dataset = TextClassificationDataset(eval_encodings, labels) - with tempfile.TemporaryDirectory() as tmp_dir_name: - eval_args = self._get_test_eval_args(tmp_dir_name) - trainer = Trainer( - model=self.model, # the instantiated 🤗 Transformers model to be trained - args=eval_args, - eval_dataset=eval_dataset, # training dataset - - ) + return self._run_eval(eval_dataset) - return trainer.evaluate() def test(self, input_filepath): contexts = self._get_data(input_filepath, True) From 8d4163c7d21505b65671575e204a385377dc0666 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Fri, 8 Jan 2021 17:27:01 -0500 Subject: [PATCH 018/155] added _run_test() --- happytransformer/happy_trainer.py | 11 +++++++++++ happytransformer/tc/trainer.py | 21 ++++----------------- 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/happytransformer/happy_trainer.py b/happytransformer/happy_trainer.py index da997ce6..dfed4e7d 100644 --- a/happytransformer/happy_trainer.py +++ b/happytransformer/happy_trainer.py @@ -8,6 +8,7 @@ import tempfile from csv import DictWriter from transformers import TrainingArguments, Trainer +from happytransformer.util import softmax_of_matrix class HappyTrainer: @@ -74,7 +75,17 @@ def _run_eval(self, dataset): return trainer.evaluate() + def _run_test(self, dataset): + with tempfile.TemporaryDirectory() as tmp_dir_name: + test_args = self._get_test_eval_args(tmp_dir_name) + trainer = Trainer( + model=self.model, # the instantiated 🤗 Transformers model to be trained + args=test_args + ) + result_logits = trainer.predict(dataset).predictions + result_softmax = softmax_of_matrix(result_logits.tolist()) + return result_softmax @staticmethod def _get_test_eval_args(output_path): diff --git a/happytransformer/tc/trainer.py b/happytransformer/tc/trainer.py index 90871bf4..6eb41939 100644 --- a/happytransformer/tc/trainer.py +++ b/happytransformer/tc/trainer.py @@ -8,12 +8,8 @@ https://huggingface.co/transformers/custom_datasets.html#sequence-classification-with-imdb-reviews""" import csv -import tempfile - import torch -from transformers import Trainer from happytransformer.happy_trainer import HappyTrainer -from happytransformer.util import softmax_of_matrix class TCTrainer(HappyTrainer): @@ -22,16 +18,15 @@ class TCTrainer(HappyTrainer): """ def train(self, input_filepath, args): - contexts, labels = self._get_data(input_filepath) train_encodings = self.tokenizer(contexts, truncation=True, padding=True) train_dataset = TextClassificationDataset(train_encodings, labels) + self._run_train(train_dataset, args) def eval(self, input_filepath): contexts, labels = self._get_data(input_filepath) eval_encodings = self.tokenizer(contexts, truncation=True, padding=True) - eval_dataset = TextClassificationDataset(eval_encodings, labels) return self._run_eval(eval_dataset) @@ -40,18 +35,10 @@ def eval(self, input_filepath): def test(self, input_filepath): contexts = self._get_data(input_filepath, True) test_encodings = self.tokenizer(contexts, truncation=True, padding=True) - test_dataset = TextClassificationDatasetTest(test_encodings, len(contexts)) - with tempfile.TemporaryDirectory() as tmp_dir_name: - test_args = self._get_test_eval_args(tmp_dir_name) - trainer = Trainer( - model=self.model, # the instantiated 🤗 Transformers model to be trained - args=test_args - ) - result_logits = trainer.predict(test_dataset).predictions - - result_softmax = softmax_of_matrix(result_logits.tolist()) - return result_softmax + + return self._run_test(test_dataset) + @staticmethod def _get_data(filepath, test_data=False): From e2cf547019196faaf7c42f554f4a54435cc3d5cf Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Fri, 8 Jan 2021 20:38:49 -0500 Subject: [PATCH 019/155] Changed TC, QA and MWP to use pipelines and finished TC and QA --- .../sample-qa-test-data.csv => qa/test.csv} | 0 .../train-eval.csv} | 0 happytransformer/happy_question_answering.py | 131 ++++++--------- happytransformer/happy_text_classification.py | 48 +++--- happytransformer/happy_trainer.py | 126 +++++---------- happytransformer/happy_transformer.py | 25 +-- happytransformer/happy_word_prediction.py | 22 ++- happytransformer/mwp/default_args.py | 21 +-- happytransformer/mwp/trainer.py | 13 +- happytransformer/mwp/util.py | 0 happytransformer/qa/default_args.py | 34 ++-- happytransformer/qa/trainer.py | 153 ++++-------------- happytransformer/qa/util.py | 75 --------- happytransformer/tc/trainer.py | 18 ++- happytransformer/tc/util.py | 0 happytransformer/util.py | 48 ------ tests/test_qa.py | 51 ++++++ tests/test_qa_trainer.py | 55 ------- tests/test_sequence.csv | 2 - tests/test_task_util.py | 25 --- tests/test_tc.py | 21 ++- 21 files changed, 284 insertions(+), 584 deletions(-) rename data/{test_qa_trainer/sample-qa-test-data.csv => qa/test.csv} (100%) rename data/{test_qa_trainer/sample-qa-training-eval-data.csv => qa/train-eval.csv} (100%) delete mode 100644 happytransformer/mwp/util.py delete mode 100644 happytransformer/qa/util.py delete mode 100644 happytransformer/tc/util.py delete mode 100644 happytransformer/util.py create mode 100644 tests/test_qa.py delete mode 100644 tests/test_qa_trainer.py delete mode 100644 tests/test_sequence.csv delete mode 100644 tests/test_task_util.py diff --git a/data/test_qa_trainer/sample-qa-test-data.csv b/data/qa/test.csv similarity index 100% rename from data/test_qa_trainer/sample-qa-test-data.csv rename to data/qa/test.csv diff --git a/data/test_qa_trainer/sample-qa-training-eval-data.csv b/data/qa/train-eval.csv similarity index 100% rename from data/test_qa_trainer/sample-qa-training-eval-data.csv rename to data/qa/train-eval.csv diff --git a/happytransformer/happy_question_answering.py b/happytransformer/happy_question_answering.py index 57e581ee..c6a5ce50 100644 --- a/happytransformer/happy_question_answering.py +++ b/happytransformer/happy_question_answering.py @@ -5,17 +5,19 @@ import torch from happytransformer.happy_transformer import HappyTransformer -from happytransformer.qa.util import qa_probabilities from happytransformer.qa.trainer import QATrainer from happytransformer.qa.default_args \ - import ARGS_QA_EVAL, ARGS_QA_TEST, ARGS_QA_TRAIN + import ARGS_QA_TRAIN from transformers import ( BertForQuestionAnswering, - BertTokenizerFast - + BertTokenizerFast, + DistilBertForQuestionAnswering, + DistilBertTokenizerFast, + QuestionAnsweringPipeline, ) + class HappyQuestionAnswering(HappyTransformer): """ This class is a user facing class that allows users to solve question answering problems using @@ -26,72 +28,40 @@ class HappyQuestionAnswering(HappyTransformer): to understand and to offload complex tasks to other classes. """ - def __init__(self, model_type="BERT", - model_name="bert-large-uncased-whole-word-masking-finetuned-squad", device=None): - model = BertForQuestionAnswering.from_pretrained(model_name) - tokenizer = BertTokenizerFast.from_pretrained(model_name) + def __init__(self, model_type="DISTILBERT", + model_name="distilbert-base-cased-distilled-squad"): + model = None + tokenizer = None + if model_type == "BERT": + model = BertForQuestionAnswering.from_pretrained(model_name) + tokenizer = BertTokenizerFast.from_pretrained(model_name) + elif model_type == "DISTILBERT": + model = DistilBertForQuestionAnswering.from_pretrained(model_name) + tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) + else: + raise ValueError("model_type must be BERT or DISTILBERT") + + super().__init__(model_type, model_name, model, tokenizer) + device_number = 1 if torch.cuda.is_available() else -1 + # from documentation " a positive will run the model on the associated CUDA device id." + # todo: get device ID if torch.cuda.is_available() + + self._pipeline = QuestionAnsweringPipeline(model, tokenizer, device=device_number) - super().__init__(model_type, model_name, model, tokenizer, device) self._trainer = QATrainer(model, model_type, tokenizer, self._device, self.logger) - def answers_to_question(self, question, context, k=3): - input_ids = self._tokenize_qa(question, context) - qa_output = self._run_qa_model(input_ids) - sep_id_index = input_ids.index(self._tokenizer.sep_token_id) - probabilities = qa_probabilities( - # only consider logits from the context part of the embedding. - # that is, between the middle [SEP] token - # and the final [SEP] token - start_logits=qa_output.start_logits[0][sep_id_index+1:-1], - end_logits=qa_output.end_logits[0][sep_id_index+1:-1], - k=k - ) - # qa probabilities use indices relative to context. - # tokens use indices relative to overall question [SEP] context embedding. - # need offset to resolve this difference - token_offset = sep_id_index + 1 - return [ - # grab ids from start to end (inclusive) and decode to text - {"text": self._tokenizer.decode( - input_ids[token_offset+answer.start_idx: token_offset+answer.end_idx+1] - ), - "softmax": answer.probability} - - for answer in probabilities - ] - - def answer_question(self, question, text): + def answer_question(self, context, question, topk=1): """ - Using the given text, find the answer to the given question and return it. - :param question: The question to be answered - :param text: The text containing the answer to the question - :return: The answer to the given question, as a string + :param context: background information to answer the question (string) + :param question: A question that can be answered with the given context (string) + :param topk: how many results + :return: if topk =1, a dictionary that contains the keys: score, start, end and answer + if topk >1, a list of dictionaries described above """ - return self.answers_to_question(question, text, 1)[0]["text"] - - def _run_qa_model(self, input_ids): - sep_id_index = input_ids.index(self._tokenizer.sep_token_id) - before_after_ids = [ - 0 if idx <= sep_id_index else 1 - for idx, _ in enumerate(input_ids) - ] - with torch.no_grad(): - return self._model( - input_ids=torch.tensor([input_ids]), - token_type_ids=torch.tensor([before_after_ids]) - ) - - def _tokenize_qa(self, question, context): - input_text = ' '.join([ - question, - self._tokenizer.sep_token, - context - ]) - input_ids = self._tokenizer.encode(input_text) - return input_ids + return self._pipeline(context=context, question=question, topk=topk) def train(self, input_filepath, args=ARGS_QA_TRAIN): """ @@ -99,44 +69,37 @@ def train(self, input_filepath, args=ARGS_QA_TRAIN): input_filepath: a string that contains the location of a csv file for training. Contains the following header values: context, - question, answer_text, answer_start + question, answer_text, answer_start + args: a dictionary that contains settings found under happytransformer.happytasks.happy_qa.default_args.py + return: None """ self._trainer.train(input_filepath=input_filepath, args=args) - def eval(self, input_filepath, output_filepath, args=ARGS_QA_EVAL): + def eval(self, input_filepath): """ Trains the question answering model input_filepath: a string that contains the location of a csv file for training. Contains the following header values: - context, question, answer_text, answer_start - args: a dictionary that contains settings found under - happytransformer.happytasks.happy_qa.default_args.py - output_filepath: a path to a csv file to output the results. - This file contains the following header values: contexts, - questions, answer, outputs, correct, softmax - return: correct ration (correct/total) + context, question, answer_text, answer_start + + return: A dictionary that contains a key called "eval_loss" + """ - return self._trainer.eval(input_filepath=input_filepath, - solve=self.answers_to_question, args=args, - output_filepath=output_filepath) + return self._trainer.eval(input_filepath=input_filepath,) - def test(self, input_filepath, output_filepath, args=ARGS_QA_TEST): + def test(self, input_filepath): """ Tests the question answering model. Used to obtain results input_filepath: a string that contains the location of a csv file for training. Contains the following header values: - context, question - args: a dictionary that contains settings found under - happytransformer.happytasks.happy_qa.default_args.py - output_filepath: a path to a csv file to output the results. - This file contains the following header values: contexts, questions, outputs, softmax - return: None + context, question + + return: A list of dictionaries. Each dictionary + contains the keys: "score", "start", "end" and "answer" """ - self._trainer.test(input_filepath=input_filepath, - solve=self.answers_to_question, args=args, - output_filepath=output_filepath) + return self._trainer.test(input_filepath=input_filepath, pipeline=self._pipeline) diff --git a/happytransformer/happy_text_classification.py b/happytransformer/happy_text_classification.py index 85217358..6f86bdfb 100644 --- a/happytransformer/happy_text_classification.py +++ b/happytransformer/happy_text_classification.py @@ -1,23 +1,29 @@ -from happytransformer.tc.trainer import TCTrainer +""" +Contains a class called HappyTextClassification that performs text classification +""" + +import torch from transformers import ( BertForSequenceClassification, BertTokenizerFast, DistilBertForSequenceClassification, - DistilBertTokenizerFast + DistilBertTokenizerFast, + TextClassificationPipeline ) +from happytransformer.tc.trainer import TCTrainer from happytransformer.happy_transformer import HappyTransformer from happytransformer.tc.default_args import ARGS_TC_TRAIN -import numpy as np - -from happytransformer.util import softmax_of_matrix class HappyTextClassification(HappyTransformer): + """ + A user facing class for Text Classification + """ def __init__(self, model_type="DISTILBERT", - model_name="distilbert-base-uncased-finetuned-sst-2-english", device=None): + model_name="distilbert-base-uncased-finetuned-sst-2-english"): model = None tokenizer = None @@ -27,28 +33,30 @@ def __init__(self, model_type="DISTILBERT", elif model_type == "DISTILBERT": model = DistilBertForSequenceClassification.from_pretrained(model_name) tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) + else: + raise ValueError("model_type must be BERT or DISTILBERT") + + super().__init__(model_type, model_name, model, tokenizer) + + device_number = 1 if torch.cuda.is_available() else -1 + # from documentation " a positive will run the model on the associated CUDA device id." + # todo: get device ID if torch.cuda.is_available() + + self._pipeline = TextClassificationPipeline(model=model, + tokenizer=tokenizer, device=device_number) - super().__init__(model_type, model_name, model, tokenizer, device) self._trainer = TCTrainer(self._model, self.model_type, self._tokenizer, self._device, self.logger) def classify_text(self, text): """ - :param text: A text string to be classified - :return: + :param text: A text string to be classified, or a list of strings + :return: either a single dictionary with keys: label and score, + or a list of these dictionaries with the same keys """ + return self._pipeline(text) - inputs = self._tokenizer(text, return_tensors="pt") - output = self._model(**inputs) - logits = output.logits - scores = logits.detach().cpu() - softmax = softmax_of_matrix(scores)[0] - preds = np.argmax(scores.numpy(), axis=1) - return { - "answer": preds[0], - 'softmax': softmax - } def train(self, input_filepath, args=ARGS_TC_TRAIN): """ @@ -86,4 +94,4 @@ def test(self, input_filepath): text return: #todo """ - return self._trainer.test(input_filepath=input_filepath) + return self._trainer.test(input_filepath=input_filepath, pipeline=self._pipeline) diff --git a/happytransformer/happy_trainer.py b/happytransformer/happy_trainer.py index dfed4e7d..7e3d4029 100644 --- a/happytransformer/happy_trainer.py +++ b/happytransformer/happy_trainer.py @@ -2,13 +2,8 @@ Parent class for training classes, such as TCTrainer and QATrainer """ -import time -import datetime -import math import tempfile -from csv import DictWriter from transformers import TrainingArguments, Trainer -from happytransformer.util import softmax_of_matrix class HappyTrainer: @@ -19,18 +14,40 @@ def __init__(self, model, model_type, tokenizer, device, logger): self.device = device self.logger = logger - def train(self, input_filepath, args): + """ + + :param input_filepath: A string to file location + :param args: a dictionary that contains settings + :return: + """ raise NotImplementedError() - def test(self, input_filepath): + def test(self, input_filepath, pipeline): + """ + + :param input_filepath: A string to file location + :param pipeline: an initialized transformer pipeline for the given task + :return: test results + """ raise NotImplementedError() def eval(self, input_filepath): + """ + :param input_filepath: A string to file location + :return: a dictionary that contains a key called "eval_loss" that holds the loss + for the given eval dataset. May add more metrics later + """ raise NotImplementedError() @staticmethod def _get_data(filepath, test_data=False): + """ + + :param filepath: A string to file location + :param test_data: False for train and eval, True for test + :return: varies for each task + """ raise NotImplementedError() @staticmethod @@ -53,104 +70,43 @@ def _get_training_args(args, output_path): ) def _run_train(self, dataset, args): + """ + + :param dataset: a child of torch.utils.data.Dataset + :param args: a dictionary that contains settings + :return: None + """ with tempfile.TemporaryDirectory() as tmp_dir_name: training_args = self._get_training_args(args, tmp_dir_name) trainer = Trainer( - model=self.model, # the instantiated 🤗 Transformers model to be trained - args=training_args, # training arguments, defined above - train_dataset=dataset, # training dataset + model=self.model, + args=training_args, + train_dataset=dataset, ) trainer.train() def _run_eval(self, dataset): + """ + :param dataset: a child of torch.utils.data.Dataset + :return: None + """ with tempfile.TemporaryDirectory() as tmp_dir_name: - eval_args = self._get_test_eval_args(tmp_dir_name) - + eval_args = self._get_eval_args(tmp_dir_name) trainer = Trainer( - model=self.model, # the instantiated 🤗 Transformers model to be trained + model=self.model, args=eval_args, - eval_dataset=dataset, # training dataset + eval_dataset=dataset, ) - return trainer.evaluate() - def _run_test(self, dataset): - with tempfile.TemporaryDirectory() as tmp_dir_name: - test_args = self._get_test_eval_args(tmp_dir_name) - trainer = Trainer( - model=self.model, # the instantiated 🤗 Transformers model to be trained - args=test_args - ) - result_logits = trainer.predict(dataset).predictions - - result_softmax = softmax_of_matrix(result_logits.tolist()) - return result_softmax - @staticmethod - def _get_test_eval_args(output_path): + def _get_eval_args(output_path): """ - :param output_path: A string to a temporary directory :return: A TrainingArguments object """ return TrainingArguments( output_dir=output_path, seed=42 - ) - - def _format_time(self, time): - """ - elapsed: time in seconds - return: time outputted in hh:mm:ss format - """ - time_rounded = int(round((time))) - # Format as hh:mm:ss - return str(datetime.timedelta(seconds=time_rounded)) - - - def _get_update_interval(self, count): - """ - Determines how often to print status, given the number of cases. - - First determines how often to update for exactly 50 updates. - Then, rounds to the nearest power of ten (10, 100, 1000 etc) - - :param count: - :return: - """ - - x = count / 50 - order = math.floor(math.log(x, 10)) - - update_interval = 10 ** order - if update_interval == 0: - return 1 - return update_interval - - def _print_status(self, init_time, count, total, update_interval, percentage=None): - if count % update_interval and not count == 0: - current_time = time.time() - elapsed_time_string = self._format_time(current_time - init_time) - - avg_ex = (current_time - init_time) / count - rem_time_int = avg_ex * (total - count) - rem_time_string = self._format_time(rem_time_int) - ending = "" - if percentage is not None: - ending = "Correct: " + str(round(percentage, 2)*100) + "%" - status_output = "Done: ", str(count) + "/" + str( - total) + " ---- Elapsed: " + elapsed_time_string +\ - " Estimated Remaining: " + rem_time_string +" " + ending - self.logger.info(status_output) - - def _output_result_to_csv(self, output_filepath, fieldnames, results): - with open(output_filepath, 'w') as csv_file: - csv_writer = DictWriter(csv_file, fieldnames=fieldnames) - csv_writer.writeheader() - for result in results: - csv_writer.writerow( - result - ) - diff --git a/happytransformer/happy_transformer.py b/happytransformer/happy_transformer.py index 2ec90401..8414586c 100644 --- a/happytransformer/happy_transformer.py +++ b/happytransformer/happy_transformer.py @@ -1,16 +1,20 @@ """ -Parent class to HappyTextClassification, HappyWordPrediction, HappyQuestionAnswering -and HappyNextSentencePrediction. +Contains the parent class to HappyTextClassification, HappyWordPrediction, HappyQuestionAnswering +and HappyNextSentencePrediction called HappyTransformer Contains shared variables and methods for these classes. """ - -import torch import logging +import torch class HappyTransformer(): + """ + Parent class to HappyTextClassification, HappyWordPrediction, HappyQuestionAnswering + and HappyNextSentencePrediction. - def __init__(self, model_type, model_name, model, tokenizer, device): + """ + + def __init__(self, model_type, model_name, model, tokenizer): self.model_type = model_type # BERT, #DISTILBERT, ROBERTA, ALBERT etc self.model_name = model_name self._model = model @@ -30,13 +34,10 @@ def __init__(self, model_type, model_name, model, tokenizer, device): handlers=[handler] ) - if device is None: - self._device = torch.device( - "cuda" if torch.cuda.is_available() - else "cpu" - ) - else: - self._device = device + self._device = torch.device( + "cuda" if torch.cuda.is_available() + else "cpu" + ) if self._device == 'cuda': self._model.to(self._device) diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index 7f368abb..dfc0991b 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -4,10 +4,14 @@ BertForMaskedLM, BertTokenizerFast, RobertaForMaskedLM, - RobertaTokenizerFast + RobertaTokenizerFast, + FillMaskPipeline, + ) +import torch from happytransformer.happy_transformer import HappyTransformer +from happytransformer.mwp.trainer import MWPTrainer class HappyWordPrediction(HappyTransformer): @@ -25,8 +29,10 @@ def __init__(self, model_type="BERT", tokenizer = RobertaTokenizerFast.from_pretrained(model_name) super().__init__(model_type, model_name, model, tokenizer, device) - - self._trainer = QATrainer(model, + device_number = 1 if torch.cuda.is_available() else -1 + self._pipeline = FillMaskPipeline(model=model, + tokenizer=tokenizer, device=device_number) + self._trainer = MWPTrainer(model, model_type, tokenizer, self._device, self.logger) def predict_masks(self): raise NotImplementedError() @@ -35,10 +41,10 @@ def predict_mask(self): raise NotImplementedError() def train(self, input_filepath, args): - raise NotImplementedError() + self._trainer.train(input_filepath=input_filepath, args=args) - def test(self, input_filepath, output_filepath, args): - raise NotImplementedError() + def eval(self, input_filepath): + return self._trainer.eval(input_filepath=input_filepath) - def eval(self, input_filepath, output_filepath, args): - raise NotImplementedError() + def test(self, input_filepath): + return self._trainer.test(input_filepath=input_filepath,) diff --git a/happytransformer/mwp/default_args.py b/happytransformer/mwp/default_args.py index c5c1d089..4f84f99f 100644 --- a/happytransformer/mwp/default_args.py +++ b/happytransformer/mwp/default_args.py @@ -1,19 +1,10 @@ ARGS_MWP_TRAIN = { - 'max_length': 300, - 'batch_size': 16, 'learning_rate': 5e-5, - 'epochs': 2, - - -} - -ARGS_MWP_TEST = { - # eventually we'll add settings - -} - -ARGS_MWP_EVAL = { - - # eventually we'll add settings + 'weight_decay': 0, + 'adam_beta1': 0.9, + 'adam_beta2': 0.999, + 'adam_epsilon': 1e-8, + 'max_grad_norm': 1.0, + 'num_train_epochs': 3.0, } \ No newline at end of file diff --git a/happytransformer/mwp/trainer.py b/happytransformer/mwp/trainer.py index 711eb9e2..8e8090e4 100644 --- a/happytransformer/mwp/trainer.py +++ b/happytransformer/mwp/trainer.py @@ -1,17 +1,14 @@ -from happytransformer.trainer import Trainer -from happytransformer.mwp.default_args import ARGS_MWP_EVAL, ARGS_MWP_TEST, ARGS_MWP_TRAIN +from happytransformer.happy_trainer import HappyTrainer +from happytransformer.mwp.default_args import ARGS_MWP_TRAIN -class QATrainer(Trainer): - - def __init__(self, model, model_type, tokenizer, device, logger): - super(QATrainer, self).__init__(model, model_type, tokenizer, device, logger) +class MWPTrainer(HappyTrainer): def train(self, input_filepath, args=ARGS_MWP_TRAIN): raise NotImplementedError() - def test(self, input_filepath, answers_to_question, output_filepath, args=ARGS_MWP_TEST): + def eval(self, input_filepath): raise NotImplementedError() - def eval(self, input_filepath, answers_to_question, args=ARGS_MWP_EVAL, output_filepath=None): + def test(self, input_filepath, pipeline): raise NotImplementedError() \ No newline at end of file diff --git a/happytransformer/mwp/util.py b/happytransformer/mwp/util.py deleted file mode 100644 index e69de29b..00000000 diff --git a/happytransformer/qa/default_args.py b/happytransformer/qa/default_args.py index 5877ce79..576987af 100644 --- a/happytransformer/qa/default_args.py +++ b/happytransformer/qa/default_args.py @@ -1,17 +1,27 @@ ARGS_QA_TRAIN = { - 'max_length': 300, - 'batch_size': 16, 'learning_rate': 5e-5, - 'epochs': 2, - + 'weight_decay': 0, + 'adam_beta1': 0.9, + 'adam_beta2': 0.999, + 'adam_epsilon': 1e-8, + 'max_grad_norm': 1.0, + 'num_train_epochs': 3.0, } -ARGS_QA_TEST = { - # eventually we'll add settings - -} - -ARGS_QA_EVAL = { - # eventually we'll add settings -} +# maybe implement later +# 'max_steps': -1, +# 'warmup_steps': 0, +# # 'logging_dir': #todo +# 'logging_first_step': False, +# 'logging_steps': 500, +# 'save_steps': 500, +# #'save_total_limit': +# 'no_cuda': False, +# 'seed': 42, +# 'fp16': False, +# 'fp16_opt_level': "O1", +# 'local_rank': -1, +# #'tpu_num_cores':, +# "debug": False, +# diff --git a/happytransformer/qa/trainer.py b/happytransformer/qa/trainer.py index d102b742..182fc030 100644 --- a/happytransformer/qa/trainer.py +++ b/happytransformer/qa/trainer.py @@ -7,18 +7,15 @@ https://huggingface.co/transformers/custom_datasets.html#question-answering-with-squad-2-0 """ -import time import csv +from tqdm import tqdm import torch -from torch.utils.data import DataLoader -from transformers import AdamW - from happytransformer.happy_trainer import HappyTrainer class QATrainer(HappyTrainer): - - def __init__(self, model, model_type, tokenizer, device, logger): - super().__init__(model, model_type, tokenizer, device, logger) + """ + Trainer class for HappyTextClassification + """ def train(self, input_filepath, args): """ @@ -26,149 +23,56 @@ def train(self, input_filepath, args): """ #todo: add time elapsed and test time remaining similar to what is within eval - contexts, questions, answers = self.__get_data(input_filepath) + contexts, questions, answers = self._get_data(input_filepath) self.__add_end_idx(contexts, answers) encodings = self.tokenizer(contexts, questions, truncation=True, padding=True) self.__add_token_positions(encodings, answers) dataset = QuestionAnsweringDataset(encodings) - self.model.train() - - train_loader = DataLoader(dataset, batch_size=args['batch_size'], shuffle=True) - - optim = AdamW(self.model.parameters(), lr=args['learning_rate']) - - for epoch in range(args['epochs']): - epoch_output = "Epoch: " + str(epoch) + "\n\n" - self.logger.info(epoch_output) - batch_num = 0 - for batch in train_loader: - batch_output = "Batch: " + str(batch_num) - self.logger.info(batch_output) - optim.zero_grad() - input_ids = batch['input_ids'].to(self.device) - attention_mask = batch['attention_mask'].to(self.device) - start_positions = batch['start_positions'].to(self.device) - end_positions = batch['end_positions'].to(self.device) - outputs = self.model(input_ids, attention_mask=attention_mask, - start_positions=start_positions, - end_positions=end_positions) - loss = outputs[0] - loss.backward() - optim.step() - batch_logger_output = "Batch: " + str(batch_num)\ - + " loss: " + str(round(loss.item(), 6)) - self.logger.info(batch_logger_output) - batch_num += 1 - self.model.eval() - - def eval(self, input_filepath, solve, output_filepath, args): + self._run_train(dataset, args) + + + def eval(self, input_filepath): """ See docstring in HappyQuestionAnswering.eval() - solve: HappyQuestionAnswering.answers_to_question() """ - contexts, questions, answers = self.__get_data(input_filepath) - init_time = time.time() - correct = 0 - count = 0 - total = len(contexts) - update_interval = self._get_update_interval(total) - - results = list() - - for case in zip(contexts, questions, answers): - context = case[0] - question = case[1] - answer = case[2] - - result = solve(question, context, k=1000)[0] - output_text = result["text"] - output_softmax = result["softmax"] - - # todo modify the qa functionality to output with correct capitalization - - compare_answer = answer["answer_text"].lower() + contexts, questions, answers = self._get_data(input_filepath) - if output_text == compare_answer: - correct += 1 - - results.append( - { - "contexts": context, - "questions": question, - "answer": answer["answer_text"].lower(), - "outputs": output_text, - "correct": output_text == compare_answer, - "softmax": output_softmax - - } - ) - count += 1 - - self._print_status(init_time, count, total, update_interval, correct/count) - - score = correct/total - ending = str(round(score, 2) * 100) + "%" - - result_output = "Evaluating Result: " + str(correct) + "/" + str(total) + " -- " + ending - self.logger.info(result_output) - - fieldnames = ["contexts", "questions", "answer", "outputs", "correct", "softmax"] - self._output_result_to_csv(output_filepath, fieldnames, results) + self.__add_end_idx(contexts, answers) + encodings = self.tokenizer(contexts, questions, truncation=True, padding=True) + self.__add_token_positions(encodings, answers) + dataset = QuestionAnsweringDataset(encodings) + return self._run_eval(dataset) - return score - def test(self, input_filepath, solve, output_filepath, args): + def test(self, input_filepath, pipeline): """ See docstring in HappyQuestionAnswering.test() - solve: HappyQuestionAnswering.answers_to_question() - """ - contexts, questions = self.__get_data(input_filepath, test_data=True) - init_time = time.time() - total = len(contexts) - count = 0 - update_interval = self._get_update_interval(total) + contexts, questions = self._get_data(input_filepath, test_data=True) results = list() - for case in zip(contexts, questions): + for case in tqdm(zip(contexts, questions)): context = case[0] question = case[1] + result = pipeline(question, context) - result = solve(question, context, k=1000)[0] - output_text = result["text"] - output_softmax = result["softmax"] - - # todo modify the qa functionality to output with correct capitalization - results.append( - { - "contexts": context, - "questions": question, - "outputs": output_text, - "softmax": output_softmax - } - ) - - self._print_status(init_time, count, total, update_interval, None) - count += 1 + results.append(result) - fieldnames = ["contexts", "questions", "outputs", "softmax"] - self._output_result_to_csv(output_filepath, fieldnames, results) + return results - result_output = "Output saved to: " + output_filepath - count += 1 - self.logger.info(result_output) @staticmethod - def __get_data(filepath, test_data=False): + def _get_data(filepath, test_data=False): """ - Used for parsing data for training and evaluating (both contain labels) + Used to collect :param filepath: a string that contains the location of the data - :return: + :return: if test_data = False contexts, questions, answers (all strings) + else: contexts, questions """ contexts = [] questions = [] @@ -203,10 +107,10 @@ def __add_end_idx(contexts, answers): answer['answer_end'] = end_idx elif context[start_idx - 1:end_idx - 1] == gold_text: answer['answer_start'] = start_idx - 1 - answer['answer_end'] = end_idx - 1 # When the gold label is off by one character + answer['answer_end'] = end_idx - 1 elif context[start_idx - 2:end_idx - 2] == gold_text: answer['answer_start'] = start_idx - 2 - answer['answer_end'] = end_idx - 2 # When the gold label is off by two characters + answer['answer_end'] = end_idx - 2 else: print("error: implement skipping training answer") @@ -216,7 +120,6 @@ def __add_token_positions(self, encodings, answers): for i in range(len(answers)): start_positions.append(encodings.char_to_token(i, answers[i]['answer_start'])) end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1)) - # if None, the answer passage has been truncated if start_positions[-1] is None: start_positions[-1] = self.tokenizer.model_max_length if end_positions[-1] is None: @@ -237,4 +140,4 @@ def __getitem__(self, idx): return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} def __len__(self): - return len(self.encodings.input_ids) \ No newline at end of file + return len(self.encodings.input_ids) diff --git a/happytransformer/qa/util.py b/happytransformer/qa/util.py deleted file mode 100644 index fa81ef79..00000000 --- a/happytransformer/qa/util.py +++ /dev/null @@ -1,75 +0,0 @@ -from collections import namedtuple -import torch -from happytransformer.tasks_util import biggest_sums - -QAAnswerLogit = namedtuple('QaAnswerLogit', [ - 'start_idx', 'end_idx', 'logit' -]) - -def qa_logits(start_logits, end_logits): - """ - Compute the logits for top qa pairs - :param start_logits: tensor from qa model output - :param end_logits: tensor from qa model output - :returns: generator of namedtuples of the form - (start_idx, end_idx, logit), sorted in descending order - by score - """ - - sorted_starts_tensors = torch.sort(start_logits, descending=True) - sorted_ends_tensors = torch.sort(end_logits, descending=True) - # start logits sorted in descending order INDEPENDENTLY - sorted_start_scores = sorted_starts_tensors.values.tolist() - sorted_start_indices = sorted_starts_tensors.indices.tolist() - # end logits sorted in descending order INDEPENDENTLY - sorted_end_scores = sorted_ends_tensors.values.tolist() - sorted_end_indices = sorted_ends_tensors.indices.tolist() - # start logit + end logit pairs sorted in descending order - # of their sum TOGETHER - all_answers = ( - QAAnswerLogit( - start_idx=sorted_start_indices[sum_pair.idx1], - end_idx=sorted_end_indices[sum_pair.idx2], - logit=sum_pair.sum - ) - for sum_pair in - biggest_sums(sorted_start_scores, sorted_end_scores) - ) - # filter for only answers which have end at or after start - legit_answers = ( - answer - for answer in all_answers - if answer.end_idx >= answer.start_idx - ) - return legit_answers - -QAProbability = namedtuple('QaProbability', [ - 'start_idx', 'end_idx', 'probability' -]) - -def qa_probabilities(start_logits, end_logits, k): - """ - Computes the top k qa probabilities, in terms of indices. - :param start_logits: tensor from qa model output - :param end_logits: tensor from qa model output - :param k: number of results to return - :returns: list of namedtuples of the form (text,probability) - """ - top_answers = [ - qa_logit - for qa_logit, _ in zip(qa_logits(start_logits, end_logits), range(k)) - ] - logit_scores = torch.tensor([ - answer.logit - for answer in top_answers - ]) - - probabilities = torch.nn.Softmax(dim=0)(logit_scores).tolist() - return [ - QAProbability( - start_idx=answer.start_idx, - end_idx=answer.end_idx, - probability=probability - ) - for answer, probability in zip(top_answers, probabilities) - ] diff --git a/happytransformer/tc/trainer.py b/happytransformer/tc/trainer.py index 6eb41939..09206bd7 100644 --- a/happytransformer/tc/trainer.py +++ b/happytransformer/tc/trainer.py @@ -10,6 +10,7 @@ import csv import torch from happytransformer.happy_trainer import HappyTrainer +from tqdm import tqdm class TCTrainer(HappyTrainer): @@ -31,14 +32,20 @@ def eval(self, input_filepath): return self._run_eval(eval_dataset) + def test(self, input_filepath, pipeline): + """ + See docstring in HappyQuestionAnswering.test() + solve: HappyQuestionAnswering.answers_to_question() + """ + contexts = self._get_data(input_filepath, test_data=True) - def test(self, input_filepath): - contexts = self._get_data(input_filepath, True) - test_encodings = self.tokenizer(contexts, truncation=True, padding=True) - test_dataset = TextClassificationDatasetTest(test_encodings, len(contexts)) + results = list() - return self._run_test(test_dataset) + for context in tqdm(contexts): + result = pipeline(context) + results.append(result) + return results @staticmethod def _get_data(filepath, test_data=False): @@ -62,7 +69,6 @@ def _get_data(filepath, test_data=False): return contexts - class TextClassificationDataset(torch.utils.data.Dataset): """ A class to allow the training and testing data to be used by diff --git a/happytransformer/tc/util.py b/happytransformer/tc/util.py deleted file mode 100644 index e69de29b..00000000 diff --git a/happytransformer/util.py b/happytransformer/util.py deleted file mode 100644 index 753a1544..00000000 --- a/happytransformer/util.py +++ /dev/null @@ -1,48 +0,0 @@ -from collections import namedtuple -from numpy import exp -SumPair = namedtuple('SumPair', ['idx1', 'idx2', 'sum']) - - -def biggest_sums(items_a, items_b): - ''' - compute biggest sums from two descending ordered lists, - labeled by indices - :param items_a: list of numeric values, sorted descendingly - :param items_b: list of numeric values, sorted descendingly - :returns: list of namedtuples of the form (idx1,idx2,sum), - sorted by descending sum - ''' - a_index = b_index = 0 - while a_index < len(items_a) and b_index < len(items_b): - yield SumPair( - a_index, b_index, - sum=items_a[a_index] + items_b[b_index] - ) - # increment in whichever direction has smaller gain - # fallback to -inf at end of list. - # this will always be taken last. - next_from_a = items_a[a_index + 1] if a_index + 1 < len(items_a) else float('-inf') - next_from_b = items_b[b_index + 1] if b_index + 1 < len(items_b) else float('-inf') - - diff_a = items_a[a_index] - next_from_a - diff_b = items_b[b_index] - next_from_b - - if diff_a >= diff_b: - b_index += 1 - else: - a_index += 1 - - -def softmax_of_matrix(matrix): - """ - - :param matrix: A list of lists of logits - :return: a list of lists of softmax values - """ - result = list() - - for logits in matrix: - e_logits = exp(logits) - softmax = e_logits/e_logits.sum() - result.append(softmax.tolist()) - return result diff --git a/tests/test_qa.py b/tests/test_qa.py new file mode 100644 index 00000000..020e6171 --- /dev/null +++ b/tests/test_qa.py @@ -0,0 +1,51 @@ +""" +Tests for the question answering training, evaluating and testing functionality +""" + +from happytransformer.happy_question_answering import HappyQuestionAnswering + + +def test_qa_answer_question(): + happy_qa = HappyQuestionAnswering() + result = happy_qa.answer_question("Today's date is January 8th 2021", "What is the date?") + answer = {'score': 0.9696964621543884, 'start': 16, 'end': 32, 'answer': 'January 8th 2021'} + assert result == answer + + +def test_qa_answer_question_top_k(): + happy_qa = HappyQuestionAnswering() + result = happy_qa.answer_question("Today's date is January 8th 2021", "What is the date?", topk=3) + answer = [{'score': 0.9696964621543884, 'start': 16, 'end': 32, 'answer': 'January 8th 2021'}, {'score': 0.02050216868519783, 'start': 16, 'end': 27, 'answer': 'January 8th'}, {'score': 0.005092293489724398, 'start': 16, 'end': 23, 'answer': 'January'}] + assert result == answer + +def test_qa_train(): + happy_qa = HappyQuestionAnswering() + happy_qa.train("../data/qa/train-eval.csv") + + +def test_qa_eval(): + happy_qa = HappyQuestionAnswering() + result = happy_qa.eval("../data/qa/train-eval.csv") + assert result["eval_loss"] == 0.11738169193267822 + + +def test_qa_test(): + happy_qa = HappyQuestionAnswering() + result = happy_qa.test("../data/qa/test.csv") + answer = [{'score': 0.9939756989479065, 'start': 0, 'end': 12, 'answer': 'October 31st'}, {'score': 0.967872679233551, 'start': 12, 'end': 25, 'answer': 'November 23rd'}] + assert result == answer + + +def test_qa_train_effectiveness(): + """ + Ensures that HappyQuestionAnswering.train() results in + lowering the loss as determined by HappyQuestionAnswering.eval() + """ + + happy_qa = HappyQuestionAnswering() + before_loss = happy_qa.eval("../data/qa/train-eval.csv")["eval_loss"] + happy_qa.train("../data/qa/train-eval.csv") + after_loss = happy_qa.eval("../data/qa/train-eval.csv")["eval_loss"] + + assert after_loss < before_loss + diff --git a/tests/test_qa_trainer.py b/tests/test_qa_trainer.py deleted file mode 100644 index 4962b4d0..00000000 --- a/tests/test_qa_trainer.py +++ /dev/null @@ -1,55 +0,0 @@ -""" -Tests for the question answering training, evaluating and testing functionality -""" - -from happytransformer.happy_question_answering import HappyQuestionAnswering - - -def test_qa_train_eval(): - """ - Tests - HappyQuestionAnswering.eval() - HappyQuestionAnswering.train() - - """ - happy_qa = HappyQuestionAnswering() - # Test 1 - start_answers = happy_qa.answers_to_question("What is the date?", "October 31st is the date") - - # Test 2 - before = happy_qa.eval("../data/test_qa_trainer/sample-qa-training-eval-data.csv", - output_filepath="../data/test_qa_trainer/results/output-eval-before-qa.csv") - - happy_qa.train("../data/test_qa_trainer/sample-qa-training-eval-data.csv") - - - # Test 1 - end_answers = happy_qa.answers_to_question("What is the date?", "October 31st is the date") - assert start_answers[0]["text"] == "october 31st" - assert end_answers[0]["text"] == "october 31st" - assert end_answers[0]["softmax"] > start_answers[0]["softmax"] - - # Test 2 - after = happy_qa.eval("../data/test_qa_trainer/sample-qa-training-eval-data.csv", - output_filepath="../data/test_qa_trainer/results/output-eval-after-qa.csv") - assert after >= before - - # Test 3: - #todo ensure the output csv file makes sense for eval - - -def test_qa_testing(): - """ - tests: - - HappyQuestionAnswering.test() - - """ - - happy_qa = HappyQuestionAnswering() - happy_qa.test("../data/test_qa_trainer/sample-qa-test-data.csv", - output_filepath="../data/test_qa_trainer/results/output-test-output.csv") - - #todo ensure the output csv file makes sense - - diff --git a/tests/test_sequence.csv b/tests/test_sequence.csv deleted file mode 100644 index 94cf4b3a..00000000 --- a/tests/test_sequence.csv +++ /dev/null @@ -1,2 +0,0 @@ -0,"I hate you" -1,"I love you" \ No newline at end of file diff --git a/tests/test_task_util.py b/tests/test_task_util.py deleted file mode 100644 index 993f591c..00000000 --- a/tests/test_task_util.py +++ /dev/null @@ -1,25 +0,0 @@ -""" -Contains tests for functions found within util.py -""" - -from happytransformer.util import SumPair, biggest_sums -def test_biggest_sums(): - """ - Tests the biggest_sums function - """ - items_a = [7, 4, 3] - items_b = [7, 6, 4] - - expected_pairs = [ - SumPair(idx1=0, idx2=0, sum=14), # 7+7 - SumPair(idx1=0, idx2=1, sum=13), # 7+6 - SumPair(idx1=0, idx2=2, sum=11), # 7+4 - SumPair(idx1=1, idx2=2, sum=8), # 4+4 - SumPair(idx1=2, idx2=2, sum=7) # 3+4 - ] - computed_pairs = biggest_sums(items_a, items_b) - assert all( - expected_pair == computed_pair - for expected_pair, computed_pair in - zip(expected_pairs, computed_pairs) - ) diff --git a/tests/test_tc.py b/tests/test_tc.py index 522d156c..4bbf054c 100644 --- a/tests/test_tc.py +++ b/tests/test_tc.py @@ -12,7 +12,21 @@ def test_classify_text(): """ happy_tc = HappyTextClassification() result = happy_tc.classify_text("What a great movie") - assert result["answer"] == 1 and result["softmax"][1] == 0.9998726844787598 + print(result) + answer = [{'label': 'POSITIVE', 'score': 0.9998726844787598}] + assert result == answer + +def test_classify_texts(): + """ + Tests + HappyQuestionAnswering.classify_text() + + """ + happy_tc = HappyTextClassification() + input = ["What a great movie", "Horrible movie", "Bad restaurant"] + result = happy_tc.classify_text(input) + answer = [{'label': 'POSITIVE', 'score': 0.9998726844787598}, {'label': 'NEGATIVE', 'score': 0.9997945427894592}, {'label': 'NEGATIVE', 'score': 0.9997393488883972}] + assert result == answer def test_qa_train(): """ @@ -43,9 +57,8 @@ def test_qa_test(): happy_tc = HappyTextClassification() result = happy_tc.test("../data/tc/test.csv") - expected_result = [[0.00015978473364387978, 0.9998402152663561], [0.9772132247336673, 0.022786775266332746], [0.9966067733093962, 0.0033932266906038368], [0.020770484301764973, 0.979229515698235]] - - assert result == expected_result + answer = [[{'label': 'POSITIVE', 'score': 0.9998401999473572}], [{'label': 'NEGATIVE', 'score': 0.9772131443023682}], [{'label': 'NEGATIVE', 'score': 0.9966067671775818}], [{'label': 'POSITIVE', 'score': 0.9792295098304749}]] + assert result == answer def test_qa_train_effectiveness(): From 1093c1b56cf5699a2492c7f0b60b763e0c71cadb Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Fri, 8 Jan 2021 21:22:11 -0500 Subject: [PATCH 020/155] Added maintainers --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 59d64cda..88758487 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ * [Masked Word Prediction Fine-Tuning](#Masked-Word-Prediction-Fine-Tuning) * [Tech](#Tech) * [Call For Contributors](#Call-For-Contributors) +* [Maintainers](#Maintainers) ## News: @@ -704,3 +705,7 @@ print(results) # prints: [{'word': 'mathematics', 'softmax': 0.16551}, {'word': ### Call for contributors Happy Transformer is a new and growing API. We're seeking more contributors to help accomplish our mission of making state-of-the-art AI easier to use. + +### Maintainers +- [Eric Fillion](https://github.com/ericfillion) Lead Maintainer +- [Ted Brownlow](https://github.com/ted537) Maintainer From 696b20cd6fe8ab18bd31dff4ef022f3a2376f3bb Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Fri, 8 Jan 2021 23:29:53 -0500 Subject: [PATCH 021/155] Added TC to __init__.py --- happytransformer/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/happytransformer/__init__.py b/happytransformer/__init__.py index 2ac707aa..5a213275 100644 --- a/happytransformer/__init__.py +++ b/happytransformer/__init__.py @@ -1,3 +1,5 @@ from happytransformer.happy_question_answering import HappyQuestionAnswering from happytransformer.happy_word_prediction import HappyWordPrediction +from happytransformer.happy_text_classification import HappyTextClassification + name = "happytransformer" From 58c7d5731d782e78c0f497042ae7a5976bd40e7c Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sat, 9 Jan 2021 00:07:58 -0500 Subject: [PATCH 022/155] predict_mask() to HappyWordPrediction --- happytransformer/happy_word_prediction.py | 44 ++++++++++++++++------- tests/test_multi_mask.py | 42 ---------------------- tests/test_mwp.py | 35 ++++++++++++++++++ 3 files changed, 66 insertions(+), 55 deletions(-) delete mode 100644 tests/test_multi_mask.py create mode 100644 tests/test_mwp.py diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index dfc0991b..7459db13 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -1,10 +1,8 @@ -from happytransformer.qa.trainer import QATrainer - from transformers import ( BertForMaskedLM, BertTokenizerFast, - RobertaForMaskedLM, - RobertaTokenizerFast, + DistilBertForMaskedLM, + DistilBertTokenizerFast, FillMaskPipeline, ) @@ -15,8 +13,8 @@ class HappyWordPrediction(HappyTransformer): - def __init__(self, model_type="BERT", - model_name="bert-large-uncased-whole-word-masking-finetuned-squad", device=None): + def __init__(self, model_type="DISTILBERT", + model_name="distilbert-base-uncased"): model = None tokenizer = None @@ -24,18 +22,38 @@ def __init__(self, model_type="BERT", model = BertForMaskedLM.from_pretrained(model_name) tokenizer = BertTokenizerFast.from_pretrained(model_name) - elif model_type == "ROBERTA": - model = RobertaForMaskedLM.from_pretrained(model_name) - tokenizer = RobertaTokenizerFast.from_pretrained(model_name) + elif model_type == "DISTILBERT": + model = DistilBertForMaskedLM.from_pretrained(model_name) + tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) - super().__init__(model_type, model_name, model, tokenizer, device) + super().__init__(model_type, model_name, model, tokenizer) device_number = 1 if torch.cuda.is_available() else -1 self._pipeline = FillMaskPipeline(model=model, tokenizer=tokenizer, device=device_number) self._trainer = MWPTrainer(model, model_type, tokenizer, self._device, self.logger) - def predict_masks(self): - raise NotImplementedError() + def predict_masks(self, text, targets=None, top_k =1): + """ + + :param text: Either a single string, or strings that contains masks + :param targets: Optional. A list of strings of potential answers. All other answers will be ignored + :param top_k: number of results. Default is 1 + :return: If top_k ==1: a dictionary with the keys "score" and "token_str" + if top_k >1: a list of dictionaries described above in order by score + """ + result = self._pipeline(text, targets=targets, top_k=top_k) + + if top_k ==1: + result = result[0] + del result['sequence'] + del result['token'] + else: + for answer in result: + del answer['sequence'] + del answer['token'] + return result + + def predict_mask(self): raise NotImplementedError() @@ -47,4 +65,4 @@ def eval(self, input_filepath): return self._trainer.eval(input_filepath=input_filepath) def test(self, input_filepath): - return self._trainer.test(input_filepath=input_filepath,) + return self._trainer.test(input_filepath=input_filepath,pipeline=self._pipeline) diff --git a/tests/test_multi_mask.py b/tests/test_multi_mask.py deleted file mode 100644 index 50aa52f3..00000000 --- a/tests/test_multi_mask.py +++ /dev/null @@ -1,42 +0,0 @@ -from happytransformer import HappyBERT - -happy = HappyBERT() - -def test_multi_mask(): - # should give something like - # "I have a great dog and I love him so much" - all_predictions = happy.predict_masks( - "[MASK] have a [MASK] dog and I love [MASK] so much", - num_results=2 - ) - assert len(all_predictions) == 3 - assert all( - len(specific_predictions) == 2 - for specific_predictions in all_predictions - ) - assert all_predictions[0][0]["word"] == 'i' - assert all_predictions[0][0]["softmax"] > 0.5 - - assert all_predictions[2][0]["word"] == 'him' - -def test_multi_mask_options(): - MASKS_OPTIONS = [ - ['I', 'You'], - ['big', 'small'], - ['him', 'her'] - ] - options_set = set( - option - for mask in MASKS_OPTIONS - for option in mask - ) - all_predictions = happy.predict_masks( - "[MASK] have a [MASK] dog and I love [MASK] so much", - options=MASKS_OPTIONS - ) - assert len(all_predictions) == 3 - assert all( - prediction["word"] in options_set - for mask_predictions in all_predictions - for prediction in mask_predictions - ) \ No newline at end of file diff --git a/tests/test_mwp.py b/tests/test_mwp.py new file mode 100644 index 00000000..4ab2aa6d --- /dev/null +++ b/tests/test_mwp.py @@ -0,0 +1,35 @@ +from happytransformer import HappyWordPrediction + + +def test_mwp_basic(): + happy_mwp = HappyWordPrediction() + result = happy_mwp.predict_masks( + "Please pass the salt and [MASK]", + ) + answer = {'score': 0.2664579749107361, 'token_str': 'pepper'} + assert result == answer + + +def test_mwp_top_k(): + happy_mwp = HappyWordPrediction() + result = happy_mwp.predict_masks( + "Please pass the salt and [MASK]", + top_k=2 + ) + answer = [{'score': 0.2664579749107361, 'token_str': 'pepper'}, {'score': 0.08760260790586472, 'token_str': 'vinegar'}] + + assert result == answer + +def test_mwp_targets(): + happy_mwp = HappyWordPrediction() + result = happy_mwp.predict_masks( + "Please pass the salt and [MASK]", + targets=["water", "spices"] + ) + answer = {'score': 0.014856964349746704, 'token_str': 'water'} + assert result == answer + + +def test_mwp_train(): + happy_mwp = HappyWordPrediction() + happy_mwp.train() From 3cea815d3a74f2a4c24a18df06f9d71ca4281f5f Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sat, 9 Jan 2021 00:42:07 -0500 Subject: [PATCH 023/155] cleaned MWP --- happytransformer/happy_word_prediction.py | 36 +++++++++++------------ tests/test_mwp.py | 5 ---- 2 files changed, 18 insertions(+), 23 deletions(-) diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index 7459db13..8573584f 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -13,6 +13,9 @@ class HappyWordPrediction(HappyTransformer): + """ + A user facing class for text classification + """ def __init__(self, model_type="DISTILBERT", model_name="distilbert-base-uncased"): model = None @@ -28,22 +31,23 @@ def __init__(self, model_type="DISTILBERT", super().__init__(model_type, model_name, model, tokenizer) device_number = 1 if torch.cuda.is_available() else -1 - self._pipeline = FillMaskPipeline(model=model, - tokenizer=tokenizer, device=device_number) - self._trainer = MWPTrainer(model, - model_type, tokenizer, self._device, self.logger) - def predict_masks(self, text, targets=None, top_k =1): - """ + self._pipeline = FillMaskPipeline(model=model, tokenizer=tokenizer, device=device_number) + self._trainer = MWPTrainer(model, model_type, tokenizer, self._device, self.logger) - :param text: Either a single string, or strings that contains masks - :param targets: Optional. A list of strings of potential answers. All other answers will be ignored + def predict_masks(self, text, targets=None, top_k=1): + """ + :param text: A string that contains the model's mask token + :param targets: Optional. A list of strings of potential answers. + All other answers will be ignored :param top_k: number of results. Default is 1 :return: If top_k ==1: a dictionary with the keys "score" and "token_str" if top_k >1: a list of dictionaries described above in order by score """ - result = self._pipeline(text, targets=targets, top_k=top_k) + if isinstance(text, list): + raise ValueError("the \"text\" argument must be a single string") - if top_k ==1: + result = self._pipeline(text, targets=targets, top_k=top_k) + if top_k == 1: result = result[0] del result['sequence'] del result['token'] @@ -53,16 +57,12 @@ def predict_masks(self, text, targets=None, top_k =1): del answer['token'] return result - - - def predict_mask(self): - raise NotImplementedError() - def train(self, input_filepath, args): - self._trainer.train(input_filepath=input_filepath, args=args) + raise NotImplementedError("train() is currently not available") def eval(self, input_filepath): - return self._trainer.eval(input_filepath=input_filepath) + raise NotImplementedError("eval() is currently not available") def test(self, input_filepath): - return self._trainer.test(input_filepath=input_filepath,pipeline=self._pipeline) + # self.logger.error("test() is currently not available") + raise NotImplementedError("test() is currently not available") diff --git a/tests/test_mwp.py b/tests/test_mwp.py index 4ab2aa6d..632b7189 100644 --- a/tests/test_mwp.py +++ b/tests/test_mwp.py @@ -28,8 +28,3 @@ def test_mwp_targets(): ) answer = {'score': 0.014856964349746704, 'token_str': 'water'} assert result == answer - - -def test_mwp_train(): - happy_mwp = HappyWordPrediction() - happy_mwp.train() From c41ee95b59f8e26ed1bc269249b860f89c4ce01f Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sat, 9 Jan 2021 00:56:06 -0500 Subject: [PATCH 024/155] Changed predict_masks() to predict_mask() --- happytransformer/happy_word_prediction.py | 2 +- tests/test_mwp.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index 8573584f..d07d3892 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -34,7 +34,7 @@ def __init__(self, model_type="DISTILBERT", self._pipeline = FillMaskPipeline(model=model, tokenizer=tokenizer, device=device_number) self._trainer = MWPTrainer(model, model_type, tokenizer, self._device, self.logger) - def predict_masks(self, text, targets=None, top_k=1): + def predict_mask(self, text, targets=None, top_k=1): """ :param text: A string that contains the model's mask token :param targets: Optional. A list of strings of potential answers. diff --git a/tests/test_mwp.py b/tests/test_mwp.py index 632b7189..41739899 100644 --- a/tests/test_mwp.py +++ b/tests/test_mwp.py @@ -3,7 +3,7 @@ def test_mwp_basic(): happy_mwp = HappyWordPrediction() - result = happy_mwp.predict_masks( + result = happy_mwp.predict_mask( "Please pass the salt and [MASK]", ) answer = {'score': 0.2664579749107361, 'token_str': 'pepper'} @@ -12,7 +12,7 @@ def test_mwp_basic(): def test_mwp_top_k(): happy_mwp = HappyWordPrediction() - result = happy_mwp.predict_masks( + result = happy_mwp.predict_mask( "Please pass the salt and [MASK]", top_k=2 ) @@ -22,7 +22,7 @@ def test_mwp_top_k(): def test_mwp_targets(): happy_mwp = HappyWordPrediction() - result = happy_mwp.predict_masks( + result = happy_mwp.predict_mask( "Please pass the salt and [MASK]", targets=["water", "spices"] ) From a581368c23d9bdc3e8ab281d5797f96ffddd24f5 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sat, 9 Jan 2021 01:13:46 -0500 Subject: [PATCH 025/155] deleted old files --- README.md | 1 - examples/Finetuning HappyROBERTA.ipynb | 259 ------------------------- examples/teaching_transformer.py | 125 ------------ img/HappyTransformer.png | Bin 68043 -> 0 bytes tests/load_transformer.py | 7 - tests/test_multi_mask.py | 42 ---- tests/test_next_sentence_probs.py | 52 ----- tests/test_predict.py | 17 -- tests/test_qa_multi.py | 34 ---- tests/test_sequence.py | 12 -- 10 files changed, 549 deletions(-) delete mode 100644 examples/Finetuning HappyROBERTA.ipynb delete mode 100644 examples/teaching_transformer.py delete mode 100644 img/HappyTransformer.png delete mode 100644 tests/load_transformer.py delete mode 100644 tests/test_multi_mask.py delete mode 100644 tests/test_next_sentence_probs.py delete mode 100644 tests/test_predict.py delete mode 100644 tests/test_qa_multi.py delete mode 100644 tests/test_sequence.py diff --git a/README.md b/README.md index 88758487..6652497a 100644 --- a/README.md +++ b/README.md @@ -31,7 +31,6 @@ Last month, Happy Transformer was presented at a conference called C-Search, and ### June 9th, 2020 We're happy to announce that we won a Best Paper Award at the Canadian Undergraduate Conference for AI. We also received the highest score overall. The paper can be found [here](https://qmind.ca/wp-content/uploads/2020/05/Proceedings-of-CUCAI-2020.pdf) on page 67. -![HappyTransformer](img/HappyTransformer.png) Happy Transformer is an API built on top of [Hugging Face's transformer library](https://huggingface.co/transformers/) that makes it easy to utilize state-of-the-art NLP models. diff --git a/examples/Finetuning HappyROBERTA.ipynb b/examples/Finetuning HappyROBERTA.ipynb deleted file mode 100644 index 9a3e23f4..00000000 --- a/examples/Finetuning HappyROBERTA.ipynb +++ /dev/null @@ -1,259 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Installing HappyTransformer " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "!pip install happytransformer" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Instantiating a HappyROBERTA object" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "from happytransformer import HappyROBERTA" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "02/17/2020 22:13:53 - INFO - happytransformer.happy_transformer - Using model: cuda\n", - "02/17/2020 22:13:54 - INFO - transformers.tokenization_utils - loading file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json from cache at /root/.cache/torch/transformers/d0c5776499adc1ded22493fae699da0971c1ee4c2587111707a4d177d20257a2.ef00af9e673c7160b4d41cfda1f48c5f4cba57d5142754525572a846a1ab1b9b\n", - "02/17/2020 22:13:54 - INFO - transformers.tokenization_utils - loading file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt from cache at /root/.cache/torch/transformers/b35e7cd126cd4229a746b5d5c29a749e8e84438b14bcdb575950584fe33207e8.70bec105b4158ed9a1747fea67a43f5dee97855c64d62b6ec3742f4cfdb5feda\n" - ] - } - ], - "source": [ - "roberta = HappyROBERTA()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Loading the model for Masked Language Modelling" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "02/17/2020 22:13:54 - INFO - transformers.configuration_utils - loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json from cache at /root/.cache/torch/transformers/e1a2a406b5a05063c31f4dfdee7608986ba7c6393f7f79db5e69dcd197208534.a7ab0e5de2d8321d6d6a15b199110f2c99be72976b7d151423cb8d8c261a13b6\n", - "02/17/2020 22:13:54 - INFO - transformers.configuration_utils - Model config {\n", - " \"architectures\": [\n", - " \"RobertaForMaskedLM\"\n", - " ],\n", - " \"attention_probs_dropout_prob\": 0.1,\n", - " \"finetuning_task\": null,\n", - " \"hidden_act\": \"gelu\",\n", - " \"hidden_dropout_prob\": 0.1,\n", - " \"hidden_size\": 768,\n", - " \"id2label\": {\n", - " \"0\": \"LABEL_0\",\n", - " \"1\": \"LABEL_1\"\n", - " },\n", - " \"initializer_range\": 0.02,\n", - " \"intermediate_size\": 3072,\n", - " \"is_decoder\": false,\n", - " \"label2id\": {\n", - " \"LABEL_0\": 0,\n", - " \"LABEL_1\": 1\n", - " },\n", - " \"layer_norm_eps\": 1e-05,\n", - " \"max_position_embeddings\": 514,\n", - " \"num_attention_heads\": 12,\n", - " \"num_hidden_layers\": 12,\n", - " \"num_labels\": 2,\n", - " \"output_attentions\": false,\n", - " \"output_hidden_states\": false,\n", - " \"output_past\": true,\n", - " \"pruned_heads\": {},\n", - " \"torchscript\": false,\n", - " \"type_vocab_size\": 1,\n", - " \"use_bfloat16\": false,\n", - " \"vocab_size\": 50265\n", - "}\n", - "\n", - "02/17/2020 22:13:55 - INFO - transformers.modeling_utils - loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin from cache at /root/.cache/torch/transformers/228756ed15b6d200d7cb45aaef08c087e2706f54cb912863d2efe07c89584eb7.49b88ba7ec2c26a7558dda98ca3884c3b80fa31cf43a1b1f23aef3ff81ba344e\n", - "02/17/2020 22:14:00 - INFO - happytransformer.happy_transformer - You can now train a masked word prediction model using ROBERTA\n" - ] - } - ], - "source": [ - "roberta.init_train_mwp()" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "train_path = 'train/wiki.train.raw' # Path to training dataset\n", - "test_path = 'test/wiki.test.raw'# Path to testing dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "02/17/2020 22:14:16 - WARNING - transformers.tokenization_utils - Token indices sequence length is longer than the specified maximum sequence length for this model (2417780 > 512). Running this sequence through the model will result in indexing errors\n", - "02/17/2020 22:14:16 - INFO - happytransformer.mlm_utils - ***** Running training *****\n", - "02/17/2020 22:14:16 - INFO - happytransformer.mlm_utils - Num examples = 4722\n", - "02/17/2020 22:14:16 - INFO - happytransformer.mlm_utils - Batch size = 1\n", - "Epoch: 0%| | 0/1 [00:00 512). Running this sequence through the model will result in indexing errors\n", - "02/17/2020 22:28:44 - INFO - happytransformer.mlm_utils - ***** Running evaluation *****\n", - "02/17/2020 22:28:44 - INFO - happytransformer.mlm_utils - Num examples = 559\n", - "02/17/2020 22:28:44 - INFO - happytransformer.mlm_utils - Batch size = 2\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "3da1fc53584f4600bb6af792b9d8fba0", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=280.0, style=ProgressStyle(description_w…" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "data": { - "text/plain": [ - "{'perplexity': 3.8661696910858154, 'eval_loss': 1.3522642510277885}" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "roberta.eval_mwp(test_path) # Evaluate on the testing dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.8" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/examples/teaching_transformer.py b/examples/teaching_transformer.py deleted file mode 100644 index c2819435..00000000 --- a/examples/teaching_transformer.py +++ /dev/null @@ -1,125 +0,0 @@ -""" -An example of how to use a HappyTRANSFORMER to create a -that gives instant feedback on written sentences -""" - -import random -from happytransformer import HappyROBERTA - -class TeachingTransformer: - """ - A class that organizes and runs a program that makes a game out of learning English - """ - - no_space_characters = ['.', ',', '?', '!'] # characters that do not require a space before being used - - def __init__(self, transformer): - """ - :param transformer: A HappyTransformer object - """ - self.transformer = transformer - - def __predict_last_word(self, text: str): - """ - :param text: An incomplete sentence - :return: 1. A list with str word predictions - 2. A list with the corresponding softmax for each prediction. - Both lists share the same indexes for each prediction. - """ - text = text.strip() - text = text + " [MASK]" - predictions = self.transformer.predict_mask(text, num_results=400) - - word_predictions = list() - score_predictions = list() - special_characters = [".", ",", "?", "!", "$", "*"] # TODO add more special characters - - count = 0 - for prediction in predictions: - - word = prediction["word"] - word = word.strip() - - if word.isalpha() == True or word in special_characters: - word_predictions.append(word) - score_predictions.append(prediction["softmax"]) - count += 1 - if count == 200: - break - - return word_predictions, score_predictions - - def teach(self): - """ - Runs a terminal program for Teaching Transformer. - The user must enter one word at a time. - If the user enters the word "$clear," then the program - generates a new starting phrase. - If the user enters the word "$exit," then the terminal program - exits. - """ - sample_starts = ["To solve world hunger we must", "Humans are for", "I think therefore I"] - text = random.choice(sample_starts) - - while True: - word_predictions, score_predictions = self.__predict_last_word(text) - new_word = input(text + " ") - new_word = new_word.strip() - - # TODO check for valid input - if new_word == "$clear": - text = random.choice(sample_starts) - elif new_word == "$exit": - print("Exiting Teaching Transformer") - break - - else: - if new_word in self.no_space_characters: - text = text + new_word - - else: - text = text + " " + new_word - - top_word_predictions = word_predictions[:5] - prediction_index = 201 # default prediction index - - if new_word in word_predictions: - prediction_index = word_predictions.index(new_word) - prediction_score = score_predictions[prediction_index] - else: - prediction_score = 0 # new_word is not in the top 200 predictions - - if prediction_index == 0: - feedback = "PERFECT" - elif prediction_index < 5: - feedback = "amazing" - elif prediction_index < 8: - feedback = "great" - elif prediction_score > 0.05: - feedback = "good" - elif prediction_score > 0.03: - feedback = "okay" - elif prediction_score > 0.02: - feedback = 'Okay, but could be better' - elif prediction_score > 0.01: - feedback = "So so, there is room for improvement" - else: - feedback = "Nice try, look at the examples for better options" - - print(feedback) - print("Here are some suggestions: ", end="") - print(top_word_predictions) - print("\n\n") - - -def main(): - happy_roberta = HappyROBERTA('roberta-large') - - teching_transformer = TeachingTransformer(happy_roberta) - - teching_transformer.teach() - - -if __name__ == "__main__": - main() - diff --git a/img/HappyTransformer.png b/img/HappyTransformer.png deleted file mode 100644 index a521a95a6425df8ae8d1873d0b5c3366e4210c92..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 68043 zcmb?@1y@|X*6;v>ySqEZ-QBIY%b-OI6n6$Mu7#q5yL*ui#odZSfl?fbyL~+Ox%d4A z-&rdwnXH^lcCvl%oX;BS3g{>#C;$KeT}e?^3jl!8ek=S)2yZ=WpWJfaDj0Vy1!+L- zB-z2+4<8!?C0jK$0Lxn&2>^s40l@z=0O-%8<)eRRN8b0Y`p>_CPnlr=2d-^n z?g)TG?~bmIZL99XCpY4I=Ra?L2(R9XcJ*}ryT+h<99Ln}BC)(I?fLnS-}^L8k394i zVHF*pUyQo^E5#^I`P)`+?QCpp7y(uH864Qp7@lun-3cjfGL{I7ko|2Xh_w)xY^<5u zli z{Gk2mg3Up^lpxlBk2N3_7=!~lynS@3HyE!Vh#j$OGK`q8#Rg>obf~3t;q;a?@snUR zy7|PnE*FvVrxG(TA%}1>Ez9G!W4K#eTW$Mj*Vz=^FaJ4%cxEu=x-AMswHLhx>Ms&< z=)0*N8RMB!q1EEuvHs)zX1{Qz@>X;q&nOeQmaSmP-B+iUZ{J4YTwW3X759MU@&VY%8_$>_A9#>F@LdfDHVkp=$*nK%E|0tJU z807lT)&*VT+-dgWK?Ts?QxS96f4mb;;;#LfsV$ufMMz~r>F6qD_$d#fhfl4Qd7 zaSzG=LFyKjO$9h@pCgy3v+eVg>Z>EnP>_M1UO|pJHyTQ-RtP;z+IZ?=7pz)n#l^)V zpxlhZx!k&Pw3Y-1hpDirD6+*N^`ZX@n=wsK5XJL<35-MrRstmxmZ>u&Jz)UdBq{x3 zFlUm*{@pbUuj7`17PW@PB}`6_uWV$mXjR^ZWh?%AEHqII5 zeS+D@HPFuX{JH>zX#0#Zx*;(e4HRW%E@ad|+eSbPu~Osup!v6V-4H6@2Fo#jwqPKGaavlM9ISP| zqN1I$e@)N7%oUX`wE^a5pq04w5#I9oq|BgfJ4zMU62D^Bxp8?zn`N2I5F*TH&ZPs( z4#f*m9>S~Cq`vuyws3@Xe-HoT#BCj)j4UU{>KVj6U27h^_%{D$1|ygfO%aeF15*rl z_R$e35|8;it|v>Vb~-K;E3Mf|IWIj)`a_Tt0* z>}<@zL#3K?(jhzof)|C=8d1oL!oU3SH4GSp#R+eab^eWb5m&jzcv%>K_2*Ad1vz@@ zg328ip#F;deXV$vNX16oQ$q9h;Rw zH^o$DN(zpcj`C;y*MFo8^fn2K30qnZ_JVzP0u-?s4Uc^t?#@ zyZj9)SP7&|nqBw8UQl?A4wp=5g=I>NoF$GJAdmDYt#a~lI;45aizN>aWK_12+I^9U zCR&5Ib&6~!@RmD^fMR+L!3I8FYLweVl zUOZ2^Z9pC^)&(9CTKZ-gKnqMUe-;yia?rA8V=p;2MFRO9_EZhSU{K52qTp(ct{pZ* z=dRUqGUY#{K1+bguiQUBdE(Ux61XOTCS~V(c6xG_K(GJN0l=rM;8=`Ov+qO}%}ZI3 zrRavp0(Ahd-Acbg7h1jfH=@e$shAoEZ|UhdGm|Knlw=&m5qW!tO>t1yUBje$jCV`k zklv$z@|3xW5p1G3nX%{4X3+R$St?2f4xx@|P1SwY=C$`ffb^sS*&PgpZHey#`K&a= zIillm1GIjUAlZW;06APUQ0<$_N+G***=()Wov6Dz`u`&Atqpk z_i`GDwZ9YFmJZM$-pc0xYGh}*ZwC);Zf@o!Q}iSz69cnmGggVsgQ>vjRQsVdX`7X7 zI8^7P7P`_!%{4fJ$b~3L_dy@G&fD2u>iyD0@yeOeQJ|W=q0C1DPFN)+HaZV5d24xy zJK@^$PUf_0d@nrmR-Z?O#JoYzVgE|yo?IY%R@SLU$3tfPu7@>kZwFjUUI^kogFtc2 z)Rj7BOi~iHa!56tyjc&d)(`7-my6VgqSlgl9m_(_pD@L+O2*44p3|KTsIDw~B|+MEM3pdu-t zDrV?K_qb%r@_V$O3VJwDofS@{cO6)|$VEV1SUSDbF&j zp!}0nZ)mSBIN%B8eEEZMf30b!jlD#u5W~tg_)@uQZF#O#ZLYVu!1a3SAGI#P5!nRK zHY-1$a__>2114z&#sKHoF0I(%{TUAl#H`4SS?c{u!bW{MQ1chqs7A*hHow(w6)r)_ zo$mvIw_sbu8llaLhC`jf@p`9z7PhK<6Dv;O*(NofRt27Qh1%RKk)|u>(r=Cdp-FNn zk(8I(f7M+VT5#+S)$X8RiKdY>t=J{473HH|jEus8h#7R`rd7qWQv!j5AOYu*QEUgMNx9j@k?>* zrWaHXl>Z9e0L*_tJaqGqT&`XKNaq5jl2qWVzMSQw#^z$((M?zC_A9W)42%Yz#iMD% zX~Q;pu=bu+*3jDX5vc;SzZ_jI4#dueZ$`#^SMymjvj{)IBhy9C8qPND7}kA1PCC zs@aXO-u-oU(y9fi&}Rz38>H|(DAiv_WI;PF0SdL6MZOb)0`nFh$C(U1Uc!_th8>)&*DA5&Ynz4KSy z#blb>4LGfs4c%mA+kO5dBSy73F4`mO1%nsmO&lq^u1o?^{w2}7eER8q5DFDZ2+<^< zYb@3dsoaFV7ZaewJ0Fs^>agmlZ-!ZVCuXlQn zZH(K(RvTu*l*-rZz*h2dF=d0>4=dlvE!LW%IWE=`PKjRT43AF!jkHqjEeP8|zLMB5 zgdKKLi7@SY9Z6;E>$#M=BKJgoildt@=hvRB0NSOph=b!%r5dO;Dw(tKlW;USfW<4q zzGy3Q*q`3ZSJl)ZS_tPiFDqe-pqxEjjMZ=-eXdReR7>iBnZLVlx!J=iDk%+iYO+52 z5Qjbr#OT!+#n?_{UBB!X*^Dez=VX8K54`unfx4r>C9!j6PpsqOrmE}cb~M(13|n-} zBqXpEiQq4gA9nFJ2y`@sm9R(VW#QplajF66OlW>jOTV51Q})Jg+)v&g&$~F7${e_g z!RUdjA94@7^Cww|{VOxefz}kLEaHdBuy(|qhcc=a+Q-XH#vdKInHgd*0u$14mhNfF zFHFn>&gl=!^OwEo)C%Fnfd2P%zaXE8M8{Y`eCPvtw;7r3dQ{B)etFU`xxud@f$K@`hw6>B%77-g^3Dw`kZ!Se9h?llFB zIu3@^$}kk#qA+;?=@q6KaHePRrWR~jO1g>pt!d8|=26Z?6Xkia*1Mq~qiKn@4?+^V zYh&erJh;_7@!ZdNT2Vl~&~jKUwE;#}9`_hfyo_bk`1stX%CsxniT=%AI9W*$WT`c< zQ_yuEMEF5z;ORr2|JJvv+6U*5Z_3Z70!pCW@vqDSJ2vj(j~8RF#tolz+hbNVE_QpQ zq+l8V^BbgU3-8C?y(^PRZ3vp^3CJtR|cG<~Hbx#;}F z;6_f!t=khF9urvU%1jAJv(>Gg+5glyg+{@qXe1Jso5S%9P;Vvncrv41%$e`EzXk;VO7=AvjngXdOuvDOC%>`uuuc!X+!)`miiB_??O&C5V*fX>*^4D)7tHZDPU@F=8tn zq71&j;8yc|4haExtZ7)Z8Ph{9GFee_Q*xMWCj(TQT{g^M+3Ugpj|m9;K1M^fun7R%n}E0f8X3VP2HGK0ZzQE{$IhE#Tv!mrA|Pl5HqOua@xdINN{ArylF&-l z20Bvd;W!nR)DP66Y1w7C0EG;C`RO%YOZn1k;N zD`g}4s70e?JPTz(K|#MND8&57>k%XWiU&O1?a~gZKLy=x^;G~~G%lF-ebru1<|AUhXkhnskX8TSA-LSOu$0ZM8kK>J23+(2|3D!=(+D}{S@~7&A z-oWxfM~nBFO~~qu)M#~Cshaixea02e%#3D=eWMu*E-+_a{m&l} zIr!s>I5y|`$_N$@EpOAyJ&y1ikFqzf2(5u5cgRxwF|*-(C8T(m%k#D88Iu;irLh?D z2d8S_rED0oiC(6j+e6RD$f)satnO?oUO>Iu@A;cr{`befIc(Hh1rzZ|`x9ehxKWs= zgk?NDaqPpZ=*y?e?vepd=gMdfx<0j)Y`l1c>0claDn~`dVM3V zaDFV11YK40q1HeC`yjhiTI=Mugd%Wd0HfmU7nGLV^%zy16}^hXeE*rrN@!YoY5A8+ zfhmdX#z1o*L5 zLkTL6Oq8>Qm=ZW+`gH@0se-AV2wpwgGz6fv8b^JuoiC){{q}OdM-brUOc-|rUE^Qw zO&NoEb?Ln4d3lW4TrA7H-{(&{>vw{r0tsL+A5hKnf?w{fo8})<#Gr5(L3VcM1#4S&+7<4H>QP1yy5xRN` z$I&D?NrN)aivgU~OxKut(}aU$CSikE=6xF9PoDk>vx~$|fiA1IYe0Rr*d$C?@ zCuU_G0rs-8#Ub(9$4!6x+^`WMLV^hNkWo?adV8rGL0569+keetT1U@&2#lCLwP$Vs6fM zCFKiQkmS@4Fx+cqdODv3J^jwc9_ao&->t~3H}LlPBgj_ouU+CIu9=1gU8P0@DmONP z;G@C9v~}g!aD2aDyfSz2%FpQAcLm!9iZ)o<+=F>W+iQ_@+cJzh5BA5k$eDfLZE zoZhjH=iuO`eyMVno|5?E|9!;iE9|3}dsX_}#8R%3l3Idb?%x(7YYC$!+kwAIsSFiy z7|sgMrWsI7Iv5AT4gcj9M}5-oz3s4@=?EZEZ&fcMds3NQY(j38QjdqtojqP!>l{f% zdja|^2M4OQ%XAG@F5te(dF2)nj}~3#q;ylq?@tX4J80hwYSiQ~ z|2L=c=;W*J>rEitG?RlkfF0iOycd862a^&qL@1G8HwdTNKoPF}BL8Ld%~yTo0qf@P zH{WfxAgoK_6{Ht38WJ2X%T-4yK|nz8IZ^yUkxa}l%}@pK)3!XX&hmKs_iqiaDGUHQ zK|w*m#X91bGU2seYwYm z_pX3gFbO1uNS(j4NLczB84+=dfr65QPl4S`%Piz+u4yq7mooc_6qjcS!)~!Q($v@( z#86M0;Hd=yO@AwpQRqVF{1|H`p!~QndG7A_fz(?0G*8T*GcXA&hgpz^=h3Zb5|sRHje`>H~ziq=&(RuE@qhH)XE|pXH`|+ z-4M%lojN2`^i)8eCxc2Hg_xF!xNXjz&jB~t0LMqfBd6s;q2rol%ih8PYnhhe1-Cyt z3+*r~nqK}PmuLD>=04{f95x$%V>OuF_YWJ!qW4!#xr^0iA|FNyI#Q!B$c5du3gzB^ z$Q2UVx)@Uc^4h1&Cgj=24&1?#TspoP%o_U_+J3(`wv7I^Z`USE8z5R4{cXj!p)akj zb`m9|93@$am~a?UMPcfRZ#J_n!WOl-4M+bObITdPk#X};+*|<4!6IzL9VnhbA@)tl z>oY6!*Yt(6S*K}E6LvUUj=~xOcV6dpMV0g+`Z;v8AAOPJYrtK{mIVPTvUh+wQkZWt zsHiFwGLPBuUW$BsA38Q4;M|^d(K#rNSbYeviJ;pm$}BjXcm12bA;UjcSgPK2|7B^z zPS6V}-p|X*Ivk#06O8aN!cuT0W4s_4;q9bC=T(Pb8{DE9`lu4-F{1k43_Kx!CM23J$q+ z;I^cygLw>m>fMm|wBthA?!T-3UX<%0#Y}94D1LA#i{IJrbuT|y+-7Vq|3oA;{rc?A z&5E|u<1ZEg8QH^|Q>28aYYvk(D5Yax!nozNG`rN~pu95A=B%U7glNsq+@j&+;^LF0 z?jg<=wgtDQGb0bg>%a4@v5l}Z0-+m zg#vP_|CX$vJ2GlCZ5BtfYV_&5xgm?bfo=F^hZ^jg`pS}82HUCLWm5{CXq7A|(roCOM#D%!@k+sg%11wR9 zZ0S#1?G+mzF~y9@4$3PWBw+f{r$`f!W=bCYqUDRmtuxQ%t6J$PT1wj5XfyVOy6gpYzIU42@A{ZtJ%1GI z837He$pUV`lWO-0O%<`ZDhq@$KVZ zbmP0i(+&p*wUA|k&r!Sp%om#^{}QG?w;aIW130zaIc-yEcXXV;m0BS}CN1sDurL~X zhC(Z{y4-d;+~50f6eEOjc(2XjGIPRQ&p}{^3=wJ+O`rURswWM8~5M7)R{c% zXh@!k<&n4`<=)Es*xXw-V0#JbxmpgJ~8j(=rbfIR&IKO@R&S$0(;YMfNoR@8plEu>}uk@Dn{1KrEB zmxj9L9k`xa9dG}dB4Cp7HaWI^FPbz;(3T8@wg{kkMdAL4;VCQX-x!NU4eF{}U?SZrj8lM13X2YMquS+Aq0>(*LO zBcpy+<^$k|wWqUmj)~gd-WI}* z=m`h|GVPsqvzm1Yj-K4CCvq0sXV%cII7QZ=Rz_jxe&$mozJUG347YXficcz61b8fd zP?uM(*F8={!+(cdyC2adX-z#cdjIB*rlEi=b{T5V;8WcFUGG$RH0dK<>U8<;gE3kB zruaptF^q7hyZ!1Kh-MR`sr}}c(jM@Kb9wVu;Avok#|XEkh7J==AFW&!e@!?c%OPWa zHC(hm`~+f7bmEo)*}89_Q-~cX!YliEAgAkiY$%a(@zEP|&1L!ft7Pw!FDk{Sdv<{|4Jq0FOd5gs8xFPiBl`Y2w8GB z(-KIO@%emvD7Gv=2Y9j9*kYNL9sWxW*V zDX5Ypul?gz)3*09;;Z2f`SNlZVd2RW-EsLi6)I}ti;T4N+E2+PVWdL4cS-2)BuQi~ zr3dL10Q zS?sg3qdx%Y4eJj_)*n{?M<&2fiHDwA4U}Wc*f8cu1SbP$3x8X3=5ixrd}s>8Cb&-5 zc|T$_A&!PX1XyyYX%8Y$r*4uDO9ZoB`^GNsnsWvhoG;Db{%*TnCtYoD&-hM=hv&tz zaeXk((X;fbYT$GC(dRMWUB7eXmeRF?`1rTQ(rpUotOTs&MDKHMUFyQdVB!n&yDDz@ z5@fZZMDnmOM7N&(>So>1$>Y$4kSL{Wdmn>hBn)JE!W#G`UASpwVn!4O{P0thaimW` zgkkjR=;LHnEk(oGo(W`(<$L6Cpk^z6&|!dlM=VFeC`WK}+3<)I*Rv zqWv3*CY8ZSl;3*|1}t|AV;V?kOGM#BcV1eWHQ3vKG7-HTFw3r~$rbz-yjnqJot6FC zNJlqeXuUS$x{zFQDUP*>YIv5n?8IdJE0w!IlK`l->{3rr;OHPCD$r6gizc|Q(FqxZ zX7CS#wqM&fGsCd4n_w}jk$og?CaNbWp7LCgou$Mh%5zJ&*0%rSK!66X0gMfV?6o}>=Y>=S9%{)p=WFl zy^kj5$&ATuENXMvkP4MIv>$0iGTv_{ zkMjBxPb48;W8-j``WDH_Px{3wHf`I)X$cauWIR<@QxlBw*NC1zlMnhcrTsBGD7R|# z8n%{)ol#)O0OmUDm;ne;f`-k(kuOFN&il~HWd3P_#HWQQ>!wwwf1+#m6`jWc-W~s; zvT21u>Q2&yx8B2}l}E#M>E=^$;P`2`Z^GU8RTh#o0e1@2XsI zAe)LK^#vpyX&V;EuaSjXec{*KS_nq`+iPfI;^Sy!v=Vi9qM&)-=Ct<7P&&0=Ks&3r zSWjPhR7eZ1bB#c@QIyI&%4!%>7y*qgZ&>*Zu>Kg+djWMKgMK#%#6bzqSmW5 zVEdKNLkNB})X?vyff*`r2;RM7_V^gptd^|g$bj6aDH1xS35AQVCD`0diO)J@5aC|N zI?+hcY+qv!Wj*$~fg~kKEG)FYgfamX-fY?G%U!mBCd11sC5^-rYj6C7x3n11m+Wjl z6>%8iB>5*g0uR;gKSv+$v9!ONV6>26$h9381C=pCu+P@!%#^TnyfhoBo2-kDlQa-_ zZh+DlIsv_(L-p~6`9%ajB*Bk`j^|FIbezO;^e_dv?|Crmx58CEHV@&XbtpoiK@=hO z$j2r5efLD<M}|utp>0L^4s1<!RV@{<k4_xSM z8u&7i|2c2{+vf^xmy~Lou@Cv5mrtiMY{TXZuWpJisUaIMkvj33x!~R7Gwb&{687&R zMqr1+jDYe=h^WM2q)ut-gq5bp0J1I3)R6d3-gd5c?s+8oO~b?G{#vP8`YUj1iCyP| zq;d+0c~8b~++bpf0n5w)87nqm%yir(;~{sC0DUb2zdOL{bl`;Q-EYe~jeLuC)WWcuD@Ic|=V-Q)fCVB(EV zjQhufdhk2wnKITUm7L@}Km!gTw6mzuC}&iV+VXrH85j{Ufa6oGSFZ(Nhe-=j3EEj z4*YzEWMeK*3pR=1v?}+sgv$5Fr8P0&j^@CHA(R5Ovb62mc4KefW7&Abp@!+xBM1wF zh@};^RZq><jj{_Qgi=n0XI;IWQHvo#TAm;YBysd>iXJkN!fKe zSGMsvfr9DfC2>*q_9JWO2ZU8O8pJ2_-*Xe$a^83>U;U2Q_C2^VYdlCD(qB-S<1t2P z=CtPeM{tK=syiBswAZE3@G8j)wD5#k=a)AjSjZ{!8m3!Y8Se62(>cGiyc~L#Koy}5 z=%g8EzB<OGUeK#;Q z^)2440Jb8_-}?On1I781gWGOs)>TSggVe5Lhc437=R zTFTOlE2G|2;*4N5AdU~io>1q_=XGAu`9-_f5MSs8v*O5?X)jdi7f4SmW)<*gx?G4``QRW2$--F&Gvk?$u{44|CVCpkOGg2+5jK%OJlpDgYf?NfY0pvXId5! zQ{1gq9=%$xS8 zf_sQ0SCp;}{Fy}y&foCj(WUFYHRtp*?QIDsArQPpGlAij>dTFwwUabA1G|TE{LR`# zl+ro)*vDE^7Z6R7i9i%YYemtkY|lcKU2>ZbQT}4vn0X^PlOM1NWhXg$ev70WqIaGK za5gbigCc@xoONe|;6I>>w&_xMU&m_Xn}L-evZ?cl*Wz!HzRkDP4Hxm`!v^}K0?lAPW34?qJ%(YQ;{xHksqNZj991KiJlJ;BgEQ~ni}n| zLNsZJ9vJ837rVEG#mmy}M0VW}pGh+fbAI5Bp<;ma?9*G^%7U1N=WAQ(iH78A|f3;l6t z9EsG35J_r;rv?a^5q--N;eGq{jL z+H%c6sR*9@SyBAAR z^(g1Gsj#U?3>o#4Z79ef+AjL|*u?N8He{_t8*!#>ivAx~^p-;M6i@(90zI8Zd>!sB#iI__}KU|3l2^VjP%?c0JJU}jjWo^m@kHla92cYS!UY@L`fO*~Ht1;(umMdQ+3(bC@}&2WYqzLY zD?LfWJ4yP#ycZ~XvCp#>wN}!>97$WUKWMNWT8O)kp}w~%$5e2GEqFj5M~=_nH|4R^ z?3FL+3!~8WgWPww7vKFcHwy^bT58xXgCpxhh8#vLN$@&Vnja%6(nqECqqNF?IUEj2 z-E=Ez@Z!?p)0YwAyV;{=$)VciDA98GqaZS7NuOQap~}$n*4~w*MyTZ z3`CBMDfl4@^GFJlCZIA%NJhqVvW`ZgAEqqUs|}6ZLll~69e)fwZcymQ*Ea!sUR$`L z8z!m#3V9d>$115bcV8?C3EeX}q{cW}jN70?n@q?AeTb-jz|Mg$*5YQuh0z~Dk^4U@ zitLsf`SNPG2^F2;ShE|xcwA}F7Ld2h8k(AE82t=hj-sRh@pbNR^tPvRd3@Ar3|8amraloWF~Hg;-0le)(-hgmvr; zRxFamb|MDU*tj)dPB*D(rclp>O-$lo;`Zg|`D5ftsClL#2vn-U4s&uf2QF8P6yJqL zEe9SC&wM8n3;GjLp`~}&AGw~d^Dp$lxVE2J|=7MzZLft zmp4e77r#Aj;ygX><%@_Be}13=Z+{9OA5g~n4N?4}JV8U`5X`wm%|t=ue10vS_(95Q zC{G$Aa@`-}3C_twSx>Juf;D$>w%cDI+=W_>U@E`yH)_5twsi4JK%)j;xIFD3p(CI; zEt%B$klQ@Hy+ID=Pq6_rA}zN|=sR5IKjZj3j$u<}E|uc~I%N_3skTPt%j9%Z4I_)mEpOA07zkcE6qmG@kol zl6He_Z7DD#za`qinkSJ$pw_VJD$*iUqpc&6Lmu-Ef*eZ~#3D){D;XMEOGHRk|KP_G zZ6M@_d<1`jx@5LP1ke^1otR)z;%-{T@^0GVu3A{B&wu;F!PYJ-tSCQ;#617;`HFOsb3 zm*_MIDcXg^ywEQ*EjGw#c@_sSslei&s#H9>lEJPGDvPcG1KNf(AM`P=JX43PoW#J_?gE$NTZ5O~AqS z=N_qqOE3e+2l5;>H^sIu6jx?MSb_jI>Szb2U*WBdht{=s)2M?^s3~5W#FX@;qy{uA zzjJ2@X6TvaGu@g32w1}7NH-0DI$eva6PTjItjmRj z;Lk4lB2j%y_|$OMHCV-HxJ`tsGpVF-QR-5sB2gvgmWgo5uQkimPwHf|KhVC;a~v{1 z!S;tBuAwB}%X!QeG<^sLJ%awy*el3=_g!|nnObQ%g3d8PxXEsWNr|yUxQOXbd@NLZ z;i82tARqduq41VhB}FlZbZy$`%{`?6snqyb^w>HGRpg)Y@Mmx?NzFsw*o<&-BF1IJ^-#O-e0BeSP*AQClla&vfjR37opbCSgusHcg5 zb08hgkLoQ80paY<{^V6={!0?kE20?(92AXUHeyX|Y@B^u{aW=re(iEB@`bqOR3j6jCD)**~G z_hA$%>1LVGrKYgMepg5p+^9vbqS%8mnWl;-nd+Q!Ka1`2bmt~C%%ddtwQ{Kxn;Ztc z0EvMw(w<9cFzf@A=qIq)c`gyU*B5r_>$-LBtM0(uzS{_ata$ZB#?Qgs-)9zw2f+lN z)hEKX3~uY$#u0IKd0g#d@B7r_gS#N-)}}%C^+%dD$M*dRU$koU%}n1JVCI-00_A8$HYUwy=NLS`jNy4n9FV-h$)=i!sujLqE~yQ&4h zEm8Ww?5PUF+TFw|>pUQIknCBwr5(s%2M-^?8=H{uH3DYy{OxMW2+jBS`*M~sl|5*( zD?n-~=ztpWa-^S^(;SkiN-XWP^#sG;gN7FYvRJF+^yMRo@}pyGq#eIL*Uk0 zO7j3zp#Svg;-Ty!+=Rq2p|%WCM90awY3N@Ls}8BempA_g zZ?-9F#w%>^B_or>RGbsx0R9N`d$r`XhbCSm4tn^CnT*UUn$<(n%q4uLjNX6cw%Ks3 z>(=9T-&Ku}7u~-#_~pAtzSy#;Do#v>!%eV=3xyp=^MoBn;#i^?`#TcKx9?+pkrqKB zXaI<0O@YtX6~vjK0xFLxXrC-AeHhQA_PaSoHsm&rxzXqW6nh@0eVWe1*yuR(4t=}k z;QSNl(sLIexIdz!tr&IGEg`jYr?9!-?O*!8UGBn?wAcx1 zY1qw~l;e|=UAN%J;KzefwQQ30v>!)CZ6DLO!H7ZT)3t%w?=+|ozXh^|q}iNT7V}GM z!`CH;MWAL$0kL>c!X))djCDXsHl9qQaq&{E44G`)TbXKzmkFr0xnX|d{o72wOkeBK z-KqAP3W*~or5ROu?js`V1FNR_M_#rt>kZ+GX_Lb!_cUW;V$uqttNo!_EpdWKed4_M zl2(`0v)k~o^R5!A8pDIrgvy&gYQN^xNQAqF*ea?Q1`MF>7(;9#}w6se7?B6vO&AIb~Gc%*lGnG1Ryu5DU6z89d8;;9mBGH!WcT=wu>FN5vhi9SOIQSg6 z$HQ~F1H+Q(#yE47m*g%lKhZ;G?WGt=mzit^CzU;u0SfY?EPtDm!Jt!k@!LC;XSo|$J4RFd_-ajlBz}j?UQYuV2B_tL z1Uhd#UGQSSM#R4u5lk+<=r0xRh9N0cCSlpQ|opLPUWKYDvng*1ZN`6?$G z=RJ^6u|5;DEp0O4B0ARmK)wv`(9^Zxv3ykS&zqd6!um`NrisbIYlfSzZRRh7ru8-O z;)60ZAV*OTc}${ZJjs!{Y+3wjxbl%k1y0PRjFSGg#7YqZ8XjPM!grm7>2X5 zSoODGZ55m8yggP31YmbK-Lr7ruQ<{XIjy#mJEh5qZgTNa!J# zRlM~s-<6{MvsM)Al!DSK;HQ;CgliHePNdIl#;uai92vh%Nu_hra3k(FJa1G1%(<-0 zBTByuqA1cf2aQmvL3~Mt&2^32xU!iTF1#%DbJnsNbVY5DFrJ{i89)fxam7-4Berr- zc21+}TWHy%1a+OU=r_gL zn{iRC4dQwkFvC6)wN?#`6VL(CxAMa9Z9;j5z2u4CN!BFWFn_K8bg081J!AwSuaZ*Z zGsQ=Tekg!pl5h|1xr8sP!GA7bRIfh`Cg`sN$;{6MQwyLB5>R5{T}b&DXwNCk|L zvJyUJY;P{9gj~U(tC!Bxo;kqvRmD9jK}+!c&Z26LNDO+!SI)SiV!|x)xFU94e@vM# zpfVZ*EgSgtl+~2yX`jSTevNQVy^WB;JLGNbARZDHtQghdT1;@``DoiKyQR=dSp11Z zyZO^mZt&-cY=J&E0;ANNOhH_(sJgd2=b`c&nzw5UVX0ZX_AkT3$$c&q+#YrsHB*m% zX1~rQy!NL>^1loGur%rRT2FQ3a>FDO^jNwXdGNG+&Fh|p!H?3&3wnNl+Z^jhsOFGn zLMOKc%KtwA(LgT0mWpFFEHY98DZT9X-+w-1N~jdV7p`borp&G1jq1+=FMA*_U+p_ zqoZQ7P{|*`9_hvi< zFF+m(7cQ(EIb!%0G^p0d>_|mAX)G#3jo-a{Z$@}{=+k}rTpHWESFiLlr?bt4up7zn z8-Ob9esuUqClhUcXs~1rQiVkWW04LzKlq*AUpiVCSWbt@?bD~p0Cbm&PMR`wJ}om! z0RkJfWLEzsUzL;1#Dv-JzbF3m{GU+3`UDtLf~!@FR!@b72D~zF-aH2yw_)Z?!FsL^ zR$n@Nn4Wh}MC&%K2S!H34ZUsp?KSt_efQZ{UwxHP^xtQxeSh|Qn?{Zv=}SvZ?URz6 zZ03N1R985}T;i*v4377SrfE&(Ihh4O%M`hgel|j>73Kob!gjZ5O#eF&Xk@25)+*&+@1i!=dan%IhZm?9;S^L z{5}?>xX<<1PcFFWrkgTR!&m9n@+$uQ&!kCLUmX|}+6UKe=3vZlyLy0AAV(6Z4Il(n z-V5c)MXKEZ(ORsx;=Y5C;1Pnsiz+zfLGDQeL~{g3=R0Mj-qpT!=!mTepnNVo7bGSJ zHHNpHIeiBAY^t+|Yx1N_>)9!GKzN)3=BI3irxlzykuh`O!uhAUUSE_1@FaeD%bYoL zq634&o(&BRpBxe#h&rNnQCL*8tlC*Vb2zA-qrBq&fBfU0Q45iB%31)7F{Njo4U9qD zHtB>?HX=4a0xi9+G{jS-=LjDR&-U%pZxaIt4wRjn9e~c8Qp{--fy(_1Y^D#2OyTPb z*ICdZva@o;>#x5FwSKh(rt#y)ifgW(RMoR*x36HEEzQix^laU}y{fFVtm>92H~C+F z`LCltUbf5%o8^w~J-R1eH+gdF7A;yBG3|0D5&%$~+V^_z-rs*l)VJS$cWTpS>CfTn zqjv1rjzv2k(N0@y5c z*deojg1dBzVx1aXc+Kj?3+28V zKx=X&kn*^+`|~qj{H%ia^DuY>;Ta`Jk3Ewu2&NFLES|ywDm+dU>SN&i+gi40b_vc= zkJLQruYCDh6Zz#x11i52YT5(#z~F~Mk+`pdkjpYppUHS|)TrUrA%P)JELrk##~1(n z7t`6ZXJwXXp3#p(P%f59#B>vV#+7uA){j=p-SE^dSB8TTtM|R)$kDy ziN=|aDp*!{{G+2I(IBaolBKf4YK|tW*oUIkZ@e)JI~zrd=IIDW+A6NUev%janU5Yl zYWV7_BT@}3K@9;ikSivP!#Qsgd-v`a&;RMqwP(*}e+qV*f9=(i#|<1j5cM)Rf4%R^ zN{9QgR;^M$#r1qI=fG3XJ=ec^(-yBHCbL6sUXF3wwjBcVC&|2Sc0k@Q<8j&B+qP*f zh7B7bJ-w6{)nWo<`dCYCM#SbitaL`6m56@%!4u4nr6>1%;`CmzRu z*32sYyk@0RE7J~c4aX#-=6UcDFWPh!evk<|VOIta)y70eBgqc6B=D?Y_g)U`!ULm5 z4UwT%3+63e5*!vX8X#uoHEMvT-W7&a$;4?R<~3=@kW)9; zv1gt`duei{+_*?N)`2MRgU2UJCJT0=v>^w~GE|nr%}BMMH5B+P^kRAdGqFeHu1`dC*Ymzfd+#prjQ6YqsH#szPlhO zJoKryEm{UrJ3W0m(?S)7iV{*HohXFyFW3W50$f|c^J)(Q%Vg{$ZF1@9&BQ(T-Xl7q zI0V%wqA3|DzR1Q(dltRj43uWcGsaD_1E@}@*vIBApNV(hoh_$2?$o7|$UJolEGWP? zZroTF#x^Woyws4Bmy3cw)w!`TvE^VO!MpbC`UJB9EEqcMvKFXLAF_M*p0qE&+?s#< z#L@Qn{u^S7)NS_4YjB)R#mC8 z5)inF+@63%eExSpB{QrC3GPdEmB7J$IY=p1M&}{HDG}jMT~L~l!Tkph9*|YPIFa0d z0sTd({S6!xXa=Ig z44O^LQ>RW0`0&F;A8*;RrE>hZ36pWHEl@GcSPxzbkhOF09yGvHs=1N@?^ipgx=$q^ z-nV8d;P3JUdGMXb;d3BBGK`D^59fT+wxet`IjVQ{jL66dmLwglwZOPve&-xpEUZp# zgo6zS7nryTO7|DPIcwo8stNj1CxKs-&GbS~6@Yqp;=~D?(a_tXQzW8uSrOWCX~yx) z)}MX;=}QQ)*$thvdvQ@=FdYK`(ZjA-MF?5J*g6b_7ZK{*CpR5;4mM)c(0X9odm$g~Z5qx;K$R-xyQgZ3G}Eh zD-#S5;v`c_c@1Qud_ER1JTvT%l9iGL@`V#==jS`m!<1iMC$Et}MY9h9lpsTvNc%Cb zAuB6etX#P=_-tYRoTG;i7xwKpXp-CGY6*}>R1-l>-30-UD!#gBbzJ_=V^CHEOl=V6 zdM@=M;6}VkwLg!+m)gJKI&6V+;6L7#8HD?23-{r|d*q-Z<$(c*wqmwlf+`0TCK}Bh{TQ|ftK|}?ugOc7Ed5hLd}8Hox82&|@y8ym zZQHih>OVaG=v(=D1(EOqcZAw$f;yH41$G=lUvh7`<(4CFyzz!ctK{{4n?LKviE9*yFkFg2L=aHmgM9O}R4~#3Qe9<{4}u?UsZ?Rf zN@3cOzi}VzET1Qjc;15bj5)@c!A&iy$DkXS>O z%0b^;HwZ}Q6d%?F@3Hl)$e7OnIkJ`>W=* zoU?j-)Wkg4J-hh){!k=H1;s>1jz%h-2VnJ-qq5Q$7aQLW9ovezp+3VjK~Jw-yV7&( z)Tu+jZrY(?Sybjzed}QZ(;6_})n^5&{A_^A$lUN1h#g(GcAdCv$RH8Na)BE^73FYZ zOrL(6uXnGW7NlnSe{j1WH2V!OzmJh1~Mo%V9iE`Mx6}j&X7$TKmDk+Ex8XHt+k|)L=R)A!Vs#!K;yH;kI}oj8`l=lP@+DR0taq93mOwh2D;KuQVzgkTT=>RGZ~F)H+SZ{H~cFv;l(y z4%+3am;dt8>IP@kKSBb(8Zebldy}Z0KX=ZaE5=XURO9ufqv@o8nV6B2-!(BQp>zMf zyDE~Y*CZT<&4jmS~{Mz5rU`u)hHp_u96Ogu?YcyJeJGY|22d$BE6`fW)cI*_Z z*Q^zT2M@;lRmlj<-@tmOHMdT=$=APMA1l=O+D9J#KXK&P3CspsE@Lyb<1=TTor(@E zq7IoaI$a>h>#)3J|}ak#VvM?3W1uEi+KH ze~`bX*|=-3{d~^5?|!cW$e(=jiF3n-bvrkIx^Z^@UVT0U?Fj`4VW2UYc;B=!l5T1l zOT2azQ&Alp@{r&LG18Vy)jBT^*;W8Gc2?fAx^Fxg$)hQ6c5q4(z`=vnE$Sm`&jKh2 zN*WMUBJqKs#9}Cn&*W!yY%nyei-tGa1m_n4Clq0BRB7wai)+P!HBjZLwBxFa;LQ-Q zVMoO|xi2H-p(M-mY}~L(eDV3`(sl|{5i+R!oiV`Js}Su_hUdzOBKbV&%!xuV5l;<@ z!~j%zFY4ZK)3`7Ra5y&yBPA~day)s%Yn5`5mjnS(MmP|@3QUiHf+aO#LxoQqwTFv+ zm0oeG7~yGf0>&_o6{spr3;5!9kE1b@E|N7diJL|X7KT!g| zE-^dg*fFkMySDLNyLTIbT>;_YXc;yLH=66N`w@p2!YFTbyKka1FE8Dq~#{o*-3Z)!C)@xg%Ik(z4~tZCTk_ zV%pTHNs*B;S0WljW#Grlwg zb+S~Ab@RDOK*ctU=pYzLAjT1R-7K5<;o=WDr<8PHDTYjkRJgFpfdg@&I03`(iV+9P z%3!P|z~d+3S`y6nBqTq`G zgGjz9KlUpo^PSf~dB8SaQ&~~^#OtrWo}+E2 zzbgs+D#4VuzIgFs_q=%@Y`XHwiM!BT(_sfqKtn;LU_(;NYS^0dLka%*n7>3ZM5{{5^_mueq+=#kpZo4+vA z`Ng1rni0r+curqZCba6bIUKM zILBV{1l(8NDda%4Tx(POUCI%Hma06jM{OzJB@W^bETv$lMsWgQnq3Ri1rr^(DYTmc zymmW0;&ef=u!E^aaw=@d+=px70Cdi-$|Kgvp6|R*m~?89F|`~v)7QR_4W5_sOI&`n zV5&V#u+)sJu9~ZL4#KviGZ+V z=`!I(Bkj?xhxqJ^&&BC8S-$HhPx4(hWUv*-^gs0QBeknmt$`YDmz|Lv8)i-b5N7^F z){)2KqU}v~S~_hA7A%LSz(b{KmX;PvRumc*j0p+zW&S)85DWn4o>q%J$K`Uo<2Q=c z@W!`9-OyBo+`=pG-P^5e*CU%YZQ@ALD9ef!%Xf|*HDV7A*2icvhl80JC^z1VinS#$ zQI-TZZJlCLbOe#*u`)cT_@?@=-6KIozVL_HPdosR!Kc8OQY19UZMlX{A!!E%^V9&8 zGNV(AAkFi0p)3z!x)a9iQufShBVfrF0Sg;y&tE=c?8oCNma2e-akPwN;BzJ@GJ#SZB%9qi@WzpN zzjaMf9+wiK>4XmtTjMX+sK0!t2cKmKIV<* z^Z-z$ZCkQb?9ciDe12r4@+FV!FTq@BlQ0{>x2jLT)BhTlUp<)eAY6VBG|6D2DQJC2GO1^w=NdFVu+aezu&55mPYoA>!Oop~#EzZ2L@!i@Kd|qBz+CkH zNs}hjj2bb*3e~gri6@?_UA}yknJknkeLf#c+^LjShNUp>k6=p1O0Z#872_nyyudD@ zZh6qbOcD$*VV|7NxJ8TB_^rxOSyA>TuHAuYrEhG}s+FNli{{HK9pwx1@(SI^Wa$Bi zO&K)PbpX>wjT2nHeEG>SV^;C1o3H^7t6=fYT9jZN}8k%fW)oHg)+eD<)Vp4$%L!a z_28mJ0Azc(+#@iBBpgzdofIBEqhc4g#I@w3KI*OtmpGGGB;s&8A;=a~K&r``g9ozD zpar0$Vqj`&lh%(t_SmOSKmD{mGpT$6^?!6Qt(OHn{`1a^9dr3&2JE9CMG`#WvFL!1 z5j&vj$f|wQ1FD#oYVd5_Y~8fUnPqJ0Yp?xPpfrG}fI4^l_)-5A0Lg@((1dwpTpDM5uhF;NT8ia~RWD2yYVU_^tJtjD5AkWUhX+{Vh zWMJ&vZQ8Uo!E=1ZS>t{%IVpK-I8tie?wT*=&V4`U$dN;z`c38yu`v;b9^Ef(I%VqA zOGl5rJoxIXuF6KVgX*;np7B zXa#;MC6abkvPd#i9*2n)oYEr~r3KSc)8s(K>rsHJJmhb^~7>y{#|NvcRriU&xMJyzmAis6uoKxw;J z88Svzt`;7hN8y6c>VN$j+=ptHXq%J|Hs!tXKKZhvN|hT?2Pa|j$DK=%l%QG%to)3$ z1>s}5H3s$OsE!sNf>~wxJ70KCoKBn34~Pztsls?Z=@{U%pMU=OFV%JFe>Dl{V5&)? z?){^WKJqSFv|!)p>#kjBU>T%Z(HRWKPVkhh3XK&_7+d2Vjazm=e!~ZTCL3C`Xt7wk zcCA!ftDH_ha_+qoCyuw$;qs>!Ui7{5?raNKsDZ46MX6c;liqr&s3rgkJ-lS8w6$>x z5dt;Af@Se!6GvxbM`W5G0Z3y(Gy!0m6Ju;#e7nf-$k7$0Wvf=NUiSF@{d;nGi7cQ! zX3UsTEz(=wgBx%S0Lnacs{G84SHMWw(mPFulwz&i2mkQ9HfU)BmnLIe298OP zrP)BG3R6R%$zj8WjkICo92`_}49Py3l~jWyhjckxYCMW7%fibsQ4%~EYQ+ibnB0+) zk}T^#AVn4Xr~!xP(-wjo7q&a~&NisM)Lr7XIXOM#PQ#~{r1~>g?&DR28#}+ z4TRC~h72^IK^p$?&o_K?=PWQIMbA)NSR%ctD7qn;C_5n;S`adWXg{RGi2%yu zs%$C(0y+Uel;@;#iXD~xtE58a7v zW6{$LbZ)@;U<9bN4-N_(K5ERkjLFwbI@K5~b=9g>KG;z=jK6Z?S+Ji=0Q5o9`%Pqo zz!XUFUgQdO5A5Xl$9q8;LKIdhmq5vqhg>JY0Nc>a!P0(Y>>P9m3@OOUVmOvu@ZIlI znu7vU?hADv_sgkaC_k>}RL=zKDtOqrg}f#LB)69GRA!!rfXemSOUfSyV-uhwjgFi{ z2fxC3I0lo&60`)0th5ya`u7)+aE!4wKsaV+pc#Vu#3EgjfXHW70Q+SsLgzR;XmN2# z9rzOzaRPZ3z7r2=$5ddczViyWp7P{N**OWwY#}Kq3Ud~NPpJ>JQgL$}h#A5b;cVM9YfDWeTiR0hD z%%4B^C@MLv0|QEh<01`RPDQ9=Q+5tsFK%0*0oZvSX34 z${Ls0M}dsoUH;L&qqsDI$QT5L9zWCeRWI*%;ef^&FbZ^TW+ z;Lmud&xZc}`tDe>W=*5mQ4^0eFJ=vGZ#oOZ==ri?Boh;zGHS zF%mdSFxONFWI~Gz!;;9{XzS!}WTvp=NoGob;X)bIqPi48L#h0VZQ`f^3FfdvjplO$ zpz&;EtTYuUQ-Wfy8i4X$J0_2-t=B$l&!27K(n~J^+l&!AckPiOP!uYkWeTD>$U1qQ zp`q~XgAFS?EHxR_C5@u?VT9c^7urmrtgeN2LM;Q+o2WQr9-r5*7T%u%O#-I6Z-Q$B zc8cHmodAhKsv_Nj`*p*5inbK%bBR87zZe!7AX)E6NU*)m@1G)<6O}m^Ki`fBJ}Y)Z(E-FI!=;+WcUCT}>#_E|HTvfLWo> z5)R}nk4qLWfd|)8m68;H-z~S?9En=0X@?IVvF7LHE$-H}+p>KJ4s<{W6tafkIR#5% z!=-(V?29Rmj6)<#rGq3mI1~*T{1=^HIZ@CtP{fzpbJ2iPerfPWfe9xfJcm2k{aPDzQ$ zA{2#JxUB*k4QMDg1t#)j{GG?-Kdy)DaXQHm6~NXq`%IL>jv9rztH|1~Z4$G#EB@_G~t^l_)t20Mk2p?{ZIE7w?;a{JcL# z1!QHNl_1<4)x=nG5Se*kk}y%=A1?j8h4-jl93-IL*Hzadh_ZcYGO$34Ne{A#4#5`D zA;cz9f{=xV>!5>A6+%(qOZzD6`?8=-TwDU=>X)+?xL|v&EU$QC*zln%&bt8p%Q*?? zV0umz|NR4i#WQd2oK06;G2yG6+?-A}i#-xC>PR+G0KksNVCEl?>5&THNr7Hyg0CB(3bni) zh|18BBS#;(>82abG=|#0Zr!?aO!d4JMPtku9D3m7C`m^ER18&N7r=`*CV_^iLBLV* zwgjuMzB&q^IfK%7v?+U}&5=$UvQK66lza0C=K%2k**gyatBSJ?pY6S}yR@aa^d{1= zfD};+LV zE=1rCoO{}wIWu#1-udRMs3$>n^1K5vbsNN94boCx0xxMV0FL}OU?!e! zye1BLvl1^j*W0vttNsqkq}bouum{lUKyMY^FYkjQb#0$+bvh|0gA%DhV^$=ldu?# zBv`rTnrpn{kMC25?Dr_%<-%Ha_qQy8-Y7BB}1+}VuJy6K$wtxh- zh0`mQdqS^~B4}o`X>ItF?pgOz2Y8%25QO@q&71=uB=+zQ(Skrf6csdBF02FqQCE(u ziz~~O;eYPR!|k1{BaDyQE?qin*}n6`wQJWuqA)MV#l=L+GhljfuGkh#0@(9AptJjG z1Ex&0B_&0AHp>Vx@ue&O3#PtSjSdcs1wlYmO)~A`8`ihYwvVP#Hu;$}X|DxZ0jl!D zV`q=R4?_mtBYzcMg(%@{`Q^pOudM4f78c}QIBwjyfPN_-`dd-}OuwaLzU#6;A@7n) zE}kNpb5}`-P^MGlk?g8!si%4k;oT->t3Fn;s^JP<} zOe*=d_y3UV0@Sb0JoB7sqP)9nx!YV6s=5%#xC2!7HUJ+!VHwmo$!1_P_!Y>>P~Py8 z5mY&Z9QR#fSbKtf z_kYeg=e&iIfOe7YPh$aKumZrfC@ra&dfai{vNV%dcInckmsF4vwr$%sxw4|!nM5UKy z0F}J()flLS3e|$86@Vg7)EclA8f-FU6=sG{L-+Oa=1XCTx<^%@^f>glrOa*LoEyVp}N!4 zn@QT;NSoAn1c(fJ_^SQ@=fKVFp~`3!Tm_cY4pdRK;vWWY z|2=IXagBT=%EO^J*Cc!j(vpAB+O_LW%*@IPt#kHrp-ES$dLc^=+&chx2~OfwQ194si>Ak#YH7;1rohfdIZaT z4F|)+f|toT1Bn5^vfk=uT8ik~YQx}f)%qCkh8wQ;GBR4J3`xahg=2WE0#YbamiQ#f z#iyrylN!nc)&f>oPgtipQEH5FSuhTYZ)+#fnCZIQ6R^yY!zt z`Rx)#4UnyOm7;>y$bUdhC{9wS3p`P%F2Ee>8Okv|J>3j=tpL|py(?Slri<^ zZN#D{0aWtf=q(p`ekH3b`AZO>_Ava_WnU~)+vM;?c@YHq*NsUIFeUyHCeCsT>FICB zgdwLM@=RsgK?rg_nyrKOI%QIzEdVpu$xoeF;?s88xOM9`BP2@^#S5<HB*i0H)vHN8g)c(xeH+Z@lr^)C(^-f3ucI$I07$pV;`AQ}T<- zxBPp;>+?p87@?(qqfT;2edFpiYX?Xeb5UMiejI<)3|0fEW^l-RyLgxY8fr`X0x4j? zfByuKB7Fu(+qKWI=`{)zyA*8e3tb8m8^&#Z}I{qI*mwe~-fLbaYP4q~q8$FsVNq&#xj`j5Y zy`@r}D-utX*jX7F8FIb2+JG2g0Z5kn znj{>cG6B*qC~U_DG40efsLsubNXqI&I>zWEt54-YU*%i=bpEW!h>(l3vUV(-`Cmq9 zphu3j6ado({NhrfS|L-)Pc*wXOHNAdlav&HeuR|uKUuouvy*!EocPc~50$rT+wRh` z@`|KYD@CD*>VSH{&nA-Nl9EF4uJR4#Vfd&FUjF$8u@B+))EIS{CUSUu<&{?%X=UjQAVaY*@N{4aQyrRf0~CTe4b)OSOI#F0xrU%a(fpPp+q=pc{&>ooO)$qNS!8KNrH3{)MB$1eGqH6UtS=BYgh z>tK1X%tZ@9-V@CG;8EgqBvX!FSej!}Y6(RPF0>zzag;87fglFUGeFav>pLKbgJR>H zya08yRLMloi>JLepaQ1uyVyNd(bOYe0^dvsCd$89I>EA1Q+=TvX-~htz2&BFhwgzRP!w|qQ#IQS9p5yM%_n^|i z@~r_N<(nRO3V`W>_tgRAoG@WR_7zuL{;p!h7il@#KP{!n>0y->T|0K{@Ws{1GtfK!%AAT@j5l}-EP4;*xyN4oKHNpU`T&Ju6# zsx4mUj22#`Kpa69@3TOPmJ#}*I@f=rgJ5pyNAGhWRxAn?Q&+wziOH^Czdo8ci@kaC z7kWz;FUk4z(@&};V2gmdHvS0Iu5DYkdP|pn;bqIIGdj5NH)$7?Gj-*dqCqL*7O%f1yYl$k#6Z!efBH=Gv||6?z`{4 z264}58hjM}1iq}4 zL;H5ACl4IZDQnlRuf@9z}Tom1}#bPeX;mdvp{jOMLjkKEAQ~ ziM|4CLq!R{H~(vI>#joY>9);q==5K_5xS_ z7D^SO86K^8*{TP_SwcJjmlcQFi)IF^52@_^XGiAtY)w0jHAw|0C9}MbEYSoqRiONb zT|2kmKX&XNwya#a(pH{|2GS4efdH8Ppziu%ncsTz&GqMxx-e7NG*tHZ>GI}uRzhuE z_{SfAH2b8JPI^ncz}*Vn+(W)GW551-Z4eeK@Y%`*dYLjUjE#vip?0B?00fYZWh4N3 z(M7*dtomVQAX|X3gE~dF-tfS3MbSVxL8ZVWV?)4HC0<&&cx?N!b#T+c(LsE)!o(K?JZSbWB8~%169Z}w^HO9)F=~(5zc6j z^c<=kKm~+qv@a^F@s4TH#5<+`vEHfuI(w-y0i_@5PZIsDE;EU}m5`gyjQE@*v?7RGw`Ps3ymo3Ll@lD*tybX9H-~ao z$Y_AkOqzgxNm;Ro)T+AB;4oE>iJ@BaK~8OSbS@AJf_^ zkP498*%=k4;isSGU4Gf+hJVPQB(Y~%Z*?5Q4231D6$6HVE5Zt>Q>Yr2`wUFL(o&!n zTmWnXo}z@Qrxk{ zi?DgMRpn!Dx#gCPs@4yta_)KH;j^2hrM(;x9u+UEAyL?h<*s0K2!aqk0Y-*Gg)$Yz zO6vhElIsh5Y;`^rUC=xeFJ2^V128S?qP~qzSi|+5xkcU^vzL1VdUZCIK-0h6<{i^C z#rxeEeY_|s_1AlpV)C-@hdMPC(Tkz&tsuy6^4Ps zGI6)`EOB_zIIXb(&y0+=-Zj6yTGXcv1ZSzFAhQgHAICF>iV-lf^k=!7l_e@wlpFwx zkOgVB1JZiD^HHIODzy}<&7)(15KcjsEnTAEfD65-h=>*vtehv$`13Zb&%8S_G9*DD z(^X(BUn(Ij#L`RX-Lv~g^XJdsYaoAH|G4Yp#@*jGH7)h!q{O5qdAWH(w3{U|fCA4H ze+JfupQXMa%#w?0f&&~&Ra0-I^KhJ*NQyKO?aa(J>=HB*V4GlK$|S+Eo_=BCKp+*N zc6M&n%I_5D0r2J~dAX8Zbs`ef*Z}YfkaQ6p4 zR^GdfPzn8CzL%)epuTq4*LP4>hYR8ZN@z!jvO@V!V4+Oxp zzAp~y(JQa~bJ^%o7k(k3z;PNVDWbL#lM)-9Das}3>8JlS_m4OI{=MO+4d1>iC#P3l zPENe|n)pT5VABB9px>pi$W3K4eeE^Z8j6X^x@&}b;4RIN(L$0m6Vf8XL*Dr8v(FCLQOYHkjEijBKJD42X-)fx3J<|c5x|2( z6D9-%V(=R6lG;+#I?;cC6yS0|3PmlTR9m40VUb{-3HU(i=E{9q7OPB77$u#{VSyp~pke}a9R7|=CHyTs5G9&!ED6NT?U;gW3d-gmo z)($KNsTF1bGHmc%p|Xsm+5iO#*cHkp%2Ea4kqCf^(h|`>0SpdAq4JOdvs49apzx~1 zo0~puy4Vw&4Rs`n>I*&ba%x&yT${FSE}K1TPWr4_b1urw&7NObQ?(&1BqTX3Hu}{C zbLSo~@AR@uN8c#J^~=SM2+zyOmNZ)LhgZzfT!2HUWLOjci>1Z^d)o@Yh~+dB2|xeZR0eEOb)uh8S;CPFR4=?_1U@YBk=W-JS9?>x-sLqGoYoDZQU&Bg0&WCIVSo*wwwq>^ftgJrp9T4TEX_a#9vEt{7XMWaMBfZ|ed!~qW&`8=0r$j_YX5Mi9^~=VO zA74LXnl}c14t5XN5qYrR5-5MK9(eiXmt#^JH69xh9&x>-)CnwEcSxkaW5?Fpii!%} z`0A^#!U~EiZWd8;ZBaqd!s4QepU;>v^;l6NFO-y)w4g8sCxZ`QLCC^jW1!l>CyGl; z!7_mlLJTXx7pUrfmO^A88BUYOcb>P@r?m!xLIYGN10E}2%6q=}Vu_^eQv_@RD^a@w zt77r%zKD*BeyMNY{vC&&GW0r8R1yFD*UNP~wr%^cW!u&-9@n|^I~vdj#FzWaU!E8w z0o~i7Vd2SGGXeO3BGeo4-)RfL16U?VrKgG9!x82qMgltVXw?oR{qPul)HI-~{7eW; zE^eYQ{7ZyRrX7}o2TTD+c(JIHp+ONIli(%COLtW~Qgk&De$7=t#D4wyc`H}0G~aT- zfB`0y2Ru>>qkI8tVdameU?+V_a`j+uzm?jhx^mYF#4u?!kZg2jaqdF>U`6n)Q2m16 zD&r61qK}D%i2&VU`Uf4=;r;)317wTk#++_-T8Ks7yoXnq1ZA) z9FRIHjiGG#eo%7)K_vHpkTq<;f@Anv;;d)vVaa?#UOXmZI01> zeC@SYv_vj21KpM_XCAUAkB*Hgm#q5HUcGv+z3j>>Zf}|1B17s;L9=JiTq)1}qqc0> z0`mA1YTxLG#~*(@H6$$R!^rT6<4Q`)gOOYVP}t)mxklPMaySh?F0`6$_t(!fT%1F=!wyGmx zO)8iu^be~a!}}B!i+w+R!Q=p7SwM!Wwum{^EybJxRPxv;p!x~c1CLEX8`cVdyJ36VGH*c0Vs#VigEZ;cp7r!{a)RZ6iIb=N$ zpiB>0SAX}5xPl1pZ!^Ka@>2tAwiuDtY{EPSKbUf zO0-mv1rkd&iXs*D#1bE6ZDU@@K-YkD$$=Du)bdd)1sd`$ufGPs70_YuB5;~FcdpT~ zgo0za%UFjpBgn3Xs`q( zq<^GVU;vo%XbWJ70}l)ruqJ|cin z*6!Vf12j+uNCKS}9TAaG8&ow^sL@m&gXAGj9*m?OL}Ks(q*my{QW)vAOhyHyIKKc8 z{nMVo>bq#Eof_0@v^12@Ou1si1sgzu2El>_pLw5uuFy?-odZ%RQm0OY`6k|*kd_!* z@Y=fL;^Gs3-KI@O$d;{J>&B0NE_mCP4gOWw>-dEiUPvu2uDnYWXLPBS%zy=aPrw37 z6ty9>&I?fl(^^sDh7F+-XtUs6dYD1pnssZv&*#rKOK(h1k3ar+<2WQ7us1wZSr3VV zg?cq@P>HlHTGmYEVOIrMfT9g7?EylUT;Ef7|IJ+|AJ<7dMTo3A{a7ET z&RWYF7x2ApUIiVDi;-GY6YZl7&sV`<(Y% z`maHsRv2cMW*_-BQ})nPhmOYe8aCIVbrvAQe1nnrILeW9h-1P*c6MvB-Kx$mRE{; zJ5vIY;GDd?0`W}i5yj@(^D7DV1#xkW+RACN;Z#@SF4aJFjyCzMWNN5G%sM?s|@|;f4xs6}~5JgRX9GD<*lJ%MLT1p?# zbYQoqqB@{2`6r(NhSJ}&vMob zegS|>oLk|Pk;EGc7AltKp>&X*v+PY5#kFeXDgnehMtEg_R09^>Nb(ReL;msPU?ow5 zIYMRP4ZqCrXDcfzOWGXMYO$`T0;Y-aNp*xc-d|FEf=Wv(Pmz+mf)HCDpiqp^Wq=3( zL4DAh#p4pk@G=1w<5p0h2tO0EQSq5o*K!ja5LbOrnlVtEir^X>8|}qwqDYjmDp_?# z8qPB@3Ik%7^irA^Ks(`;F_aJ}A>47G>goeulm9I9+bYllhSIlX>}{FFcOX%w|K6PVe$0CTNk4!$4puZNxaU~rTcM$~Tr-RTaRXFs zGjYJ`Xq23ClfZeX-WiaK_;>I@JPv^ALG0XyC|Uz`*X)_IezRiviqFbQ%Vbh7-k-pO zu+ZoymS|Z|L{o%K2#la^9ED@miQuzI_#(amYHy{0bmq)YRaU9F zr_pP8kMQGALjpLn6=oT|PwSNmv(;*^ha4S0^w5J?9%f%jyX3LVP2TsaytlQ>^Azvfh!Xm_2L^C zePHvw`L6ozR8Tji@gep@Tx@bm%6)6rtZ8B{2YwEE57;k#&|edH!H?~MhaY}8tyQbG zcPBSW9-~1P%^OKNY15Em=W??Fjg!&|(-57Kh5I8_p2o?ZfwQSikLy=0C zRe3cm8Qi*#UTIipgs9Ot0cWK#ZMAxt);V$S-p6?_z4TA7O-5@^QCLOUt~4rF8`iBk zTg%?5kXz#w50d5Io1Sqvgc~IsOf5f z2YJopp}qhpX$J|Qss}3o43kW^6lUpc!4&;eD`*0sp?q=kXT4LoJoa4Anx!;*3v#@R z$z_Y8h$bjZv%F_{avvziG-+DyZ#L|ap1pf|&BS(aJXGpWa&6nw9{Sst{PQ?)b=SyZ zj_Dg6!&}Dkjk-X!2Sd5*n6`6`iDF0Q&O6IXiysUu_kA6F5O;wkz(MTXhAev8v}q+> ze%tB8q=J;(>YB^@ zt)!+U3BC_N9QXl@)E$uInq?DMTX_1w*t-jScpsE5@t*)B?Eg8p*DNSGo+^FiK$_l! zcL~24Z2}n`c+HzPH`I2bJxfN zrE1Q7pbBuB=gd9&#!W^>y)8v?vfg~^si(g24NO^qp9brJ0A<=>U3usVnmBRdklMPi z$29P}iAprA8jT+VrVJ{mH3q#KxKO7^;kga~05Sjsj;Y^&f2ucU&Md<_1$cP|0K(TP zd{Ov}SlFNp4rl|g_{nUeSW_~O^}2THEP>T#@81*N^6<;LVZ(Z90nC%CPPAQ8M}$1C zAaXa0N+mGTo+NMc9p}zp_^y6=k_`?!`{Hea|R?; zcgg|`?KRsk^au9=Yy;W`I4zaxz!YJW1yZpT425cenEv8fRKoyV%JttUo5VHxNU`O` z256=z0gXL*Y4W;v?`}NmV;YO9noH!T&)G<{ZBNoM_gTvOr@iv%2IqFNuvTD962ew< z9k+>!pH;ClbNf&!@Xu3u^)L4?G;rJ?JrJNw8>AZ#Z6V@KZX7-Of_J4vl&V4AMdK;N zEr%Ig8iN{4woaVagA6*HTDA*N@yPVX>#uur=gl>l4N$SPG);^ryeP_~JZK zZczfIXbEa0fG-;){my^MnMs%Ith`{TK?kN51OY-o&|U{r*zCF81Bfi$EoI4GfG5!7 z98k1WrlmwJSOS!6jsmsIVahaRnR@`PkNP!z$a`%+Qa@|aWC%l3ANSQAz_AB-QCEVW zJ!{r1dF;yaI(F4AW1RD*Qmp)ch8^Uu#5H*VaokL}Ux zGmWM{3osgsDhdMdY84=eK?YDT3;|Y_=73406xEk5S?Yc8!Bk13cbO$2gOLrXHWK_A z0Rq9@03<*+PadNHRLX0qm_^TPX)j7gx|KVF{`d+$QkqqnH3B(x7~9UT>E!HnLtW|(&D*P(0b zqfA3Wpiu2~@PvHq9hk7EObg1~b7cdbTz4Q^PpR7X?Ts^bGP2hk(TC=}XPtGHp(6JOp7!bbUsoFgqBKy6Uq`7r0Ovb%47YML>?074y^aI%iPhAFOO?KmwC$M5QR6x0svm zp}@YudtlGNYw(f-1s>8Kc;k&XFA50@d$y{wCW`p$BnB*068ml2w$;i!G+2TkLoq?! zL3J_^8O%t-laf+(E(aiZ^cOpT)#&t?*8+=-pPXod!t8> zHiMsfT1zCKa7^_#pkct2wm_>{kcEfJJ(2@1du;im_L`+UId)(N7}|S#=QF_Rqi6xi zJzy(vLPG8&_&n^S+;{evR$=p)=c#c-O>t^q}P9Ap; zAVu$#*H9MP78q_|k-BSF&Wpu`IoFIDH7XD!&vfj;`>7{g4&FNgB_DzwcvT3uF4J2{{4x^y|cAUl|aLr^v+~&iC6^y>HPU~<#41(wxA*2op;>kU3$qc zRjzfO3~QiClQcO9>1mYd0f&!2p631EAI2IJRIG-&cI{#S3JTAH8c{%fAPKOkJ`N0d z-cp1P@apf`JW!C1Uur2u_pAj!`@BCzYAH*uTk6*32bip!Jjxb1%H$f-EhXPtpL5!d;yBnjwo8FDWZ5 z$bJ3lt8RL}b?atvefso1PW;jkQLEyuq0A4(CCM^;NiD@0ssN6Ls{k&{`^#S*^+t`l z5dNF@?tAY`$h1&YZN4{8!Yfz@!aMKgn{V{4{q5BTNa+U#KE5}RWaCW40Rb-Spg^H! z0aFz1x#z$X>U9tB`GA!BY>q;d#A>f)JJh117~QhlojY)3Z|_;^l=iv2-vnshWtaGS z-fbw+ZvtvPw5iwVKLMfxNI=r@cO978Yfej{KCkODHPjTTH*skXAe$544`rYgU>s)~ zYbZQq0j6?WD7Fj~u=vyEMMWjQ-MxF)tQ&8<(W(ph7ZNyb@E!<&X@hs^|F*!l-kPxF zXQO|%Twu~yp8OSRDlRl7HKj*LaQKK>v!<>PmGbm&e{;>65yOY|l!Z>>(l3@Xd=;i6 zais=wp3Fv*<+#KMvY>Wuz4aFFipwuI%l>ynk$(2sTv4ZmQvaE+0ju~-T4Mh0`rmnD z|L=Mur24+rvz$NSgcA&ZcHO%5(g^6OM;w@1#rgWBc0J{3z|;q5fGXffSr#NMPtzYb z-7(MAgQi>NTNZ!=Tvs;FSnyPh0NXw17T9Q;y-%60EO$=;=NhjA3wSyZ=;79oKAI$=tc)p0bjn#}pN{ zrKy=kt&5MbS8(dRUmcMlAmk;dD&;Oa0{M%0&aMHRb|M5f*nUgk}^8QEO zObM@0-rppL9ve4qQoN!<@2ab>^lrcHHc_9(zv&NZ)|%+<~W6&9Zq7ko&08eZkZ{!#mxx?tQk3 zww~r)w4%P+rkVzBeYj{S&pr34%ieW$l&pDBKL(uDcDJlYZ;uAspFp!L9>&w0?^5GeS6)&uXp`)-_wqOlc~RX511R4mJW z93AK7=1y3%=BpdVj2W}*!VAy8Q55Fyx9SG{|| z!o{b*_~O4p0c521SYF}uCpYk?3}hNDF5fz^T?J_O+4V~=y`)t` z$qS9sl71)4tZ-R9;eyY1q{LmR!B~)= z^Huil{NHU}zj~Ia)c$hR@t6R0N2^w?#>K=$wv}?xMH>7=1fU6D$=)7^CimWRU+tnL zOM;OUj~R23&hPpjAho@)e8mbW_P^_`lXaB5>Pxk1uy^K}BTb~x&pw-HB>V`gta_Fx zQ$K*D1kM2>z|S94Z8rNS>yS9-N;5m>!&d{mGRov&O_-*}JcnlaSwQ z@HQ4)1S#}!Rat4tjYUOyPv3dxoiUDaM-*iB={N)I(w>QP@vP3j#_kyDM+WCvuf17^;g?cH|!I1?Za zZ2_E=;0b@sm|u98Tylx`*?;ilAEZz!TH;H=9q;!p`IiMq38oCN`d@c>k@~Aiju_l4 zp8+=masojWL{^dmQ45ZgZ7EXD-9|olyuUs7UE)589q)9&NUiD&Qr3gzo@b~rRIq)B z4Ca;rQXL~?V)8QTPFmJG0TcXegicvmIo|WnKj*Dj@s*8VC1D>d05mMb0PtG3I>tM` zf8RGpo-^`enVP1H*BdTgbC7;Y!y?1CBqt<%B^FDr9D{FF{jHM+SGmCc2lYU}0_q2K z*WsIa@ZiBL`5t)VjfoSbhkAiLH1(97eNbI(bx2WBD~;u#-MW9xX{Vk3;pWYoniS`h zt}d;uTGzB`Vivr&zJ2?87mWIO%goHJ!>_#J(gos?c2xf4+}zzkOP741!CUCnrPuQ_ z=_yQ6UwGl4UarD>h;1ODjWtQdk@cHzzHMSEVcLpsRR>f&XOF=Zz=<-=)jfH*@4KSR z=FlLR0i->r^~-ddAIg@SlxNY@z0=^B-0o49E8ADhH_lB70=Wap=gMH2AEdSbsN&kz z1FNp1RRD{F1YjB!E0r(tlt9$FPP8g&04jV^Tq-JTiTbdttR(B}RjdBFe#3@yg^z7C z8LC>=&uhDO?Y#j5PVzc+>gXjTCS9m^uo){iBY49rzuH?=TvjD^cS&CD?&WgW^0-z9 zlT}E7*Xlmr57@gu#UJoUAnVcG15#z`qbQwgZXY-9;#TP`A67uX$RxR-+g}nD>;3AI z%Vci4#ZaDR^eSVyye#35^X8k!dTDZ5S0bSn4pg8_8Ss{GX#vksca(1^NgpN0AFcy5 zwQHukjxLWxjtM?fdOS;PiG)y$n)wWK?d6=C>j0?)ikSTtB*aGlIw&;g zy_;^jb-aA2j*$vl*nj~842ZUF-P-V!nv2R@61N;5|OgOUbO-H{Y7%U3JxE z-kE2dZY1*l&m|~qGTxIm$t!)8KmZ+5q|bFS&50H{)S+VsIk_%1PCzUOeK&yV?jF$N zJ^s(Q_tsyt%jJ4N1vYGQc?@X#C{pfl-R<2m=l)=6GVF8B<#l_zQukY~^N@;$IyLB} zQ-Ev9;>F%y|N6MMeOsniTvF(reDYlHrkidO2!;zx1*B$K?-Mi$NXtdRmrM0YJy_JS zL;I^f6I zxwOq~2)8OKf(FVX*+9k6`as^ou9rY?4b=~%@9%*Cn0|jB1#;}C2NDuux8)TSSFK*N z#{1(RZ{2Qo4;*6zs6-4Rq<8!t~(L5mhH_O6hLsaykkzq{@l;|G;z`}gnbJ@w=haz6dO zw_*J{g&T?Uh72C8xJiQz|1Lf;-cYn=cxzlc0CVRIb^}EgnS2zLt{GtD4IHz_534!w zU9-n~$}{HYZLd;HUI?c6cz;DB~o_(@?BoHx4_Cfq5k$1 zWTf`gH=|7(uT&OfP=wJE9N}q-@Gy$^WhNdxc;P#i;DJPt{qVyN73gppalq2UBEqL6 zB_@RsHFfCFAzpfV3pt{WG1?VSqtq23g&kow0XTdG3+=!XqC8BIUd5?A`Ic@8O(x0iIvTNld09xVCHCRz3wf`vB5X!94$M zKy1r2zx-GbQ-0K%KKkH8Z@&5fp7HSE!%=M#by(qB;%iwiAzo(YE>ZrIyv2(aiDFj) zLT0og8aHj`U3$r{jK5UEbmQgAgsu*ke*#gZtN`|ascF0T$J$c_K)QFkt(6uOD(!$m zt<_|aq9BEO;m68*wzj<-eCtLa`Sk$(8c2a2_^*0k`0%ri-L`$}G({;*0|c>RLS=`6 zedCQccqg9N-vAe`=#20~gBxizl7Bo_DHZb{^c<5c zFxCSZ*W5j~ch@Wn01!1-Or3niaN!%z)jv=92(W^se)NL;l1?+BT>z--uDezqv`(WU zd%WlK9Gj&`{mTXa3<~PLJ!a{SH52*E6(EsN-j%Ye$59DOdnOwEy{=um-kUu69dG^m z^`>vB3;nCoycK@sg91#2by#^*Ezi#N9btaj9V-7-}))1?1B7s&l zIBlMR$IS!+1~+PygjMWqnl$eK2)E>WZN=Pxi+BB7$WHfPQp z6K>l*LJ={j8KerYB9Hzbc%3?S_F5m)N)%KhQC$@Zak9-DGGvhABDD}TOK9g%Ly_@n z2C@Stx97U0L`@DKNP;bns^WTm;yJr<%`cC+ZULh{w;zD`sJHs}T2R946-(%i8#kD? z;W!0h56fut29-;_-LW~)kAGV%kO5i%8j^bPLuFOQ-k6B$8oHx|j3zc$S!wB0B_;Vc zPn-6BQIB57*LCUK{sz@FV&TFi#&>M%)-Aob_!#vad|di)Z=XfHhjws}dbql{db+wI z=(Fu*(jn#)6OMs?V=e`?rJ2%;u8B;PZt@@X{H4loc>)v?NPBu90H%AoB5?5E^uQBO zJW+VwdFNfZeMi=-Wu@ftpSWdC!g$v z3k={F;>wR?2R7hoL4b3+anI3|-vm%};&K^)o zl!_Nx2@@y2Beq1Ip$bQhI$z6V38Qph1(DUHZ-AA6jESXw!?xb%z!@MlTaDq9vc#95 zsivr~@Sd=+y8Caw`Q{3_&x?8g!%xnT{%Bb9=IP#yPi9I9KT#BYd!K;Gda%N(jQ)Yb z3>Mbaimec=cl(2?JEo%a3w1T92-q5`UBIe)IyFhcee%kC~MEBUpSuwj6N3HPBTDAZG2Y_J@4L_qS0 z?^tdm*HD{C^U+7m+nsL)w|l=oIC9MnaKO>!sNcK11~`2g$nPUG_hj_X$3=heTvz+rv(G+%hj^n=yYusmer`fSyx0wPS#QIYn69-TsZK^V z$9tKSNUo0lZQJ)c&2U7g1ywW_Y}?R#6+0qPDpxW^4uC0rw{P0dpXR<<0_TAqIItej zzzaU*wA0TjDk%J;cz?acLk+>9+I#8a=jWC zgn~JiQ1Ai$698pF%9iC13ibEupD;X9ghxKAbYD}``keJ3>Ok0UPSDlh1SlLlT>QC|75A{|8b z>(@`hr&APWCdmk#h>{Q&7iA_bgn+OioSy;e>-xv-{p;#~?B9Nl;p@^C5|aVyTVdf5 zV-jMcUUkm}_VqmwpiJxgB5?F0dq4vaQth1^AAWe+@@X@MTfBHlO-XUlgdW|y-aUHs&;A}1pK!9C4{-yS>bL=}v90t?y-x14 zxo0@u3rN3*TyW)6I%W8mx9>eu1A(J|UUTC2W5Lr=qEM!6)`|&Usa@hrSSK^lVkzMl zD-6u8ox8p&EhxTn!`3yQD;)DTJUgi#b-(-F|E=1Ym0PoE;|4DwK3?@i-_-&gd%XO! zx6~+8h>mqT$d|{}*-@(Q+_l%8`!&UsWy)3=Oo}+5me16Fk3901vYtJAbm-l?=fwE< z_|^W;9L>i8Fg=>T{y}Q{BV)(H0G+HcIOXiK&uW^RThK*RPDfE=%}V5q!g|;5Z<)4xH^0pJhH~p%zrxqy8i*z8q^878H90MFq2^0(HfM zkKMag#oEHn8}|MwDk@3ht?+(BM82|mw{>xS0-w|Ys@f8wiKkW}vbEl~3w>Ljt3PM1 z4IFdb^^H5{8f`MqsAlB~TO;cFx<38-Ue{NaZVEqhnXIxdijI!{oM(>q6adqs{V(7k zwjW&cy!UqK6>QV?_o`*f>{U^0@0S!8-=t-JGX^i4r7#_M+2yJk*uH}N2S1m-aSn>| zUlnopd7g7ujyraF+0VTLjgG63+-nb@vTP=fRzV>^ znlII;vJd4{Uk(jtI}lqh&u9qD92#Evo^TasBYPI*fs#N zh9dQ~o8I7M{%z&G{(VtDF<43*!4?w%pR|Rnu8O5~5Ed5SMz;Ht#iIGO_ERY1Xiol< z*3sM#fwupTd*JfRFJ4z(QT~wzxEOVo%GzI*9GLmv0~Gd`u;2kTxbL?8<=eMR^7>!f zU-?`!Acmw~Kv{|2s02yyKr71aak#+ikaP zRh{3@VKu1tKoMgC=T&@}&ys-_ZV2SKiUO4^XB@>zDrtwp*4!PfA&RWTQsO zDZv5(vqUul6pJ+sYCz6yfPg#pZ;rAk*?j)jzwOw8i9dk4y#CiXc0j})lTPQn86yNJ zP^&}gz=vf$)M&Ya4@wDN>P3~6Wi#c$^iL~RELpHW{-}HJzWb!4*p0i3ig!gsM8)Rq z&JBXX#hD3x0>2bDdLgw4`JjR=0M^!8Cnv#%)gghNNGJ860Yb|6Z<}kE8Yq)h^&RaK z3@f;zc?azRV8w2b&TXP5j~Y>?0(S9yV`^$*M43J(5N{#@*!}y~{b)H4P^L%Amp%;I zdG*zoEh;W4eo{Qe3LBEc>Sr3j@B>c35_$_T@CO9fM0d>q1az4nb>{MLkL33IJEj~5 zGStCsHL~j%;Q{!QK*7TRrlQJI2oY zAwWvq6bT_)j!PCMB*xdkCfFrkvDTVO6sjn2s7TvS@HlzGO5iQeR_q4&wh|!WJjL>G zosKKl)X#>>^*yTsYE&kH`Y01gKAH$vWdW#Y44@OsF@6r9x@Bw-EMtxP1gL+NGx2YC z$Y85>ME(ZA^oabdhi{|u^RgaLz`vJDO3O6FFOcp>`>H>VU z8)bj~tf9cPH&m)leE{QtcHg}NRR^Aw7kg&<9tzmcpn2KMj zJ`R=+DlC8kEX01_u3fvAu)Tkn*7vwd2ehfG}Y7`3DSu$BEy;XB^ z%X+Ij3fPP9Ypl5t&{UR}uadg#y&E^KpD=#>_#f@H{;|g%i(0*Y>*U;=+(8PS7joLL zQ;lzf*jOB#M9AU{T~m3C(o1m`WA#mKJDt-L(@M3E{$K*4kDL=0?vVgs{46*y1xT~9 zvW?GxPMtc*A}mh1EOd|;N)_L=FcC1+K>t=Y)iYj8YT&$`NuL1J?o{-sTyD~!5ei;YJU(#&|^a|TS^t@&BnIAxFlaT?@y&9#y)<{HP`Gi6`&do zkZ!r<_5rzth3~9iyDm9aRN{yc!%gUyX!$b5EEnGh04m_g=0CZ(WA^(?j*7KZxTSWX zJmL3l-%jMy=H4;#vEV?OGI)kQ<6Qu%`3~xH@x7sNzZCU)hJ2xJ+28w)v}*w{J<`7D z5!fD4s==*VwQpZlThU+3`;)YMAEv?ELVIDJs8bDQfYl9Zzyh!G!3v*j!gOdwYrxb1 zskN-qp~-Le4qWVT;2O#8IcE6`pyIC;{-LAVphhbyqtgk1jF87y~7JQ+4 zTfHI*Xux#i&A0rY*a-KoUb8x~Wvf=+DMJQ}T1+${O7Mq(Q)sAD_?ZBo15pEXK1zPy z>HvyndGBKrK$+s*zp$`CK)T&{@P{hJ@igVIk!;!4-vaM5RIw&5*a4c{zEGLNWfIGT za>OU!pUn~9&Oif?pdR>S=FE}Rm9=k(3XDclj!F-$=oluuzYB`|Y>o(5B2g`|NYPmaWp|he6?{E2~!& z7VbV*{L`Z*6ev~GP|;?gC{hF)~hFG6t=f;CZzJWf@JS}c}T5xh+X zzm-_min=N@f)aq10T1XxL9JW2R;H0ew+0kK49}4zGE0A+kBdt%;ET$U*tUIpqfchc@lHCS zub0}mv8i8eZKT@4@CN)vp=%lL*czH%3}^zPqDPHt75xH;(k29IfFxQ7P?+u8w=#ylv4%omEs=erfma9XB3|{`jf9BLJp9l`j#f#{u;~ zOkBv#ig(-|{nc%OYXPn2VkmyyvD2uXlb9dcXSsk&ps9@KLaxr*Y z|$>OTN#*%j6ai!*aYYEPe~q@;NrIs<+*Kr}yHEFY2~Jhlqmh-u+nbuDkB?x^(F*>T378?5tgv=j7!4M--=V zK61qszrC?tyNqTE)4V{G-}D=Ayeh|){lDC!*ZJDN-x^tvmw#D#Mb&`hMoF>h5*DSV zrf71BHLIF2W4OKtIK}1;1|F6Me*dxlS|z*NAf-ZrbqJEJ;K^HX3UtT zJ2JE9rlh2_5(T8cUEIV~c(>nvyLbHY{k&$)ntD>D@}_PY{v?2`GLmh80MLc< zJny`7y?gJuOO#uz_wn>k6rTDTJRx}l1`hBZee@x(ZJRbouD{&6W%I?;rcGOZu!Kkl zmCyt+?JY*dH1xR}Nf5z(z=EfvO+-MKe}?NKE?-m)?o8B~;+ zWk38@!lH=smwHp8VRyg+aXeE0IvW;&L1nH688g%M>o+JY&35ne1q;2VO`BAgl$B+N zhX&1h;&0bo5){;~!vFar@i?%|KN7#{5!$Bc=)?pOj4|7HWCo!k1wi6>x=EA90)fUF z^pajn$h3OR8l$olDitB}kpKkLm^N(OAfZvI5!A4hj}#Sl!tp0~Pe1t&fk+eY?THf= zzv{jeIbDg}uwl(%MQmu&Kx+1-Z+lu3LGY3#i(~47!Y-8|#DiMCM@B_OdLu`k=_Myg zm?WNOq|}*M#si*c1P}@Z6{k=+o_4BcOhzs_5Vd=**GW)S>@884?=@@IY62_w5><9h zT~KIwS!L6KLkIn=va)5V@?WcR4hc`$)%8&C17Lcndpz*I|GEciYlDhw>#EC;sc7kG zC{uW&2urZYVf~6=s6~r(FGL_)BS#}7u)sJ3(SyyJHW&38X(+My_yljn>BGHietV?> z((%te=iUFnLs%6xNlR^-=y`+m;rQdDs*8U4%S(!PWoFgo|6yKobPSkyx9l?32#EqX6c4a zP%BEeMuNRxfBEG~)T5r%M!4ugZT92n;7CaUFg;SfCIH!DTFE{ge`!`m@&UF{8IdjCW#We`|dkMorQZ3 zJn*15{@E90zg{K2Uu02qbY!OIeQNciObLhYPjziwr{uIoAt_B#y}<(qdGXRz05Abh zHjZ)Hy+?&g%Xa<;i*m)HNzT2E{XRVaxPq%pPFuHbHBnlT`V*0M&YU?WXkw3Ey?ER^ zb=Z)qP8~V~uU@mxTep7WE%)5>z}2Z~X=Tw-QQ5JvQC}q`HJYB#Dt1zjV|wHtzP(#t zbJ`q8fgU(4dO(9Xs8630o|drWQVFKDf{ze|S!JZoPe1(+ud(=j0uH07kDIzl@4n+L z`22ID5|trd=!K&$^twq<1r=*p0wQh^xZE!woiyn^fe8*gO0~?5T$a+P#gKXPKFT{R zI_7&SId0rTO}vnr#}sztg0PVAup~uVjfsmk=BEh>38GA+yvV3%FA9@Tr1mNg&@3n@ z@Yb$fXZ)9@rKJf#Q46zbRkFSJX3d&yKy+v3PH+C_^NsMSUAvB6*KS?w&O3KxWqNwE zAn|0w0$cej@D|6 z>O4I+Z}$XwG>SnV6z@hYw4{9UsVBwni`9TPt9_{e)oIBZCZUs6=$A>Ov6R9a6+i(G z6%ZN!>sO>Z-+0OnN}#=({#;+QgFdh)AD)`f23iWnbP`TU~oinDnI* zlM+Os#)@|wYpk0H5{LlGD*K0+N$PJb?S6daD7?(FCe$GQjVd=&P;kRd0}2oH;#Cq$I7j%?<&TyHEz z^~8iYFH%bP(UIX^Ok|j-EXA4<YA$3d_{vTR6xVBvJz3}C5C#(nhUUWmSo9| zQoM{d8IsI5RlkL5VygDU`xN_Rrt!4DR6?hP3m1yYjrTfs>FV|D)x$e$-2ygJhGA~a@zF3-~awpmMq2|k;}X!0F5r9x$u?Q za}6o~CBM4JJM)ZFjqdJ(1)qz0tnj{Gy~Zn-CE47$bG(<&YTC4!@o_qR`lsHO&08II8Z=}{zjq2U_Vgcr^pUrH z+YYaBYO1IrYuytY8~sK~`{eObfGpR|3s#g@wg+yG5nKJ4VF` zDpe5=yi!dr;Sx?sU(}mDdzK{sG2Y6RD+x*MWoBj?YO-}k8?RgU?%sftj;|d$WU#zf zrJ4z;p7M7t3~@RwkoEyJ$AVYkshi|{fU4^_-PXOPqEO$G{_g<)djHgq17P}7|3ZQK z?Y9T!&zw{^=_S%h`*R;*azt^Rto5h86+|8>@~zI(T0>&`##?9x+DJ*6r^ni24wnY^r- zst-VY)F~=!tP=KYV$lGuC|i+W#t z@x?MXE%iSB_+#-<+u$LKB5H>Z8(LmlR}&l)6CWXIe^5+}gnSb0u_3i*4>-DOJmgji zLB4wcsag87=bvvSf#48!fmR-i1kXd|Q{gB7ZR-ExXlWn?df+hY0S(5C{QROha@f&~ zL2D@;c&7eKNGKv2#?lgcNK$0wMGzvwC?_F4=6DU<^%}@N$R+R5lb(6znWhT9_n2Z- zof#e#8J3uw@L6JP+8-{xbYxCwX!4sqd-d#p#u;ZwC--izZM$|}pFVwRM3Gi(-?lwy z)8?&F{rmSf)>G)LVl}ldm^wgpKieB(=<3`Chx%@xX zy=%iywSIwm1$y9HdmtmD{YmGZcg{6Yu`%aoZqEus?Fk7#yf`f?CFroj6e+$%Nhlxi z)W(sA5()_msawBx&7`bdS+9Qm^_TDIWe3$4>9;oKe|$rNJvrcR)xN4KC}h#1xogJz z!mRxK=RZ&Rdd-^4YN~^JWM}U>Q+(B!v(7rR^6tC;95jCX^PxR^_6*9%$nX-xD%qp- zuTFN9sRK?x(tq=BuK#tn>_?DD@=>tm;dQ~MC8Z_J_J2;_F>%tQDV>@%OKsMzTerF5 z>&npLf0qJa`oHV@z?*+$5A^8K_xKTKjJST@+&H74!09&n+8w(+)!>-PjQeiSv427v!+k$ZiN;TWJh;x>vSrh$8#Zic zR8djaUoo)G&D^>3q+^fm*7|3oF9;3~3)`S@+vD!L@4i*&rIU)sq&nh+ zYG9vr_E}Y9etzLL0kIU#_a4{>r{;%WsWVmOCXE{hN6S(Qpu++Rt0<{58DR8pkpoHg z9WSn8m9)3^jkjBAkV9r zR8uP+D$xjn1Yk0XkY)tO^ymISl6(8x+$$V_x-vLtgEBQAf^+i?k?z-p#DsuC6BBfHE6NIsa=(j822-o zH1>bvjY&6+9(~RR4RA|Nxx$X}J^k^gKi#n+Em+-D>0yEhm0!*?U`jExK{KDYoDU04KDRh6=Gi-|S>NVE1sK{D$m-pg-Sx|E0r*4!ON!hxT(sb^Jz!x6zoJpyhii14<*& z2%GT!WQ0)~xTr!g;P4m^rD_A~$>D~^Y;b63ZFFSR{UU6pX71Yc=ZGFr@6Vn+`@Ths z7QNoPckdq^>%Jiy|GzFvPPsowO-mWOZQB-gTd0?nl_h5=g$CHDn*%}iY=EUZ{;s;a z=g8v#7B~e=(LV*Kp*%_CEtkowN|!$X0M&vw_xu6XCjjfJ%WsR){g=M~?^O0d^-dK4 zQ@%nV9Tq(>>D_nl=-Q?0MNONx7$^Jt*EBe)&OP_s(m{iU&Xed6_$Q zcx~FXH%jRE*(6Mfe3Z)P<`x~d(}1k;Hf-1=A<8l0pT<``{f}pcFI~E{!{2bjb#qs)TJ_oLufHC^QXE0w*I$2aA{?*+2vun9 z{UryMdw^1a=%2Vw9=GQn=YT1cDDMKi-F5QeT9H)+mRX!LS(4AHRCzKWpg#5)-Md)j zwa|94@@_K)9fY3%kLVzDYM{u5?*R>j@cg2pPvw5^Z_S%DnJAT=v<;iKT$z)db5UMy zeuft7!Ny^UqLdP;RMK4yMq(UQOZht#MJW^SC@L~S1J&~?#Al1tG3q=rT{MGHAi7vu zRe!zqH`hjWJGR?BZ%%mY=`X+h;&sKETB8>IPb9<*-;+PC#0MYw>#z^re}C%QwQED6 z@_Y2??%j6V?ed3e1%LpX12+K9o%2?A%-)>?nl8EfTysE*=YGnee3tMi^1H-!&ZFea z2Ht9=thR8}!V1LtY9&yjem}i? z*RBuAO`!p-Wy_X(pMSo<0400^kwSgNyzXqHOQ(ECdJ7oCsYf*P4Qm}r4k(-=Y>azI+5fV zL4}n(bFBDA;i6U)8!9r=2A9LrRFJ|SGgzT!Rc38@Wmzy(5ar6lUvl66$3H)7)~wOb zEPBQUlxomc9$tkBDC_?E_+xieIyP`L*vlT`mUD(@Y=W01Mf zfIk5+ZNSb96xDD&AYNQEg}8Vt_zpiVOk#T!$OkCN{%w67J{_>a+NI@Yw3YA7Vpwu&_jSc|WJ1W#2^Bh7Xi~^uI z0a5o|9gw=`DUXeFCc}z~YEiRNsq(!?H;k~cyu8xftE#ew08~PGFg>0m7Q!%7sRsH9 zfN2ADWT2Ra?EyKq_*tXm|KQSg^G(hWWYAeet zt13z=%Bw3PBV)p1mVZm z#lpgTLlIS1RrsQgRx4U*zV2B9vY>!t3#|D%CIP5!0Kyii)L_fa%@xm7CQ%}Esw(BZ zIH<06;DCWwmzNdaTv=09o0pdx6cQZLP7Xm@j*}g`DJbxxI8YzW894C7VJ*^I{BGOU zEnnVo+a2D72@_2Gs318#p+i|xI4YFA+bjSAq7}xMDnJUT@=aX069=MufuukgQ02S6 z@WM+9k~qeD=)ni|w_$m#d={7Afp3NQR8K97%dbWpywEPM+Q4wAZ-6P8&(BB=r(1A zgUK%QN=k}FwZJoD(7NMVvy^06$Uw&-F@lP`yc|(QB}Ks@L8S%x1q6~?TT>I3t3eWC zczBgn(Ozh6lRNLc)2>+?wAF`iVdKY-FI0U6SDanYhZ+}FSX{hj@AA^}wc^+9_6V{d`9UD`zoC=v z44RtY5}QW~_}jcVns`(*9Q+CbcQ7mj=iLAPj7%-Xi_{Zcm@EaGzdjoaNy-ebb9C_8Er4R9Wd2*3N zY!={ip*lauFXKcoFzhVFsznc#tP<-(e$`yHH$Dp`wMp@F*(MBV2 zk?N+ir5a`>L+NmpAVE(!vsLBM?zp2XIFmfsdhiUa0+P96kVvYEM0i@4PR^)@>|pOk z=y>&u;sVd_P|f=_=uipQ^u_$Y4Zf@*koQOwNQ^L#uJAgh;~ML0Q;^|(Faws%M3ieL zoyI+FxbbDp`2O72u=np;`fJgHz8@+Yn$inUP)X=lMAFH_&7CnW?36svyG z+Uh1$@HQ-j!Ccp567J5xPnHnWKa(O#PMX)!1S8-Ph37hvI;nkA|BYHmxd6weoDP1- zDWl{y?(}W#xFbpnk1MTHO$5+>?4$!=0bC)2jee7lO?G*CAm+)X{TtgD^4u$y!I*?w zwt96CDT|!QHBA6lTS}=f+c5BZ>CdPyo_Arz|Igo%8Ir5($u%^p7yMf9RFYn)sBU?Q z{Ze0N@YQX8w_CL^q5l~$&H8GCUxW{mlnU+M2jb#4pO5;6XQ}k-Kv^gEuC) zPGuRrM**Ap%b#bl+WyB`f9G}}|A~p2yy6eJA^YT3Dkk52CYa}Avd+#vb30ww#ViXz z3{yZT*N2pUdII$Jf*-STZWt6%V5$fCLvu=>th5hUk*KCd*t?a%6CI zqz9jN|CuSSxZ6uM?u84fp5=X}ioJ0^wQ`24XA~m2pZ&vUs-R<`o16S>Vg92Bg@IU% z3T$R?|8t46={ye~Mkc^yC*uJw--lsQuL%ihv7(p1#(fg5l=a8B{fiLQ=AiyaC*GRZ zjhA^*QJ3Z$(LXMvHY}+8^PMKuI`v85h+!IXvh-lz0zI!hN<`e^x_DYdn^@i^6*v53 zr8ho-nQE)h;GSV0YD%~)YV~}@e2xz`#RH7m6ftc5+Vz)>qa41;;o-}4ZtF^5F{8i) zf)kwR(YhRxr(srVDr2k$vx?B|9EIxZRh*;ezl*X@sibhceneDqi6**nL?jf}qVre6 zFp>&JZ8`#~I|l9H(@Bl{j#3+kXZ)y1etvZuOr?Z1+TmevIBxH;UVwmNq&1zu z%&<}wgKyr8X7 z$`fMJ*JUzJAT#r5v?!XqbmK@ux~3n3=Qk=pN34Io%jQzJJg*$KuBqh8Cgo%>0?xd-0}AuSD`$mj5BhP@05U9${B;ej<7-`y4_DeCe}oMOI6#s zcu&R+SxGg@zI5*vF*BsdeqLTkt}36j*t+!i&E+O=6rBmF5L8pTW}jbB<3y z`aJmvoSe9I{$|r27cg)i({MQK9WjF?mOL&r3{wHAIm*}j z4I*AtMLP?j8?vk-BeVGS1n#{SAbb^ek1O>;oS;x=rJwe$)5Ess#pZ+=-Qfm0u{n4 z#H433)+SPt#}<5PW$(f^xC|bh##M0c+Hs%fS*cY;Hxa8sRNf=DGhCmZBf1JI8P3V5 z`jdQ@xhQf08L-4ryF9Pgs|ds-aAnwap^3X%mh8a+RUkb_VfI@Kavsw7AvxWIfRzdcvH ztR?8ROq`!0gSjef@nf#ITHtw+iG{{o((((N>c%!@*Iqkv%oW)TZvc)+DrNq=AiuP8 z$**WCOH7+P?D$!4j6!%7VJ>|eZ1ltN(9*B@t*v_Cl!^`GOnX47%sRoX;k?PMPw(4< z@Q-kDGKlczl{eDUzu^0eH;q4^;fhP-8E4OoiU23Cg2?Hch`%g;onS2UvO$lTDYQw}Bb&*mhVrza9ON|q?b z07cZwuU-BDtj(LWP#QRe&=;lLOh+5I?sZ`GuCYcSAR^#^`s_ zU4^fMrvByf^iA~XGS=kuis5mOH{orji_@o=2+l9x%adOtTkbw=#O2J}uiM!}ZpKT^ zVw8vxa;LFik%Jp*2Mt(7VExHnN<=>T!#}}o?o)}N^6$<%b7#t9;tdvBV=TOsK1@4B z#45F_OfpJa=u8)B_EFn-YgkG2!&fV?OUYpB%xFEJGDHbzT=(#-APVAKW>E^>1Y6dj zTv~cMRB+s8BO#;n1CJ#)>kF1b4a;Dcad1Tc{TvvOW<<@+4@bdsac(<{KZl>3mI`KC zWFeKk{V^emU5lRWfup8od^TAB&dRsjh%H1mXRqAEW8%#@lKO;F0uTwrH$Gtz%swdV zA~d9AS5NL=(;z80fX~s_g82f(L+1Xuj8mEfmHV^Z`1F{O+TH$Gm?n#fq1iPLB=LM} z5O@GicxjNt!f2>ZfD;M;#wH{X{BCJyz~>KQv7#fNVQ^o9YwDJea)!phJK8{9Etv2h zUzYY1V=~LY(eiKjnaF=Hi3vWy>kRMq5a4hq(#!Pt8HZ645>UgdA;vb*suM(hh2Q!6 z;1ijLouw9u`SW4mUjIKw6p5(Dd{XL^Jn_ za55xes3VcFXH*oMNUPL9{uCMFqI1Ua%fVxwn!+Tcw>A}PkWpgx4v^Vc(c?c{l<;L& zM%n@Y1Fq-eHw!1B4>6k(js4qyo*G`3Ep8puH@I?IU;7>Xj@v1_xqQ$?ye%&VV?+vH zVCFW`5G~97SEjKh}i!-V_?n7Y7HbF;Zhn zel4@1pq{L@Gg{Vv63tEg*wM@H86V^Wz6g6L=BHEX)f7A@(;}|kQATTbMl)pDijRnQ z()8z(6t_kQ>BMJpp82|=yo1@DRL;FB9oqI~;*9{{ITycUd-u*sQQDe#d{dFCt zB=wo!|AZ;c4@^TMolBEjPN_|RBOlCgjGX)wmP+$MP=(RMyz(zVvFw5{Dy{zlGv1i& zqXvhk3_^NsJMQSDUO>L-|12$xFFwC#wJ4S*GU6CkaIxHx84PbcB|ZNXY))GBSoOv0 zhJ=E+Tz9OZ()B3!n|Khu@FEb28lE?yR4+j!mDEy15=H^X`n zoxGiRob;GHb%#4hs*Vs%yunHaB!sh$>xp=!>JD#wKpQq?*cd_)q=DT(sGPNbr7gzR zA6*VU^&|F`Z_v&|exOF9&Y;^D|H4D=UF(Z=cbc(s`6=N}w0aR3{FcW(5x;}=&iiwP zRptBrhu}d=ss4CynklLZpa^|F=w;49Cz-?va3Jz;;b~gvdEIKY)^@+ZA#)*4t*;YQTtWQseAw{pIC8Onw)_sJ!g6}x(7dJus#lD3J-}Y9bwT~ zWTp^3Hm2K7kq;p*{MkX*)$rhM$gb&%T#YIn7BqvIluQDzar+WqzGt9g=zUmE_E3~! zbeXu8-7S^rPIX%HHc-lY(4SeYp=KI-QoF2D-JV+TfPUfn)#o-hhCO zXXTQ8LSe_C9Q_B0;SdW)Yxt4Fb*E+X6!MK_E6cCVXS69-NXyRDetg__;1YE*$>oa0 z^CXc4yNc&tpXu(`L;|>ChJvWF992|+?C*J^->*GqMqfVE`d#m)e4QXpG|V2wI@vy6 zF!B@T!(5-p`cl*mtWW05!HRy0gdre9I66cq7v?rkL8g*|pr;$Dp_e;L(f75v=|mL# zwwKlFxL<*Gb&@x{R^0-;mmfzbyY@H3syn^*Cv=*>2{r_97lDCsqrMzj@>L#uM%1uO zQsS9sW_bPC6ewZu4&xz2{umrua7ISK7%Bc2BL+xFVQh`tgU$K*;+mo;S`_GzV@70N z-sT1s3lVQI=fUqhiQZ4+4$Gj!47EYB40cpQzuV8081N$wE7yOkzM@T2vHrQ4NxS@h zG;7>^@701EW4F>|&VM38j%js}hQ~y%92w6degnh)2vbu^z=s7dL+#P6$&mU-Ax3Ld z>yynx_p;$QEy-~}p5$%Z7wo~7@Gad2YSe2zbirPloan;KE zhKm0az%&WUR|j)Cr%v5MjhJ7B_d{Nr-IxHyd7@wS`86+23kyV!{XFQ5!3fuL?k$xAm?exfJ_ zvw$RLN(DJZ#A-vc%exW0;y^`Vyik(F*kvJsT0fk2ZaKbYB}3d%emD5p`vbMscbhV> ztoi6jxqw1%1C_?Bt8LZ^ipRuzm0nAp9=XtoX0dacOjYR za``Nf;Q(oWJ%vmj$7dEV9w*K|I}gpkrLKjE6I}6iCyLDI^UFa(#E%?PSoB^pn%k71 zn;=E)@?um1sxB{|ab9{&tuf=-s|gQ{0-|Ywi?ApqCDopcDzNQrOc2~Pg7b~9JumOV zK}my=E7++VS`GKNcSwn4sa&KaglKEmoi+`1b$akZKLl)jyuG6c#@gSsGoKK8hYyx- zqcJFd|Nb4ql65F6$hm<+09MpfS5u3jrz1$_t}Z|bgH6=Mc;GQ-cva(hDuXbInE z%M4dRhh?h64PH8o0L*w&Z{Vysx`qLqZx;KjbGPEK`@x3T;})JOBcfL3Zg^Np3^`U) z|L@2^&H34ekMlDkQy$dC-mOLZ*)AU|8&7N8`fstR`s-|<5-)lO4J};t&`H?RiV82- zt~Qa|XIM(}QsE#*OLE@@R;x3T#+;_XR=%CX#~djN+FYA@CZ=jY;n&B>L5fG_e_-66 zuKraLc0V-OwBB=J1kNT!f~uqZM_jNN;hN87_x;zpvhw_^W4}Ls&AqPugZOyd;R1T; z7BT6RL!P4K0lfW#%h{Bx#VNdB;r3eWF1Px5QioVRP>_!F?4TBx^i5@M$`9hz_T7C$ z6a2|i8UR}~%>myWD9&`4>xMMGH)GAp53-RSkta_*Jh=a<;qwGi{Sj_ue*dx`An1J` z+Wf8c8>FGn7@mtPEmkJ z>4jkH&P_#y7pZYevDcekERh#QdA_rd7AuOz7kI_-K>1yIfcC5G*5Zdi2=xa_*`mZ8 zqgvm`u0N{=++B5!DW=7*Uz7vC!*(!!^=5(aqZk4>1~GVjojJ9qM$KZsujDvbr1Rv>G1IzLj*(9pF ztthWWoS8>5nBwKN@`TJD(>PIpPEfOscNu}S7UWBkHyqZyRbLP8)T=bE1HRWsk`VOC z+kPv-r*08=Svl0+H02*Y1uODAWPT~d_N zh~Pf$uTbSh)q%{POcbRhqOqL|8}#&4%w8lKv8vM28R~YljR+6wC%qb|r!yDeymm$r zZfD@3tNL9={qD&WtOSm@ug z+8{i4Xv*4OMB`E23v{5$iBMw%A57tt_UkHG!>^mFWttAe{b>c4{qPq50)FQPbK}?A zDa{5(L?N%z1>q*Ly)Glg&~)m`;_i706|;Fua#936l`EI|jM@g&KUyG4EFf)kvGeXT z@ZNG+BF_odd@!jaJ|mbD zNpj0ejxH`PP$8pUl#zKp{D#BOD+Sn>c4=&vJP`o|>@AMV!Lzv`#rGxr&#$kGj3)KB zMcu;co#xB<4q94TyDt~U@rd)8LBT%rAy-=I4eX@|VoCC4WKCSULwUI0QF_2B>cRu= zJEp@)bs|klK?eQ1H9IC52mA}VUz-E=#z>Z-l=B6G=z%9l;fy-!;&?G1p`U#{5B?NB zyGc3Yw*`6$7-XOZfX^g_J2M%A5Dp}fB{?9;d!Rz22Qn!q07^k|IR1cAfV4c)_wT?4 zdWd?Uq*M~m&_X6lKOEl&Q2EoAIphtqN1_|csR0~J{oFls&og5e?P+WS9bTfhLf-4itKsKxl)+9j>KGTp~`*J?|9I0v_MLrBB~)cro3pm> zd=n~axc}=PXZTU69RGr2yA)P_jjlEcAWx_R!w?%!OHW1gy|#4pqBnPe@(!LW6bUH> zTansJc6(~iiI}=vkprJt1_rKp8`;#IAshBaKrC~3y+!DZyYYBOv8?|d;U&JJs`aP} z+9G^QWNp6gb`^-vKP^r0W16D#WZJevtjP7Ddi|XwE4qj$*g75(0vQpuilH~lw4mEU zt)65)I?kvJ9VpgwlKW{q=1=VP*7RQ6&~>%7Hwn%IP4B6s$?!Ki^if{8Ltd=MT=l)n zAyergh>NJ`hqsaty1`-VFJL)|3$P^&4qo+X^AMb`8Fbi zD9=y^jr-+H5$_iZMG> zGa^P1Q^}d~_cGUR3oz(d0-1-%R3`?#0Aov+hLiyJHwOAfX6ho5Cu-jEj6`rR<w8e6Zzz#CBw_iy=PlOZNa_(;Z3STpF+az*DKQ!Y~lWadYU>T7A3 zxwH$?Y`RmHqlYwA|M`9z!I0y5wV>1NU`%kqf7Y_w)J`Gnd~p#SED`MRJqKUtkI>e} zTd?Uwrp4i7gXN?HjwE11ps>{H%ioU{pLrvn+5~E$qsO(b%j$&^UR}tsKfB!5{qBWe zL_5uM9S`s@FmBhbC%$}jb-(w0ju>TqK3?dxUliX&SmO(GD*nNh!JL#**f;QP?-Tu< z=$I2e0w$S&tB#)6E>W9+K+M^{TDS4WMdOvu-P^?vtNfR|2F<4(mzRS%PNq-fKmPt+ zfD%?R^)xhWrKp*uS{Q-HT0HVLw;bJJ(re=A3Npe~MH1dT)Dgl*9eml)(tgL)08~kJ zV0w|4-FCiwexZ$-jk4Ua@_o?gOOxO0({Xj|%SD*T8|7#++xxR5jW-?cgmHt-{BjKs zd%vm)3j(^OKr)3zwImdZEE{Gf>u87SL#a%DuiH>Bie0FZwC&)9!|Ll(=T{Zm()VW~ z<0bC>q4?fw=wvzbc;1V($O6;uPjIj}k9-lXkrWAjDvV;LGVg*iymUM&{o!_(=jF2H z*?NGa@5^76U-Syeg0YZ~0j;)+3*&&9vHi0@ynMC;B$VP*1eTm)#ja7xm6E0Ap=zkm z$f%F1(jB$QhYDyaeEvv#w8%T>M?v}^0p*hU{NoqQ)aR?-Ex{6QskIz#-G3R|_Xwfz zwkm*^Evtp~R9_hBu_ofzFzN=X)!}9`$4;Yy{`#fM`<<9kN~kwN%!YouuZ*%mM{ah8 zOq_cE+#_nU>fJ4BHO>=JO%w@bC`6_TSy;HxRWS~#K{|*BAj6}8SBI*lyUSPLbK+l@gd zt;of{lGZB(8d{*!i$ViHGDL!1f-f8P*AiWK#H?#NI}D2u)TJr&kJYqZ{_oS$rFM@T zD|;atD8uM$#5|EP5IU()-H|?({SP=f+D#Wh>gQZMlBJoZF#Ss;Tv!9;=X&t|EOWIT zZoI5Gi!OUt=;+NMS7FacFnaQ#rs3Tv4s3#u^=N)T1@zuQR|0hSgN}m$nm}S=Fg^VI z;PQ9midFwcSk%jKcSj$W4W2X+! zOBYNVo?ql{$|`IkFm`hgvw!LiGYzGa{HpR*0;p+C`+ze9MA_xe9=P3M($k$h6rw#_ zT+keqeV%y+R{{;-!qTdK0t^fcXvNgBe~J7;r}xq9eC7VDb5`W~DFhj&yR0!F04J5^ zkDTKdHF-m*Am%gR-rF~VQ156mgAN}me&)%`FETkyMWKuyr)W5|W(Up(x7O(j7D zaV>(L@CzT;d(DfI0lHSr++0MB1;VCCgCid*6$L?hXTn~N*z2Kir7Z5aIs&x7MXc&_ zy9M|tb$pJ1K@EtfOVs}H~&*;<)ci7X{Tjej6|$YFZzgXS~2P%|});WD^p6 zer32I)EXHlyOLO@)<%vIF}cG)phG}UO()9ZW@JaTDEKfKT~93eS@h`h2Rv2lZn~9W zhnRB(m$v?_QEd~`K}rq4IK1!ww7O^Iq8W7z8|&)D69xtvMbRrdI}aH(myB} zzD&-2K(#_8PAe<^sh5I}03mRP5t6A@!cq=i3zr1gaZv!!p5b@JJ<^ZoUJpM%^C{Pr z>#9?5_N#6bLNC3y`b;Vuo*c~RHz|HGolC{CVM?qYHm(RYEZUL;(p91f@*j&f7h81E zv+a!jsa*2JW}&xujp0kcD8X=zFM`>R6Y@nP_Ef!g5=Bv2=AR9Z?J?6$JZK=Z_TV(l zO){fNm%;n~HoPS9ci9umh&%`i(1h=Qw#E6dffp%6Sf*#WGn!@&kG@bjWnRI4%)zgj3!Lw0xy&S*>;`e~fkvPaq+;@TtY$m+|9da}}7Jyu6%NiCV3RhCA6f#cR~&P@?n zQc$~e9Ixv-TAlr(wGAbYG#0nFF9r(U(L@+3$q_$T#;q`7`PTWseuF`8tPLP^OTMWw zjne32BT$N-eL8PDj={jdXf(ICKee2nTV%U^ei>(#%(1u<{+rWtH^1NDL-^+f9cyASGgoHwZA8%0p8 z?Pp0x`W0i{*W{D2nc}q(@(W9*{{fp9sZASJrF0y)X6gEG_Em<%)0~Nn z%<-#LhW9~U*vP1O-o3WArXQwPL2|dodO46yrfGI5h>7l>S;m&|coXy;tZ|JACPV00 zyNaWfJFZ7=7U50^@9a42cl6)>aIZB$Q%g8HGXLA?IdI`_BIa;d^KYF2A2S-Ph-_V{NawAs29VF+x+{EVq zj;oFiPF~&ugMY_%bl%@~f&LED@+2$dX~kJ_x_0vY^#=c?dY-z7LkA%2B!W`q8G@rgDwO`A zn`YSzpl`yHc+P@(Yr&OcWA~D+z$In3%+uO)acqLsYUuHl`$ctR{(<2eTmCRG7s2)RlgJ9c^oxS4eL$~^6> zOvi_ZE*wn4O=YVxL(|~iPZ~>RBba9ipu>J)jku|?a8YY9cu|L>2oFTYJVHvC>>@@m ztU>I*5-)t3XRe?e$B9hqMyTz*4^Gj#w((ja-<09R2dOSOV7~gHs}eVE-q;+zL?8$B zo(hDLS(Io3MORct5kaFCcH`sqWVD+#ZW|C761}dfD*rMJXTVNK9UfmRwqm@frX--U zfzl0x*9eH?hj*1unmhRNBQqa{qQt0t7R7f$ijEl1&Yp+ih1NkZ%g==$Yh znX6^u=%#v{A(vYUY^$q$_TCpA=A*w)SB$L(Hi_yt43|RzIToO|PT_0|Xc#1h2JAy( z=hQ6qTtJIxv!XZBMMS?@jl1(eaJR|7QdJmgTT6^wTStT_mh4)x>09}|UFQ(E>`yi)vl=lWdOthkQv%K2 zep2o;dz7nFC6h=Dt3ojb{WWk*lpBGwRS|Q9swM_~;N;^XvcZcYPl#e(T#|fM!kX$B zA5&mLX6X(^S~xe-G-yjWEfw6FIixoe83Fjztpi(vi-oOAEw4K8rmMiu!lzIA{_NM8 zzPD##P|5_45(KB=tBc&1sQe3e%%gxZSua){(t%%4$o>I}y`3T$K_9Vk$x4UF1m?7P z__&FQNl5GZx71R(`(fPw=459w1ai{y%fQl}Dw^9^ereTq%Orz7eywmtMR$Loj3qGd z`lCW3k9#tiVk(PC`o{%&H)}95*-3OfO{mSR1*|8fIHh5s zIh6>|TEi~D;&RjLWQbFuYU;^|D)*VjFDz(_RuW6ImS<+}G0bAN zs5ltcdjfsF*6C$L&)PXh@{GbJm*vvJs|J-U9)CexnNbE=u)effQdbPgC^P^XP^0yDPhZt# zgCyA6F=F}uzNoOLmKtbkf0XodYNCv^e&@NKx;dQ939jV6hQMLDc3q7y<9f|* zIHR1kDB~~wI<8G`%mWoe1G*AQJu3-nAznE9KQR|Z0A2BsT$s~$lcw)xLqf{qdGJ9e z>b5&chj^`{hA}2GC)@e>3Yi7AHA!xK6MpmAD@RS{9PqYg(TiWL4{dYg zMuF|cYKay1tS0ARHu@Mb42N~z1$t0)Ap$UJr!OKwZF{``h@ z1{%9j6z7schSwNTettGj*^B+(chdgt|J+H??A{5I3@Rdfujpi}GrZuB7P95yfq~ys z4Y5?h>s7sB_iV5OJ!W#zx`Fh88iD2S1*=#07rN_*ZpG? z##GYaA&jKwio;lfDfSNls8!Ge1_pMfF7Br!!?zd;;s*g1pyBt&f8jTV%EDud5fJBx!JQ`|eBteU>4&OMYCaaX#nn?ifpe-urh7LgK z2mis8O$(`g=YVG)U;b19Ys3vGjD=ywa}xR8w0!bQDgCsTx`lhJZ6yirdv`I*fDYML#;Ey&*Sl&^61yS%b;?Rj8r*RiA4WBJ7^G60r6 z>P*Dxptpu-e13899UZ?2V?OB4jGR)hUv%N^+HwUnK3ArNIlxfNpg&4)JSR*q>EPg~7vB#V^e|QJV)t7~5TlXH-3X_HvOUlFj7bLJV^} zZ=$#?glpYK7o^5y+dR%c(1hw1$;6Psf&`{XV;)td`%TrLsB@Esbi0br$*HRW*92EM z0OBb0mJ?g;?{>^X#5bj|5pk6J?uV)kzoESTkW%e?X!d!w`1E~;!I5wM^s0C}ypK0D z?5W9asixbhxIksZ4hmdS`2Gtf{HQOpB7WvSI8$5;^QXy`OYXUTSwE#@@OG)suFTSo zFf&Uz22`gSi(sIBRJgglAF?YD21ck-``xES#S+5o?wWhemPMh%j%!Y_rbuXpmP>g! z#uAeei^?D1Gz^C)F|%P?W_uMcb=p7E8Z|+Jq)VbY}kZ{PQHg z?Q|HA!@mbo0iY}2-;8CxRYsHi^~^Iz2wm$t)IszL!N(n{oT2OEL&Ethf`rR5qOOi= zE91$WMg5dVeuJftq3BSvdpq1h%{lOe>yuIeao#$k4NM{09vlAi-8lhU=M7E1g86rE z{HMEmp)900v+|i!)DB%Eqf1g)%~+{pIdKWDF^^=}keox0JAq;ki9nH5*f_OhD2}+3 ziC=%qH}PsE|0sVLf=2k=&-kmuanq*;+I}CFujWosu54NYAN#aB$*NwEQvm) z$7ZU~9J~H~?ec$0Q*mwZm@{;npRfPKY>t$g{4r^)^dlI$n1EwsAxG~}0FLo%F!LYaH%zG9{72q+nSR8~cMZ6JG) zQD5o>vl1oT36b&K9fPbcNF%;YdCc5!b5xjWI5lM&<4 z_R?}fxIC4_c7id=4~|Z-Cq>=`&mvii#so$Tb|^V(ygx}=>NL+iRk!Z^DRoD_s=X^j z@M3K9peo`{?{$%%oB#X^Z5_{)?ydh)@x>!tlJ)2s2@$8J4a4w^+NrudQO=gRuHqz3 zB*34c5HmqzPfLm44+v@=faZ^xV$kG%c7`J2F=fD33eR4Z$otE+IZ(KIkr|s6OZZ02 zxy1Dg0s5@R!v(b(4iiytunAe(49U`HnNT-4(eL+^2NlQqX0J@&nNfS+m|i`-Q^E&x z7|V!%8hls`g0DB#l0H6Rp6fnrwf1c5XF`7$LjU=9G65kLDJdykgXi}gzZ#wXb{L(W z8f9#j-CCv1J{LC6_>2K6==QBI2SJC{m#GZTO2y%!icSpPKF@vAO$DaQb=Nz>CAT-q z0=_NecZCE!$M=yg$)5gL=u40_^U)tsipQL6V7?D~Id~tM*IF(* z1TqRg81=W;trP55W4r7%{1UA}2MMtV9{3>hlpC%7LH=o)61j3nXq=-1Mu5w2Yb$r~ zo+Z(lThKO&zqVYV{S%G^ZMMJNc0Ql2GUk95OrK0;bzBA6-9l?@p;>P zvE#gzT$wH)fRpGoT{y8AwpTCN0T8>J>c3y~#sW-IA_#~ou}MlVsY#Rel*ZuP(0R(B zK?3&~i%e9#19)`0@qLxu!&z{9@33f|^ZcS?Q*P^6URB{C*UjLNMEX)7W6%#{b$aOI7}h)! zwWnt~-4l{eqX_h8LVH!hmCCUi7@MPPDG^{@~rn_ z6bVM`raqWIl`{kg&A7!=N@)gZ0{EihUl>{j*`V`iX3foRt}P=YOaS!;Jc6x8p?0eC ze?LV({y4nt>rhp6TA$(6VKHh#2Owc4paQ>ul!LOi=)U=~WjZu#vU0HDw;D8xUSm}N zKC~zmOXbl6lC*Dr(f~F1r5+h4^YSByUxx^|icqT+;uzS!P{TTHd&7Ht5uJzO0EoHW zo$%{ggiJpg{rR&hV%TJt7B2kR%~wFT@$JXl!d|5}D@``e`b9IPq33}43KNWk)`$Gx3?C@&c z+Q(M8nOMUjeY_f!Dn4mZ_`zW-b@o_sAwD$_t>0a}>}Mr&g5YvVGZYR^&M{5q8nY|y%)_04P>nmxY@f5D zXm1mfZ&qRJN0la+QI7Cq-G-9O$%RyOIF7p_5sJOIL3 zPKKfrP38G+wYqF6m0bc4gHl?VVKY?2`6fY_fXt~E4Or2s!gZ0^^f)wgp^|%eBj=F@ z^jzQM!S64#o9u>~?X|9Ab8(OdUx2N`n6>rCjppT_&m++vy1LAQr*CZ4RgM~AX60TV ziqAIAyWUq)=6JbZN>6otR%5c-B@K>oI2_jm12@K3~2o+UNB$>VvZ@-Z#@&3PhuhDc)ie~IsQ zJ0m?f;G}ji$i3uvIPfj+8Y(m6fB_4{5*uy+m^@{Fwl!%8pnwH?Vgm(eo}O{kL}1gb zDVBzWOp%He0V=7oIM~;zOyN>-W7{P(6oHAvC{#OM`Xv3IVGq z)^}l_RAHF4pGgQHSh7#b{k1A_S^(rJIw#X_YAa^3wVzrQLYgwK47{kc$QbXJ5n(iQ z!AzM72DA7B5><22xw9LVyPxqR>V?SBI4>5;u`AdN88IoX3Bo=(=MEuHid+O6l(- z1PCc+_X*GIV>SkRPBSsLk#Y+K7cl#A1M8=Vni=(hH^bCFyeBCU{+5S|Qx7-ReQm4I z*8CZF!6rs>r02K!u)LuhfOgifRIPPq`KE}8ivPzn(F|e%*2sZP`(BP1pK3Q=Zwccx z0%NFo092)87!*H?ZOXd&!=M_^H3u0oV_*`TFzaVY+Qk9@kUQig#WXj{ns?K4(?sG8 zG+CqB;AQF2w2om@nWlC>&;2O+X}ZH0!$uY}WUJG&nzR73W4Em1rbSoi z2tngp1~JNgL5YM$h=-}-m$**`S}rd2pL7X9igS^NUy2t2ntiUx&B~%t{iXoMt(YJY z3~;}wJZ9W&dhgC~{|f2@KJpduGn6lI2i&ci+LHh=iN!wexF>2-(kLs|1K>BeGbAJVY*@Lp!rD|Wyo{eDarCg?`qcYM7KxBB5fgnLityd z(%Edgp-r{)pXrbguVM_{qTEl~{a+zuv_!B4$Z@V&fvL{wH%Ro!=;=M%rIzn@%ow+Wb{FlqM> zv^`DaO(c4Kc2%CPrY842M0mHfqe+kQwHz-ejoV@6t>>#2a$~DL`@E~T3s^g_^DoX} zbW!kGCJ%}@JXa7^w*q4rKDM*LxveUFZryK0Q-|Y_L1`GUN!nLW*5=6P1ktkC#DPbi z8C}ZlnSEKn0Ea%}6fIg8{FMwfzy+wd|E&l+#gM5T7?yx>sXFRpd;VuH-FD0_UNY@x zzhKLM+j14RSsep-s5%-cw=n=@mhG|76U(1}MDTSVlGOmvVdxiz?{T77^{@P6V-*}w z@38m~R4lC1Y?-ko;1?rYwYYtJ`&lnXoE_g=f`b@!yyM@}o3e*Srihlze{Z&8nj@%7%{eo<#1(5OyK#r^^w&xg}1Sf zlGNpbvyJO9x|3~<1~FTz7yEDPa07HpA{{DL4$az(o437RYE5%fBDf5bbv~A=yLKN6 zO@6DvXPc?g)LLY5k44wYl>@(~akc_YiXRh?y_6oOLwgmB$|QS`Fq)(9(zt)d1_wv> zZ*f{M9v|5E(VPmwln!Y!@e2{1NDSBF=C)vu>E67RRelE4eNwm-ZocL8KbrJog_m>A zq8Ne>=TAEM8WMk?!uTybIKk*ohnr054<$sjs^=xkmz393Et~8VC)_2TLb-DcD|i`O z+eQsm=@k|A%zWH@b_IGkGb^KDDtVGZNEA!?9y%95D2PBTD#VZ__V6>m!C9gYeMyDA z*s?ZU@!EnP;o`BG;V~x#xa=qF$VdbI6Y;ZYrN#>|j)21D(_Ju=(<-`Q7Nm*Pku!-d z!-kkl=|B!jWD*#rSem0`4Hl8XzR>&Ol#WNIOUMTQ36=$oKu&PLx(n=+udK9#QUzd2 zUxuq8R+AqwZae@$MxDGhaS!n#V6@;0^zqox7N^LVt0%Wvo(vrnq;x1`BV%hZVtf$cJRN`7RyO8ZEt1;>P!cRTb*K3jy`FTvZl|v=*EkoDIvOUB z;R0dH8c!wT??pP37jZu)cH23ep~j3_V|gAb?$TGfO!tD4;-7pWB^2E3JQ#>Z!bac!F`#hKuQswlXKuxFrlOe3`#H>r20>%uZ zq#w^|I=EccqoYKPr@+ZTlK`fAg z&00A-TuOBlvz|m0Fr4WmdRV8!P~l$L?hBOtGc?>EOQo_i5jDy|#S#m_iBo85$;She zri}(vybV166|>$+)J^|EW4#4Kic+Z(wX1dGMQbd zGRh|qVX~DIO=>3FJ(%%MK?)o?bf{u@IM!@)Cn#C;&!V6w&hNj z({jM}lcq(w5HO`Oh;!y5090He6;W(ibRd7?se);torFduD?k7iYLaA#KB2FnE|aDm zDSiOb2zCMue-Z`?gZN44MXsvDnByPJ<C<_f2-2y}`0_)sRAl!1+}GVS@lZKfu!u zZq^O%mJ1NivN>D`Xp*NVxkoPKeow@?5(ufqQednc1h@k=8Kaw~a@kA@r^nDDj><4B z3WjmQAk3WlU@6s`OvXCxHb)zB>~$t%u}-*{b|D1c(8`r7W9E@gEehy&YHHn;xtr6P z0?#t?l+)?0s;Y9uV!433%;j`sGL^tUr`zv$q+-b`04Nmac9F$8lTHM%^4e`yB#};( z!xez$PBw?bO1K>EKBO!`%xJhNfdNG})IorFj1B}YmlcjiGvQP!%>4vJ5^-qaqZ>AC zFw^wY`VBNUZzy0WU?^ZHU?^ZHU?^ZHU?^ZHU?^ZHU?^ZHFv%(K{{g;tTuG%sH`@RJ N002ovPDHLkV1i0L{5t>u diff --git a/tests/load_transformer.py b/tests/load_transformer.py deleted file mode 100644 index dd19094c..00000000 --- a/tests/load_transformer.py +++ /dev/null @@ -1,7 +0,0 @@ -# This test is here to see if we can -# minimize logging -from happytransformer import HappyBERT - -transformer = HappyBERT() -predictions = transformer.predict_mask("Dogs make me [MASK] to eat",num_results=20) -# when runnning this, logs should be minimal \ No newline at end of file diff --git a/tests/test_multi_mask.py b/tests/test_multi_mask.py deleted file mode 100644 index 50aa52f3..00000000 --- a/tests/test_multi_mask.py +++ /dev/null @@ -1,42 +0,0 @@ -from happytransformer import HappyBERT - -happy = HappyBERT() - -def test_multi_mask(): - # should give something like - # "I have a great dog and I love him so much" - all_predictions = happy.predict_masks( - "[MASK] have a [MASK] dog and I love [MASK] so much", - num_results=2 - ) - assert len(all_predictions) == 3 - assert all( - len(specific_predictions) == 2 - for specific_predictions in all_predictions - ) - assert all_predictions[0][0]["word"] == 'i' - assert all_predictions[0][0]["softmax"] > 0.5 - - assert all_predictions[2][0]["word"] == 'him' - -def test_multi_mask_options(): - MASKS_OPTIONS = [ - ['I', 'You'], - ['big', 'small'], - ['him', 'her'] - ] - options_set = set( - option - for mask in MASKS_OPTIONS - for option in mask - ) - all_predictions = happy.predict_masks( - "[MASK] have a [MASK] dog and I love [MASK] so much", - options=MASKS_OPTIONS - ) - assert len(all_predictions) == 3 - assert all( - prediction["word"] in options_set - for mask_predictions in all_predictions - for prediction in mask_predictions - ) \ No newline at end of file diff --git a/tests/test_next_sentence_probs.py b/tests/test_next_sentence_probs.py deleted file mode 100644 index 6adc29cc..00000000 --- a/tests/test_next_sentence_probs.py +++ /dev/null @@ -1,52 +0,0 @@ -''' -tests next sentence prediction capabilities -''' - -from happytransformer import HappyBERT - -def percent(x): - '''formats a float as a percentage''' - return f"{x*100:.2f}%" - -def eq_ish(x, y, epsilon): - '''soft similarity check between two numbers''' - return abs(y-x) < epsilon - -def errors(func): - '''determines whether function errors''' - try: - func() - except: - return True - return False - -happy = HappyBERT() -SENTENCE_PAIRS = [ - ["How old are you?", "The Eiffel Tower is in Paris", False], - ["How old are you?", "I am 40 years old", True] -] - -def test_argument_errors(): - ''' - tests that the nsp module correctly rejects - multi-sentence inputs - ''' - two_sentences = "This is the first sentence. This is the second sentence" - one_sentence = "This is one sentence." - assert errors(lambda: happy.predict_next_sentence(two_sentences, one_sentence)) - assert errors(lambda: happy.predict_next_sentence(one_sentence, two_sentences)) - -def test_nsp(): - ''' - tests that the nsp module returns expected results - for the given sentence pairs - ''' - for a, b, follows in SENTENCE_PAIRS: - print('==============================') - print(a) - print(b) - predict = happy.predict_next_sentence(a, b) - probability = happy.predict_next_sentence(a, b, use_probability=True) - assert 0 <= probability <= 1 - assert predict == follows - print(predict, probability) \ No newline at end of file diff --git a/tests/test_predict.py b/tests/test_predict.py deleted file mode 100644 index 2f81e7c7..00000000 --- a/tests/test_predict.py +++ /dev/null @@ -1,17 +0,0 @@ -from happytransformer.to_delete.happy_bert import HappyBERT - -happy = HappyBERT() - -def test_prediction_options(): - ''' - asserts that trimmed options are sorted by - likelihood and not order in list - ''' - predictions = happy.predict_mask( - 'I want crackers and [MASK]', - options=['death', 'cheese'], - num_results=1 - ) - print(predictions) - # top prediction should be cheese and not death - assert predictions[0]['word'] == 'cheese' \ No newline at end of file diff --git a/tests/test_qa_multi.py b/tests/test_qa_multi.py deleted file mode 100644 index 3bf8107e..00000000 --- a/tests/test_qa_multi.py +++ /dev/null @@ -1,34 +0,0 @@ -""" -Tests for the "answers_to_question" method that can be accessed through a HappyBERT object -""" - -from happytransformer.happy_question_answering import HappyQuestionAnswering -happy_qa = HappyQuestionAnswering() - -PARAGRAPH = ( - 'McGill is a university located in Montreal. ' - 'It was founded in 1821, making it the eight oldest university in Canada. ' - 'It is currently ranked 31st worldwide according to the QS Global World Ranking ' - -) - -QA_PAIRS = [ - ('When was McGill founded?', '1821'), - ('Where is McGill located?', 'Montreal'), - ('What is McGill\'s worldwide ranking?', '31st'), - -] - -def test_qa_multi(): - for question, expected_answer in QA_PAIRS: - - computed_answers = happy_qa.answers_to_question(question, PARAGRAPH, k=10) - computed_answer = happy_qa.answer_question(question, PARAGRAPH) - # k is being respected - assert len(computed_answers) == 10 - # both answering methods yield correct result - assert computed_answers[0]["text"].lower() == expected_answer.lower() - assert computed_answer.lower() == expected_answer.lower() - total_p = sum(answer["softmax"] for answer in computed_answers) - # probabilties for answers_to_question() add up to 1 ish - assert abs(total_p-1) < 0.01 diff --git a/tests/test_sequence.py b/tests/test_sequence.py deleted file mode 100644 index 137ed300..00000000 --- a/tests/test_sequence.py +++ /dev/null @@ -1,12 +0,0 @@ -from happytransformer import HappyBERT - -happy = HappyBERT() - -def test_sequence_runs(): - ''' - run sequence classification end to end - only checks that no errors are raised - ''' - happy.init_sequence_classifier() - happy.train_sequence_classifier('tests/test_sequence.csv') - happy.eval_sequence_classifier('tests/test_sequence.csv') \ No newline at end of file From fad67da01a06abf40e036468dcf4aeb0f7ba1038 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sat, 9 Jan 2021 01:39:49 -0500 Subject: [PATCH 026/155] Added ALBERT for QA --- happytransformer/happy_question_answering.py | 5 +++++ tests/test_qa.py | 19 +++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/happytransformer/happy_question_answering.py b/happytransformer/happy_question_answering.py index c6a5ce50..d90d7313 100644 --- a/happytransformer/happy_question_answering.py +++ b/happytransformer/happy_question_answering.py @@ -13,6 +13,8 @@ BertTokenizerFast, DistilBertForQuestionAnswering, DistilBertTokenizerFast, + AlbertForQuestionAnswering, + AlbertTokenizerFast, QuestionAnsweringPipeline, ) @@ -38,6 +40,9 @@ def __init__(self, model_type="DISTILBERT", elif model_type == "DISTILBERT": model = DistilBertForQuestionAnswering.from_pretrained(model_name) tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) + elif model_type == "ALBERT": + model = AlbertForQuestionAnswering.from_pretrained(model_name) + tokenizer = AlbertTokenizerFast.from_pretrained(model_name) else: raise ValueError("model_type must be BERT or DISTILBERT") diff --git a/tests/test_qa.py b/tests/test_qa.py index 020e6171..c8621f43 100644 --- a/tests/test_qa.py +++ b/tests/test_qa.py @@ -49,3 +49,22 @@ def test_qa_train_effectiveness(): assert after_loss < before_loss +def test_qa_train_effectiveness_albert(): + """ + Ensures that HappyQuestionAnswering.train() results in + lowering the loss as determined by HappyQuestionAnswering.eval() + """ + + happy_qa = HappyQuestionAnswering("ALBERT", "twmkn9/albert-base-v2-squad2") + before_loss = happy_qa.eval("../data/qa/train-eval.csv")["eval_loss"] + happy_qa.train("../data/qa/train-eval.csv") + after_loss = happy_qa.eval("../data/qa/train-eval.csv")["eval_loss"] + + assert after_loss < before_loss + +def test_qa_test_albert(): + happy_qa = HappyQuestionAnswering("ALBERT", "twmkn9/albert-base-v2-squad2") + result = happy_qa.test("../data/qa/test.csv") + print(result) + answer = [{'score': 0.988578736782074, 'start': 0, 'end': 12, 'answer': 'October 31st'}, {'score': 0.9833534359931946, 'start': 12, 'end': 25, 'answer': 'November 23rd'}] + assert result == answer \ No newline at end of file From 055b5ac0de50770754bc80b895544edbf19b5563 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sat, 9 Jan 2021 01:41:50 -0500 Subject: [PATCH 027/155] Updated warning for model_type to include ALBERT --- happytransformer/happy_question_answering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/happytransformer/happy_question_answering.py b/happytransformer/happy_question_answering.py index d90d7313..65f4a16e 100644 --- a/happytransformer/happy_question_answering.py +++ b/happytransformer/happy_question_answering.py @@ -44,7 +44,7 @@ def __init__(self, model_type="DISTILBERT", model = AlbertForQuestionAnswering.from_pretrained(model_name) tokenizer = AlbertTokenizerFast.from_pretrained(model_name) else: - raise ValueError("model_type must be BERT or DISTILBERT") + raise ValueError("model_type must be BERT, DISTILBERT or ALBERT") super().__init__(model_type, model_name, model, tokenizer) device_number = 1 if torch.cuda.is_available() else -1 From 60483306bb6d366d1c904f17482f44f5e8cd3f6d Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sat, 9 Jan 2021 01:48:43 -0500 Subject: [PATCH 028/155] Added albert for TC --- happytransformer/happy_text_classification.py | 9 ++++++- tests/test_tc.py | 25 +++++++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/happytransformer/happy_text_classification.py b/happytransformer/happy_text_classification.py index 6f86bdfb..4e04c51e 100644 --- a/happytransformer/happy_text_classification.py +++ b/happytransformer/happy_text_classification.py @@ -9,6 +9,10 @@ BertTokenizerFast, DistilBertForSequenceClassification, DistilBertTokenizerFast, + AlbertForSequenceClassification, + AlbertTokenizerFast, + + TextClassificationPipeline ) from happytransformer.tc.trainer import TCTrainer @@ -33,8 +37,11 @@ def __init__(self, model_type="DISTILBERT", elif model_type == "DISTILBERT": model = DistilBertForSequenceClassification.from_pretrained(model_name) tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) + elif model_type == "ALBERT": + model = AlbertForSequenceClassification.from_pretrained(model_name) + tokenizer = AlbertTokenizerFast.from_pretrained(model_name) else: - raise ValueError("model_type must be BERT or DISTILBERT") + raise ValueError("model_type must be BERT, DISTILBERT or ALBERT") super().__init__(model_type, model_name, model, tokenizer) diff --git a/tests/test_tc.py b/tests/test_tc.py index 4bbf054c..92425738 100644 --- a/tests/test_tc.py +++ b/tests/test_tc.py @@ -73,3 +73,28 @@ def test_qa_train_effectiveness(): happy_tc.train("../data/tc/train-eval.csv") after_loss = happy_tc.eval("../data/tc/train-eval.csv")["eval_loss"] assert after_loss < before_loss + +def test_qa_test_albert(): + """ + Tests + HappyQuestionAnswering.test() + """ + happy_tc = HappyTextClassification(model_type="ALBERT", model_name="textattack/albert-base-v2-SST-2") + + result = happy_tc.test("../data/tc/test.csv") + answer = [[{'label': 'LABEL_1', 'score': 0.9990348815917969}], [{'label': 'LABEL_0', 'score': 0.9947203397750854}], [{'label': 'LABEL_0', 'score': 0.9958302974700928}], [{'label': 'LABEL_1', 'score': 0.9986426830291748}]] + assert result == answer + + +def test_qa_train_effectiveness_albert(): + """ + Tests + Ensures that HappyQuestionAnswering.train() results in + lowering the loss as determined by HappyQuestionAnswering.eval() + """ + + happy_tc = HappyTextClassification(model_type="ALBERT", model_name="textattack/albert-base-v2-SST-2") + before_loss = happy_tc.eval("../data/tc/train-eval.csv")["eval_loss"] + happy_tc.train("../data/tc/train-eval.csv") + after_loss = happy_tc.eval("../data/tc/train-eval.csv")["eval_loss"] + assert after_loss < before_loss From 7a63f747deeae58caba8a3e231f33ea034c7b61c Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sat, 9 Jan 2021 14:26:46 -0500 Subject: [PATCH 029/155] Added Ted's feedback --- happytransformer/happy_word_prediction.py | 24 +++++++++++------------ tests/test_mwp.py | 10 ++++++---- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index d07d3892..73d8e3ba 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -7,9 +7,11 @@ ) import torch - +import collections from happytransformer.happy_transformer import HappyTransformer -from happytransformer.mwp.trainer import MWPTrainer +from happytransformer.mwp.trainer import MWPTrainer + +WPOutput = collections.namedtuple("WPOutput", ["token_str", "score"]) class HappyWordPrediction(HappyTransformer): @@ -43,19 +45,16 @@ def predict_mask(self, text, targets=None, top_k=1): :return: If top_k ==1: a dictionary with the keys "score" and "token_str" if top_k >1: a list of dictionaries described above in order by score """ - if isinstance(text, list): + if not isinstance(text, str): raise ValueError("the \"text\" argument must be a single string") result = self._pipeline(text, targets=targets, top_k=top_k) - if top_k == 1: - result = result[0] - del result['sequence'] - del result['token'] - else: - for answer in result: - del answer['sequence'] - del answer['token'] - return result + results = list() + for answer in result: + result = WPOutput(token_str=answer["token_str"], score=answer["score"]) + results.append(result) + + return results def train(self, input_filepath, args): raise NotImplementedError("train() is currently not available") @@ -64,5 +63,4 @@ def eval(self, input_filepath): raise NotImplementedError("eval() is currently not available") def test(self, input_filepath): - # self.logger.error("test() is currently not available") raise NotImplementedError("test() is currently not available") diff --git a/tests/test_mwp.py b/tests/test_mwp.py index 41739899..99c8ebf4 100644 --- a/tests/test_mwp.py +++ b/tests/test_mwp.py @@ -1,12 +1,12 @@ from happytransformer import HappyWordPrediction - +from happytransformer.happy_word_prediction import WPOutput def test_mwp_basic(): happy_mwp = HappyWordPrediction() result = happy_mwp.predict_mask( "Please pass the salt and [MASK]", ) - answer = {'score': 0.2664579749107361, 'token_str': 'pepper'} + answer = [WPOutput(token_str="pepper", score=0.2664579749107361)] assert result == answer @@ -16,7 +16,9 @@ def test_mwp_top_k(): "Please pass the salt and [MASK]", top_k=2 ) - answer = [{'score': 0.2664579749107361, 'token_str': 'pepper'}, {'score': 0.08760260790586472, 'token_str': 'vinegar'}] + print(result) + + answer = [WPOutput(token_str='pepper', score=0.2664579749107361), WPOutput(token_str='vinegar', score=0.08760260790586472)] assert result == answer @@ -26,5 +28,5 @@ def test_mwp_targets(): "Please pass the salt and [MASK]", targets=["water", "spices"] ) - answer = {'score': 0.014856964349746704, 'token_str': 'water'} + answer = [WPOutput(token_str='water', score=0.014856964349746704), WPOutput(token_str='spices', score=0.009040987119078636)] assert result == answer From 860a4dd4540f84d6f81d3dfa569fe42ddecec860 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sat, 9 Jan 2021 14:28:15 -0500 Subject: [PATCH 030/155] Remove mask in mwp --- happytransformer/happy_word_prediction.py | 4 ++-- happytransformer/mwp/trainer.py | 2 +- tests/{test_mwp.py => test_wp.py} | 0 3 files changed, 3 insertions(+), 3 deletions(-) rename tests/{test_mwp.py => test_wp.py} (100%) diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index 73d8e3ba..19b9d9db 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -9,7 +9,7 @@ import torch import collections from happytransformer.happy_transformer import HappyTransformer -from happytransformer.mwp.trainer import MWPTrainer +from happytransformer.mwp.trainer import WPTrainer WPOutput = collections.namedtuple("WPOutput", ["token_str", "score"]) @@ -34,7 +34,7 @@ def __init__(self, model_type="DISTILBERT", super().__init__(model_type, model_name, model, tokenizer) device_number = 1 if torch.cuda.is_available() else -1 self._pipeline = FillMaskPipeline(model=model, tokenizer=tokenizer, device=device_number) - self._trainer = MWPTrainer(model, model_type, tokenizer, self._device, self.logger) + self._trainer = WPTrainer(model, model_type, tokenizer, self._device, self.logger) def predict_mask(self, text, targets=None, top_k=1): """ diff --git a/happytransformer/mwp/trainer.py b/happytransformer/mwp/trainer.py index 8e8090e4..d68c866b 100644 --- a/happytransformer/mwp/trainer.py +++ b/happytransformer/mwp/trainer.py @@ -2,7 +2,7 @@ from happytransformer.mwp.default_args import ARGS_MWP_TRAIN -class MWPTrainer(HappyTrainer): +class WPTrainer(HappyTrainer): def train(self, input_filepath, args=ARGS_MWP_TRAIN): raise NotImplementedError() diff --git a/tests/test_mwp.py b/tests/test_wp.py similarity index 100% rename from tests/test_mwp.py rename to tests/test_wp.py From 8b8ae8e6456f6b5ab4fba13c82e6375786d2d0d5 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sat, 9 Jan 2021 15:41:59 -0500 Subject: [PATCH 031/155] Added Albert and DistilBERT to MWP --- happytransformer/happy_word_prediction.py | 24 ++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index dfc0991b..7da19535 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -3,8 +3,10 @@ from transformers import ( BertForMaskedLM, BertTokenizerFast, - RobertaForMaskedLM, - RobertaTokenizerFast, + AlbertForMaskedLM, + AlbertTokenizerFast, + DistilBertForMaskedLM, + DistilBertTokenizerFast, FillMaskPipeline, ) @@ -20,15 +22,23 @@ def __init__(self, model_type="BERT", model = None tokenizer = None - if model_type == "BERT": + if model_type == "ALBERT": + model = AlbertForMaskedLM.from_pretrained(model_name) + tokenizer = AlbertTokenizerFast.from_pretrained(model_name) + + elif model_type == "BERT": model = BertForMaskedLM.from_pretrained(model_name) tokenizer = BertTokenizerFast.from_pretrained(model_name) - elif model_type == "ROBERTA": - model = RobertaForMaskedLM.from_pretrained(model_name) - tokenizer = RobertaTokenizerFast.from_pretrained(model_name) + elif model_type == "DISTILBERT": + model = DistilBertForMaskedLM.from_pretrained(model_name) + tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) + + else: + raise ValueError("model_type must be BERT, DISTILBERT or ALBERT") + - super().__init__(model_type, model_name, model, tokenizer, device) + super().__init__(model_type, model_name, model, tokenizer) device_number = 1 if torch.cuda.is_available() else -1 self._pipeline = FillMaskPipeline(model=model, tokenizer=tokenizer, device=device_number) From 6878350eabd3f440acb27bc3f7c496169ae48fcf Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sat, 9 Jan 2021 15:49:08 -0500 Subject: [PATCH 032/155] Added Ted's feedback --- happytransformer/happy_word_prediction.py | 10 +++------- tests/test_wp.py | 10 ++++------ 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index 19b9d9db..aa1eaee8 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -11,7 +11,7 @@ from happytransformer.happy_transformer import HappyTransformer from happytransformer.mwp.trainer import WPTrainer -WPOutput = collections.namedtuple("WPOutput", ["token_str", "score"]) +WordPredictionResult = collections.namedtuple("WordPredictionResult", ["token_str", "score"]) class HappyWordPrediction(HappyTransformer): @@ -42,17 +42,13 @@ def predict_mask(self, text, targets=None, top_k=1): :param targets: Optional. A list of strings of potential answers. All other answers will be ignored :param top_k: number of results. Default is 1 - :return: If top_k ==1: a dictionary with the keys "score" and "token_str" - if top_k >1: a list of dictionaries described above in order by score + :return: A named WordPredictionResult Named Tuple with the following keys: token_str and score """ if not isinstance(text, str): raise ValueError("the \"text\" argument must be a single string") result = self._pipeline(text, targets=targets, top_k=top_k) - results = list() - for answer in result: - result = WPOutput(token_str=answer["token_str"], score=answer["score"]) - results.append(result) + results = [WordPredictionResult(token_str=answer["token_str"], score=answer["score"]) for answer in result] return results diff --git a/tests/test_wp.py b/tests/test_wp.py index 99c8ebf4..5d8461db 100644 --- a/tests/test_wp.py +++ b/tests/test_wp.py @@ -1,12 +1,12 @@ from happytransformer import HappyWordPrediction -from happytransformer.happy_word_prediction import WPOutput +from happytransformer.happy_word_prediction import WordPredictionResult def test_mwp_basic(): happy_mwp = HappyWordPrediction() result = happy_mwp.predict_mask( "Please pass the salt and [MASK]", ) - answer = [WPOutput(token_str="pepper", score=0.2664579749107361)] + answer = [WordPredictionResult(token_str="pepper", score=0.2664579749107361)] assert result == answer @@ -16,9 +16,7 @@ def test_mwp_top_k(): "Please pass the salt and [MASK]", top_k=2 ) - print(result) - - answer = [WPOutput(token_str='pepper', score=0.2664579749107361), WPOutput(token_str='vinegar', score=0.08760260790586472)] + answer = [WordPredictionResult(token_str='pepper', score=0.2664579749107361), WordPredictionResult(token_str='vinegar', score=0.08760260790586472)] assert result == answer @@ -28,5 +26,5 @@ def test_mwp_targets(): "Please pass the salt and [MASK]", targets=["water", "spices"] ) - answer = [WPOutput(token_str='water', score=0.014856964349746704), WPOutput(token_str='spices', score=0.009040987119078636)] + answer = [WordPredictionResult(token_str='water', score=0.014856964349746704), WordPredictionResult(token_str='spices', score=0.009040987119078636)] assert result == answer From b936d3d4a76d19633a6567c6e94ccea050dc7642 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Sat, 9 Jan 2021 15:54:41 -0500 Subject: [PATCH 033/155] adjusted formatting --- happytransformer/happy_word_prediction.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index aa1eaee8..7a4e160b 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -7,11 +7,11 @@ ) import torch -import collections +from collections import namedtuple from happytransformer.happy_transformer import HappyTransformer from happytransformer.mwp.trainer import WPTrainer -WordPredictionResult = collections.namedtuple("WordPredictionResult", ["token_str", "score"]) +WordPredictionResult = namedtuple("WordPredictionResult", ["token_str", "score"]) class HappyWordPrediction(HappyTransformer): @@ -48,7 +48,13 @@ def predict_mask(self, text, targets=None, top_k=1): raise ValueError("the \"text\" argument must be a single string") result = self._pipeline(text, targets=targets, top_k=top_k) - results = [WordPredictionResult(token_str=answer["token_str"], score=answer["score"]) for answer in result] + results = [ + WordPredictionResult( + token_str=answer["token_str"], + score=answer["score"] + ) + for answer in result + ] return results From 8a7b81f06504c71ac797f4aa62b88cf681fa7ab0 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sat, 9 Jan 2021 16:06:10 -0500 Subject: [PATCH 034/155] Addded WP test cases for distilbert and albert --- tests/test_wp.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/test_wp.py b/tests/test_wp.py index 5d8461db..e14802de 100644 --- a/tests/test_wp.py +++ b/tests/test_wp.py @@ -28,3 +28,20 @@ def test_mwp_targets(): ) answer = [WordPredictionResult(token_str='water', score=0.014856964349746704), WordPredictionResult(token_str='spices', score=0.009040987119078636)] assert result == answer + + +def test_mwp_basic_albert(): + happy_mwp = HappyWordPrediction("ALBERT", "albert-base-v2") + result = happy_mwp.predict_mask( + "Please pass the salt and [MASK]", + ) + answer = [WordPredictionResult(token_str='garlic', score=0.036625903099775314)] + assert result == answer + +def test_mwp_basic_bert(): + happy_mwp = HappyWordPrediction("BERT", "bert-base-uncased") + result = happy_mwp.predict_mask( + "Please pass the salt and [MASK]", + ) + answer = [WordPredictionResult(token_str='.', score=0.8466101884841919)] + assert result == answer \ No newline at end of file From a811d3c81a5d4bcb5fcaa5d9e5d581fd53eb2237 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sat, 9 Jan 2021 16:08:00 -0500 Subject: [PATCH 035/155] changed order of models --- happytransformer/happy_question_answering.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/happytransformer/happy_question_answering.py b/happytransformer/happy_question_answering.py index 65f4a16e..c0fbafa5 100644 --- a/happytransformer/happy_question_answering.py +++ b/happytransformer/happy_question_answering.py @@ -34,15 +34,17 @@ def __init__(self, model_type="DISTILBERT", model_name="distilbert-base-cased-distilled-squad"): model = None tokenizer = None - if model_type == "BERT": + + if model_type == "ALBERT": + model = AlbertForQuestionAnswering.from_pretrained(model_name) + tokenizer = AlbertTokenizerFast.from_pretrained(model_name) + elif model_type == "BERT": model = BertForQuestionAnswering.from_pretrained(model_name) tokenizer = BertTokenizerFast.from_pretrained(model_name) elif model_type == "DISTILBERT": model = DistilBertForQuestionAnswering.from_pretrained(model_name) tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) - elif model_type == "ALBERT": - model = AlbertForQuestionAnswering.from_pretrained(model_name) - tokenizer = AlbertTokenizerFast.from_pretrained(model_name) + else: raise ValueError("model_type must be BERT, DISTILBERT or ALBERT") From d80b09e2d522d39848734e20f46fc6ed6304f92b Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sat, 9 Jan 2021 16:12:11 -0500 Subject: [PATCH 036/155] Added BERT test cases --- tests/test_qa.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/tests/test_qa.py b/tests/test_qa.py index c8621f43..58ce1a25 100644 --- a/tests/test_qa.py +++ b/tests/test_qa.py @@ -65,6 +65,25 @@ def test_qa_train_effectiveness_albert(): def test_qa_test_albert(): happy_qa = HappyQuestionAnswering("ALBERT", "twmkn9/albert-base-v2-squad2") result = happy_qa.test("../data/qa/test.csv") - print(result) answer = [{'score': 0.988578736782074, 'start': 0, 'end': 12, 'answer': 'October 31st'}, {'score': 0.9833534359931946, 'start': 12, 'end': 25, 'answer': 'November 23rd'}] + assert result == answer + + +def test_qa_train_effectiveness_bert(): + """ + Ensures that HappyQuestionAnswering.train() results in + lowering the loss as determined by HappyQuestionAnswering.eval() + """ + + happy_qa = HappyQuestionAnswering("BERT", "mrm8488/bert-tiny-5-finetuned-squadv2") + before_loss = happy_qa.eval("../data/qa/train-eval.csv")["eval_loss"] + happy_qa.train("../data/qa/train-eval.csv") + after_loss = happy_qa.eval("../data/qa/train-eval.csv")["eval_loss"] + + assert after_loss < before_loss + +def test_qa_test_bert(): + happy_qa = HappyQuestionAnswering("BERT", "mrm8488/bert-tiny-5-finetuned-squadv2") + result = happy_qa.test("../data/qa/test.csv") + answer = [{'score': 0.9352769255638123, 'start': 0, 'end': 12, 'answer': 'October 31st'}, {'score': 0.9180678129196167, 'start': 12, 'end': 25, 'answer': 'November 23rd'}] assert result == answer \ No newline at end of file From 651c65cae630000ae35dc75cfd8cc7db68077d28 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sat, 9 Jan 2021 16:16:01 -0500 Subject: [PATCH 037/155] Added model_type_error value --- happytransformer/happy_question_answering.py | 2 +- happytransformer/happy_text_classification.py | 2 +- happytransformer/happy_transformer.py | 2 ++ happytransformer/happy_word_prediction.py | 2 +- 4 files changed, 5 insertions(+), 3 deletions(-) diff --git a/happytransformer/happy_question_answering.py b/happytransformer/happy_question_answering.py index c0fbafa5..f0a93833 100644 --- a/happytransformer/happy_question_answering.py +++ b/happytransformer/happy_question_answering.py @@ -46,7 +46,7 @@ def __init__(self, model_type="DISTILBERT", tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) else: - raise ValueError("model_type must be BERT, DISTILBERT or ALBERT") + raise ValueError(self.model_type_error) super().__init__(model_type, model_name, model, tokenizer) device_number = 1 if torch.cuda.is_available() else -1 diff --git a/happytransformer/happy_text_classification.py b/happytransformer/happy_text_classification.py index 4e04c51e..9e674d86 100644 --- a/happytransformer/happy_text_classification.py +++ b/happytransformer/happy_text_classification.py @@ -41,7 +41,7 @@ def __init__(self, model_type="DISTILBERT", model = AlbertForSequenceClassification.from_pretrained(model_name) tokenizer = AlbertTokenizerFast.from_pretrained(model_name) else: - raise ValueError("model_type must be BERT, DISTILBERT or ALBERT") + raise ValueError(self.model_type_error) super().__init__(model_type, model_name, model, tokenizer) diff --git a/happytransformer/happy_transformer.py b/happytransformer/happy_transformer.py index 8414586c..f842b600 100644 --- a/happytransformer/happy_transformer.py +++ b/happytransformer/happy_transformer.py @@ -14,6 +14,8 @@ class HappyTransformer(): """ + model_type_error = "model_type must be ALBERT, BERT or DISTILBERT" + def __init__(self, model_type, model_name, model, tokenizer): self.model_type = model_type # BERT, #DISTILBERT, ROBERTA, ALBERT etc self.model_name = model_name diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index 0d0015a8..f41ef97d 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -37,7 +37,7 @@ def __init__(self, model_type="DISTILBERT", model = DistilBertForMaskedLM.from_pretrained(model_name) tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) else: - raise ValueError("model_type must be BERT, DISTILBERT or ALBERT") + raise ValueError(self.model_type_error) super().__init__(model_type, model_name, model, tokenizer) device_number = 1 if torch.cuda.is_available() else -1 self._pipeline = FillMaskPipeline(model=model, tokenizer=tokenizer, device=device_number) From fb56554512e756a0dbd4e9ae7dbbbf120fecdc86 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sat, 9 Jan 2021 16:19:30 -0500 Subject: [PATCH 038/155] changed the order of models --- happytransformer/happy_text_classification.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/happytransformer/happy_text_classification.py b/happytransformer/happy_text_classification.py index 9e674d86..62148ce4 100644 --- a/happytransformer/happy_text_classification.py +++ b/happytransformer/happy_text_classification.py @@ -31,15 +31,16 @@ def __init__(self, model_type="DISTILBERT", model = None tokenizer = None - if model_type == "BERT": + if model_type == "ALBERT": + model = AlbertForSequenceClassification.from_pretrained(model_name) + tokenizer = AlbertTokenizerFast.from_pretrained(model_name) + elif model_type == "BERT": model = BertForSequenceClassification.from_pretrained(model_name) tokenizer = BertTokenizerFast.from_pretrained(model_name) elif model_type == "DISTILBERT": model = DistilBertForSequenceClassification.from_pretrained(model_name) tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) - elif model_type == "ALBERT": - model = AlbertForSequenceClassification.from_pretrained(model_name) - tokenizer = AlbertTokenizerFast.from_pretrained(model_name) + else: raise ValueError(self.model_type_error) From 2e413ee248e5dd306ebc5328b0aa1e24e42a29c0 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sat, 9 Jan 2021 16:21:34 -0500 Subject: [PATCH 039/155] Added bert test cases --- tests/test_tc.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/test_tc.py b/tests/test_tc.py index 92425738..71669619 100644 --- a/tests/test_tc.py +++ b/tests/test_tc.py @@ -98,3 +98,29 @@ def test_qa_train_effectiveness_albert(): happy_tc.train("../data/tc/train-eval.csv") after_loss = happy_tc.eval("../data/tc/train-eval.csv")["eval_loss"] assert after_loss < before_loss + + +def test_qa_test_bert(): + """ + Tests + HappyQuestionAnswering.test() + """ + happy_tc = HappyTextClassification(model_type="BERT", model_name="textattack/bert-base-uncased-SST-2") + + result = happy_tc.test("../data/tc/test.csv") + answer = [[{'label': 'LABEL_1', 'score': 0.9995690584182739}], [{'label': 'LABEL_0', 'score': 0.9981549382209778}], [{'label': 'LABEL_0', 'score': 0.9965545535087585}], [{'label': 'LABEL_1', 'score': 0.9978235363960266}]] + assert result == answer + + +def test_qa_train_effectiveness_bert(): + """ + Tests + Ensures that HappyQuestionAnswering.train() results in + lowering the loss as determined by HappyQuestionAnswering.eval() + """ + + happy_tc = HappyTextClassification(model_type="BERT", model_name="textattack/bert-base-uncased-SST-2") + before_loss = happy_tc.eval("../data/tc/train-eval.csv")["eval_loss"] + happy_tc.train("../data/tc/train-eval.csv") + after_loss = happy_tc.eval("../data/tc/train-eval.csv")["eval_loss"] + assert after_loss < before_loss From 5dbf394628802341397e31ed246ec63ca25a3749 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sat, 9 Jan 2021 20:12:29 -0500 Subject: [PATCH 040/155] Added WP to readme --- README.md | 413 ++++---------------- examples/word_prediction/readme_examples.py | 44 +++ 2 files changed, 111 insertions(+), 346 deletions(-) create mode 100644 examples/word_prediction/readme_examples.py diff --git a/README.md b/README.md index 6652497a..bcd8038d 100644 --- a/README.md +++ b/README.md @@ -6,12 +6,10 @@ * [News](#News) * [Key Features](#Key-Features) * [Installation](#Installation) -* [Initialization](#Initialization) * [Word Prediction](#Word-Prediction) -* [Binary Sequence Classification](#Binary-Sequence-Classification) +* [Text Classification](#Binary-Sequence-Classification) * [Next Sentence Prediction](#Next-Sentence-Prediction) * [Question Answering](#Question-Answering) -* [Masked Word Prediction Fine-Tuning](#Masked-Word-Prediction-Fine-Tuning) * [Tech](#Tech) * [Call For Contributors](#Call-For-Contributors) * [Maintainers](#Maintainers) @@ -32,229 +30,112 @@ Last month, Happy Transformer was presented at a conference called C-Search, and We're happy to announce that we won a Best Paper Award at the Canadian Undergraduate Conference for AI. We also received the highest score overall. The paper can be found [here](https://qmind.ca/wp-content/uploads/2020/05/Proceedings-of-CUCAI-2020.pdf) on page 67. -Happy Transformer is an API built on top of [Hugging Face's transformer library](https://huggingface.co/transformers/) that makes it easy to utilize state-of-the-art NLP models. +Happy Transformer is an package built on top of [Hugging Face's transformer library](https://huggingface.co/transformers/) that makes it easy to utilize state-of-the-art NLP models. -## Key Features - - **New: Finetuning Masked Language Models** - - Available language models: XLNET, BERT and ROBERTA. - - Predict a masked word within a sentence. - - Fine tune binary sequence classification models to solve problems like sentiment analysis. - - Predict the likelihood that sentence B follows sentence A within a paragraph. -| Public Methods | HappyROBERTA | HappyXLNET | HappyBERT | +| Public Methods | Basic Usage | Training | +|------------------------------------|--------------|------------| +| Text Classification | ✔ | ✔ | +| Question Answering | ✔ | ✔ | +| Word Prediction | ✔ | | +| Next Sentence Prediction | ✔ | | + +| Public Methods | ALBERT | BERT |DISTILBERT | |------------------------------------|--------------|------------|-----------| -| Masked Word Prediction | ✔ | ✔ | ✔ | -| Sequence Classification | ✔ | ✔ | ✔ | -| Next Sentence Prediction | | | ✔ | -| Question Answering | | | ✔ | -| Masked Word Prediction Finetuning | ✔ | | ✔ | +| Text Classification | ✔ | ✔ | ✔ | +| Question Answering | ✔ | ✔ | ✔ | +| Word Prediction | ✔ | ✔ | ✔ | +| Next Sentence Prediction | ✔ | ✔ | ✔ | ## Installation ```sh pip install happytransformer ``` -## Initialization -By default base models are used. They are smaller, faster and require significantly less training time -to obtain decent results. -Large models are recommended for tasks that do not require fine tuning such as some word prediction tasks. -Base models are recommended for tasks that require fine tuning with limited available training data. - -Uncased models do not differentiate between cased and uncased words. For example, the words -"empire" and "Empire" would be reduced to the same token. In comparison, cased models do differentiate between cased and uncased words. - -#### HappyXLNET: - -```sh -from happytransformer import HappyXLNET -#--------------------------------------# -xl_base_cased = HappyXLNET("xlnet-base-cased") -xl_large_cased = HappyXLNET("xlnet-large-cased") -``` -#### HappyROBERTA: -```sh -from happytransformer import HappyROBERTA -#--------------------------------------# -happy_roberta_base = HappyROBERTA("roberta-base") -happy_roberta_large = HappyROBERTA("roberta-large") - -``` -#### HappyBERT : -```sh -from happytransformer import HappyBERT -#--------------------------------------# -bert_base_uncased = HappyBERT("bert-base-uncased") -bert_base_cased = HappyBERT("bert-base-cased") -bert_large_uncased = HappyBERT("bert-large-uncased") -bert_large_cased = HappyBERT("bert-large-cased") -``` ## Word Prediction -It is recommended that you use HappyROBERTA("roberta-large") for masked word prediction. -Avoid using HappyBERT for masked word prediction. -If you do decide to use HappyXLNET or HappyBERT, then also use their corresponding "large cased model'. - -For all Happy Transformers, the masked token is **"[MASK]"** - -### Single Mask - -Each Happy Transformer has a public method called "predict_mask(text, options, num_results)" with the following input arguments. -1. Text: the text you wish to predict including a single masked token. -2. options (default = every word): A limited set of words the model can return. -3. num_results (default = 1): The number of returned predictions. - -returns: a list of dictionaries, where each dictionary contains a "word" and "softmax" key - +Initialize a HappyWordPrediction() object to perform word prediction. +Initialization Arguments: + 1. model_type (string): either "ALBERT", "BERT" or "DISTILBERT." The default is "DISTILBERT" + 2. model_name(string): below is a URL that contains potential models. + [MODELS](https://huggingface.co/models?filter=masked-lm) + +For all Transformers, the masked token is **"[MASK]"** -#### Example 1 : -```sh -from happytransformer import HappyROBERTA -#--------------------------------------# -happy_roberta = HappyROBERTA("roberta-large") -text = "I think therefore I [MASK]" -results = happy_roberta.predict_mask(text) - -print(type(results)) # prints: -print(results) # prints: [{'word': 'am', 'softmax': 0.24738965928554535}] - -print(type(results[0])) # prints: -print(results[0]) # prints: {'word': 'am', 'softmax': 0.24738965928554535} - - -``` - -#### Example 2 : -```sh -from happytransformer import HappyROBERTA -#--------------------------------------# -happy_roberta = HappyROBERTA("roberta-large") -text = "To solve world poverty we must invest in [MASK]" -results = happy_roberta.predict_mask(text, num_results = 2) - -print(type(results)) # prints: -print(results) # prints: [{'word': 'education', 'softmax': 0.34365904331207275}, {'word': 'children', 'softmax': 0.03996562585234642}] - -print(type(results[0])) # prints: -print(results[0]) # prints: {'word': 'education', 'softmax': 0.34365904331207275} - - -``` - -#### Example 3 : -```sh -from happytransformer import HappyXLNET -#--------------------------------------# -happy_xlnet = HappyXLNET("xlnet-large-cased") -text = "Can you please pass the [MASK] " -options = ["pizza", "rice", "tofu", 'eggs', 'milk'] -results = happy_xlnet.predict_mask(text, options=options, num_results=3) +### Initialization -print(type(results)) # prints: -print(results) # prints: [{'word': 'tofu', 'softmax': 0.007073382}, {'word': 'pizza', 'softmax': 0.00017212195}, {'word': 'rice', 'softmax': 2.843065e-07}] +We recommend using "HappyWordPrediction("ALBERT", "albert-xxlarge-v2")" for the best performance -print(type(results[1])) # prints: -print(results[1]) # prints: {'word': 'pizza', 'softmax': 0.00017212195} +#### Example 1.0: +```python + from happytransformer import HappyWordPrediction + # --------------------------------------# + happy_wp_distilbert = HappyWordPrediction() # default + happy_wp_albert = HappyWordPrediction("ALBERT", "albert-base-v2") + happy_wp_bert = HappyWordPrediction("BERT", "bert-base-uncased") ``` -### Multiple Mask - -The "predict_masks(text, options, num_results)" method predicts multiple masks within a string. -1. Text: the text you wish to predict that includes "[MASK]" at least once -2. options: a list of lists, where each inner lists contains strings. The outer list is for each mask. The inner list is for the options for the nth mask. -3. num_results (default = 1): The number of returned predictions for each mask. +### predict_mask() +The method predict_masks() contains 3 arguments: +1. text (string): a body of text that contains a single masked token +2. targets (list of strings): a list of potential answers. All other answers will be ignored +3. top_k (int): the number of results that will be returned -returns: a list of lists, where each inner list contains dictionaries. Each dictionary has a "word" and "softmax" key - -The outer list is for the nth mask token. the inner list contains the the prediction(s) for the mask. +Returns: +A list of named tuples with arguments: "token_str" and "top_k" +Note: if targets are provided, then top_k will be ignored and a score for each target will be returned. +#### Example 1.1: +```python -#### Example 1 : -```sh -from happytransformer import HappyROBERTA +from happytransformer import HappyWordPrediction #--------------------------------------# -happy_roberta = HappyROBERTA("roberta-large") -text = "[MASK] have a [MASK] dog and I love [MASK] so much" -results = happy_roberta.predict_masks(text) - -print(type(results)) # prints: -print(results) # prints: [[{'word': 'i', 'softmax': 0.5861835479736328}], [{'word': 'little', 'softmax': 0.16358524560928345}], [{'word': 'him', 'softmax': 0.6039994359016418}]] - -first_mask_result = results[0] -print(type(first_mask_result)) # prints: -print(first_mask_result) # prints: [{'word': 'i', 'softmax': 0.5861835479736328}] - -first_prediction_result = first_mask_result[0] -print(type(first_prediction_result)) # prints: -print(first_prediction_result) # prints: {'word': 'i', 'softmax': 0.5861835479736328} - - -print(type(first_prediction_result["word"])) # -print(first_prediction_result["word"]) # i - + happy_wp = HappyWordPrediction() # default uses distilbert-base-uncased + result = happy_wp.predict_mask("I think therefore I [MASK]") + print(type(result)) # + print(result) # [WordPredictionResult(token_str='am', score=0.10172799974679947)] + print(type(result[0])) # + print(result[0]) # [WordPredictionResult(token_str='am', score=0.10172799974679947)] + print(result[0].token_str) # am + print(result[0].score) # 0.10172799974679947 + ``` -#### Example 2 : -```sh +#### Example 1.2: +```python -from happytransformer import HappyROBERTA +from happytransformer import HappyWordPrediction #--------------------------------------# -happy_roberta = HappyROBERTA("roberta-large") -text = "[MASK] have a [MASK] dog and I love [MASK] so much" -results = happy_roberta.predict_masks(text, num_results=2) - -print(type(results)) # prints: -print(results) # prints: [[{'word': 'i', 'softmax': 0.5861835479736328}, {'word': 'I', 'softmax': 0.3941880762577057}], [{'word': 'little', 'softmax': 0.16358524560928345}, {'word': 'beautiful', 'softmax': 0.10422931611537933}] ... - -second_mask_result = results[1] -print(type(second_mask_result)) # prints: -print(second_mask_result) # prints: [{'word': 'little', 'softmax': 0.16358524560928345}, {'word': 'beautiful', 'softmax': 0.10422931611537933}] - -second_prediction_result = second_mask_result[1] -print(type(second_prediction_result)) # prints: -print(second_prediction_result) # prints: {'word': 'beautiful', 'softmax': 0.10422931611537933} - - -print(type(second_prediction_result["word"])) # prints: -print(second_prediction_result["word"]) # prints: beautiful - +happy_wp = HappyWordPrediction("ALBERT", "albert-xxlarge-v2") +result = happy_wp.predict_mask("To better the world I would invest in [MASK] and education.", top_k=2) +print(result) # [WordPredictionResult(token_str='infrastructure', score=0.09270179271697998), WordPredictionResult(token_str='healthcare', score=0.07219093292951584)] +print(result[1]) # WordPredictionResult(token_str='healthcare', score=0.07219093292951584) +print(result[1].token_str) # healthcare ``` -#### Example 3 : -```sh -from happytransformer import HappyROBERTA +#### Example 1.3: +```python +from happytransformer import HappyWordPrediction #--------------------------------------# -happy_roberta = HappyROBERTA("roberta-large") -text = "[MASK] have a [MASK] dog and I love [MASK] so much" - -options = [["We", "You"], ["smart", "massive"], ["him", "myself"]] -results = happy_roberta.predict_masks(text, options=options) - -print(type(results)) # prints: -print(results) # prints: [[{'word': 'We', 'softmax': 0.007347162}, {'word': 'You', 'softmax': 0.0007238558}], [{'word': 'smart', 'softmax': 1.1157575e-06}, {'word': 'massive', 'softmax': 1.0597887e-06}], [{'word': 'him', 'softmax': 0.00021874729}, {'word': 'myself', 'softmax': 3.0992996e-06}]] - - -first_mask_result = results[0] -print(type(first_mask_result)) # prints: -print(first_mask_result) # prints: [{'word': 'We', 'softmax': 0.007347162}, {'word': 'You', 'softmax': 0.0007238558}] +happy_wp = HappyWordPrediction("ALBERT", "albert-xxlarge-v2") +targets = ["technology", "healthcare"] +result = happy_wp.predict_mask("To better the world I would invest in [MASK] and education.", targets=targets) +print(result) # [WordPredictionResult(token_str='healthcare', score=0.07219093292951584), WordPredictionResult(token_str='technology', score=0.032044216990470886)] +print(result[1]) # WordPredictionResult(token_str='technology', score=0.032044216990470886) +print(result[1].token_str) # technology -first_prediction_result = first_mask_result[0] -print(type(first_prediction_result)) # prints: -print(first_prediction_result) # prints: {'word': 'We', 'softmax': 0.007347162} - -print(type(first_prediction_result["word"])) # -print(first_prediction_result["word"]) # We ``` @@ -527,166 +408,6 @@ print(best_answer["softmax"]) # prints: 0.9916905164718628 ``` -## Masked Word Prediction Fine-Tuning - -*Fine-tune a state-of-the-art masked word prediction model with just a text file* - -Each HappyBERT and HappyROBERTA both have 4 methods that are associated with masked word prediction fine-tuning -They are: - -``` -1. init_mwp(args) -2. train_mwp(training_path) -3. eval_mwp(testing_path,batch_size) -4. predict_mask(text, options, num_results) -``` - -### init_mwp(args) - -*Initialize the model for masked word prediction training.* - -#### Example 1 -```python -from happytransformer import HappyROBERTA -#----------------------------------------# - -Roberta = HappyROBERTA() - -Roberta.init_train_mwp() # Initialize the training model with default settings - -``` - -You can also customize the training parameters by inputting a dictionary with specific training parameters. The dictionary must have the same keys as the dictionary shown below. -```python -word_prediction_args = { - -"batch_size": 1, - -"epochs": 1, - -"lr": 5e-5, - -"adam_epsilon": 1e-8 - -} -``` -The args are: - -- batch_size: How many sequences the model processes on one iteration. - -- epochs: This refers to how many times the model will train on the same dataset. - -- lr (learning rate): How quickly the model learns. - -- adam_epsilon: This is used to avoid diving by zero when gradient is almost zero. - - -The recommended for the parameters are: - -- lr: 1e-4 used in BERT and ROBERTA [1] - -- Adam Epsilon: 1e-6 used by Huggingface team [2] - -- batch_size: Depend on the user's vram, Typically 2 to 3 - - -#### Example 2 - -```python -from happytransformer import HappyROBERTA -#----------------------------------------# - -happy_roberta = HappyROBERTA() - -word_prediction_args = { -"batch_size": 4, - -"epochs": 2, - -"lr": 3e-5, - -"adam_epsilon": 1e-8 - -} - -happy_roberta.init_train_mwp(word_prediction_args) - -``` - - -### train_mwp(training_path) -*Trains the model on Masked Language Modelling Loss.* - -Argument: -1. testing_path: A string directory path to the .txt that contains the testing data. - -Example training.txt : -``` -I want to get healthy in 2011 . -I want to boost my immune system , cut that nasty dairy out , and start exercising on a regular basis . -That doesn 't seem to hard to follow does it ? -``` - -### eval_mwp(testing_path,batch_size)
-*Evaluates the model on Masked Language Modelling loss and return both perplexity and masked language modelling loss.* - - -Perplexity: Mathematicall it is ![equation](https://latex.codecogs.com/gif.latex?2^{Entropy}) where Entropy is the disorder in the system. Lower the perplexity the better the model is performing. - -Masked language modelling loss: see [BERT Explained: State of the art language model for NLP](https://towardsdatascience.com/perplexity-intuition-and-derivation-105dd481c8f3) for the explanation. - -Arguments: -``` -1. testing_path: A string directory path to the .txt that contains the testing data. -2. batch_size: An integer. Will default to 2. -``` - -Example testing.txt : -``` -In the few short months since Dan 's mother had moved to town , Saturday had gone from being my favourite day of the week to the one I looked forward to the least. -Although she came when Dan was at work , she invariably stayed all day . -It was like living in a goldfish bowl and Dan was on edge the minute he came through the door. -``` - - -Note 2: Evaluating on Cpu is not recommended as it will take considerably longer. - -#### Example 3: - -```python -from happytransformer import HappyROBERTA -#----------------------------------------# - -happy_roberta = HappyROBERTA() -happy_roberta.init_train_mwp(word_prediction_args) - -train_path = "data/train.txt" -happy_roberta.train_mwp(train_path) - -eval_path = "data/eval.txt" -eval_results = happy_roberta.eval_mwp(eval_path) - -print(type(eval_results)) # prints: -print(eval_results) # prints: {'perplexity': 7.863316059112549, 'eval_loss': 2.0622084404198864} - -``` - - -### Predicting masked word with fine-tuned model - -```python -text = "Linear algebra is a branch of [MASK]" - -options = ["music", "mathematics", "geography"] - -results = happy_roberta.predict_mask(text, options=options, num_results=3) - -print(type(results)) # prints: - -print(results) # prints: [{'word': 'mathematics', 'softmax': 0.16551}, {'word': 'music', 'softmax': 3.91739e-05}, {'word': 'geography', 'softmax': 2.9731e-05}] - -``` - ## Tech Happy Transformer uses a number of open source projects: diff --git a/examples/word_prediction/readme_examples.py b/examples/word_prediction/readme_examples.py new file mode 100644 index 00000000..1e31c60b --- /dev/null +++ b/examples/word_prediction/readme_examples.py @@ -0,0 +1,44 @@ +from happytransformer import HappyWordPrediction + + +def example_1_0(): + happy_wp_distilbert = HappyWordPrediction() # default + happy_wp_albert = HappyWordPrediction("ALBERT", "albert-base-v2") + happy_wp_bert = HappyWordPrediction("BERT", "bert-base-uncased") + + +def example_1_1(): + happy_wp = HappyWordPrediction() # default uses distilbert-base-uncased + result = happy_wp.predict_mask("I think therefore I [MASK]") + print(type(result)) # + print(result) # [WordPredictionResult(token_str='am', score=0.10172799974679947)] + print(type(result[0])) # + print(result[0]) # [WordPredictionResult(token_str='am', score=0.10172799974679947)] + print(result[0].token_str) # am + print(result[0].score) # 0.10172799974679947 + +def example_1_2(): + happy_wp = HappyWordPrediction("ALBERT", "albert-xxlarge-v2") + result = happy_wp.predict_mask("To better the world I would invest in [MASK] and education.", top_k=10) + print(result) # [WordPredictionResult(token_str='infrastructure', score=0.09270179271697998), WordPredictionResult(token_str='healthcare', score=0.07219093292951584)] + print(result[1]) # WordPredictionResult(token_str='healthcare', score=0.07219093292951584) + print(result[1].token_str) # healthcare + +def example_1_3(): + happy_wp = HappyWordPrediction("ALBERT", "albert-xxlarge-v2") + targets = ["technology", "healthcare"] + result = happy_wp.predict_mask("To better the world I would invest in [MASK] and education.", targets=targets) + print(result) # [WordPredictionResult(token_str='healthcare', score=0.07219093292951584), WordPredictionResult(token_str='technology', score=0.032044216990470886)] + print(result[1]) # WordPredictionResult(token_str='technology', score=0.032044216990470886) + print(result[1].token_str) # technology + + + + +def main(): + example_1_3() + + + +if __name__ == "__main__": + main() \ No newline at end of file From c5e7eef987b7678fdef275d5d293678f5a474f06 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sat, 9 Jan 2021 20:14:39 -0500 Subject: [PATCH 041/155] fixed whitespace --- examples/word_prediction/readme_examples.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/word_prediction/readme_examples.py b/examples/word_prediction/readme_examples.py index 1e31c60b..108ebffe 100644 --- a/examples/word_prediction/readme_examples.py +++ b/examples/word_prediction/readme_examples.py @@ -17,6 +17,7 @@ def example_1_1(): print(result[0].token_str) # am print(result[0].score) # 0.10172799974679947 + def example_1_2(): happy_wp = HappyWordPrediction("ALBERT", "albert-xxlarge-v2") result = happy_wp.predict_mask("To better the world I would invest in [MASK] and education.", top_k=10) @@ -24,6 +25,7 @@ def example_1_2(): print(result[1]) # WordPredictionResult(token_str='healthcare', score=0.07219093292951584) print(result[1].token_str) # healthcare + def example_1_3(): happy_wp = HappyWordPrediction("ALBERT", "albert-xxlarge-v2") targets = ["technology", "healthcare"] @@ -33,12 +35,9 @@ def example_1_3(): print(result[1].token_str) # technology - - def main(): example_1_3() - if __name__ == "__main__": - main() \ No newline at end of file + main() From b6616c9b38021f1f0cc461a87b5e2f5628c42fdd Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sat, 9 Jan 2021 20:16:35 -0500 Subject: [PATCH 042/155] Added features header --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index bcd8038d..5ae104a3 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ ## Table of Contents * [News](#News) -* [Key Features](#Key-Features) +* [Features](#Features) * [Installation](#Installation) * [Word Prediction](#Word-Prediction) * [Text Classification](#Binary-Sequence-Classification) @@ -32,7 +32,7 @@ We're happy to announce that we won a Best Paper Award at the Canadian Undergrad Happy Transformer is an package built on top of [Hugging Face's transformer library](https://huggingface.co/transformers/) that makes it easy to utilize state-of-the-art NLP models. - +## Features | Public Methods | Basic Usage | Training | |------------------------------------|--------------|------------| From 2be2fa04f7e491a5d45d99162f331e19204dc4c7 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sat, 9 Jan 2021 20:38:28 -0500 Subject: [PATCH 043/155] Added linebreaks --- tests/test_wp.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_wp.py b/tests/test_wp.py index e14802de..5b7a429f 100644 --- a/tests/test_wp.py +++ b/tests/test_wp.py @@ -16,7 +16,8 @@ def test_mwp_top_k(): "Please pass the salt and [MASK]", top_k=2 ) - answer = [WordPredictionResult(token_str='pepper', score=0.2664579749107361), WordPredictionResult(token_str='vinegar', score=0.08760260790586472)] + answer = [WordPredictionResult(token_str='pepper', score=0.2664579749107361), + WordPredictionResult(token_str='vinegar', score=0.08760260790586472)] assert result == answer @@ -26,7 +27,8 @@ def test_mwp_targets(): "Please pass the salt and [MASK]", targets=["water", "spices"] ) - answer = [WordPredictionResult(token_str='water', score=0.014856964349746704), WordPredictionResult(token_str='spices', score=0.009040987119078636)] + answer = [WordPredictionResult(token_str='water', score=0.014856964349746704), + WordPredictionResult(token_str='spices', score=0.009040987119078636)] assert result == answer From ff9b0baf75826299040a12ec5d0bae0ca7679db1 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sat, 9 Jan 2021 20:40:00 -0500 Subject: [PATCH 044/155] Added TC linebreaks --- tests/test_tc.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/tests/test_tc.py b/tests/test_tc.py index 71669619..989e56c2 100644 --- a/tests/test_tc.py +++ b/tests/test_tc.py @@ -25,7 +25,9 @@ def test_classify_texts(): happy_tc = HappyTextClassification() input = ["What a great movie", "Horrible movie", "Bad restaurant"] result = happy_tc.classify_text(input) - answer = [{'label': 'POSITIVE', 'score': 0.9998726844787598}, {'label': 'NEGATIVE', 'score': 0.9997945427894592}, {'label': 'NEGATIVE', 'score': 0.9997393488883972}] + answer = [{'label': 'POSITIVE', 'score': 0.9998726844787598}, + {'label': 'NEGATIVE', 'score': 0.9997945427894592}, + {'label': 'NEGATIVE', 'score': 0.9997393488883972}] assert result == answer def test_qa_train(): @@ -57,7 +59,10 @@ def test_qa_test(): happy_tc = HappyTextClassification() result = happy_tc.test("../data/tc/test.csv") - answer = [[{'label': 'POSITIVE', 'score': 0.9998401999473572}], [{'label': 'NEGATIVE', 'score': 0.9772131443023682}], [{'label': 'NEGATIVE', 'score': 0.9966067671775818}], [{'label': 'POSITIVE', 'score': 0.9792295098304749}]] + answer = [[{'label': 'POSITIVE', 'score': 0.9998401999473572}], + [{'label': 'NEGATIVE', 'score': 0.9772131443023682}], + [{'label': 'NEGATIVE', 'score': 0.9966067671775818}], + [{'label': 'POSITIVE', 'score': 0.9792295098304749}]] assert result == answer @@ -82,7 +87,10 @@ def test_qa_test_albert(): happy_tc = HappyTextClassification(model_type="ALBERT", model_name="textattack/albert-base-v2-SST-2") result = happy_tc.test("../data/tc/test.csv") - answer = [[{'label': 'LABEL_1', 'score': 0.9990348815917969}], [{'label': 'LABEL_0', 'score': 0.9947203397750854}], [{'label': 'LABEL_0', 'score': 0.9958302974700928}], [{'label': 'LABEL_1', 'score': 0.9986426830291748}]] + answer = [[{'label': 'LABEL_1', 'score': 0.9990348815917969}], + [{'label': 'LABEL_0', 'score': 0.9947203397750854}], + [{'label': 'LABEL_0', 'score': 0.9958302974700928}], + [{'label': 'LABEL_1', 'score': 0.9986426830291748}]] assert result == answer @@ -108,7 +116,10 @@ def test_qa_test_bert(): happy_tc = HappyTextClassification(model_type="BERT", model_name="textattack/bert-base-uncased-SST-2") result = happy_tc.test("../data/tc/test.csv") - answer = [[{'label': 'LABEL_1', 'score': 0.9995690584182739}], [{'label': 'LABEL_0', 'score': 0.9981549382209778}], [{'label': 'LABEL_0', 'score': 0.9965545535087585}], [{'label': 'LABEL_1', 'score': 0.9978235363960266}]] + answer = [[{'label': 'LABEL_1', 'score': 0.9995690584182739}], + [{'label': 'LABEL_0', 'score': 0.9981549382209778}], + [{'label': 'LABEL_0', 'score': 0.9965545535087585}], + [{'label': 'LABEL_1', 'score': 0.9978235363960266}]] assert result == answer From 03b60601d6891b7f296a2ceaa3a3717ba5bd5aeb Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sat, 9 Jan 2021 20:40:52 -0500 Subject: [PATCH 045/155] added QA linebreaks --- tests/test_qa.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tests/test_qa.py b/tests/test_qa.py index 58ce1a25..04a30553 100644 --- a/tests/test_qa.py +++ b/tests/test_qa.py @@ -15,7 +15,9 @@ def test_qa_answer_question(): def test_qa_answer_question_top_k(): happy_qa = HappyQuestionAnswering() result = happy_qa.answer_question("Today's date is January 8th 2021", "What is the date?", topk=3) - answer = [{'score': 0.9696964621543884, 'start': 16, 'end': 32, 'answer': 'January 8th 2021'}, {'score': 0.02050216868519783, 'start': 16, 'end': 27, 'answer': 'January 8th'}, {'score': 0.005092293489724398, 'start': 16, 'end': 23, 'answer': 'January'}] + answer = [{'score': 0.9696964621543884, 'start': 16, 'end': 32, 'answer': 'January 8th 2021'}, + {'score': 0.02050216868519783, 'start': 16, 'end': 27, 'answer': 'January 8th'}, + {'score': 0.005092293489724398, 'start': 16, 'end': 23, 'answer': 'January'}] assert result == answer def test_qa_train(): @@ -32,7 +34,8 @@ def test_qa_eval(): def test_qa_test(): happy_qa = HappyQuestionAnswering() result = happy_qa.test("../data/qa/test.csv") - answer = [{'score': 0.9939756989479065, 'start': 0, 'end': 12, 'answer': 'October 31st'}, {'score': 0.967872679233551, 'start': 12, 'end': 25, 'answer': 'November 23rd'}] + answer = [{'score': 0.9939756989479065, 'start': 0, 'end': 12, 'answer': 'October 31st'}, + {'score': 0.967872679233551, 'start': 12, 'end': 25, 'answer': 'November 23rd'}] assert result == answer @@ -65,7 +68,8 @@ def test_qa_train_effectiveness_albert(): def test_qa_test_albert(): happy_qa = HappyQuestionAnswering("ALBERT", "twmkn9/albert-base-v2-squad2") result = happy_qa.test("../data/qa/test.csv") - answer = [{'score': 0.988578736782074, 'start': 0, 'end': 12, 'answer': 'October 31st'}, {'score': 0.9833534359931946, 'start': 12, 'end': 25, 'answer': 'November 23rd'}] + answer = [{'score': 0.988578736782074, 'start': 0, 'end': 12, 'answer': 'October 31st'}, + {'score': 0.9833534359931946, 'start': 12, 'end': 25, 'answer': 'November 23rd'}] assert result == answer @@ -85,5 +89,6 @@ def test_qa_train_effectiveness_bert(): def test_qa_test_bert(): happy_qa = HappyQuestionAnswering("BERT", "mrm8488/bert-tiny-5-finetuned-squadv2") result = happy_qa.test("../data/qa/test.csv") - answer = [{'score': 0.9352769255638123, 'start': 0, 'end': 12, 'answer': 'October 31st'}, {'score': 0.9180678129196167, 'start': 12, 'end': 25, 'answer': 'November 23rd'}] + answer = [{'score': 0.9352769255638123, 'start': 0, 'end': 12, 'answer': 'October 31st'}, + {'score': 0.9180678129196167, 'start': 12, 'end': 25, 'answer': 'November 23rd'}] assert result == answer \ No newline at end of file From 836633eaaef2e224bf5255811e46aee594efb222 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sat, 9 Jan 2021 23:05:03 -0500 Subject: [PATCH 046/155] Added Ted's feedback --- README.md | 2 +- examples/word_prediction/readme_examples.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 5ae104a3..40e32c38 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,7 @@ from happytransformer import HappyWordPrediction result = happy_wp.predict_mask("I think therefore I [MASK]") print(type(result)) # print(result) # [WordPredictionResult(token_str='am', score=0.10172799974679947)] - print(type(result[0])) # + print(type(result[0])) # print(result[0]) # [WordPredictionResult(token_str='am', score=0.10172799974679947)] print(result[0].token_str) # am print(result[0].score) # 0.10172799974679947 diff --git a/examples/word_prediction/readme_examples.py b/examples/word_prediction/readme_examples.py index 108ebffe..829129b9 100644 --- a/examples/word_prediction/readme_examples.py +++ b/examples/word_prediction/readme_examples.py @@ -12,7 +12,7 @@ def example_1_1(): result = happy_wp.predict_mask("I think therefore I [MASK]") print(type(result)) # print(result) # [WordPredictionResult(token_str='am', score=0.10172799974679947)] - print(type(result[0])) # + print(type(result[0])) # print(result[0]) # [WordPredictionResult(token_str='am', score=0.10172799974679947)] print(result[0].token_str) # am print(result[0].score) # 0.10172799974679947 @@ -36,7 +36,7 @@ def example_1_3(): def main(): - example_1_3() + example_1_1() if __name__ == "__main__": From 99225647172f72d815246632824c0a5bdf7d4072 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sun, 10 Jan 2021 01:06:34 -0500 Subject: [PATCH 047/155] QA now uses named tuple outputs --- happytransformer/happy_question_answering.py | 27 +++++++++++++++----- happytransformer/happy_trainer.py | 4 +-- happytransformer/qa/trainer.py | 4 +-- tests/test_qa.py | 24 ++++++++--------- 4 files changed, 37 insertions(+), 22 deletions(-) diff --git a/happytransformer/happy_question_answering.py b/happytransformer/happy_question_answering.py index f0a93833..bd4f8c88 100644 --- a/happytransformer/happy_question_answering.py +++ b/happytransformer/happy_question_answering.py @@ -6,6 +6,7 @@ import torch from happytransformer.happy_transformer import HappyTransformer from happytransformer.qa.trainer import QATrainer +from collections import namedtuple from happytransformer.qa.default_args \ import ARGS_QA_TRAIN from transformers import ( @@ -18,7 +19,7 @@ QuestionAnsweringPipeline, ) - +QuestionAnsweringResult = namedtuple("QuestionAnsweringResult", [ "answer", "score", "start", "end"]) class HappyQuestionAnswering(HappyTransformer): """ @@ -61,14 +62,28 @@ def __init__(self, model_type="DISTILBERT", def answer_question(self, context, question, topk=1): """ - :param context: background information to answer the question (string) :param question: A question that can be answered with the given context (string) :param topk: how many results - :return: if topk =1, a dictionary that contains the keys: score, start, end and answer - if topk >1, a list of dictionaries described above + :return: A list of a named tuples that contains the keys: answer, score, start and end + """ - return self._pipeline(context=context, question=question, topk=topk) + + result = self._pipeline(context=context, question=question, topk=topk) + # transformers returns a single dictionary when topk ==1. + # Our convention however is to have constant output format + if topk == 1: + result = [result] + + results = [ + QuestionAnsweringResult( + answer=answer["answer"], + score=answer["score"], + start=answer["start"], + end=answer["end"],) + for answer in result + ] + return results def train(self, input_filepath, args=ARGS_QA_TRAIN): """ @@ -109,4 +124,4 @@ def test(self, input_filepath): return: A list of dictionaries. Each dictionary contains the keys: "score", "start", "end" and "answer" """ - return self._trainer.test(input_filepath=input_filepath, pipeline=self._pipeline) + return self._trainer.test(input_filepath=input_filepath, solve=self.answer_question) diff --git a/happytransformer/happy_trainer.py b/happytransformer/happy_trainer.py index 7e3d4029..fb871f00 100644 --- a/happytransformer/happy_trainer.py +++ b/happytransformer/happy_trainer.py @@ -23,11 +23,11 @@ def train(self, input_filepath, args): """ raise NotImplementedError() - def test(self, input_filepath, pipeline): + def test(self, input_filepath, solve): """ :param input_filepath: A string to file location - :param pipeline: an initialized transformer pipeline for the given task + :param solve: a method for using the model for the given task :return: test results """ raise NotImplementedError() diff --git a/happytransformer/qa/trainer.py b/happytransformer/qa/trainer.py index 182fc030..ad23e521 100644 --- a/happytransformer/qa/trainer.py +++ b/happytransformer/qa/trainer.py @@ -46,7 +46,7 @@ def eval(self, input_filepath): return self._run_eval(dataset) - def test(self, input_filepath, pipeline): + def test(self, input_filepath, solve): """ See docstring in HappyQuestionAnswering.test() @@ -58,7 +58,7 @@ def test(self, input_filepath, pipeline): for case in tqdm(zip(contexts, questions)): context = case[0] question = case[1] - result = pipeline(question, context) + result = solve(context, question)[0] # only care about first result results.append(result) diff --git a/tests/test_qa.py b/tests/test_qa.py index 04a30553..3122eb53 100644 --- a/tests/test_qa.py +++ b/tests/test_qa.py @@ -2,22 +2,22 @@ Tests for the question answering training, evaluating and testing functionality """ -from happytransformer.happy_question_answering import HappyQuestionAnswering +from happytransformer.happy_question_answering import HappyQuestionAnswering, QuestionAnsweringResult def test_qa_answer_question(): happy_qa = HappyQuestionAnswering() result = happy_qa.answer_question("Today's date is January 8th 2021", "What is the date?") - answer = {'score': 0.9696964621543884, 'start': 16, 'end': 32, 'answer': 'January 8th 2021'} + answer = [QuestionAnsweringResult(answer='January 8th 2021', score=0.9696964621543884, start=16, end=32)] assert result == answer def test_qa_answer_question_top_k(): happy_qa = HappyQuestionAnswering() result = happy_qa.answer_question("Today's date is January 8th 2021", "What is the date?", topk=3) - answer = [{'score': 0.9696964621543884, 'start': 16, 'end': 32, 'answer': 'January 8th 2021'}, - {'score': 0.02050216868519783, 'start': 16, 'end': 27, 'answer': 'January 8th'}, - {'score': 0.005092293489724398, 'start': 16, 'end': 23, 'answer': 'January'}] + answer = [QuestionAnsweringResult(answer='January 8th 2021', score=0.9696964621543884, start=16, end=32), + QuestionAnsweringResult(answer='January 8th', score=0.02050216868519783, start=16, end=27), + QuestionAnsweringResult(answer='January', score=0.005092293489724398, start=16, end=23)] assert result == answer def test_qa_train(): @@ -34,8 +34,8 @@ def test_qa_eval(): def test_qa_test(): happy_qa = HappyQuestionAnswering() result = happy_qa.test("../data/qa/test.csv") - answer = [{'score': 0.9939756989479065, 'start': 0, 'end': 12, 'answer': 'October 31st'}, - {'score': 0.967872679233551, 'start': 12, 'end': 25, 'answer': 'November 23rd'}] + answer = [QuestionAnsweringResult(answer='October 31st', score=0.9939756989479065, start=0, end=12), + QuestionAnsweringResult(answer='November 23rd', score=0.967872679233551, start=12, end=25)] assert result == answer @@ -68,8 +68,8 @@ def test_qa_train_effectiveness_albert(): def test_qa_test_albert(): happy_qa = HappyQuestionAnswering("ALBERT", "twmkn9/albert-base-v2-squad2") result = happy_qa.test("../data/qa/test.csv") - answer = [{'score': 0.988578736782074, 'start': 0, 'end': 12, 'answer': 'October 31st'}, - {'score': 0.9833534359931946, 'start': 12, 'end': 25, 'answer': 'November 23rd'}] + answer = [QuestionAnsweringResult(answer='October 31st', score=0.988578736782074, start=0, end=12), + QuestionAnsweringResult(answer='November 23rd', score=0.9833534359931946, start=12, end=25)] assert result == answer @@ -89,6 +89,6 @@ def test_qa_train_effectiveness_bert(): def test_qa_test_bert(): happy_qa = HappyQuestionAnswering("BERT", "mrm8488/bert-tiny-5-finetuned-squadv2") result = happy_qa.test("../data/qa/test.csv") - answer = [{'score': 0.9352769255638123, 'start': 0, 'end': 12, 'answer': 'October 31st'}, - {'score': 0.9180678129196167, 'start': 12, 'end': 25, 'answer': 'November 23rd'}] - assert result == answer \ No newline at end of file + answer = [QuestionAnsweringResult(answer='October 31st', score=0.9352769255638123, start=0, end=12), + QuestionAnsweringResult(answer='November 23rd', score=0.9180678129196167, start=12, end=25)] + assert result == answer From cf0a3a091b0ccf14fb50161b96df19a25102e34a Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sun, 10 Jan 2021 01:07:29 -0500 Subject: [PATCH 048/155] cleaning --- happytransformer/happy_question_answering.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/happytransformer/happy_question_answering.py b/happytransformer/happy_question_answering.py index bd4f8c88..44113e69 100644 --- a/happytransformer/happy_question_answering.py +++ b/happytransformer/happy_question_answering.py @@ -2,11 +2,10 @@ Contains the HappyQuestionAnswering class. """ - +from collections import namedtuple import torch from happytransformer.happy_transformer import HappyTransformer from happytransformer.qa.trainer import QATrainer -from collections import namedtuple from happytransformer.qa.default_args \ import ARGS_QA_TRAIN from transformers import ( @@ -19,7 +18,7 @@ QuestionAnsweringPipeline, ) -QuestionAnsweringResult = namedtuple("QuestionAnsweringResult", [ "answer", "score", "start", "end"]) +QuestionAnsweringResult = namedtuple("QuestionAnsweringResult", ["answer", "score", "start", "end"]) class HappyQuestionAnswering(HappyTransformer): """ From 2e8c6983153823d5d0bd18fdc13ed359b7831ae5 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sun, 10 Jan 2021 01:19:53 -0500 Subject: [PATCH 049/155] Changed TC outputs to named tuple --- happytransformer/happy_text_classification.py | 20 ++++++--- happytransformer/tc/trainer.py | 4 +- tests/test_tc.py | 42 +++++++------------ 3 files changed, 30 insertions(+), 36 deletions(-) diff --git a/happytransformer/happy_text_classification.py b/happytransformer/happy_text_classification.py index 62148ce4..d1cc0dea 100644 --- a/happytransformer/happy_text_classification.py +++ b/happytransformer/happy_text_classification.py @@ -1,7 +1,7 @@ """ Contains a class called HappyTextClassification that performs text classification """ - +from collections import namedtuple import torch from transformers import ( @@ -21,6 +21,8 @@ from happytransformer.tc.default_args import ARGS_TC_TRAIN +TextClassificationResult = namedtuple("TextClassificationResult", ["label", "score"]) + class HappyTextClassification(HappyTransformer): """ A user facing class for Text Classification @@ -59,11 +61,17 @@ def __init__(self, model_type="DISTILBERT", def classify_text(self, text): """ - :param text: A text string to be classified, or a list of strings - :return: either a single dictionary with keys: label and score, - or a list of these dictionaries with the same keys + :param text: A text string to be classified + :return: A dictionary with keys: label and score, """ - return self._pipeline(text) + # Blocking allowing a for a list of strings + if not isinstance(text, str): + raise ValueError("the \"text\" argument must be a single string") + result = self._pipeline(text) + # we do not support predicting a list of texts, so only first prediction is relevant + result = result[0] + + return TextClassificationResult(label=result["label"], score=result["score"]) def train(self, input_filepath, args=ARGS_TC_TRAIN): @@ -102,4 +110,4 @@ def test(self, input_filepath): text return: #todo """ - return self._trainer.test(input_filepath=input_filepath, pipeline=self._pipeline) + return self._trainer.test(input_filepath=input_filepath, solve=self.classify_text) diff --git a/happytransformer/tc/trainer.py b/happytransformer/tc/trainer.py index 09206bd7..e3704211 100644 --- a/happytransformer/tc/trainer.py +++ b/happytransformer/tc/trainer.py @@ -32,7 +32,7 @@ def eval(self, input_filepath): return self._run_eval(eval_dataset) - def test(self, input_filepath, pipeline): + def test(self, input_filepath, solve): """ See docstring in HappyQuestionAnswering.test() solve: HappyQuestionAnswering.answers_to_question() @@ -42,7 +42,7 @@ def test(self, input_filepath, pipeline): results = list() for context in tqdm(contexts): - result = pipeline(context) + result = solve(context) results.append(result) return results diff --git a/tests/test_tc.py b/tests/test_tc.py index 989e56c2..49cf4f3a 100644 --- a/tests/test_tc.py +++ b/tests/test_tc.py @@ -2,7 +2,7 @@ Tests for Text Classification Functionality """ -from happytransformer.happy_text_classification import HappyTextClassification +from happytransformer.happy_text_classification import HappyTextClassification, TextClassificationResult def test_classify_text(): """ @@ -12,23 +12,9 @@ def test_classify_text(): """ happy_tc = HappyTextClassification() result = happy_tc.classify_text("What a great movie") - print(result) - answer = [{'label': 'POSITIVE', 'score': 0.9998726844787598}] + answer = TextClassificationResult(label='POSITIVE', score=0.9998726844787598) assert result == answer -def test_classify_texts(): - """ - Tests - HappyQuestionAnswering.classify_text() - - """ - happy_tc = HappyTextClassification() - input = ["What a great movie", "Horrible movie", "Bad restaurant"] - result = happy_tc.classify_text(input) - answer = [{'label': 'POSITIVE', 'score': 0.9998726844787598}, - {'label': 'NEGATIVE', 'score': 0.9997945427894592}, - {'label': 'NEGATIVE', 'score': 0.9997393488883972}] - assert result == answer def test_qa_train(): """ @@ -59,10 +45,10 @@ def test_qa_test(): happy_tc = HappyTextClassification() result = happy_tc.test("../data/tc/test.csv") - answer = [[{'label': 'POSITIVE', 'score': 0.9998401999473572}], - [{'label': 'NEGATIVE', 'score': 0.9772131443023682}], - [{'label': 'NEGATIVE', 'score': 0.9966067671775818}], - [{'label': 'POSITIVE', 'score': 0.9792295098304749}]] + answer = [TextClassificationResult(label='POSITIVE', score=0.9998401999473572), + TextClassificationResult(label='NEGATIVE', score=0.9772131443023682), + TextClassificationResult(label='NEGATIVE', score=0.9966067671775818), + TextClassificationResult(label='POSITIVE', score=0.9792295098304749)] assert result == answer @@ -87,10 +73,10 @@ def test_qa_test_albert(): happy_tc = HappyTextClassification(model_type="ALBERT", model_name="textattack/albert-base-v2-SST-2") result = happy_tc.test("../data/tc/test.csv") - answer = [[{'label': 'LABEL_1', 'score': 0.9990348815917969}], - [{'label': 'LABEL_0', 'score': 0.9947203397750854}], - [{'label': 'LABEL_0', 'score': 0.9958302974700928}], - [{'label': 'LABEL_1', 'score': 0.9986426830291748}]] + answer = [TextClassificationResult(label='LABEL_1', score=0.9990348815917969), + TextClassificationResult(label='LABEL_0', score=0.9947203397750854), + TextClassificationResult(label='LABEL_0', score=0.9958302974700928), + TextClassificationResult(label='LABEL_1', score=0.9986426830291748)] assert result == answer @@ -116,10 +102,10 @@ def test_qa_test_bert(): happy_tc = HappyTextClassification(model_type="BERT", model_name="textattack/bert-base-uncased-SST-2") result = happy_tc.test("../data/tc/test.csv") - answer = [[{'label': 'LABEL_1', 'score': 0.9995690584182739}], - [{'label': 'LABEL_0', 'score': 0.9981549382209778}], - [{'label': 'LABEL_0', 'score': 0.9965545535087585}], - [{'label': 'LABEL_1', 'score': 0.9978235363960266}]] + answer = [TextClassificationResult(label='LABEL_1', score=0.9995690584182739), + TextClassificationResult(label='LABEL_0', score=0.9981549382209778), + TextClassificationResult(label='LABEL_0', score=0.9965545535087585), + TextClassificationResult(label='LABEL_1', score=0.9978235363960266)] assert result == answer From 7375b1cb51ffb2ab12b44af24bed81b711d17aba Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sun, 10 Jan 2021 01:27:44 -0500 Subject: [PATCH 050/155] changed eval output to named tuple for TC --- happytransformer/happy_trainer.py | 4 +++- happytransformer/tc/trainer.py | 6 ++++-- tests/test_tc.py | 10 +++++----- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/happytransformer/happy_trainer.py b/happytransformer/happy_trainer.py index fb871f00..86c5bf85 100644 --- a/happytransformer/happy_trainer.py +++ b/happytransformer/happy_trainer.py @@ -1,10 +1,12 @@ """ Parent class for training classes, such as TCTrainer and QATrainer """ - +from collections import namedtuple import tempfile from transformers import TrainingArguments, Trainer +# may eventually add more metrics like accuracy +EvalResult = namedtuple("EvalResult", ["eval_loss"]) class HappyTrainer: def __init__(self, model, model_type, tokenizer, device, logger): diff --git a/happytransformer/tc/trainer.py b/happytransformer/tc/trainer.py index e3704211..ef333102 100644 --- a/happytransformer/tc/trainer.py +++ b/happytransformer/tc/trainer.py @@ -9,7 +9,7 @@ import csv import torch -from happytransformer.happy_trainer import HappyTrainer +from happytransformer.happy_trainer import HappyTrainer, EvalResult from tqdm import tqdm @@ -30,7 +30,9 @@ def eval(self, input_filepath): eval_encodings = self.tokenizer(contexts, truncation=True, padding=True) eval_dataset = TextClassificationDataset(eval_encodings, labels) - return self._run_eval(eval_dataset) + result = self._run_eval(eval_dataset) + print(result) + return EvalResult(eval_loss=result["eval_loss"]) def test(self, input_filepath, solve): """ diff --git a/tests/test_tc.py b/tests/test_tc.py index 49cf4f3a..1a780ae5 100644 --- a/tests/test_tc.py +++ b/tests/test_tc.py @@ -34,7 +34,7 @@ def test_qa_eval(): """ happy_tc = HappyTextClassification() results = happy_tc.eval("../data/tc/train-eval.csv") - assert results["eval_loss"] == 0.007262040860950947 + assert results.eval_loss == 0.007262040860950947 def test_qa_test(): @@ -88,9 +88,9 @@ def test_qa_train_effectiveness_albert(): """ happy_tc = HappyTextClassification(model_type="ALBERT", model_name="textattack/albert-base-v2-SST-2") - before_loss = happy_tc.eval("../data/tc/train-eval.csv")["eval_loss"] + before_loss = happy_tc.eval("../data/tc/train-eval.csv").eval_loss happy_tc.train("../data/tc/train-eval.csv") - after_loss = happy_tc.eval("../data/tc/train-eval.csv")["eval_loss"] + after_loss = happy_tc.eval("../data/tc/train-eval.csv").eval_loss assert after_loss < before_loss @@ -117,7 +117,7 @@ def test_qa_train_effectiveness_bert(): """ happy_tc = HappyTextClassification(model_type="BERT", model_name="textattack/bert-base-uncased-SST-2") - before_loss = happy_tc.eval("../data/tc/train-eval.csv")["eval_loss"] + before_loss = happy_tc.eval("../data/tc/train-eval.csv").eval_loss happy_tc.train("../data/tc/train-eval.csv") - after_loss = happy_tc.eval("../data/tc/train-eval.csv")["eval_loss"] + after_loss = happy_tc.eval("../data/tc/train-eval.csv").eval_loss assert after_loss < before_loss From 3d1e9e9812264e426fe94a1247dc4ecf049221ab Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sun, 10 Jan 2021 01:30:56 -0500 Subject: [PATCH 051/155] changed eval output to named tuple for QA --- happytransformer/happy_question_answering.py | 2 +- happytransformer/qa/trainer.py | 7 ++++--- tests/test_qa.py | 14 +++++++------- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/happytransformer/happy_question_answering.py b/happytransformer/happy_question_answering.py index 44113e69..c4f898a8 100644 --- a/happytransformer/happy_question_answering.py +++ b/happytransformer/happy_question_answering.py @@ -110,7 +110,7 @@ def eval(self, input_filepath): return: A dictionary that contains a key called "eval_loss" """ - return self._trainer.eval(input_filepath=input_filepath,) + return self._trainer.eval(input_filepath=input_filepath) def test(self, input_filepath): """ diff --git a/happytransformer/qa/trainer.py b/happytransformer/qa/trainer.py index ad23e521..32b8ce5c 100644 --- a/happytransformer/qa/trainer.py +++ b/happytransformer/qa/trainer.py @@ -10,7 +10,7 @@ import csv from tqdm import tqdm import torch -from happytransformer.happy_trainer import HappyTrainer +from happytransformer.happy_trainer import HappyTrainer, EvalResult class QATrainer(HappyTrainer): """ @@ -42,8 +42,9 @@ def eval(self, input_filepath): self.__add_end_idx(contexts, answers) encodings = self.tokenizer(contexts, questions, truncation=True, padding=True) self.__add_token_positions(encodings, answers) - dataset = QuestionAnsweringDataset(encodings) - return self._run_eval(dataset) + eval_dataset = QuestionAnsweringDataset(encodings) + result = self._run_eval(eval_dataset) + return EvalResult(eval_loss=result["eval_loss"]) def test(self, input_filepath, solve): diff --git a/tests/test_qa.py b/tests/test_qa.py index 3122eb53..50760d13 100644 --- a/tests/test_qa.py +++ b/tests/test_qa.py @@ -28,7 +28,7 @@ def test_qa_train(): def test_qa_eval(): happy_qa = HappyQuestionAnswering() result = happy_qa.eval("../data/qa/train-eval.csv") - assert result["eval_loss"] == 0.11738169193267822 + assert result.eval_loss == 0.11738169193267822 def test_qa_test(): @@ -46,9 +46,9 @@ def test_qa_train_effectiveness(): """ happy_qa = HappyQuestionAnswering() - before_loss = happy_qa.eval("../data/qa/train-eval.csv")["eval_loss"] + before_loss = happy_qa.eval("../data/qa/train-eval.csv").eval_loss happy_qa.train("../data/qa/train-eval.csv") - after_loss = happy_qa.eval("../data/qa/train-eval.csv")["eval_loss"] + after_loss = happy_qa.eval("../data/qa/train-eval.csv").eval_loss assert after_loss < before_loss @@ -59,9 +59,9 @@ def test_qa_train_effectiveness_albert(): """ happy_qa = HappyQuestionAnswering("ALBERT", "twmkn9/albert-base-v2-squad2") - before_loss = happy_qa.eval("../data/qa/train-eval.csv")["eval_loss"] + before_loss = happy_qa.eval("../data/qa/train-eval.csv").eval_loss happy_qa.train("../data/qa/train-eval.csv") - after_loss = happy_qa.eval("../data/qa/train-eval.csv")["eval_loss"] + after_loss = happy_qa.eval("../data/qa/train-eval.csv").eval_loss assert after_loss < before_loss @@ -80,9 +80,9 @@ def test_qa_train_effectiveness_bert(): """ happy_qa = HappyQuestionAnswering("BERT", "mrm8488/bert-tiny-5-finetuned-squadv2") - before_loss = happy_qa.eval("../data/qa/train-eval.csv")["eval_loss"] + before_loss = happy_qa.eval("../data/qa/train-eval.csv").eval_loss happy_qa.train("../data/qa/train-eval.csv") - after_loss = happy_qa.eval("../data/qa/train-eval.csv")["eval_loss"] + after_loss = happy_qa.eval("../data/qa/train-eval.csv").eval_loss assert after_loss < before_loss From 77482f33b42619eb3c00347aa4df7e2bff32bd1c Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sun, 10 Jan 2021 01:50:08 -0500 Subject: [PATCH 052/155] removed print --- happytransformer/tc/trainer.py | 1 - 1 file changed, 1 deletion(-) diff --git a/happytransformer/tc/trainer.py b/happytransformer/tc/trainer.py index ef333102..3e10575b 100644 --- a/happytransformer/tc/trainer.py +++ b/happytransformer/tc/trainer.py @@ -31,7 +31,6 @@ def eval(self, input_filepath): eval_dataset = TextClassificationDataset(eval_encodings, labels) result = self._run_eval(eval_dataset) - print(result) return EvalResult(eval_loss=result["eval_loss"]) def test(self, input_filepath, solve): From 7a34ef870e12dd7cb09e5d6193be706e17a39aca Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sun, 10 Jan 2021 12:08:07 -0500 Subject: [PATCH 053/155] Added Ted's feedback --- happytransformer/happy_text_classification.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/happytransformer/happy_text_classification.py b/happytransformer/happy_text_classification.py index d1cc0dea..50d88bb7 100644 --- a/happytransformer/happy_text_classification.py +++ b/happytransformer/happy_text_classification.py @@ -67,11 +67,11 @@ def classify_text(self, text): # Blocking allowing a for a list of strings if not isinstance(text, str): raise ValueError("the \"text\" argument must be a single string") - result = self._pipeline(text) + results = self._pipeline(text) # we do not support predicting a list of texts, so only first prediction is relevant - result = result[0] + first_result = results[0] - return TextClassificationResult(label=result["label"], score=result["score"]) + return TextClassificationResult(label=first_result["label"], score=first_result["score"]) def train(self, input_filepath, args=ARGS_TC_TRAIN): From 23d9d6e01313d87c39893948f03520641a8109d3 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sun, 10 Jan 2021 13:46:34 -0500 Subject: [PATCH 054/155] QA: changed topk to top_k --- happytransformer/happy_question_answering.py | 10 +++++----- tests/test_qa.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/happytransformer/happy_question_answering.py b/happytransformer/happy_question_answering.py index c4f898a8..221048cc 100644 --- a/happytransformer/happy_question_answering.py +++ b/happytransformer/happy_question_answering.py @@ -59,19 +59,19 @@ def __init__(self, model_type="DISTILBERT", self._trainer = QATrainer(model, model_type, tokenizer, self._device, self.logger) - def answer_question(self, context, question, topk=1): + def answer_question(self, context, question, top_k=1): """ :param context: background information to answer the question (string) :param question: A question that can be answered with the given context (string) - :param topk: how many results + :param top_k: how many results :return: A list of a named tuples that contains the keys: answer, score, start and end """ - result = self._pipeline(context=context, question=question, topk=topk) - # transformers returns a single dictionary when topk ==1. + result = self._pipeline(context=context, question=question, topk=top_k) + # transformers returns a single dictionary when top_k ==1. # Our convention however is to have constant output format - if topk == 1: + if top_k == 1: result = [result] results = [ diff --git a/tests/test_qa.py b/tests/test_qa.py index 50760d13..4b62a8f9 100644 --- a/tests/test_qa.py +++ b/tests/test_qa.py @@ -14,7 +14,7 @@ def test_qa_answer_question(): def test_qa_answer_question_top_k(): happy_qa = HappyQuestionAnswering() - result = happy_qa.answer_question("Today's date is January 8th 2021", "What is the date?", topk=3) + result = happy_qa.answer_question("Today's date is January 8th 2021", "What is the date?", top_k=3) answer = [QuestionAnsweringResult(answer='January 8th 2021', score=0.9696964621543884, start=16, end=32), QuestionAnsweringResult(answer='January 8th', score=0.02050216868519783, start=16, end=27), QuestionAnsweringResult(answer='January', score=0.005092293489724398, start=16, end=23)] From 1050fea3b7dac8b3718944c6463465519e4c92f5 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sun, 10 Jan 2021 15:31:46 -0500 Subject: [PATCH 055/155] QA: Updated README --- README.md | 372 ++++++++++++++++++++++-------------------------------- 1 file changed, 151 insertions(+), 221 deletions(-) diff --git a/README.md b/README.md index 40e32c38..ebbfd05a 100644 --- a/README.md +++ b/README.md @@ -7,9 +7,9 @@ * [Features](#Features) * [Installation](#Installation) * [Word Prediction](#Word-Prediction) -* [Text Classification](#Binary-Sequence-Classification) -* [Next Sentence Prediction](#Next-Sentence-Prediction) +* [Text Classification](#Text-Classification) * [Question Answering](#Question-Answering) +* [Next Sentence Prediction](#Next-Sentence-Prediction) * [Tech](#Tech) * [Call For Contributors](#Call-For-Contributors) * [Maintainers](#Maintainers) @@ -36,16 +36,16 @@ Happy Transformer is an package built on top of [Hugging Face's transformer libr | Public Methods | Basic Usage | Training | |------------------------------------|--------------|------------| +| Word Prediction | ✔ | | | Text Classification | ✔ | ✔ | | Question Answering | ✔ | ✔ | -| Word Prediction | ✔ | | | Next Sentence Prediction | ✔ | | | Public Methods | ALBERT | BERT |DISTILBERT | |------------------------------------|--------------|------------|-----------| +| Word Prediction | ✔ | ✔ | ✔ | | Text Classification | ✔ | ✔ | ✔ | | Question Answering | ✔ | ✔ | ✔ | -| Word Prediction | ✔ | ✔ | ✔ | | Next Sentence Prediction | ✔ | ✔ | ✔ | ## Installation @@ -138,274 +138,208 @@ print(result[1].token_str) # technology ``` - -## Binary Sequence Classification - -Binary sequence classification (BSC) has many applications. For example, by using BSC, you can train a model to predict if a yelp review is positive or negative. -Another example includes determining if an email is spam or ham. - -Each Happy Transformer has four methods that are utilized for binary sequence classification: - -1. init_sequence_classifier() -2. custom_init_sequence_classifier(args) -3. train_sequence_classifier(train_csv_path) -4. eval_sequence_classifier(eval_csv_path) +## Text Classification -### init_sequence_classifier() -Initialize binary sequence classification for the HappyTransformer object with the default settings. +## Question Answering -### train_sequence_classifier(train_csv_path): -Trains the HappyTransformer's sequence classifier. - -One of the two init sequence classifier methods must be called before this method can be called. - -Argument: - 1. train_csv_path: A string directory path to the csv that contains the training data. +##### Single Answer +**HappyBERT** has a method called "answer_question" which is used for question answering tasks. +The method takes the following arguments: -##### train_csv requirements: - 1. The csv must contain *NO* header. - 2. Each row contains a training case. - 3. The first column contains either a 0 or a 1 to indicate whether the training case is for case "0" or case "1". - 4. The second column contains the text for the training case -#### Example 1 -| | | -|---|--------------------------------------------------------------| -| 0 | Terrible service and awful food | -| 1 | My new favourite Chinese restaurant!!!! | -| 1 | Amazing food and okay service. Overall a great place to eat | -| 0 | The restaurant smells horrible. | + 1. question: The question to be answered + 2. text: The text containing the answer to the question -This method does not return anything +The output from the method is the answer to the question, returned as a string. -### eval_sequence_classifier(eval_csv_path): -Evaluates the trained model against an input. -train_sequence_classifier(train_csv_path): must be called before this method can be called. +## Question Answering -Argument: +Initialize a HappyQuestionAnswering() object to perform question answering. - 1. eval_csv_path: A string directory path to the csv that contains the evaluating data. +This model answers a question given a body of that's text relevant to the questions. -##### eval_csv requirements: (same as train_csv requirements) - 1. The csv must contain *NO* header. - 2. Each row contains a training case. - 3. The first column contains either a 0 or a 1 to indicate whether the training case is for case "0" or case "1". - 4. The second column contains the text for the training case - -**Returns** a python dictionary that contains a count for the following values +The outputted answer is always a text-span with the provided information. -*true_positive:* The model correctly predicted the value 1 . -*true_negative:* The model correctly predicted the value 0. -*false_positive':* The model incorrectly predicted the value 1. -*false_negative* The model incorrectly predicted the value 0. +Initialization Arguments: + 1. model_type (string): either "ALBERT", "BERT" or "DISTILBERT." The default is "DISTILBERT" + 2. model_name(string): below is a URL that contains potential models. + [MODELS](https://huggingface.co/models?filter=question-answering) +### Initialization -### test_sequence_classifier(test_csv_path): -Tests the trained model against an input. +We recommend using "HappyQuestionAnswering("ALBERT", "mfeb/albert-xxlarge-v2-squad2")" for the best performance -train_sequence_classifier(train_csv_path): must be called before this method can be called. -Argument: - 1. test_csv_path: A string directory path to the csv that contains the testing data +#### Example 3.0: +```python + from happytransformer import HappyQuestionAnswering + # --------------------------------------# + happy_qa_distilbert = HappyQuestionAnswering() # default + happy_qa_albert = HappyQuestionAnswering("ALBERT", "mfeb/albert-xxlarge-v2-squad2") + # good model when using with limited hardware + happy_qa_bert = HappyQuestionAnswering("BERT", "mrm8488/bert-tiny-5-finetuned-squadv2") -##### test_csv requirements: - 1. The csv must contain *NO* header. - 2. Each row contains a single test case. - 3. The csv contains a single column with the text for each test case. +``` -#### Example 2: -| | -|-------------------------------------------| -| 5 stars!!! | -| Cheap food at an expensive price | -| Great location and nice view of the ocean | -| two thumbs down | +### answer_question() -**Returns** a list of integer values in ascending order by test case row index. -For example, for the csv file shown in Example 2, the result would be [1, 0, 1, 0]. -Where the first index in the list corresponds to "5 stars!!!" -and the last index corresponds to "two thumbs down." +Inputs: +1. context (string): background information, which contains a text-span that is the answer +2. question (string): the question that will be asked +3. top_k (int): the number of results that will be returned (default=1) +Returns: + A list of a named tuples that contains the keys: "answer", "score", "start" and "end." +The list is in descending order by score -#### Example 3: -```sh -from happytransformer import HappyROBERTA -#------------------------------------------# -happy_roberta = HappyROBERTA() -happy_roberta.init_sequence_classifier() - -train_csv_path = "data/train.csv" -happy_roberta.train_sequence_classifier(train_csv_path) - -eval_csv_path = "data/eval.csv" -eval_results = happy_roberta.eval_sequence_classifier(eval_csv_path) -print(type(eval_results)) # prints: -print(eval_results) # prints: {'true_positive': 300', 'true_negative': 250, 'false_positive': 40, 'false_negative': 55} - -test_csv_path = "data/test.csv" -test_results = happy_roberta.test_sequence_classifier(test_csv_path) -print(type(test_results)) # prints: -print(test_results) # prints: [1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0 ] +#### Example 3.1: +```python + from happytransformer import HappyQuestionAnswering + # --------------------------------------# + happy_qa = HappyQuestionAnswering() + result = happy_qa.answer_question("Today's date is January 10th, 2021", "What is the date?") + print(type(result)) # + print(result) # [QuestionAnsweringResult(answer='January 10th, 2021', score=0.9711642265319824, start=16, end=34)] + print(type(result[0])) # + print(result[0]) # QuestionAnsweringResult(answer='January 10th, 2021', score=0.9711642265319824, start=16, end=34) + print(result[0].answer) # January 10th, 2021 ``` +#### Example 3.2: +```python + from happytransformer import HappyQuestionAnswering + # --------------------------------------# + happy_qa = HappyQuestionAnswering() + result = happy_qa.answer_question("Today's date is January 10th, 2021", "What is the date?", top_k=2) + print(type(result)) # + print(result) # [QuestionAnsweringResult(answer='January 10th, 2021', score=0.9711642265319824, start=16, end=34), QuestionAnsweringResult(answer='January 10th', score=0.017306014895439148, start=16, end=28)] + print(result[1].answer) # January 10th -### custom_init_sequence_classifier(args) - -Initializing the sequence classifier with custom settings. -Called instead of init_sequence_classifier(). -argument: - 1. args: a python dictionary that contains all of the same fields as the default arguments - -### default classifier arguments - ``` -# found under "from happytransformer.classifier_args" -classifier_args = { - # Basic fine tuning parameters - 'learning_rate': 1e-5, - 'num_epochs': 2, - 'batch_size': 8, - - # More advanced fine tuning parameters - 'max_seq_length': 128, # Max number of tokens per input. Max value = 512 - 'adam_epsilon': 1e-5, - 'gradient_accumulation_steps': 1, - 'weight_decay': 0, - 'warmup_ratio': 0.06, - 'warmup_steps': 0, - 'max_grad_norm': 1.0, - - # More modes will become available in future releases - 'task_mode': 'binary', - } - ``` -#### Example 4: - ```sh -from happytransformer import HappyROBERTA -from happytransformer import classifier_args -#------------------------------------------# -happy_xlnet = HappyXLNET() - -custom_args = classifier_args.copy() -custom_args["learning_rate"] = 2e-5 -custom_args['num_epochs'] = 4 -custom_args["batch_size"] = 3 - -happy_xlnet.custom_init_sequence_classifier(custom_args) -# Continue from example 1 after "happy_roberta.init_sequence_classifier()"" -``` +## Training Question Answering -## Next Sentence Prediction +HappyQuestionAnswering contains three methods for training +- train(): fine-tune a question answering model to become better at a certain task +- eval(): determine how well the model performs on a labeled dataset +- test(): run the model on an unlabeled dataset to produce predictions -*Determine the likelihood that sentence B follows sentence A.* +### train() +inputs: +1. input_filepath (string): a path file to a csv file as described in table 3.1 +2. args (dictionary): a dictionary with the same keys and value types as shown below. +The dictionary below shows the default values. -**HappyBERT** has a method called "predict_next_sentence" which is used for next sentence prediction tasks. -The method takes the following arguments: +Information about what the keys mean can be accessed [here](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments) +ARGS_QA_TRAIN= { + 'learning_rate': 5e-5, + 'weight_decay': 0, + 'adam_beta1': 0.9, + 'adam_beta2': 0.999, + 'adam_epsilon': 1e-8, + 'max_grad_norm': 1.0, + 'num_train_epochs': 3.0, - 1. sentence_a: A **single** sentence in a body of text - 2. sentence_b: A **single** sentence that may or may not follow sentence sentence_a +} -This likelihood that sentence_b follows sentenced_a is returned as a boolean value that is either True or False indicating if it is true that sentence B follows sentence A. +Output: None + +#### Table 3.1 -###### Example 1: -```sh -from happytransformer import HappyBERT -#--------------------------------------# -happy_bert = HappyBERT() -sentence_a = "How old are you?" -sentence_b = "I am 93 years old." -sentence_c = "The Eiffel Tower is in Paris." -result = happy_bert.predict_next_sentence(sentence_a, sentence_b) -print(type(result)) # prints: -print(result) # prints: True -result = happy_bert.predict_next_sentence(sentence_a, sentence_c) -print(type(result)) # prints: -print(result) # prints: False -``` -###### Example 2: (New feature) -You can now set the use_probability parameter to True to make the next_sentence function output -a probability instead of a boolean answer. -```sh -from happytransformer import HappyBERT -#--------------------------------------# -happy_bert = HappyBERT() -sentence_a = "How old are you?" -sentence_b = "I am 93 years old." -result = happy_bert.predict_next_sentence(sentence_a, sentence_b, use_probability=True) -print(type(result)) # prints: -print(result) # prints: 0.999990701675415 +1. context (string): background information for answer the question +2. question (string): the question that will be asked +3. answer_text(string): the answer in string format +4. answer_start(int): the char index of the start of the answer + +| context | question | answer_text | answer_start | +|---------------------------|-------------------|---------------|--------------| +| October 31st is the date | what is the date? | October 31st | 0 | +| The date is November 23rd | what is the date? | November 23rd | 12 | +#### Example 3.3: +```python + from happytransformer import HappyQuestionAnswering + # --------------------------------------# + happy_qa = HappyQuestionAnswering() + happy_qa.train("../../data/qa/train-eval.csv") ``` -## Question Answering +### eval() +Input: +1. input_filepath (string): a path file to a csv file as described in table 3.1 -*Determine the answer to a given question using a body of supplied text.* +output: -##### Single Answer -**HappyBERT** has a method called "answer_question" which is used for question answering tasks. -The method takes the following arguments: +A named tuple with the key "eval_loss" - 1. question: The question to be answered - 2. text: The text containing the answer to the question - -The output from the method is the answer to the question, returned as a string. +#### Example 3.4: +```python + from happytransformer import HappyQuestionAnswering + # --------------------------------------# + happy_qa = HappyQuestionAnswering() + result = happy_qa.eval("../../data/qa/train-eval.csv") + print(type(result)) # + print(result) # EvalResult(eval_loss=0.11738169193267822) + print(result.eval_loss) # 0.1173816919326782 -###### Example 1: -```sh -from happytransformer import HappyBERT -#--------------------------------------# -happy_bert = HappyBERT() -question = "Who does Ernie live with?" -text = "Ernie is an orange Muppet character on the long running PBS and HBO children's television show Sesame Street. He and his roommate Bert form the comic duo Bert and Ernie, one of the program's centerpieces, with Ernie acting the role of the naïve troublemaker and Bert the world weary foil." # Source: https://en.wikipedia.org/wiki/Ernie_(Sesame_Street) -result = happy_bert.answer_question(question, text) -print(type(result)) # prints: -print(result) # prints: bert ``` -##### Multiple Answers -**HappyBERT** has a method called "answers_to_question" which is used to generate multiple answers for a single question +### test() +Input: +1. input_filepath (string): a path file to a csv file as described in table 3.2 - 1. question: The question to be answered - 2. text: The text containing the answer to the question - 3. k: The number of answers that will be returned -The output is a list of dictionaries. -Each dictionary contains two keys: text and softmax. -The text key contains the answer in the form of a string. -The softmax key contains the "probability" of the answer as a float between 0 and 1. +Output: A list of named tuples with keys: "answer", "score", "start" and "end" -###### Example 1: -```sh -from happytransformer import HappyBERT -#--------------------------------------# -happy_bert = HappyBERT() -question = "Who does Ernie live with?" -text = "Ernie is an orange Muppet character on the long running PBS and HBO children's television show Sesame Street. He and his roommate Bert form the comic duo Bert and Ernie, one of the program's centerpieces, with Ernie acting the role of the naïve troublemaker and Bert the world weary foil." # Source: https://en.wikipedia.org/wiki/Ernie_(Sesame_Street) -result = happy_bert.answers_to_question(question, text, k=3) -print(type(result)) # prints: -print(result) # prints: [{'text': 'bert', 'softmax': 0.9916905164718628}, {'text': 'roommate bert', 'softmax': 0.004403269849717617}, {'text': 'his roommate bert', 'softmax': 0.0039062034338712692}] +The list is in order by ascending csv index. +#### Table 3.2 -best_answer = result[0] -second_best_answer = result[1] +1. context (string): background information for answer the question +2. question (string): the question that will be asked -print(type(best_answer)) # prints: +| context | question | +|---------------------------|-------------------| +| October 31st is the date | what is the date? | +| The date is November 23rd | what is the date? | -print(best_answer) # prints: {'text': 'bert', 'softmax': 0.9916905164718628} +#### Example 3.5: +```python + from happytransformer import HappyQuestionAnswering + # --------------------------------------# + happy_qa = HappyQuestionAnswering() + result = happy_qa.test("../../data/qa/test.csv") + print(type(result)) + print(result) # [QuestionAnsweringResult(answer='October 31st', score=0.9939756989479065, start=0, end=12), QuestionAnsweringResult(answer='November 23rd', score=0.967872679233551, start=12, end=25)] + print(result[0]) # QuestionAnsweringResult(answer='October 31st', score=0.9939756989479065, start=0, end=12) + print(result[0].answer) # October 31st -print(best_answer["text"]) # prints: bert -print(best_answer["softmax"]) # prints: 0.9916905164718628 +``` + +#### Example 3.6: +```python + from happytransformer import HappyQuestionAnswering + # --------------------------------------# + happy_qa = HappyQuestionAnswering() + before_loss = happy_qa.eval("../../data/qa/train-eval.csv").eval_loss + happy_qa.train("../../data/qa/train-eval.csv") + after_loss = happy_qa.eval("../../data/qa/train-eval.csv").eval_loss + print("Before loss: ", before_loss) # 0.11738169193267822 + print("After loss: ", after_loss) # 0.00037909045931883156 + # Since after_loss < before_loss, the model learned! + # Note: typically you evaluate with a separate dataset + # but for simplicity we used the same one ``` +## Next Sentence Prediction ## Tech @@ -413,18 +347,14 @@ print(best_answer["softmax"]) # prints: 0.9916905164718628 Happy Transformer uses a number of open source projects: * [transformers](https://github.com/huggingface/transformers/stargazers) - State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch! -* [pytorch](https://github.com/pytorch/pytorch) - Tensors and Dynamic neural networks in Python -* [scikit-learn](https://github.com/scikit-learn/scikit-learn) - A set of python modules for machine learning and data mining -* [numpy](https://github.com/numpy/numpy) - Array computation -* [pandas](https://github.com/pandas-dev/pandas) - Powerful data structures for data analysis, time series, and statistics +* [pytorch](https://github.com/pytorch/pytorch) - Tensors and Dynamic neural networks in Python * [tqdm](https://github.com/tqdm/tqdm) - A Fast, Extensible Progress Bar for Python and CLI -* [pytorch-transformers-classification](https://github.com/ThilinaRajapakse/pytorch-transformers-classification) - Text classification for BERT, RoBERTa, XLNet and XLM HappyTransformer is also an open source project with this [public repository](https://github.com/EricFillion/happy-transformer) on GitHub. ### Call for contributors - Happy Transformer is a new and growing API. We're seeking more contributors to help accomplish our mission of making state-of-the-art AI easier to use. + Happy Transformer is a growing API. We're seeking more contributors to help accomplish our mission of making state-of-the-art AI easier to use. ### Maintainers - [Eric Fillion](https://github.com/ericfillion) Lead Maintainer From 401e3842917d1987d28b36e5a8881597444d4cd9 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sun, 10 Jan 2021 15:32:16 -0500 Subject: [PATCH 056/155] QA: Added readme examples --- .../question_answering/readme_examples.py | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 examples/question_answering/readme_examples.py diff --git a/examples/question_answering/readme_examples.py b/examples/question_answering/readme_examples.py new file mode 100644 index 00000000..3fe22a33 --- /dev/null +++ b/examples/question_answering/readme_examples.py @@ -0,0 +1,62 @@ +from happytransformer import HappyQuestionAnswering +def example_3_0(): + + happy_qa_distilbert = HappyQuestionAnswering() # default + happy_qa_albert = HappyQuestionAnswering("ALBERT", "mfeb/albert-xxlarge-v2-squad2") + # good model when using with limited hardware + happy_qa_bert = HappyQuestionAnswering("BERT", "mrm8488/bert-tiny-5-finetuned-squadv2") + + +def example_3_1(): + happy_qa = HappyQuestionAnswering() + result = happy_qa.answer_question("Today's date is January 10th, 2021", "What is the date?") + print(type(result)) # + print(result) # [QuestionAnsweringResult(answer='January 10th, 2021', score=0.9711642265319824, start=16, end=34)] + print(type(result[0])) # + print(result[0]) # QuestionAnsweringResult(answer='January 10th, 2021', score=0.9711642265319824, start=16, end=34) + print(result[0].answer) # January 10th, 2021 + + +def example_3_2(): + happy_qa = HappyQuestionAnswering() + result = happy_qa.answer_question("Today's date is January 10th, 2021", "What is the date?", top_k=2) + print(type(result)) # + print(result) # [QuestionAnsweringResult(answer='January 10th, 2021', score=0.9711642265319824, start=16, end=34), QuestionAnsweringResult(answer='January 10th', score=0.017306014895439148, start=16, end=28)] + print(result[1].answer) # January 10th + +def example_3_3(): + happy_qa = HappyQuestionAnswering() + happy_qa.train("../../data/qa/train-eval.csv") + +def example_3_4(): + happy_qa = HappyQuestionAnswering() + result = happy_qa.eval("../../data/qa/train-eval.csv") + print(type(result)) # + print(result) # EvalResult(eval_loss=0.11738169193267822) + print(result.eval_loss) # 0.1173816919326782 + +def example_3_5(): + happy_qa = HappyQuestionAnswering() + result = happy_qa.test("../../data/qa/test.csv") + print(type(result)) + print(result) # [QuestionAnsweringResult(answer='October 31st', score=0.9939756989479065, start=0, end=12), QuestionAnsweringResult(answer='November 23rd', score=0.967872679233551, start=12, end=25)] + print(result[0]) # QuestionAnsweringResult(answer='October 31st', score=0.9939756989479065, start=0, end=12) + print(result[0].answer) # October 31st + +def example_3_6(): + happy_qa = HappyQuestionAnswering() + before_loss = happy_qa.eval("../../data/qa/train-eval.csv").eval_loss + happy_qa.train("../../data/qa/train-eval.csv") + after_loss = happy_qa.eval("../../data/qa/train-eval.csv").eval_loss + print("Before loss: ", before_loss) # 0.11738169193267822 + print("After loss: ", after_loss) # 0.00037909045931883156 + # Since after_loss < before_loss, the model learned! + # Note: typically you evaluate with a separate dataset + # but for simplicity the same one was used + +def main(): + example_3_1() + + +if __name__ == "__main__": + main() From a505701daec10281e872e95794301ffb39f4c3c2 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sun, 10 Jan 2021 15:34:40 -0500 Subject: [PATCH 057/155] Moved description --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ebbfd05a..7318cdb4 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,8 @@ # Happy Transformer +Happy Transformer is an package built on top of [Hugging Face's transformer library](https://huggingface.co/transformers/) that makes it easy to utilize state-of-the-art NLP models. + ## Table of Contents * [News](#News) * [Features](#Features) @@ -30,7 +32,6 @@ Last month, Happy Transformer was presented at a conference called C-Search, and We're happy to announce that we won a Best Paper Award at the Canadian Undergraduate Conference for AI. We also received the highest score overall. The paper can be found [here](https://qmind.ca/wp-content/uploads/2020/05/Proceedings-of-CUCAI-2020.pdf) on page 67. -Happy Transformer is an package built on top of [Hugging Face's transformer library](https://huggingface.co/transformers/) that makes it easy to utilize state-of-the-art NLP models. ## Features From da7137cbe0541b5a13b8517772b365fca9bafb1b Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sun, 10 Jan 2021 15:36:50 -0500 Subject: [PATCH 058/155] Updated TOC and removed old section --- README.md | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 7318cdb4..451dde1a 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,7 @@ Happy Transformer is an package built on top of [Hugging Face's transformer libr * [Word Prediction](#Word-Prediction) * [Text Classification](#Text-Classification) * [Question Answering](#Question-Answering) +* [Question Answering Training](#Question-Answering-Training) * [Next Sentence Prediction](#Next-Sentence-Prediction) * [Tech](#Tech) * [Call For Contributors](#Call-For-Contributors) @@ -142,21 +143,6 @@ print(result[1].token_str) # technology ## Text Classification - -## Question Answering - - -##### Single Answer -**HappyBERT** has a method called "answer_question" which is used for question answering tasks. -The method takes the following arguments: - - 1. question: The question to be answered - 2. text: The text containing the answer to the question - -The output from the method is the answer to the question, returned as a string. - - - ## Question Answering Initialize a HappyQuestionAnswering() object to perform question answering. @@ -224,7 +210,7 @@ The list is in descending order by score ``` -## Training Question Answering +## Question Answering Training HappyQuestionAnswering contains three methods for training - train(): fine-tune a question answering model to become better at a certain task From fca8c20ee3a9b093a499d6d8104ccb0ad1c0285b Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sun, 10 Jan 2021 20:46:52 -0500 Subject: [PATCH 059/155] Added multi label TC --- happytransformer/happy_text_classification.py | 15 +-- tests/test_tc.py | 92 ++++++++++++++++--- 2 files changed, 89 insertions(+), 18 deletions(-) diff --git a/happytransformer/happy_text_classification.py b/happytransformer/happy_text_classification.py index 50d88bb7..7b806e3b 100644 --- a/happytransformer/happy_text_classification.py +++ b/happytransformer/happy_text_classification.py @@ -11,9 +11,8 @@ DistilBertTokenizerFast, AlbertForSequenceClassification, AlbertTokenizerFast, - - - TextClassificationPipeline +AutoConfig, +TextClassificationPipeline ) from happytransformer.tc.trainer import TCTrainer @@ -29,18 +28,20 @@ class HappyTextClassification(HappyTransformer): """ def __init__(self, model_type="DISTILBERT", - model_name="distilbert-base-uncased-finetuned-sst-2-english"): + model_name="distilbert-base-uncased", num_labels=2): model = None tokenizer = None + config = AutoConfig.from_pretrained(model_name, num_labels=num_labels) + # config = DistilBertConfig(num_labels=num_labels) if model_type == "ALBERT": - model = AlbertForSequenceClassification.from_pretrained(model_name) + model = AlbertForSequenceClassification.from_pretrained(model_name, config=config) tokenizer = AlbertTokenizerFast.from_pretrained(model_name) elif model_type == "BERT": - model = BertForSequenceClassification.from_pretrained(model_name) + model = BertForSequenceClassification.from_pretrained(model_name, config=config) tokenizer = BertTokenizerFast.from_pretrained(model_name) elif model_type == "DISTILBERT": - model = DistilBertForSequenceClassification.from_pretrained(model_name) + model = DistilBertForSequenceClassification.from_pretrained(model_name, config=config) tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) else: diff --git a/tests/test_tc.py b/tests/test_tc.py index 1a780ae5..08454741 100644 --- a/tests/test_tc.py +++ b/tests/test_tc.py @@ -12,7 +12,7 @@ def test_classify_text(): """ happy_tc = HappyTextClassification() result = happy_tc.classify_text("What a great movie") - answer = TextClassificationResult(label='POSITIVE', score=0.9998726844787598) + answer = TextClassificationResult(label='LABEL_1', score=0.9998726844787598) assert result == answer @@ -22,7 +22,8 @@ def test_qa_train(): HappyQuestionAnswering.train() """ - happy_tc = HappyTextClassification() + happy_tc = HappyTextClassification(model_type="DISTILBERT", + model_name="distilbert-base-uncased-finetuned-sst-2-english") happy_tc.train("../data/tc/train-eval.csv") @@ -32,7 +33,8 @@ def test_qa_eval(): Tests HappyQuestionAnswering.eval() """ - happy_tc = HappyTextClassification() + happy_tc = HappyTextClassification(model_type="DISTILBERT", + model_name="distilbert-base-uncased-finetuned-sst-2-english") results = happy_tc.eval("../data/tc/train-eval.csv") assert results.eval_loss == 0.007262040860950947 @@ -42,13 +44,14 @@ def test_qa_test(): Tests HappyQuestionAnswering.test() """ - happy_tc = HappyTextClassification() + happy_tc = HappyTextClassification(model_type="DISTILBERT", + model_name="distilbert-base-uncased-finetuned-sst-2-english") result = happy_tc.test("../data/tc/test.csv") - answer = [TextClassificationResult(label='POSITIVE', score=0.9998401999473572), - TextClassificationResult(label='NEGATIVE', score=0.9772131443023682), - TextClassificationResult(label='NEGATIVE', score=0.9966067671775818), - TextClassificationResult(label='POSITIVE', score=0.9792295098304749)] + answer = [TextClassificationResult(label='LABEL_1', score=0.9998401999473572), + TextClassificationResult(label='LABEL_0', score=0.9772131443023682), + TextClassificationResult(label='LABEL_0', score=0.9966067671775818), + TextClassificationResult(label='LABEL_1', score=0.9792295098304749)] assert result == answer @@ -59,12 +62,79 @@ def test_qa_train_effectiveness(): lowering the loss as determined by HappyQuestionAnswering.eval() """ - happy_tc = HappyTextClassification() - before_loss = happy_tc.eval("../data/tc/train-eval.csv")["eval_loss"] + happy_tc = HappyTextClassification(model_type="DISTILBERT", + model_name="distilbert-base-uncased-finetuned-sst-2-english") + before_loss = happy_tc.eval("../data/tc/train-eval.csv").eval_loss happy_tc.train("../data/tc/train-eval.csv") - after_loss = happy_tc.eval("../data/tc/train-eval.csv")["eval_loss"] + after_loss = happy_tc.eval("../data/tc/train-eval.csv").eval_loss + assert after_loss < before_loss + + + +def test_qa_train_effectiveness_multi(): + """ + Tests + Ensures that HappyQuestionAnswering.train() results in + lowering the loss as determined by HappyQuestionAnswering.eval() + """ + + happy_tc = HappyTextClassification(model_type="DISTILBERT", + model_name="distilbert-base-uncased", num_labels=3) + before_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").eval_loss + happy_tc.train("../data/tc/train-eval-multi.csv") + after_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").eval_loss assert after_loss < before_loss + +def test_qa_test_multi_distil_bert(): + """ + Tests + Ensures that HappyQuestionAnswering.train() results in + lowering the loss as determined by HappyQuestionAnswering.eval() + """ + + happy_tc = HappyTextClassification(model_type="DISTILBERT", + model_name="distilbert-base-uncased", num_labels=3) + happy_tc.train("../data/tc/train-eval-multi.csv") + result = happy_tc.test("../data/tc/test-multi.csv") + answer = [TextClassificationResult(label='LABEL_2', score=0.3558128774166107), + TextClassificationResult(label='LABEL_2', score=0.34425610303878784), + TextClassificationResult(label='LABEL_1', score=0.3998771607875824), + TextClassificationResult(label='LABEL_1', score=0.38578158617019653), + TextClassificationResult(label='LABEL_0', score=0.39120176434516907), + TextClassificationResult(label='LABEL_0', score=0.3762877583503723)] + assert result == answer + + +def test_qa_effectiveness_multi_albert(): + """ + Tests + Ensures that HappyQuestionAnswering.train() results in + lowering the loss as determined by HappyQuestionAnswering.eval() + """ + + happy_tc = HappyTextClassification(model_type="ALBERT", + model_name="albert-base-v2", num_labels=3) + before_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").eval_loss + happy_tc.train("../data/tc/train-eval-multi.csv") + after_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").eval_loss + assert after_loss < before_loss + +def test_qa_effectiveness_multi_bert(): + """ + Tests + Ensures that HappyQuestionAnswering.train() results in + lowering the loss as determined by HappyQuestionAnswering.eval() + """ + + happy_tc = HappyTextClassification(model_type="BERT", + model_name="bert-base-uncased", num_labels=3) + before_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").eval_loss + happy_tc.train("../data/tc/train-eval-multi.csv") + after_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").eval_loss + assert after_loss < before_loss + + def test_qa_test_albert(): """ Tests From 1328599e46544cf6bd0433070b92bc669e799c79 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sun, 10 Jan 2021 20:48:52 -0500 Subject: [PATCH 060/155] Added multi label data --- data/tc/test-multi.csv | 7 +++++++ data/tc/train-eval-multi.csv | 7 +++++++ 2 files changed, 14 insertions(+) create mode 100644 data/tc/test-multi.csv create mode 100644 data/tc/train-eval-multi.csv diff --git a/data/tc/test-multi.csv b/data/tc/test-multi.csv new file mode 100644 index 00000000..86ceb8c1 --- /dev/null +++ b/data/tc/test-multi.csv @@ -0,0 +1,7 @@ +text +Wow what a great place to eat +Soooo good +Okay place +Pretty average +Horrible food +Yuck \ No newline at end of file diff --git a/data/tc/train-eval-multi.csv b/data/tc/train-eval-multi.csv new file mode 100644 index 00000000..8f5e8495 --- /dev/null +++ b/data/tc/train-eval-multi.csv @@ -0,0 +1,7 @@ +text,label +Wow what a great place to eat,2 +Soooo good,2 +Okay place,1 +Pretty average,1 +Horrible food,0 +Yuck,0 \ No newline at end of file From 9536dc668aa012e2ad4a9e7ef6d59b82ffd34bf0 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sun, 10 Jan 2021 21:50:21 -0500 Subject: [PATCH 061/155] Updated test_qa_test to use proper model --- tests/test_tc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tc.py b/tests/test_tc.py index 08454741..518d3526 100644 --- a/tests/test_tc.py +++ b/tests/test_tc.py @@ -10,7 +10,7 @@ def test_classify_text(): HappyQuestionAnswering.classify_text() """ - happy_tc = HappyTextClassification() + happy_tc = HappyTextClassification(model_type="DISTILBERT", model_name="distilbert-base-uncased-finetuned-sst-2-english") result = happy_tc.classify_text("What a great movie") answer = TextClassificationResult(label='LABEL_1', score=0.9998726844787598) assert result == answer From 13461c6eb2cb62e41d1caa3abf4c31e8be68e89f Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sun, 10 Jan 2021 22:20:43 -0500 Subject: [PATCH 062/155] Added TC to the README --- README.md | 167 ++++++++++++++++++ .../text_classification/readme_examples.py | 66 +++++++ 2 files changed, 233 insertions(+) create mode 100644 examples/text_classification/readme_examples.py diff --git a/README.md b/README.md index 451dde1a..cad5725c 100644 --- a/README.md +++ b/README.md @@ -143,6 +143,173 @@ print(result[1].token_str) # technology ## Text Classification +Initialize a HappyTextClassification() object to perform text classification. + +This model assigns a label to a given text string. For example, you can train a model to +detect if an email is spam based on its text. + + +Initialization Arguments: + 1. model_type (string): either "ALBERT", "BERT" or "DISTILBERT." The default is "DISTILBERT" + 2. model_name(string): below is a URL that contains potential models. The default is "distilbert-base-uncased" + [MODELS](https://huggingface.co/models?filter=text-classification) + 3. num_labels(int): The number of text categories. The default is 2 + +# WARNING: If you try to load a pretrained model that has a different number of categories +# than num_labels, then you will get an error + +# "albert-base-v2", "bert-base-uncased" and "distilbert-base-uncased" do not have a predefined +# number of labels, so if you use these models you can set num_labels freely + +### Initialization + +"HappyTextClassification("ALBERT", "textattack/albert-base-v2-SST-2")" has many useful applications. +It's able to detect the sentiment of text. + + +#### Example 2.0: +```python + from happytransformer import HappyTextClassification + # --------------------------------------# + happy_qa_distilbert = HappyTextClassification() # default with "distilbert-base-uncased" and num_labels=2 + happy_tc_albert = HappyTextClassification(model_type="ALBERT", model_name="albert-base-v2") + happy_qa_bert = HappyTextClassification("BERT", "bert-base-uncased") + +``` + + +### classify_text() + +Input: +1. text (string): Text that will be classified + +Returns: +A label in the form of a string, typically "LABEL_x", where x is the label number. +Some models use different labels + + +#### Example 2.1: +```python + from happytransformer import HappyTextClassification + # --------------------------------------# + happy_tc = HappyTextClassification(model_type="DISTILBERT", model_name="distilbert-base-uncased-finetuned-sst-2-english") + result = happy_tc.classify_text("Great movie! 5/5") + print(type(result)) # + print(result) # TextClassificationResult(label='LABEL_1', score=0.9998761415481567) + print(result.label) # LABEL_1 + +``` + + + +## Text Classification Training + +HappyTextClassification contains three methods for training +- train(): fine-tune the model to become better at a certain task +- eval(): determine how well the model performs on a labeled dataset +- test(): run the model on an unlabeled dataset to produce predictions + +### train() + +inputs: +1. input_filepath (string): a path file to a csv file as described in table 2.1 +2. args (dictionary): a dictionary with the same keys and value types as shown below. +The dictionary below shows the default values. + +Information about what the keys mean can be accessed [here](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments) +ARGS_QA_TRAIN= { + 'learning_rate': 5e-5, + 'weight_decay': 0, + 'adam_beta1': 0.9, + 'adam_beta2': 0.999, + 'adam_epsilon': 1e-8, + 'max_grad_norm': 1.0, + 'num_train_epochs': 3.0, + +} + +Output: None + +#### Table 2.1 + +1. text (string): text to be classified +2. label (int): the corresponding label + +| Text | label | +|-------------------------------|-------| +| Wow what a great place to eat | 1 | +| Horrible food | 0 | +| Terrible service | 0 | +| I'm coming here again | 1 | + +#### Example 2.3: +```python + from happytransformer import HappyTextClassification + # --------------------------------------# + happy_tc = HappyTextClassification(model_type="DISTILBERT", + model_name="distilbert-base-uncased-finetuned-sst-2-english", + num_labels=2) # Don't forget to set num_labels! + happy_tc.train("../../data/tc/train-eval.csv") + +``` + +### eval() +Input: +1. input_filepath (string): a path file to a csv file as described in table 2.1 + +output: + +A named tuple with a key called "eval_loss" + +#### Example 2.3: +```python + from happytransformer import HappyTextClassification + # --------------------------------------# + happy_tc = HappyTextClassification(model_type="DISTILBERT", + model_name="distilbert-base-uncased-finetuned-sst-2-english", + num_labels=2) # Don't forget to set num_labels! + result = happy_tc.eval("../../data/tc/train-eval.csv") + print(type(result)) # + print(result) # EvalResult(eval_loss=0.007262040860950947) + print(result.eval_loss) # 0.007262040860950947 + +``` + +### test() +Input: +1. input_filepath (string): a path file to a csv file as described in table 2.2 + +Output: A list of named tuples with keys: "label" and "score" + +The list is in order by ascending csv index. + +#### Table 2.2 + +1. text (string): text that will be classified + +| Text | +|-------------------------------| +| Wow what a great place to eat | +| Horrible food | +| Terrible service | +| I'm coming here again | + +#### Example 2.4: +```python + from happytransformer import HappyTextClassification + # --------------------------------------# + happy_tc = HappyTextClassification(model_type="DISTILBERT", + model_name="distilbert-base-uncased-finetuned-sst-2-english", + num_labels=2) # Don't forget to set num_labels! + result = happy_tc.test("../../data/tc/test.csv") + print(type(result)) # + print(result) # [TextClassificationResult(label='LABEL_1', score=0.9998401999473572), TextClassificationResult(label='LABEL_0', score=0.9772131443023682)... + print(type(result[0])) # + print(result[0]) # TextClassificationResult(label='LABEL_1', score=0.9998401999473572) + print(result[0].label) # LABEL_1 + + +``` ## Question Answering Initialize a HappyQuestionAnswering() object to perform question answering. diff --git a/examples/text_classification/readme_examples.py b/examples/text_classification/readme_examples.py new file mode 100644 index 00000000..610014c3 --- /dev/null +++ b/examples/text_classification/readme_examples.py @@ -0,0 +1,66 @@ + +from happytransformer import HappyTextClassification + +def example_2_0(): + happy_qa_distilbert = HappyTextClassification() # default with "distilbert-base-uncased" + happy_tc_albert = HappyTextClassification(model_type="ALBERT", model_name="albert-base-v2") + happy_qa_bert = HappyTextClassification("BERT", "bert-base-uncased") + + +def example_2_1(): + happy_tc = HappyTextClassification(model_type="DISTILBERT", model_name="distilbert-base-uncased-finetuned-sst-2-english") + result = happy_tc.classify_text("Great movie! 5/5") + print(type(result)) # + print(result) # TextClassificationResult(label='LABEL_1', score=0.9998761415481567) + print(result.label) # LABEL_1 + + +def example_2_2(): + happy_tc = HappyTextClassification(model_type="DISTILBERT", + model_name="distilbert-base-uncased-finetuned-sst-2-english", + num_labels=2) # Don't forget to set num_labels! + happy_tc.train("../../data/tc/train-eval.csv") + + +def example_2_3(): + happy_tc = HappyTextClassification(model_type="DISTILBERT", + model_name="distilbert-base-uncased-finetuned-sst-2-english", + num_labels=2) # Don't forget to set num_labels! + result = happy_tc.eval("../../data/tc/train-eval.csv") + print(type(result)) # + print(result) # EvalResult(eval_loss=0.007262040860950947) + print(result.eval_loss) # 0.007262040860950947 + + +def example_2_4(): + happy_tc = HappyTextClassification(model_type="DISTILBERT", + model_name="distilbert-base-uncased-finetuned-sst-2-english", + num_labels=2) # Don't forget to set num_labels! + result = happy_tc.test("../../data/tc/test.csv") + print(type(result)) # + print(result) # [TextClassificationResult(label='LABEL_1', score=0.9998401999473572), TextClassificationResult(label='LABEL_0', score=0.9772131443023682)... + print(type(result[0])) # + print(result[0]) # TextClassificationResult(label='LABEL_1', score=0.9998401999473572) + print(result[0].label) # LABEL_1 + + +def example_2_5(): + happy_tc = HappyTextClassification(model_type="DISTILBERT", + model_name="distilbert-base-uncased-finetuned-sst-2-english", + num_labels=2) # Don't forget to set num_labels! + before_loss = happy_tc.eval("../../data/tc/train-eval.csv").eval_loss + happy_tc.train("../../data/tc/train-eval.csv") + after_loss = happy_tc.eval("../../data/tc/train-eval.csv").eval_loss + print("Before loss: ", before_loss) # 0.007262040860950947 + print("After loss: ", after_loss) # 0.000162081079906784 + # Since after_loss < before_loss, the model learned! + # Note: typically you evaluate with a separate dataset + # but for simplicity we used the same one + + +def main(): + example_2_0() + + +if __name__ == "__main__": + main() From fec8cb94d00378e1585e97ef9e34b21e7915ee4e Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sun, 10 Jan 2021 22:22:31 -0500 Subject: [PATCH 063/155] Added example 2.5 --- README.md | 20 +++++++++++++++++++ .../text_classification/readme_examples.py | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index cad5725c..44fe5381 100644 --- a/README.md +++ b/README.md @@ -309,6 +309,26 @@ The list is in order by ascending csv index. print(result[0].label) # LABEL_1 +``` + + +#### Example 2.5: +```python + from happytransformer import HappyTextClassification + # --------------------------------------# + happy_tc = HappyTextClassification(model_type="DISTILBERT", + model_name="distilbert-base-uncased-finetuned-sst-2-english", + num_labels=2) # Don't forget to set num_labels! + before_loss = happy_tc.eval("../../data/tc/train-eval.csv").eval_loss + happy_tc.train("../../data/tc/train-eval.csv") + after_loss = happy_tc.eval("../../data/tc/train-eval.csv").eval_loss + print("Before loss: ", before_loss) # 0.007262040860950947 + print("After loss: ", after_loss) # 0.000162081079906784 + # Since after_loss < before_loss, the model learned! + # Note: typically you evaluate with a separate dataset + # but for simplicity we used the same one + + ``` ## Question Answering diff --git a/examples/text_classification/readme_examples.py b/examples/text_classification/readme_examples.py index 610014c3..3a220ee2 100644 --- a/examples/text_classification/readme_examples.py +++ b/examples/text_classification/readme_examples.py @@ -59,7 +59,7 @@ def example_2_5(): def main(): - example_2_0() + example_2_5() if __name__ == "__main__": From d49d6051136d271d986827a68b7e7aa610a0f273 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sun, 10 Jan 2021 22:29:05 -0500 Subject: [PATCH 064/155] deleted old comment --- happytransformer/happy_text_classification.py | 1 - 1 file changed, 1 deletion(-) diff --git a/happytransformer/happy_text_classification.py b/happytransformer/happy_text_classification.py index 7b806e3b..292f09e7 100644 --- a/happytransformer/happy_text_classification.py +++ b/happytransformer/happy_text_classification.py @@ -32,7 +32,6 @@ def __init__(self, model_type="DISTILBERT", model = None tokenizer = None config = AutoConfig.from_pretrained(model_name, num_labels=num_labels) - # config = DistilBertConfig(num_labels=num_labels) if model_type == "ALBERT": model = AlbertForSequenceClassification.from_pretrained(model_name, config=config) From e9416f871d0b3d4f60464e322a876333021d47cf Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sun, 10 Jan 2021 22:32:00 -0500 Subject: [PATCH 065/155] removed comment about output label not following LABEL_x format --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index 44fe5381..ea4e1053 100644 --- a/README.md +++ b/README.md @@ -185,8 +185,6 @@ Input: Returns: A label in the form of a string, typically "LABEL_x", where x is the label number. -Some models use different labels - #### Example 2.1: ```python From b79b4d740251a1474cf87d3222c7a3e7024e64df Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Sun, 10 Jan 2021 22:42:04 -0500 Subject: [PATCH 066/155] Fixed text that was set as being headers --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index ea4e1053..e23c0f18 100644 --- a/README.md +++ b/README.md @@ -155,11 +155,11 @@ Initialization Arguments: [MODELS](https://huggingface.co/models?filter=text-classification) 3. num_labels(int): The number of text categories. The default is 2 -# WARNING: If you try to load a pretrained model that has a different number of categories -# than num_labels, then you will get an error +WARNING: If you try to load a pretrained model that has a different number of categories +than num_labels, then you will get an error -# "albert-base-v2", "bert-base-uncased" and "distilbert-base-uncased" do not have a predefined -# number of labels, so if you use these models you can set num_labels freely +"albert-base-v2", "bert-base-uncased" and "distilbert-base-uncased" do not have a predefined +number of labels, so if you use these models you can set num_labels freely ### Initialization From d3b4534a91e7932207956d21b4a5c4c3b063e066 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Mon, 11 Jan 2021 12:23:27 -0500 Subject: [PATCH 067/155] Added Ted's feedback --- README.md | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index e23c0f18..2e39dd36 100644 --- a/README.md +++ b/README.md @@ -60,15 +60,15 @@ pip install happytransformer ## Word Prediction -Initialize a HappyWordPrediction() object to perform word prediction. +Initialize a HappyWordPrediction object to perform word prediction. -Initialization Arguments: - 1. model_type (string): either "ALBERT", "BERT" or "DISTILBERT." The default is "DISTILBERT" - 2. model_name(string): below is a URL that contains potential models. +**Initialization Arguments:** + 1. model_type (string): either "ALBERT", "BERT" or "DISTILBERT." The default is "DISTILBERT" + 2. model_name(string): below is a URL that contains potential models. [MODELS](https://huggingface.co/models?filter=masked-lm) -For all Transformers, the masked token is **"[MASK]"** +Note: For all Transformers, the masked token is **"[MASK]"** ### Initialization @@ -143,29 +143,26 @@ print(result[1].token_str) # technology ## Text Classification -Initialize a HappyTextClassification() object to perform text classification. +Initialize a HappyTextClassification object to perform text classification. This model assigns a label to a given text string. For example, you can train a model to detect if an email is spam based on its text. -Initialization Arguments: - 1. model_type (string): either "ALBERT", "BERT" or "DISTILBERT." The default is "DISTILBERT" - 2. model_name(string): below is a URL that contains potential models. The default is "distilbert-base-uncased" +**Initialization Arguments:** +1. model_type (string): either "ALBERT", "BERT" or "DISTILBERT." The default is "DISTILBERT" +2. model_name(string): below is a URL that contains potential models. The default is "distilbert-base-uncased" [MODELS](https://huggingface.co/models?filter=text-classification) - 3. num_labels(int): The number of text categories. The default is 2 +3. num_labels(int): The number of text categories. The default is 2 WARNING: If you try to load a pretrained model that has a different number of categories than num_labels, then you will get an error -"albert-base-v2", "bert-base-uncased" and "distilbert-base-uncased" do not have a predefined +NOTE: "albert-base-v2", "bert-base-uncased" and "distilbert-base-uncased" do not have a predefined number of labels, so if you use these models you can set num_labels freely ### Initialization -"HappyTextClassification("ALBERT", "textattack/albert-base-v2-SST-2")" has many useful applications. -It's able to detect the sentiment of text. - #### Example 2.0: ```python @@ -215,6 +212,8 @@ inputs: The dictionary below shows the default values. Information about what the keys mean can be accessed [here](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments) +```python + ARGS_QA_TRAIN= { 'learning_rate': 5e-5, 'weight_decay': 0, @@ -225,6 +224,7 @@ ARGS_QA_TRAIN= { 'num_train_epochs': 3.0, } +``` Output: None @@ -330,16 +330,16 @@ The list is in order by ascending csv index. ``` ## Question Answering -Initialize a HappyQuestionAnswering() object to perform question answering. +Initialize a HappyQuestionAnswering object to perform question answering. This model answers a question given a body of that's text relevant to the questions. The outputted answer is always a text-span with the provided information. -Initialization Arguments: - 1. model_type (string): either "ALBERT", "BERT" or "DISTILBERT." The default is "DISTILBERT" - 2. model_name(string): below is a URL that contains potential models. - [MODELS](https://huggingface.co/models?filter=question-answering) +**Initialization Arguments:** +1. model_type (string): either "ALBERT", "BERT" or "DISTILBERT." The default is "DISTILBERT" +2. model_name(string): below is a URL that contains potential models. + [MODELS](https://huggingface.co/models?filter=question-answering) ### Initialization @@ -410,6 +410,8 @@ inputs: The dictionary below shows the default values. Information about what the keys mean can be accessed [here](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments) +```python + ARGS_QA_TRAIN= { 'learning_rate': 5e-5, 'weight_decay': 0, @@ -420,7 +422,7 @@ ARGS_QA_TRAIN= { 'num_train_epochs': 3.0, } - +``` Output: None #### Table 3.1 From 5cce0d35ae21ee809fe9163529826abf28bc094b Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Mon, 11 Jan 2021 12:26:45 -0500 Subject: [PATCH 068/155] moved initialization header' --- README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 2e39dd36..fc2e8f5c 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,7 @@ pip install happytransformer ## Word Prediction +### Initialization Initialize a HappyWordPrediction object to perform word prediction. @@ -70,7 +71,6 @@ Initialize a HappyWordPrediction object to perform word prediction. Note: For all Transformers, the masked token is **"[MASK]"** -### Initialization We recommend using "HappyWordPrediction("ALBERT", "albert-xxlarge-v2")" for the best performance @@ -142,6 +142,7 @@ print(result[1].token_str) # technology ``` ## Text Classification +### Initialization Initialize a HappyTextClassification object to perform text classification. @@ -161,8 +162,6 @@ than num_labels, then you will get an error NOTE: "albert-base-v2", "bert-base-uncased" and "distilbert-base-uncased" do not have a predefined number of labels, so if you use these models you can set num_labels freely -### Initialization - #### Example 2.0: ```python @@ -330,6 +329,7 @@ The list is in order by ascending csv index. ``` ## Question Answering +### Initialization Initialize a HappyQuestionAnswering object to perform question answering. This model answers a question given a body of that's text relevant to the questions. @@ -341,7 +341,6 @@ The outputted answer is always a text-span with the provided information. 2. model_name(string): below is a URL that contains potential models. [MODELS](https://huggingface.co/models?filter=question-answering) -### Initialization We recommend using "HappyQuestionAnswering("ALBERT", "mfeb/albert-xxlarge-v2-squad2")" for the best performance From affbe337f33cb38d94863cfe97a96da6551098c7 Mon Sep 17 00:00:00 2001 From: Will Macdonald Date: Mon, 11 Jan 2021 13:19:32 -0500 Subject: [PATCH 069/155] Modelling NSP module structure after updated WP, QA and TC modules --- happytransformer/happy_next_sentence.py | 60 +++++++++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 happytransformer/happy_next_sentence.py diff --git a/happytransformer/happy_next_sentence.py b/happytransformer/happy_next_sentence.py new file mode 100644 index 00000000..535b6311 --- /dev/null +++ b/happytransformer/happy_next_sentence.py @@ -0,0 +1,60 @@ +import torch + +from collections import namedtuple +from happytransformer.happy_transformer import HappyTransformer +from happytransformer.mwp.trainer import WPTrainer + +from transformers import ( + BertForMaskedLM, + BertTokenizerFast, + AlbertForMaskedLM, + AlbertTokenizerFast, + DistilBertForMaskedLM, + DistilBertTokenizerFast, + FillMaskPipeline, +) + +from happytransformer.happy_transformer import HappyTransformer + +NextSentenceResult = namedtuple("NextSentenceResult", ("next_sentence", "score")) + + +class HappyNextSentence(HappyTransformer): + """ + A user facing class for next sentence prediction + """ + def __init__(self, model_type="DISTILBERT", + model_name="distilbert-base-uncased"): + + if model_type == "ALBERT": + model = AlbertForMaskedLM.from_pretrained(model_name) + tokenizer = AlbertTokenizerFast.from_pretrained(model_name) + + elif model_type == "BERT": + model = BertForMaskedLM.from_pretrained(model_name) + tokenizer = BertTokenizerFast.from_pretrained(model_name) + + elif model_type == "DISTILBERT": + model = DistilBertForMaskedLM.from_pretrained(model_name) + tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) + else: + raise ValueError(self.model_type_error) + super().__init__(model_type, model_name, model, tokenizer) + device_number = 1 if torch.cuda.is_available() else -1 + self._pipeline = FillMaskPipeline(model=model, tokenizer=tokenizer, device=device_number) + self._trainer = WPTrainer(model, model_type, tokenizer, self._device, self.logger) + + def predict_next_sentence(self): + """ + TODO: Create DocString + """ + raise NotImplementedError() + + def train(self, input_filepath, args): + raise NotImplementedError("train() is currently not available") + + def eval(self, input_filepath): + raise NotImplementedError("eval() is currently not available") + + def test(self, input_filepath): + raise NotImplementedError("test() is currently not available") From 535f3806846b5bd140a6f5ed12efe47128fd85f4 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Mon, 11 Jan 2021 15:44:48 -0500 Subject: [PATCH 070/155] Added NS --- happytransformer/__init__.py | 1 + happytransformer/happy_next_sentence.py | 80 +++++++++++++++---------- tests/test_ns.py | 41 +++++++++++++ 3 files changed, 91 insertions(+), 31 deletions(-) create mode 100644 tests/test_ns.py diff --git a/happytransformer/__init__.py b/happytransformer/__init__.py index 5a213275..359bcb50 100644 --- a/happytransformer/__init__.py +++ b/happytransformer/__init__.py @@ -1,5 +1,6 @@ from happytransformer.happy_question_answering import HappyQuestionAnswering from happytransformer.happy_word_prediction import HappyWordPrediction from happytransformer.happy_text_classification import HappyTextClassification +from happytransformer.happy_next_sentence import HappyNextSentence name = "happytransformer" diff --git a/happytransformer/happy_next_sentence.py b/happytransformer/happy_next_sentence.py index 535b6311..8ae15667 100644 --- a/happytransformer/happy_next_sentence.py +++ b/happytransformer/happy_next_sentence.py @@ -1,54 +1,72 @@ import torch - -from collections import namedtuple -from happytransformer.happy_transformer import HappyTransformer -from happytransformer.mwp.trainer import WPTrainer - +import re from transformers import ( - BertForMaskedLM, BertTokenizerFast, - AlbertForMaskedLM, - AlbertTokenizerFast, - DistilBertForMaskedLM, - DistilBertTokenizerFast, - FillMaskPipeline, + BertForNextSentencePrediction, + ) from happytransformer.happy_transformer import HappyTransformer -NextSentenceResult = namedtuple("NextSentenceResult", ("next_sentence", "score")) - class HappyNextSentence(HappyTransformer): """ A user facing class for next sentence prediction """ - def __init__(self, model_type="DISTILBERT", - model_name="distilbert-base-uncased"): - - if model_type == "ALBERT": - model = AlbertForMaskedLM.from_pretrained(model_name) - tokenizer = AlbertTokenizerFast.from_pretrained(model_name) + def __init__(self, model_type="BERT", + model_name="bert-base-uncased"): - elif model_type == "BERT": - model = BertForMaskedLM.from_pretrained(model_name) + if model_type == "BERT": + model = BertForNextSentencePrediction.from_pretrained(model_name) tokenizer = BertTokenizerFast.from_pretrained(model_name) - - elif model_type == "DISTILBERT": - model = DistilBertForMaskedLM.from_pretrained(model_name) - tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) else: raise ValueError(self.model_type_error) super().__init__(model_type, model_name, model, tokenizer) - device_number = 1 if torch.cuda.is_available() else -1 - self._pipeline = FillMaskPipeline(model=model, tokenizer=tokenizer, device=device_number) - self._trainer = WPTrainer(model, model_type, tokenizer, self._device, self.logger) + self._pipeline = None + self._trainer = None + + def predict_next_sentence(self, sentence_a, sentence_b): + """ + Determines if sentence B is likely to be a continuation after sentence + A. + :param sentence_a: First sentence + :param sentence_b: Second sentence to test if it comes after the first + :return Result of whether sente_devicence B follows sentence A as a probability + """ + if not self.__is_one_sentence(sentence_a) or not self.__is_one_sentence(sentence_b): + raise ValueError('Each inputted text variable for the "predict_next_sentence" method must contain a single sentence') + + encoded = self._tokenizer(sentence_a, sentence_b, return_tensors='pt') + with torch.no_grad(): + scores = self._model(encoded['input_ids'], token_type_ids=encoded['token_type_ids']).logits[0] + + probabilities = torch.softmax(scores, dim=0) + # probability that sentence B follows sentence A + score = probabilities[0].item() + + if self._device == 'cuda': + torch.cuda.empty_cache() + + return score - def predict_next_sentence(self): + def __is_one_sentence(self, text): """ - TODO: Create DocString + Used to verify the proper input requirements for sentence_relation. + The text must contain no more than a single sentence. + Casual use of punctuation is accepted, such as using multiple exclamation marks. + :param text: A body of text + :return: True if the body of text contains a single sentence, else False """ - raise NotImplementedError() + split_text = re.split('[?.!]', text) + sentence_found = False + for possible_sentence in split_text: + for char in possible_sentence: + if char.isalpha(): + if sentence_found: + return False + sentence_found = True + break + return True def train(self, input_filepath, args): raise NotImplementedError("train() is currently not available") diff --git a/tests/test_ns.py b/tests/test_ns.py new file mode 100644 index 00000000..a0886a57 --- /dev/null +++ b/tests/test_ns.py @@ -0,0 +1,41 @@ +from happytransformer import HappyNextSentence +import pytest +# Note, some of the model's weights are randomly initialized +# So we can not rely on getting the same score each time +# we run a unit test. + + +def test_sp_true(): + happy_ns = HappyNextSentence() + result = happy_ns.predict_next_sentence( + "How old are you?", + "I am 21 years old." + ) + assert result > 0.5 + + +def test_sp_false(): + happy_ns = HappyNextSentence() + result = happy_ns.predict_next_sentence( + "How old are you?", + "The Eiffel Tower is in Paris." + ) + assert result < 0.5 + + +def test_sp_sa_too_long(): + happy_ns = HappyNextSentence() + with pytest.raises(Exception): + result = happy_ns.predict_next_sentence( + "How old are you? I'm 21 years old.", + "I am 93 years old." + ) + + +def test_sp_sb_too_long(): + happy_ns = HappyNextSentence() + with pytest.raises(Exception): + result = happy_ns.predict_next_sentence( + "How old are you?", + "I am 93 years old. I'm 21 years old." + ) From 9f2d926453076b0458722990dca12af3d0d32276 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Mon, 11 Jan 2021 15:50:47 -0500 Subject: [PATCH 071/155] Updated doc string for predict_next_sentence --- happytransformer/happy_next_sentence.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/happytransformer/happy_next_sentence.py b/happytransformer/happy_next_sentence.py index 8ae15667..e2f90745 100644 --- a/happytransformer/happy_next_sentence.py +++ b/happytransformer/happy_next_sentence.py @@ -29,9 +29,9 @@ def predict_next_sentence(self, sentence_a, sentence_b): """ Determines if sentence B is likely to be a continuation after sentence A. - :param sentence_a: First sentence - :param sentence_b: Second sentence to test if it comes after the first - :return Result of whether sente_devicence B follows sentence A as a probability + :param sentence_a (string): First sentence + :param sentence_b (string): Second sentence to test if it comes after the first + :return (float): The probability that sentence_b follows sentence_a """ if not self.__is_one_sentence(sentence_a) or not self.__is_one_sentence(sentence_b): raise ValueError('Each inputted text variable for the "predict_next_sentence" method must contain a single sentence') From 0e74389fcd5fef08c7bc47f5e00c19a5a6330949 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Mon, 11 Jan 2021 16:24:16 -0500 Subject: [PATCH 072/155] Added Ted's feedback --- happytransformer/happy_next_sentence.py | 21 --------------------- tests/test_ns.py | 20 ++------------------ 2 files changed, 2 insertions(+), 39 deletions(-) diff --git a/happytransformer/happy_next_sentence.py b/happytransformer/happy_next_sentence.py index e2f90745..094cccc4 100644 --- a/happytransformer/happy_next_sentence.py +++ b/happytransformer/happy_next_sentence.py @@ -33,8 +33,6 @@ def predict_next_sentence(self, sentence_a, sentence_b): :param sentence_b (string): Second sentence to test if it comes after the first :return (float): The probability that sentence_b follows sentence_a """ - if not self.__is_one_sentence(sentence_a) or not self.__is_one_sentence(sentence_b): - raise ValueError('Each inputted text variable for the "predict_next_sentence" method must contain a single sentence') encoded = self._tokenizer(sentence_a, sentence_b, return_tensors='pt') with torch.no_grad(): @@ -49,25 +47,6 @@ def predict_next_sentence(self, sentence_a, sentence_b): return score - def __is_one_sentence(self, text): - """ - Used to verify the proper input requirements for sentence_relation. - The text must contain no more than a single sentence. - Casual use of punctuation is accepted, such as using multiple exclamation marks. - :param text: A body of text - :return: True if the body of text contains a single sentence, else False - """ - split_text = re.split('[?.!]', text) - sentence_found = False - for possible_sentence in split_text: - for char in possible_sentence: - if char.isalpha(): - if sentence_found: - return False - sentence_found = True - break - return True - def train(self, input_filepath, args): raise NotImplementedError("train() is currently not available") diff --git a/tests/test_ns.py b/tests/test_ns.py index a0886a57..ac0fffcd 100644 --- a/tests/test_ns.py +++ b/tests/test_ns.py @@ -1,5 +1,5 @@ from happytransformer import HappyNextSentence -import pytest + # Note, some of the model's weights are randomly initialized # So we can not rely on getting the same score each time # we run a unit test. @@ -8,7 +8,7 @@ def test_sp_true(): happy_ns = HappyNextSentence() result = happy_ns.predict_next_sentence( - "How old are you?", + "Hi nice to meet you. How old are you?", "I am 21 years old." ) assert result > 0.5 @@ -23,19 +23,3 @@ def test_sp_false(): assert result < 0.5 -def test_sp_sa_too_long(): - happy_ns = HappyNextSentence() - with pytest.raises(Exception): - result = happy_ns.predict_next_sentence( - "How old are you? I'm 21 years old.", - "I am 93 years old." - ) - - -def test_sp_sb_too_long(): - happy_ns = HappyNextSentence() - with pytest.raises(Exception): - result = happy_ns.predict_next_sentence( - "How old are you?", - "I am 93 years old. I'm 21 years old." - ) From 5e3f07f751270cf9314558361acb088e073b7795 Mon Sep 17 00:00:00 2001 From: Will Macdonald Date: Mon, 11 Jan 2021 16:25:49 -0500 Subject: [PATCH 073/155] Initial implementation of ROBERTA within modules QA, TC, and WP Created tests for respective modules within test_qa.py, test_tc.py and test_wp.py Additionally included a MWP example in readme_examples.py --- examples/word_prediction/readme_examples.py | 15 +++++++++- happytransformer/happy_question_answering.py | 12 +++++--- happytransformer/happy_text_classification.py | 17 +++++------ happytransformer/happy_word_prediction.py | 15 ++++++++-- tests/test_qa.py | 30 +++++++++++++++++++ tests/test_tc.py | 29 ++++++++++++++++++ tests/test_wp.py | 14 ++++++++- 7 files changed, 114 insertions(+), 18 deletions(-) diff --git a/examples/word_prediction/readme_examples.py b/examples/word_prediction/readme_examples.py index 829129b9..58dbdb27 100644 --- a/examples/word_prediction/readme_examples.py +++ b/examples/word_prediction/readme_examples.py @@ -5,6 +5,7 @@ def example_1_0(): happy_wp_distilbert = HappyWordPrediction() # default happy_wp_albert = HappyWordPrediction("ALBERT", "albert-base-v2") happy_wp_bert = HappyWordPrediction("BERT", "bert-base-uncased") + happy_wp_roberta = HappyWordPrediction("ROBERTA", "roberta-base") def example_1_1(): @@ -35,8 +36,20 @@ def example_1_3(): print(result[1].token_str) # technology +def example_1_4(): + happy_wp = HappyWordPrediction("ROBERTA", "roberta-base") + result = happy_wp.predict_mask("To better the world I would invest in and education.", top_k=10) + print(result) # [WordPredictionResult(token_str='science', score=0.308580607175827), WordPredictionResult(token_str='research', score=0.18018342554569244)] + print(result[1]) # WordPredictionResult(token_str='research', score=0.18018342554569244) + print(result[1].token_str) # research + + def main(): - example_1_1() + # example_1_1() + # example_1_1() + # example_1_2() + # example_1_3() + example_1_4() if __name__ == "__main__": diff --git a/happytransformer/happy_question_answering.py b/happytransformer/happy_question_answering.py index c4f898a8..33750ef1 100644 --- a/happytransformer/happy_question_answering.py +++ b/happytransformer/happy_question_answering.py @@ -15,11 +15,14 @@ DistilBertTokenizerFast, AlbertForQuestionAnswering, AlbertTokenizerFast, + RobertaForQuestionAnswering, + RobertaTokenizerFast, QuestionAnsweringPipeline, ) QuestionAnsweringResult = namedtuple("QuestionAnsweringResult", ["answer", "score", "start", "end"]) + class HappyQuestionAnswering(HappyTransformer): """ This class is a user facing class that allows users to solve question answering problems using @@ -44,6 +47,9 @@ def __init__(self, model_type="DISTILBERT", elif model_type == "DISTILBERT": model = DistilBertForQuestionAnswering.from_pretrained(model_name) tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) + elif model_type == "ROBERTA": + model = RobertaForQuestionAnswering.from_pretrained(model_name) + tokenizer = RobertaTokenizerFast.from_pretrained(model_name) else: raise ValueError(self.model_type_error) @@ -53,11 +59,9 @@ def __init__(self, model_type="DISTILBERT", # from documentation " a positive will run the model on the associated CUDA device id." # todo: get device ID if torch.cuda.is_available() - self._pipeline = QuestionAnsweringPipeline(model, tokenizer, device=device_number) - + self._pipeline = QuestionAnsweringPipeline(model=model, tokenizer=tokenizer, device=device_number) - self._trainer = QATrainer(model, model_type, tokenizer, self._device, - self.logger) + self._trainer = QATrainer(model, model_type, tokenizer, self._device, self.logger) def answer_question(self, context, question, topk=1): """ diff --git a/happytransformer/happy_text_classification.py b/happytransformer/happy_text_classification.py index 50d88bb7..6fbf22a9 100644 --- a/happytransformer/happy_text_classification.py +++ b/happytransformer/happy_text_classification.py @@ -11,8 +11,8 @@ DistilBertTokenizerFast, AlbertForSequenceClassification, AlbertTokenizerFast, - - + RobertaForSequenceClassification, + RobertaTokenizerFast, TextClassificationPipeline ) from happytransformer.tc.trainer import TCTrainer @@ -20,9 +20,9 @@ from happytransformer.happy_transformer import HappyTransformer from happytransformer.tc.default_args import ARGS_TC_TRAIN - TextClassificationResult = namedtuple("TextClassificationResult", ["label", "score"]) + class HappyTextClassification(HappyTransformer): """ A user facing class for Text Classification @@ -42,6 +42,9 @@ def __init__(self, model_type="DISTILBERT", elif model_type == "DISTILBERT": model = DistilBertForSequenceClassification.from_pretrained(model_name) tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) + elif model_type == "ROBERTA": + model = RobertaForSequenceClassification.from_pretrained(model_name) + tokenizer = RobertaTokenizerFast.from_pretrained(model_name) else: raise ValueError(self.model_type_error) @@ -52,12 +55,9 @@ def __init__(self, model_type="DISTILBERT", # from documentation " a positive will run the model on the associated CUDA device id." # todo: get device ID if torch.cuda.is_available() - self._pipeline = TextClassificationPipeline(model=model, - tokenizer=tokenizer, device=device_number) - + self._pipeline = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=device_number) - self._trainer = TCTrainer(self._model, - self.model_type, self._tokenizer, self._device, self.logger) + self._trainer = TCTrainer(self._model, self.model_type, self._tokenizer, self._device, self.logger) def classify_text(self, text): """ @@ -73,7 +73,6 @@ def classify_text(self, text): return TextClassificationResult(label=first_result["label"], score=first_result["score"]) - def train(self, input_filepath, args=ARGS_TC_TRAIN): """ Trains the question answering model diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index f2fc137e..437f373e 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -5,8 +5,9 @@ AlbertTokenizerFast, DistilBertForMaskedLM, DistilBertTokenizerFast, + RobertaForMaskedLM, + RobertaTokenizerFast, FillMaskPipeline, - ) import torch from collections import namedtuple @@ -28,19 +29,23 @@ def __init__(self, model_type="DISTILBERT", if model_type == "ALBERT": model = AlbertForMaskedLM.from_pretrained(model_name) tokenizer = AlbertTokenizerFast.from_pretrained(model_name) - elif model_type == "BERT": model = BertForMaskedLM.from_pretrained(model_name) tokenizer = BertTokenizerFast.from_pretrained(model_name) - elif model_type == "DISTILBERT": model = DistilBertForMaskedLM.from_pretrained(model_name) tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) + elif model_type == "ROBERTA": + model = RobertaForMaskedLM.from_pretrained(model_name) + tokenizer = RobertaTokenizerFast.from_pretrained(model_name) + else: raise ValueError(self.model_type_error) super().__init__(model_type, model_name, model, tokenizer) device_number = 1 if torch.cuda.is_available() else -1 + self._pipeline = FillMaskPipeline(model=model, tokenizer=tokenizer, device=device_number) + self._trainer = WPTrainer(model, model_type, tokenizer, self._device, self.logger) def predict_mask(self, text, targets=None, top_k=1): @@ -60,6 +65,10 @@ def predict_mask(self, text, targets=None, top_k=1): for answer in result: if answer["token_str"][0] == "▁": answer["token_str"] = answer["token_str"][1:] + elif self.model_type == "ROBERTA": + for answer in result: + if answer["token_str"][0] == "Ġ": + answer["token_str"] = answer["token_str"][1:] results = [ WordPredictionResult( token_str=answer["token_str"], diff --git a/tests/test_qa.py b/tests/test_qa.py index 50760d13..51477a2e 100644 --- a/tests/test_qa.py +++ b/tests/test_qa.py @@ -20,6 +20,7 @@ def test_qa_answer_question_top_k(): QuestionAnsweringResult(answer='January', score=0.005092293489724398, start=16, end=23)] assert result == answer + def test_qa_train(): happy_qa = HappyQuestionAnswering() happy_qa.train("../data/qa/train-eval.csv") @@ -52,6 +53,7 @@ def test_qa_train_effectiveness(): assert after_loss < before_loss + def test_qa_train_effectiveness_albert(): """ Ensures that HappyQuestionAnswering.train() results in @@ -65,6 +67,7 @@ def test_qa_train_effectiveness_albert(): assert after_loss < before_loss + def test_qa_test_albert(): happy_qa = HappyQuestionAnswering("ALBERT", "twmkn9/albert-base-v2-squad2") result = happy_qa.test("../data/qa/test.csv") @@ -86,9 +89,36 @@ def test_qa_train_effectiveness_bert(): assert after_loss < before_loss + def test_qa_test_bert(): happy_qa = HappyQuestionAnswering("BERT", "mrm8488/bert-tiny-5-finetuned-squadv2") result = happy_qa.test("../data/qa/test.csv") answer = [QuestionAnsweringResult(answer='October 31st', score=0.9352769255638123, start=0, end=12), QuestionAnsweringResult(answer='November 23rd', score=0.9180678129196167, start=12, end=25)] assert result == answer + + +def test_qa_train_effectiveness_roberta(): + """ + Ensures that HappyQuestionAnswering.train() results in + lowering the loss as determined by HappyQuestionAnswering.eval() + """ + + happy_qa = HappyQuestionAnswering("ROBERTA", "roberta-base") + before_loss = happy_qa.eval("../data/qa/train-eval.csv").eval_loss + happy_qa.train("../data/qa/train-eval.csv") + after_loss = happy_qa.eval("../data/qa/train-eval.csv").eval_loss + assert after_loss < before_loss + + +def test_qa_test_roberta(): + happy_qa = HappyQuestionAnswering("ROBERTA", "roberta-base") + result = happy_qa.test("../data/qa/test.csv") + print(result) + answer = [QuestionAnsweringResult(answer='is the', score=0.03888237848877907, start=13, end=19), + QuestionAnsweringResult(answer='date is', score=0.02540113404393196, start=4, end=11)] + + +if __name__ == '__main__': + test_qa_test_roberta() + test_qa_train_effectiveness_roberta() diff --git a/tests/test_tc.py b/tests/test_tc.py index 1a780ae5..0ba22fae 100644 --- a/tests/test_tc.py +++ b/tests/test_tc.py @@ -121,3 +121,32 @@ def test_qa_train_effectiveness_bert(): happy_tc.train("../data/tc/train-eval.csv") after_loss = happy_tc.eval("../data/tc/train-eval.csv").eval_loss assert after_loss < before_loss + + +def test_qa_test_roberta(): + """ + Tests + HappyQuestionAnswering.test() + """ + happy_tc = HappyTextClassification(model_type="ROBERTA", model_name="textattack/roberta-base-imdb") + + result = happy_tc.test("../data/tc/test.csv") + answer = [TextClassificationResult(label='LABEL_1', score=0.9883185029029846), + TextClassificationResult(label='LABEL_0', score=0.9893660545349121), + TextClassificationResult(label='LABEL_0', score=0.947014331817627), + TextClassificationResult(label='LABEL_1', score=0.9845685958862305)] + assert result == answer + + +def test_qa_train_effectiveness_roberta(): + """ + Tests + Ensures that HappyQuestionAnswering.train() results in + lowering the loss as determined by HappyQuestionAnswering.eval() + """ + happy_tc = HappyTextClassification(model_type="ROBERTA", model_name="textattack/roberta-base-imdb") + before_loss = happy_tc.eval("../data/tc/train-eval.csv").eval_loss + happy_tc.train("../data/tc/train-eval.csv") + after_loss = happy_tc.eval("../data/tc/train-eval.csv").eval_loss + assert after_loss < before_loss + diff --git a/tests/test_wp.py b/tests/test_wp.py index 5b7a429f..032d8d46 100644 --- a/tests/test_wp.py +++ b/tests/test_wp.py @@ -1,6 +1,7 @@ from happytransformer import HappyWordPrediction from happytransformer.happy_word_prediction import WordPredictionResult + def test_mwp_basic(): happy_mwp = HappyWordPrediction() result = happy_mwp.predict_mask( @@ -21,6 +22,7 @@ def test_mwp_top_k(): assert result == answer + def test_mwp_targets(): happy_mwp = HappyWordPrediction() result = happy_mwp.predict_mask( @@ -40,10 +42,20 @@ def test_mwp_basic_albert(): answer = [WordPredictionResult(token_str='garlic', score=0.036625903099775314)] assert result == answer + def test_mwp_basic_bert(): happy_mwp = HappyWordPrediction("BERT", "bert-base-uncased") result = happy_mwp.predict_mask( "Please pass the salt and [MASK]", ) answer = [WordPredictionResult(token_str='.', score=0.8466101884841919)] - assert result == answer \ No newline at end of file + assert result == answer + + +def test_mwp_basic_roberta(): + happy_mwp = HappyWordPrediction("ROBERTA", "roberta-base") + result = happy_mwp.predict_mask( + "Please pass the salt and ", # Roberta requires mask to be '' as opposed to '[MASK]' + ) + answer = [WordPredictionResult(token_str='pepper', score=0.7325230240821838)] + assert result == answer From 24c37a9023a3804964a96fad25942a070c721e8e Mon Sep 17 00:00:00 2001 From: Will Macdonald Date: Mon, 11 Jan 2021 16:28:07 -0500 Subject: [PATCH 074/155] Removing print statement and lines used for testing QA functionality --- tests/test_qa.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/tests/test_qa.py b/tests/test_qa.py index 51477a2e..fcde7bcd 100644 --- a/tests/test_qa.py +++ b/tests/test_qa.py @@ -114,11 +114,6 @@ def test_qa_train_effectiveness_roberta(): def test_qa_test_roberta(): happy_qa = HappyQuestionAnswering("ROBERTA", "roberta-base") result = happy_qa.test("../data/qa/test.csv") - print(result) answer = [QuestionAnsweringResult(answer='is the', score=0.03888237848877907, start=13, end=19), QuestionAnsweringResult(answer='date is', score=0.02540113404393196, start=4, end=11)] - - -if __name__ == '__main__': - test_qa_test_roberta() - test_qa_train_effectiveness_roberta() + assert result == answer From 2cb16234bfaa5ef802027edfe2e78d0a0973731e Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Mon, 11 Jan 2021 16:39:13 -0500 Subject: [PATCH 075/155] updated requirements --- requirements.txt | 7 ++----- setup.py | 9 +++------ 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/requirements.txt b/requirements.txt index cffb13c8..816e3a0c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,3 @@ -numpy>=1.17.4 -torch>=1.3.1 -tqdm>=4.38.0 +torch>=1.0 +tqdm>=4.27 transformers>=4.0.0 -pandas>=0.23.0 -scikit_learn>=0.22.1 \ No newline at end of file diff --git a/setup.py b/setup.py index ea743a81..666eb778 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ setup( name = 'happytransformer', packages = ['happytransformer',], - version = '1.1.4', + version = '2.0.0', license='Apache 2.0', description = "Happy Transformer is an API built on top of Hugging Face's Transformer library that makes it easy to utilize state-of-the-art NLP models.", long_description= readme, @@ -21,11 +21,8 @@ install_requires=[ - 'numpy', - 'torch', - 'pandas', - 'tqdm', - 'scikit_learn', + 'torch>=1.0', + 'tqdm>=4.27', 'transformers>=4.0.0', ], From 7e4221c20e6b13e5ab366335defe4393c85d22f4 Mon Sep 17 00:00:00 2001 From: Will Macdonald Date: Mon, 11 Jan 2021 16:46:49 -0500 Subject: [PATCH 076/155] Adding in RoBERTa model into question_answering/readme_examples.py and text_classification/readme_examples.py --- examples/question_answering/readme_examples.py | 8 ++++++++ examples/text_classification/readme_examples.py | 3 ++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/examples/question_answering/readme_examples.py b/examples/question_answering/readme_examples.py index 3fe22a33..63d8f494 100644 --- a/examples/question_answering/readme_examples.py +++ b/examples/question_answering/readme_examples.py @@ -1,10 +1,13 @@ from happytransformer import HappyQuestionAnswering + + def example_3_0(): happy_qa_distilbert = HappyQuestionAnswering() # default happy_qa_albert = HappyQuestionAnswering("ALBERT", "mfeb/albert-xxlarge-v2-squad2") # good model when using with limited hardware happy_qa_bert = HappyQuestionAnswering("BERT", "mrm8488/bert-tiny-5-finetuned-squadv2") + happy_qa_roberta = HappyQuestionAnswering("ROBERTA", "deepset/roberta-base-squad2") def example_3_1(): @@ -24,10 +27,12 @@ def example_3_2(): print(result) # [QuestionAnsweringResult(answer='January 10th, 2021', score=0.9711642265319824, start=16, end=34), QuestionAnsweringResult(answer='January 10th', score=0.017306014895439148, start=16, end=28)] print(result[1].answer) # January 10th + def example_3_3(): happy_qa = HappyQuestionAnswering() happy_qa.train("../../data/qa/train-eval.csv") + def example_3_4(): happy_qa = HappyQuestionAnswering() result = happy_qa.eval("../../data/qa/train-eval.csv") @@ -35,6 +40,7 @@ def example_3_4(): print(result) # EvalResult(eval_loss=0.11738169193267822) print(result.eval_loss) # 0.1173816919326782 + def example_3_5(): happy_qa = HappyQuestionAnswering() result = happy_qa.test("../../data/qa/test.csv") @@ -43,6 +49,7 @@ def example_3_5(): print(result[0]) # QuestionAnsweringResult(answer='October 31st', score=0.9939756989479065, start=0, end=12) print(result[0].answer) # October 31st + def example_3_6(): happy_qa = HappyQuestionAnswering() before_loss = happy_qa.eval("../../data/qa/train-eval.csv").eval_loss @@ -54,6 +61,7 @@ def example_3_6(): # Note: typically you evaluate with a separate dataset # but for simplicity the same one was used + def main(): example_3_1() diff --git a/examples/text_classification/readme_examples.py b/examples/text_classification/readme_examples.py index 3a220ee2..3c68bb8f 100644 --- a/examples/text_classification/readme_examples.py +++ b/examples/text_classification/readme_examples.py @@ -1,10 +1,11 @@ - from happytransformer import HappyTextClassification + def example_2_0(): happy_qa_distilbert = HappyTextClassification() # default with "distilbert-base-uncased" happy_tc_albert = HappyTextClassification(model_type="ALBERT", model_name="albert-base-v2") happy_qa_bert = HappyTextClassification("BERT", "bert-base-uncased") + happy_qa_roberta = HappyTextClassification("ROBERTA", "roberta-base") def example_2_1(): From 0104de118f6c8eff2fb46ac9355afd7886059043 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Mon, 11 Jan 2021 20:59:59 -0500 Subject: [PATCH 077/155] Added readme for NSP' --- README.md | 54 +++++++++++++++++++ .../readme_examples.py | 32 +++++++++++ 2 files changed, 86 insertions(+) create mode 100644 examples/next_sentence_prediction/readme_examples.py diff --git a/README.md b/README.md index fc2e8f5c..c1b581aa 100644 --- a/README.md +++ b/README.md @@ -514,7 +514,61 @@ The list is in order by ascending csv index. ``` ## Next Sentence Prediction +### Initialization + +Initialize a HappyNextSentence object to next sentence prediction + +**Initialization Arguments:** + 1. model_type (string): The default is "BERT", which is currently the only available model + 2. model_name(string): We recommend none-finetuned BERT models like + "bert-base-uncased" and "bert-large-uncased" + + +#### Example 4.0: +```python + from happytransformer import HappyNextSentence + # --------------------------------------# + happy_ns = HappyNextSentence() # default is "bert-base-uncased" + happy_ns_large = HappyNextSentence("BERT", "bert-large-uncased") + +``` + +### predict_next_sentence() +Inputs: +We recommend keeping sentence_a and sentence_b to a single sentence. But longer inputs still work. +1. sentence_a (string): A sentence +2. sentence_b (string): A sentence that may or may not follow sentence_a + +Returns: +A float between 0 and 1 that represents how likely sentence_a follows sentence_b. +the higher the value, the more likely sentence_b follows sentence_a + +#### Example 4.1: +```python + from happytransformer import HappyNextSentence + # --------------------------------------# + happy_ns = HappyNextSentence() + result = happy_ns.predict_next_sentence( + "How old are you?", + "I am 21 years old." + ) + print(type(result)) # + print(result) # 0.9999918937683105 +``` + +#### Example 4.2: +```python + from happytransformer import HappyNextSentence + # --------------------------------------# + happy_ns = HappyNextSentence() + result = happy_ns.predict_next_sentence( + "How old are you?", + "Queen's University is in Kingston Ontario Canada" + ) + print(type(result)) # + print(result) # 0.00018497584096621722 +``` ## Tech Happy Transformer uses a number of open source projects: diff --git a/examples/next_sentence_prediction/readme_examples.py b/examples/next_sentence_prediction/readme_examples.py new file mode 100644 index 00000000..9f6d800f --- /dev/null +++ b/examples/next_sentence_prediction/readme_examples.py @@ -0,0 +1,32 @@ +from happytransformer import HappyNextSentence + +def example_4_0(): + happy_ns = HappyNextSentence() # default is "bert-base-uncased" + happy_ns_large = HappyNextSentence("BERT", "bert-large-uncased") + + +def example_4_1(): + happy_ns = HappyNextSentence() + result = happy_ns.predict_next_sentence( + "How old are you?", + "I am 21 years old." + ) + print(type(result)) # + print(result) # 0.9999918937683105 + +def example_4_2(): + happy_ns = HappyNextSentence() + result = happy_ns.predict_next_sentence( + "How old are you?", + "Queen's University is in Kingston Ontario Canada" + ) + print(type(result)) # + print(result) # 0.00018497584096621722 + + +def main(): + example_4_2() + + +if __name__ == "__main__": + main() From 8c8a1a15b2c05fdc82cbdd89fcf0d2f3273973eb Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Mon, 11 Jan 2021 21:03:06 -0500 Subject: [PATCH 078/155] removed sentence --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index c1b581aa..3f339802 100644 --- a/README.md +++ b/README.md @@ -542,7 +542,6 @@ We recommend keeping sentence_a and sentence_b to a single sentence. But longer Returns: A float between 0 and 1 that represents how likely sentence_a follows sentence_b. -the higher the value, the more likely sentence_b follows sentence_a #### Example 4.1: ```python From 25dc2cd409bea60bf6d7ad262b9dd81a63f1d103 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Mon, 11 Jan 2021 21:09:34 -0500 Subject: [PATCH 079/155] Added pytest to req --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 816e3a0c..72cbd14e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ torch>=1.0 tqdm>=4.27 transformers>=4.0.0 +pytest \ No newline at end of file From b3710ee1ea725da1545a010328ff547927c282e4 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Mon, 11 Jan 2021 22:37:13 -0500 Subject: [PATCH 080/155] Added __init__.py files and updated setup.py --- happytransformer/mwp/__init__.py | 4 ++++ happytransformer/qa/__init__.py | 4 ++++ happytransformer/tc/__init__.py | 4 ++++ setup.py | 6 +++--- 4 files changed, 15 insertions(+), 3 deletions(-) create mode 100644 happytransformer/mwp/__init__.py create mode 100644 happytransformer/qa/__init__.py create mode 100644 happytransformer/tc/__init__.py diff --git a/happytransformer/mwp/__init__.py b/happytransformer/mwp/__init__.py new file mode 100644 index 00000000..58983a23 --- /dev/null +++ b/happytransformer/mwp/__init__.py @@ -0,0 +1,4 @@ +from .trainer import WPTrainer +from .default_args import ARGS_MWP_TRAIN + +name = "happytransformer.mwp" diff --git a/happytransformer/qa/__init__.py b/happytransformer/qa/__init__.py new file mode 100644 index 00000000..26874cca --- /dev/null +++ b/happytransformer/qa/__init__.py @@ -0,0 +1,4 @@ +from .trainer import QATrainer +from .default_args import ARGS_QA_TRAIN + +name = "happytransformer.qa" diff --git a/happytransformer/tc/__init__.py b/happytransformer/tc/__init__.py new file mode 100644 index 00000000..7cb22196 --- /dev/null +++ b/happytransformer/tc/__init__.py @@ -0,0 +1,4 @@ +from .trainer import TCTrainer +from .default_args import ARGS_TC_TRAIN + +name = "happytransformer.tc" diff --git a/setup.py b/setup.py index 666eb778..b393184f 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,5 @@ # from distutils.core import setup -from setuptools import setup +from setuptools import setup, find_packages import pathlib @@ -8,8 +8,8 @@ setup( name = 'happytransformer', - packages = ['happytransformer',], - version = '2.0.0', + packages = find_packages(), + version = '2.0.0a4', license='Apache 2.0', description = "Happy Transformer is an API built on top of Hugging Face's Transformer library that makes it easy to utilize state-of-the-art NLP models.", long_description= readme, From fd2abfb1b003e2c5d2e571ebdf9031b2801330a0 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Mon, 11 Jan 2021 22:46:32 -0500 Subject: [PATCH 081/155] Added NSP trainer skeleton code --- happytransformer/sp/__init__.py | 4 ++++ happytransformer/sp/defauly_args.py | 10 ++++++++++ happytransformer/sp/trainer.py | 14 ++++++++++++++ 3 files changed, 28 insertions(+) create mode 100644 happytransformer/sp/__init__.py create mode 100644 happytransformer/sp/defauly_args.py create mode 100644 happytransformer/sp/trainer.py diff --git a/happytransformer/sp/__init__.py b/happytransformer/sp/__init__.py new file mode 100644 index 00000000..65a0f0ef --- /dev/null +++ b/happytransformer/sp/__init__.py @@ -0,0 +1,4 @@ +from .trainer import SPTrainer +from .defauly_args import ARGS_SP_TRAIN + +name = "happytransformer.sp" diff --git a/happytransformer/sp/defauly_args.py b/happytransformer/sp/defauly_args.py new file mode 100644 index 00000000..db0e6c4f --- /dev/null +++ b/happytransformer/sp/defauly_args.py @@ -0,0 +1,10 @@ +ARGS_SP_TRAIN = { + 'learning_rate': 5e-5, + 'weight_decay': 0, + 'adam_beta1': 0.9, + 'adam_beta2': 0.999, + 'adam_epsilon': 1e-8, + 'max_grad_norm': 1.0, + 'num_train_epochs': 3.0, + +} diff --git a/happytransformer/sp/trainer.py b/happytransformer/sp/trainer.py new file mode 100644 index 00000000..e7c6b9c8 --- /dev/null +++ b/happytransformer/sp/trainer.py @@ -0,0 +1,14 @@ +from happytransformer.happy_trainer import HappyTrainer +from happytransformer.sp.defauly_args import ARGS_SP_TRAIN + + +class SPTrainer(HappyTrainer): + + def train(self, input_filepath, args=ARGS_SP_TRAIN): + raise NotImplementedError() + + def eval(self, input_filepath): + raise NotImplementedError() + + def test(self, input_filepath, pipeline): + raise NotImplementedError() \ No newline at end of file From 01d8cab8012f4fc6377b20b523810d48deea52d6 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Mon, 11 Jan 2021 23:34:03 -0500 Subject: [PATCH 082/155] cuda devices are 0 indexed. --- happytransformer/happy_word_prediction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index f2fc137e..5e60809f 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -39,7 +39,7 @@ def __init__(self, model_type="DISTILBERT", else: raise ValueError(self.model_type_error) super().__init__(model_type, model_name, model, tokenizer) - device_number = 1 if torch.cuda.is_available() else -1 + device_number = 0 if torch.cuda.is_available() else -1 self._pipeline = FillMaskPipeline(model=model, tokenizer=tokenizer, device=device_number) self._trainer = WPTrainer(model, model_type, tokenizer, self._device, self.logger) From 5a57d4926042d0d22199b64cfbc76b892abcbd31 Mon Sep 17 00:00:00 2001 From: Will Macdonald Date: Mon, 11 Jan 2021 23:38:01 -0500 Subject: [PATCH 083/155] Addressing initial issues outlined in change requests for PR #177 --- README.md | 17 ++++++++++------- examples/word_prediction/readme_examples.py | 11 +---------- happytransformer/happy_word_prediction.py | 4 +++- tests/test_qa.py | 8 ++++---- tests/test_wp.py | 2 +- 5 files changed, 19 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index fc2e8f5c..185032fb 100644 --- a/README.md +++ b/README.md @@ -43,11 +43,11 @@ We're happy to announce that we won a Best Paper Award at the Canadian Undergrad | Question Answering | ✔ | ✔ | | Next Sentence Prediction | ✔ | | -| Public Methods | ALBERT | BERT |DISTILBERT | -|------------------------------------|--------------|------------|-----------| -| Word Prediction | ✔ | ✔ | ✔ | -| Text Classification | ✔ | ✔ | ✔ | -| Question Answering | ✔ | ✔ | ✔ | +| Public Methods | ALBERT | BERT |DISTILBERT |ROBERTA | +|------------------------------------|--------------|------------|-----------|----------| +| Word Prediction | ✔ | ✔ | ✔ |✔ +| Text Classification | ✔ | ✔ | ✔ |✔ +| Question Answering | ✔ | ✔ | ✔ |✔ | Next Sentence Prediction | ✔ | ✔ | ✔ | ## Installation @@ -82,6 +82,7 @@ We recommend using "HappyWordPrediction("ALBERT", "albert-xxlarge-v2")" for the happy_wp_distilbert = HappyWordPrediction() # default happy_wp_albert = HappyWordPrediction("ALBERT", "albert-base-v2") happy_wp_bert = HappyWordPrediction("BERT", "bert-base-uncased") + happy_wp_roberta = HappyWordPrediction("ROBERTA", "roberta-base") ``` @@ -167,9 +168,10 @@ number of labels, so if you use these models you can set num_labels freely ```python from happytransformer import HappyTextClassification # --------------------------------------# - happy_qa_distilbert = HappyTextClassification() # default with "distilbert-base-uncased" and num_labels=2 + happy_tc_distilbert = HappyTextClassification() # default with "distilbert-base-uncased" and num_labels=2 happy_tc_albert = HappyTextClassification(model_type="ALBERT", model_name="albert-base-v2") - happy_qa_bert = HappyTextClassification("BERT", "bert-base-uncased") + happy_tc_bert = HappyTextClassification("BERT", "bert-base-uncased") + happy_tc_roberta = HappyTextClassification("ROBERTA", "roberta-base") ``` @@ -353,6 +355,7 @@ We recommend using "HappyQuestionAnswering("ALBERT", "mfeb/albert-xxlarge-v2-squ happy_qa_albert = HappyQuestionAnswering("ALBERT", "mfeb/albert-xxlarge-v2-squad2") # good model when using with limited hardware happy_qa_bert = HappyQuestionAnswering("BERT", "mrm8488/bert-tiny-5-finetuned-squadv2") + happy_qa_roberta = HappyQuestionAnswering("ROBERTA", "deepset/roberta-base-squad2") ``` diff --git a/examples/word_prediction/readme_examples.py b/examples/word_prediction/readme_examples.py index 58dbdb27..9ee62cba 100644 --- a/examples/word_prediction/readme_examples.py +++ b/examples/word_prediction/readme_examples.py @@ -36,20 +36,11 @@ def example_1_3(): print(result[1].token_str) # technology -def example_1_4(): - happy_wp = HappyWordPrediction("ROBERTA", "roberta-base") - result = happy_wp.predict_mask("To better the world I would invest in and education.", top_k=10) - print(result) # [WordPredictionResult(token_str='science', score=0.308580607175827), WordPredictionResult(token_str='research', score=0.18018342554569244)] - print(result[1]) # WordPredictionResult(token_str='research', score=0.18018342554569244) - print(result[1].token_str) # research - - def main(): - # example_1_1() + example_1_1() # example_1_1() # example_1_2() # example_1_3() - example_1_4() if __name__ == "__main__": diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index 437f373e..e9610974 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -38,7 +38,6 @@ def __init__(self, model_type="DISTILBERT", elif model_type == "ROBERTA": model = RobertaForMaskedLM.from_pretrained(model_name) tokenizer = RobertaTokenizerFast.from_pretrained(model_name) - else: raise ValueError(self.model_type_error) super().__init__(model_type, model_name, model, tokenizer) @@ -59,6 +58,9 @@ def predict_mask(self, text, targets=None, top_k=1): if not isinstance(text, str): raise ValueError("the \"text\" argument must be a single string") + if self.model_type == "ROBERTA": + text = text.replace("[MASK]", "") + result = self._pipeline(text, targets=targets, top_k=top_k) if self.model_type == "ALBERT": diff --git a/tests/test_qa.py b/tests/test_qa.py index 25979b41..95049c73 100644 --- a/tests/test_qa.py +++ b/tests/test_qa.py @@ -104,7 +104,7 @@ def test_qa_train_effectiveness_roberta(): lowering the loss as determined by HappyQuestionAnswering.eval() """ - happy_qa = HappyQuestionAnswering("ROBERTA", "roberta-base") + happy_qa = HappyQuestionAnswering("ROBERTA", "deepset/roberta-base-squad2") before_loss = happy_qa.eval("../data/qa/train-eval.csv").eval_loss happy_qa.train("../data/qa/train-eval.csv") after_loss = happy_qa.eval("../data/qa/train-eval.csv").eval_loss @@ -112,8 +112,8 @@ def test_qa_train_effectiveness_roberta(): def test_qa_test_roberta(): - happy_qa = HappyQuestionAnswering("ROBERTA", "roberta-base") + happy_qa = HappyQuestionAnswering("ROBERTA", "deepset/roberta-base-squad2") result = happy_qa.test("../data/qa/test.csv") - answer = [QuestionAnsweringResult(answer='is the', score=0.03888237848877907, start=13, end=19), - QuestionAnsweringResult(answer='date is', score=0.02540113404393196, start=4, end=11)] + answer = [QuestionAnsweringResult(answer='October 31st', score=0.9512737393379211, start=0, end=12), + QuestionAnsweringResult(answer='November 23rd', score=0.8634917736053467, start=12, end=25)] assert result == answer diff --git a/tests/test_wp.py b/tests/test_wp.py index 032d8d46..45b89ff1 100644 --- a/tests/test_wp.py +++ b/tests/test_wp.py @@ -55,7 +55,7 @@ def test_mwp_basic_bert(): def test_mwp_basic_roberta(): happy_mwp = HappyWordPrediction("ROBERTA", "roberta-base") result = happy_mwp.predict_mask( - "Please pass the salt and ", # Roberta requires mask to be '' as opposed to '[MASK]' + "Please pass the salt and [MASK]", ) answer = [WordPredictionResult(token_str='pepper', score=0.7325230240821838)] assert result == answer From 8edf07641ab18543d96a8d0ed288cfc087fcb062 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Mon, 11 Jan 2021 23:40:18 -0500 Subject: [PATCH 084/155] grab the actual device --- happytransformer/happy_word_prediction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index 5e60809f..6f4f4c6d 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -39,7 +39,7 @@ def __init__(self, model_type="DISTILBERT", else: raise ValueError(self.model_type_error) super().__init__(model_type, model_name, model, tokenizer) - device_number = 0 if torch.cuda.is_available() else -1 + device_number = torch.cuda.current_device() if torch.cuda.is_available() else -1 self._pipeline = FillMaskPipeline(model=model, tokenizer=tokenizer, device=device_number) self._trainer = WPTrainer(model, model_type, tokenizer, self._device, self.logger) From 2d1b0110849e3fbcabbb0fb6090cefac6d281ae5 Mon Sep 17 00:00:00 2001 From: Will Macdonald Date: Mon, 11 Jan 2021 23:46:10 -0500 Subject: [PATCH 085/155] Fixing HappyTextClassification variable names within readme_examples.py --- examples/text_classification/readme_examples.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/text_classification/readme_examples.py b/examples/text_classification/readme_examples.py index 3c68bb8f..4ef496ab 100644 --- a/examples/text_classification/readme_examples.py +++ b/examples/text_classification/readme_examples.py @@ -2,10 +2,10 @@ def example_2_0(): - happy_qa_distilbert = HappyTextClassification() # default with "distilbert-base-uncased" + happy_tc_distilbert = HappyTextClassification() # default with "distilbert-base-uncased" happy_tc_albert = HappyTextClassification(model_type="ALBERT", model_name="albert-base-v2") - happy_qa_bert = HappyTextClassification("BERT", "bert-base-uncased") - happy_qa_roberta = HappyTextClassification("ROBERTA", "roberta-base") + happy_tc_bert = HappyTextClassification("BERT", "bert-base-uncased") + happy_tc_roberta = HappyTextClassification("ROBERTA", "deepset/roberta-base-squad2") def example_2_1(): From 06d5a9614610782b0dbfc1cbedb2b7cf459711c4 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 00:17:11 -0500 Subject: [PATCH 086/155] use new method --- happytransformer/cuda_detect.py | 4 ++++ happytransformer/happy_question_answering.py | 6 +++--- happytransformer/happy_text_classification.py | 3 ++- happytransformer/happy_word_prediction.py | 3 ++- 4 files changed, 11 insertions(+), 5 deletions(-) create mode 100644 happytransformer/cuda_detect.py diff --git a/happytransformer/cuda_detect.py b/happytransformer/cuda_detect.py new file mode 100644 index 00000000..5290fc84 --- /dev/null +++ b/happytransformer/cuda_detect.py @@ -0,0 +1,4 @@ +import torch + +def detect_cuda_device_number(): + return torch.cuda.current_device() if torch.cuda.is_available() else -1 \ No newline at end of file diff --git a/happytransformer/happy_question_answering.py b/happytransformer/happy_question_answering.py index 221048cc..8b5fc447 100644 --- a/happytransformer/happy_question_answering.py +++ b/happytransformer/happy_question_answering.py @@ -6,8 +6,7 @@ import torch from happytransformer.happy_transformer import HappyTransformer from happytransformer.qa.trainer import QATrainer -from happytransformer.qa.default_args \ - import ARGS_QA_TRAIN +from happytransformer.qa.default_args import ARGS_QA_TRAIN from transformers import ( BertForQuestionAnswering, BertTokenizerFast, @@ -17,6 +16,7 @@ AlbertTokenizerFast, QuestionAnsweringPipeline, ) +from happytransformer.cuda_detect import detect_cuda_device_number QuestionAnsweringResult = namedtuple("QuestionAnsweringResult", ["answer", "score", "start", "end"]) @@ -49,7 +49,7 @@ def __init__(self, model_type="DISTILBERT", raise ValueError(self.model_type_error) super().__init__(model_type, model_name, model, tokenizer) - device_number = 1 if torch.cuda.is_available() else -1 + device_number = detect_cuda_device_number() # from documentation " a positive will run the model on the associated CUDA device id." # todo: get device ID if torch.cuda.is_available() diff --git a/happytransformer/happy_text_classification.py b/happytransformer/happy_text_classification.py index 292f09e7..b0a66d36 100644 --- a/happytransformer/happy_text_classification.py +++ b/happytransformer/happy_text_classification.py @@ -15,6 +15,7 @@ TextClassificationPipeline ) from happytransformer.tc.trainer import TCTrainer +from happytransformer.cuda_detect import detect_cuda_device_number from happytransformer.happy_transformer import HappyTransformer from happytransformer.tc.default_args import ARGS_TC_TRAIN @@ -48,7 +49,7 @@ def __init__(self, model_type="DISTILBERT", super().__init__(model_type, model_name, model, tokenizer) - device_number = 1 if torch.cuda.is_available() else -1 + device_number = detect_cuda_device_number() # from documentation " a positive will run the model on the associated CUDA device id." # todo: get device ID if torch.cuda.is_available() diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index 6f4f4c6d..26bb5df3 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -12,6 +12,7 @@ from collections import namedtuple from happytransformer.happy_transformer import HappyTransformer from happytransformer.mwp.trainer import WPTrainer +from happytransformer.cuda_detect import detect_cuda_device_number WordPredictionResult = namedtuple("WordPredictionResult", ["token_str", "score"]) @@ -39,7 +40,7 @@ def __init__(self, model_type="DISTILBERT", else: raise ValueError(self.model_type_error) super().__init__(model_type, model_name, model, tokenizer) - device_number = torch.cuda.current_device() if torch.cuda.is_available() else -1 + device_number = detect_cuda_device_number() self._pipeline = FillMaskPipeline(model=model, tokenizer=tokenizer, device=device_number) self._trainer = WPTrainer(model, model_type, tokenizer, self._device, self.logger) From 72a6b421a150b88cf0a7b6b7303dc55e850fe5c9 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 00:22:25 -0500 Subject: [PATCH 087/155] remove TODO --- happytransformer/happy_question_answering.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/happytransformer/happy_question_answering.py b/happytransformer/happy_question_answering.py index 8b5fc447..a97167ad 100644 --- a/happytransformer/happy_question_answering.py +++ b/happytransformer/happy_question_answering.py @@ -50,8 +50,6 @@ def __init__(self, model_type="DISTILBERT", super().__init__(model_type, model_name, model, tokenizer) device_number = detect_cuda_device_number() - # from documentation " a positive will run the model on the associated CUDA device id." - # todo: get device ID if torch.cuda.is_available() self._pipeline = QuestionAnsweringPipeline(model, tokenizer, device=device_number) From 9e1f4fa27723b005a54279905b9986bcb94ab952 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Tue, 12 Jan 2021 14:03:08 -0500 Subject: [PATCH 088/155] Updated table to remove NSP for bert and distilbert --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 527ee02a..39827af0 100644 --- a/README.md +++ b/README.md @@ -45,10 +45,10 @@ We're happy to announce that we won a Best Paper Award at the Canadian Undergrad | Public Methods | ALBERT | BERT |DISTILBERT |ROBERTA | |------------------------------------|--------------|------------|-----------|----------| -| Word Prediction | ✔ | ✔ | ✔ |✔ -| Text Classification | ✔ | ✔ | ✔ |✔ -| Question Answering | ✔ | ✔ | ✔ |✔ -| Next Sentence Prediction | ✔ | ✔ | ✔ | +| Word Prediction | ✔ | ✔ | ✔ |✔ | +| Text Classification | ✔ | ✔ | ✔ |✔ | +| Question Answering | ✔ | ✔ | ✔ |✔ | +| Next Sentence Prediction | ✔ | | | | ## Installation From 39fb9298314faaaeae7a0c2911d9de6a8d30509b Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 14:26:36 -0500 Subject: [PATCH 089/155] predict_mask() uses type hints --- happytransformer/happy_word_prediction.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index 1aed7ba2..29e417cb 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -14,6 +14,7 @@ from happytransformer.happy_transformer import HappyTransformer from happytransformer.mwp.trainer import WPTrainer from happytransformer.cuda_detect import detect_cuda_device_number +from typing import List WordPredictionResult = namedtuple("WordPredictionResult", ["token_str", "score"]) @@ -22,8 +23,8 @@ class HappyWordPrediction(HappyTransformer): """ A user facing class for text classification """ - def __init__(self, model_type="DISTILBERT", - model_name="distilbert-base-uncased"): + def __init__(self, model_type:str="DISTILBERT", + model_name:str="distilbert-base-uncased"): model = None tokenizer = None @@ -49,13 +50,12 @@ def __init__(self, model_type="DISTILBERT", self._trainer = WPTrainer(model, model_type, tokenizer, self._device, self.logger) - def predict_mask(self, text, targets=None, top_k=1): + def predict_mask(self, text:str, targets:List[str]=None, top_k:int=1) -> List[WordPredictionResult]: """ - :param text: A string that contains the model's mask token - :param targets: Optional. A list of strings of potential answers. - All other answers will be ignored - :param top_k: number of results. Default is 1 - :return: A named WordPredictionResult Named Tuple with the following keys: token_str and score + Predict [MASK] tokens in a string. + targets limit possible guesses if supplied. + top_k describes number of targets to return* + *top_k does not apply if targets is supplied """ if not isinstance(text, str): raise ValueError("the \"text\" argument must be a single string") From 3a80ac8e3c0f1646a3cd5e05c9b2c6eae4c5fcef Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 14:26:36 -0500 Subject: [PATCH 090/155] predict_next_sentence() type hints --- happytransformer/happy_next_sentence.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/happytransformer/happy_next_sentence.py b/happytransformer/happy_next_sentence.py index 094cccc4..b5325390 100644 --- a/happytransformer/happy_next_sentence.py +++ b/happytransformer/happy_next_sentence.py @@ -25,13 +25,10 @@ def __init__(self, model_type="BERT", self._pipeline = None self._trainer = None - def predict_next_sentence(self, sentence_a, sentence_b): + def predict_next_sentence(self, sentence_a:str, sentence_b:str)->float: """ - Determines if sentence B is likely to be a continuation after sentence - A. - :param sentence_a (string): First sentence - :param sentence_b (string): Second sentence to test if it comes after the first - :return (float): The probability that sentence_b follows sentence_a + Predict the probability that sentence_b follows sentence_a. + Higher probabilities indicate more coherent sentence pairs. """ encoded = self._tokenizer(sentence_a, sentence_b, return_tensors='pt') From d656aef879f3bcc434fdea11364e4990f50e81a9 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 14:26:36 -0500 Subject: [PATCH 091/155] added type hints to answer_question() --- happytransformer/happy_question_answering.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/happytransformer/happy_question_answering.py b/happytransformer/happy_question_answering.py index 6a420cea..2b112401 100644 --- a/happytransformer/happy_question_answering.py +++ b/happytransformer/happy_question_answering.py @@ -20,6 +20,8 @@ ) from happytransformer.cuda_detect import detect_cuda_device_number +from typing import List + QuestionAnsweringResult = namedtuple("QuestionAnsweringResult", ["answer", "score", "start", "end"]) @@ -61,13 +63,14 @@ def __init__(self, model_type="DISTILBERT", self._trainer = QATrainer(model, model_type, tokenizer, self._device, self.logger) - def answer_question(self, context, question, top_k=1): + def answer_question( + self, + context:str, question:str, top_k:int=1 + )->List[QuestionAnsweringResult]: """ - :param context: background information to answer the question (string) - :param question: A question that can be answered with the given context (string) - :param top_k: how many results - :return: A list of a named tuples that contains the keys: answer, score, start and end - + Find the answers to a question. + The answer MUST be contained somewhere within the context for this to work. + top_k describes the number of answers to return. """ result = self._pipeline(context=context, question=question, topk=top_k) From c5c965511c339a75c772c7a692c22fce78d190a6 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 14:26:36 -0500 Subject: [PATCH 092/155] reduced branches and variables --- happytransformer/happy_question_answering.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/happytransformer/happy_question_answering.py b/happytransformer/happy_question_answering.py index 2b112401..abb4cb0e 100644 --- a/happytransformer/happy_question_answering.py +++ b/happytransformer/happy_question_answering.py @@ -73,21 +73,19 @@ def answer_question( top_k describes the number of answers to return. """ - result = self._pipeline(context=context, question=question, topk=top_k) + pipeline_output = self._pipeline(context=context, question=question, topk=top_k) # transformers returns a single dictionary when top_k ==1. # Our convention however is to have constant output format - if top_k == 1: - result = [result] + answers = [pipeline_output] if top_k==1 else pipeline_output - results = [ + return [ QuestionAnsweringResult( answer=answer["answer"], score=answer["score"], start=answer["start"], end=answer["end"],) - for answer in result + for answer in answers ] - return results def train(self, input_filepath, args=ARGS_QA_TRAIN): """ From 54d8c23382a865ebea2226c2725ee07718370f2a Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 14:26:36 -0500 Subject: [PATCH 093/155] use dataclass --- happytransformer/happy_question_answering.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/happytransformer/happy_question_answering.py b/happytransformer/happy_question_answering.py index abb4cb0e..9d5aa7b3 100644 --- a/happytransformer/happy_question_answering.py +++ b/happytransformer/happy_question_answering.py @@ -21,8 +21,14 @@ from happytransformer.cuda_detect import detect_cuda_device_number from typing import List - -QuestionAnsweringResult = namedtuple("QuestionAnsweringResult", ["answer", "score", "start", "end"]) +from dataclasses import dataclass + +@dataclass +class QuestionAnsweringResult: + answer:str + score:float + start:int + end:int class HappyQuestionAnswering(HappyTransformer): From 39ad0ff4a7e86015c6635b2ee52aefd42b83bad1 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 03:10:34 -0500 Subject: [PATCH 094/155] replaed namedtuples with dataclasses --- happytransformer/happy_question_answering.py | 1 - happytransformer/happy_text_classification.py | 13 +++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/happytransformer/happy_question_answering.py b/happytransformer/happy_question_answering.py index 9d5aa7b3..ed3e9989 100644 --- a/happytransformer/happy_question_answering.py +++ b/happytransformer/happy_question_answering.py @@ -2,7 +2,6 @@ Contains the HappyQuestionAnswering class. """ -from collections import namedtuple import torch from happytransformer.happy_transformer import HappyTransformer from happytransformer.qa.trainer import QATrainer diff --git a/happytransformer/happy_text_classification.py b/happytransformer/happy_text_classification.py index c0398c46..adbfc670 100644 --- a/happytransformer/happy_text_classification.py +++ b/happytransformer/happy_text_classification.py @@ -1,7 +1,7 @@ """ Contains a class called HappyTextClassification that performs text classification """ -from collections import namedtuple +from dataclasses import dataclass import torch from transformers import ( @@ -22,8 +22,10 @@ from happytransformer.happy_transformer import HappyTransformer from happytransformer.tc.default_args import ARGS_TC_TRAIN -TextClassificationResult = namedtuple("TextClassificationResult", ["label", "score"]) - +@dataclass +class TextClassificationResult: + label:str + score:float class HappyTextClassification(HappyTransformer): """ @@ -62,10 +64,9 @@ def __init__(self, model_type="DISTILBERT", self._trainer = TCTrainer(self._model, self.model_type, self._tokenizer, self._device, self.logger) - def classify_text(self, text): + def classify_text(self, text:str) -> TextClassificationResult: """ - :param text: A text string to be classified - :return: A dictionary with keys: label and score, + Classify text to a label based on model's training """ # Blocking allowing a for a list of strings if not isinstance(text, str): From 9df543851cb557061772336e065f3f0abb5769e8 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 03:10:34 -0500 Subject: [PATCH 095/155] convert EvalResult to dataclass --- happytransformer/happy_trainer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/happytransformer/happy_trainer.py b/happytransformer/happy_trainer.py index 86c5bf85..f6645c31 100644 --- a/happytransformer/happy_trainer.py +++ b/happytransformer/happy_trainer.py @@ -1,12 +1,13 @@ """ Parent class for training classes, such as TCTrainer and QATrainer """ -from collections import namedtuple +from dataclasses import dataclass import tempfile from transformers import TrainingArguments, Trainer -# may eventually add more metrics like accuracy -EvalResult = namedtuple("EvalResult", ["eval_loss"]) +@dataclass +class EvalResult: + loss:float class HappyTrainer: def __init__(self, model, model_type, tokenizer, device, logger): From 4ced00d2f108fdb8d2b4c41fa524746441b1b305 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 03:10:34 -0500 Subject: [PATCH 096/155] WordPredictionResult to dataclass --- happytransformer/happy_word_prediction.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index 29e417cb..8e090857 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -10,14 +10,16 @@ FillMaskPipeline, ) import torch -from collections import namedtuple +from dataclasses import dataclass from happytransformer.happy_transformer import HappyTransformer from happytransformer.mwp.trainer import WPTrainer from happytransformer.cuda_detect import detect_cuda_device_number from typing import List -WordPredictionResult = namedtuple("WordPredictionResult", ["token_str", "score"]) - +@dataclass +class WordPredictionResult: + token:str + score:float class HappyWordPrediction(HappyTransformer): """ @@ -75,7 +77,7 @@ def predict_mask(self, text:str, targets:List[str]=None, top_k:int=1) -> List[Wo answer["token_str"] = answer["token_str"][1:] results = [ WordPredictionResult( - token_str=answer["token_str"], + token=answer["token_str"], score=answer["score"] ) for answer in result From 0daa99688d97e42393217d0f7e5cc4dd8dd7860b Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 03:10:34 -0500 Subject: [PATCH 097/155] improved naming --- happytransformer/happy_word_prediction.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index 8e090857..80e43697 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -65,25 +65,23 @@ def predict_mask(self, text:str, targets:List[str]=None, top_k:int=1) -> List[Wo if self.model_type == "ROBERTA": text = text.replace("[MASK]", "") - result = self._pipeline(text, targets=targets, top_k=top_k) + answers = self._pipeline(text, targets=targets, top_k=top_k) if self.model_type == "ALBERT": - for answer in result: + for answer in answers: if answer["token_str"][0] == "▁": answer["token_str"] = answer["token_str"][1:] elif self.model_type == "ROBERTA": - for answer in result: + for answer in answers: if answer["token_str"][0] == "Ġ": answer["token_str"] = answer["token_str"][1:] - results = [ + return [ WordPredictionResult( token=answer["token_str"], score=answer["score"] ) - for answer in result + for answer in answers ] - - return results def train(self, input_filepath, args): raise NotImplementedError("train() is currently not available") From 6a10e6dd6dc283c0bf358ac72635a900811b63d3 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 03:10:34 -0500 Subject: [PATCH 098/155] adjusted whitespace --- happytransformer/happy_word_prediction.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index 80e43697..fe57b1cf 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -52,7 +52,9 @@ def __init__(self, model_type:str="DISTILBERT", self._trainer = WPTrainer(model, model_type, tokenizer, self._device, self.logger) - def predict_mask(self, text:str, targets:List[str]=None, top_k:int=1) -> List[WordPredictionResult]: + def predict_mask(self, + text:str, targets:List[str]=None, top_k:int=1 + ) -> List[WordPredictionResult]: """ Predict [MASK] tokens in a string. targets limit possible guesses if supplied. From 3de70c9dc9eaefbb96dea0902abd583858449119 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 03:10:34 -0500 Subject: [PATCH 099/155] fixed up naming --- tests/test_wp.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_wp.py b/tests/test_wp.py index 45b89ff1..0a4f56e1 100644 --- a/tests/test_wp.py +++ b/tests/test_wp.py @@ -7,7 +7,7 @@ def test_mwp_basic(): result = happy_mwp.predict_mask( "Please pass the salt and [MASK]", ) - answer = [WordPredictionResult(token_str="pepper", score=0.2664579749107361)] + answer = [WordPredictionResult(token="pepper", score=0.2664579749107361)] assert result == answer @@ -17,8 +17,8 @@ def test_mwp_top_k(): "Please pass the salt and [MASK]", top_k=2 ) - answer = [WordPredictionResult(token_str='pepper', score=0.2664579749107361), - WordPredictionResult(token_str='vinegar', score=0.08760260790586472)] + answer = [WordPredictionResult(token='pepper', score=0.2664579749107361), + WordPredictionResult(token='vinegar', score=0.08760260790586472)] assert result == answer @@ -29,8 +29,8 @@ def test_mwp_targets(): "Please pass the salt and [MASK]", targets=["water", "spices"] ) - answer = [WordPredictionResult(token_str='water', score=0.014856964349746704), - WordPredictionResult(token_str='spices', score=0.009040987119078636)] + answer = [WordPredictionResult(token='water', score=0.014856964349746704), + WordPredictionResult(token='spices', score=0.009040987119078636)] assert result == answer @@ -39,7 +39,7 @@ def test_mwp_basic_albert(): result = happy_mwp.predict_mask( "Please pass the salt and [MASK]", ) - answer = [WordPredictionResult(token_str='garlic', score=0.036625903099775314)] + answer = [WordPredictionResult(token='garlic', score=0.036625903099775314)] assert result == answer From d5429df207b7f745b14649662b15b570a641e6a3 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 03:10:34 -0500 Subject: [PATCH 100/155] rename eval loss to loss --- tests/test_qa.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test_qa.py b/tests/test_qa.py index 95049c73..a3d18379 100644 --- a/tests/test_qa.py +++ b/tests/test_qa.py @@ -29,7 +29,7 @@ def test_qa_train(): def test_qa_eval(): happy_qa = HappyQuestionAnswering() result = happy_qa.eval("../data/qa/train-eval.csv") - assert result.eval_loss == 0.11738169193267822 + assert result.loss == 0.11738169193267822 def test_qa_test(): @@ -47,9 +47,9 @@ def test_qa_train_effectiveness(): """ happy_qa = HappyQuestionAnswering() - before_loss = happy_qa.eval("../data/qa/train-eval.csv").eval_loss + before_loss = happy_qa.eval("../data/qa/train-eval.csv").loss happy_qa.train("../data/qa/train-eval.csv") - after_loss = happy_qa.eval("../data/qa/train-eval.csv").eval_loss + after_loss = happy_qa.eval("../data/qa/train-eval.csv").loss assert after_loss < before_loss @@ -61,9 +61,9 @@ def test_qa_train_effectiveness_albert(): """ happy_qa = HappyQuestionAnswering("ALBERT", "twmkn9/albert-base-v2-squad2") - before_loss = happy_qa.eval("../data/qa/train-eval.csv").eval_loss + before_loss = happy_qa.eval("../data/qa/train-eval.csv").loss happy_qa.train("../data/qa/train-eval.csv") - after_loss = happy_qa.eval("../data/qa/train-eval.csv").eval_loss + after_loss = happy_qa.eval("../data/qa/train-eval.csv").loss assert after_loss < before_loss @@ -83,9 +83,9 @@ def test_qa_train_effectiveness_bert(): """ happy_qa = HappyQuestionAnswering("BERT", "mrm8488/bert-tiny-5-finetuned-squadv2") - before_loss = happy_qa.eval("../data/qa/train-eval.csv").eval_loss + before_loss = happy_qa.eval("../data/qa/train-eval.csv").loss happy_qa.train("../data/qa/train-eval.csv") - after_loss = happy_qa.eval("../data/qa/train-eval.csv").eval_loss + after_loss = happy_qa.eval("../data/qa/train-eval.csv").loss assert after_loss < before_loss From 07d5e7b6641c0ef7bc9fba83f7f03e0b0a52fe06 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 03:10:34 -0500 Subject: [PATCH 101/155] fixed up eval_loss --- happytransformer/qa/trainer.py | 2 +- happytransformer/tc/trainer.py | 2 +- tests/test_tc.py | 26 +++++++++++++------------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/happytransformer/qa/trainer.py b/happytransformer/qa/trainer.py index 32b8ce5c..887f5550 100644 --- a/happytransformer/qa/trainer.py +++ b/happytransformer/qa/trainer.py @@ -44,7 +44,7 @@ def eval(self, input_filepath): self.__add_token_positions(encodings, answers) eval_dataset = QuestionAnsweringDataset(encodings) result = self._run_eval(eval_dataset) - return EvalResult(eval_loss=result["eval_loss"]) + return EvalResult(loss=result["eval_loss"]) def test(self, input_filepath, solve): diff --git a/happytransformer/tc/trainer.py b/happytransformer/tc/trainer.py index 3e10575b..aeec24d7 100644 --- a/happytransformer/tc/trainer.py +++ b/happytransformer/tc/trainer.py @@ -31,7 +31,7 @@ def eval(self, input_filepath): eval_dataset = TextClassificationDataset(eval_encodings, labels) result = self._run_eval(eval_dataset) - return EvalResult(eval_loss=result["eval_loss"]) + return EvalResult(loss=result["eval_loss"]) def test(self, input_filepath, solve): """ diff --git a/tests/test_tc.py b/tests/test_tc.py index e00de2bc..e851c179 100644 --- a/tests/test_tc.py +++ b/tests/test_tc.py @@ -36,7 +36,7 @@ def test_qa_eval(): happy_tc = HappyTextClassification(model_type="DISTILBERT", model_name="distilbert-base-uncased-finetuned-sst-2-english") results = happy_tc.eval("../data/tc/train-eval.csv") - assert results.eval_loss == 0.007262040860950947 + assert results.loss == 0.007262040860950947 def test_qa_test(): @@ -64,9 +64,9 @@ def test_qa_train_effectiveness(): happy_tc = HappyTextClassification(model_type="DISTILBERT", model_name="distilbert-base-uncased-finetuned-sst-2-english") - before_loss = happy_tc.eval("../data/tc/train-eval.csv").eval_loss + before_loss = happy_tc.eval("../data/tc/train-eval.csv").loss happy_tc.train("../data/tc/train-eval.csv") - after_loss = happy_tc.eval("../data/tc/train-eval.csv").eval_loss + after_loss = happy_tc.eval("../data/tc/train-eval.csv").loss assert after_loss < before_loss @@ -80,9 +80,9 @@ def test_qa_train_effectiveness_multi(): happy_tc = HappyTextClassification(model_type="DISTILBERT", model_name="distilbert-base-uncased", num_labels=3) - before_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").eval_loss + before_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").loss happy_tc.train("../data/tc/train-eval-multi.csv") - after_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").eval_loss + after_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").loss assert after_loss < before_loss @@ -115,9 +115,9 @@ def test_qa_effectiveness_multi_albert(): happy_tc = HappyTextClassification(model_type="ALBERT", model_name="albert-base-v2", num_labels=3) - before_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").eval_loss + before_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").loss happy_tc.train("../data/tc/train-eval-multi.csv") - after_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").eval_loss + after_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").loss assert after_loss < before_loss def test_qa_effectiveness_multi_bert(): @@ -129,9 +129,9 @@ def test_qa_effectiveness_multi_bert(): happy_tc = HappyTextClassification(model_type="BERT", model_name="bert-base-uncased", num_labels=3) - before_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").eval_loss + before_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").loss happy_tc.train("../data/tc/train-eval-multi.csv") - after_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").eval_loss + after_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").loss assert after_loss < before_loss @@ -158,9 +158,9 @@ def test_qa_train_effectiveness_albert(): """ happy_tc = HappyTextClassification(model_type="ALBERT", model_name="textattack/albert-base-v2-SST-2") - before_loss = happy_tc.eval("../data/tc/train-eval.csv").eval_loss + before_loss = happy_tc.eval("../data/tc/train-eval.csv").loss happy_tc.train("../data/tc/train-eval.csv") - after_loss = happy_tc.eval("../data/tc/train-eval.csv").eval_loss + after_loss = happy_tc.eval("../data/tc/train-eval.csv").loss assert after_loss < before_loss @@ -187,9 +187,9 @@ def test_qa_train_effectiveness_bert(): """ happy_tc = HappyTextClassification(model_type="BERT", model_name="textattack/bert-base-uncased-SST-2") - before_loss = happy_tc.eval("../data/tc/train-eval.csv").eval_loss + before_loss = happy_tc.eval("../data/tc/train-eval.csv").loss happy_tc.train("../data/tc/train-eval.csv") - after_loss = happy_tc.eval("../data/tc/train-eval.csv").eval_loss + after_loss = happy_tc.eval("../data/tc/train-eval.csv").loss assert after_loss < before_loss From c0927a3319668c22a0dec0e059d025e99ca9ab99 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 03:10:34 -0500 Subject: [PATCH 102/155] reduce to comprehension --- happytransformer/tc/trainer.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/happytransformer/tc/trainer.py b/happytransformer/tc/trainer.py index aeec24d7..6d0b966c 100644 --- a/happytransformer/tc/trainer.py +++ b/happytransformer/tc/trainer.py @@ -40,13 +40,10 @@ def test(self, input_filepath, solve): """ contexts = self._get_data(input_filepath, test_data=True) - results = list() - - for context in tqdm(contexts): - result = solve(context) - results.append(result) - - return results + return [ + solve(context) + for context in tqdm(contexts) + ] @staticmethod def _get_data(filepath, test_data=False): From 3223f2aee939c968474e24133fe6135d0ac260da Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 03:15:52 -0500 Subject: [PATCH 103/155] replace loop with comprehension --- happytransformer/qa/trainer.py | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/happytransformer/qa/trainer.py b/happytransformer/qa/trainer.py index 887f5550..1426924b 100644 --- a/happytransformer/qa/trainer.py +++ b/happytransformer/qa/trainer.py @@ -54,18 +54,11 @@ def test(self, input_filepath, solve): """ contexts, questions = self._get_data(input_filepath, test_data=True) - results = list() - - for case in tqdm(zip(contexts, questions)): - context = case[0] - question = case[1] - result = solve(context, question)[0] # only care about first result - - results.append(result) - - return results - - + return [ + solve(context,question)[0] + for context,question in + tqdm(zip(contexts, questions)) + ] @staticmethod def _get_data(filepath, test_data=False): From dc12a95af88d7bc28876951a23b83778d0eefd6f Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 03:24:10 -0500 Subject: [PATCH 104/155] remove obsolete comment --- happytransformer/happy_text_classification.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/happytransformer/happy_text_classification.py b/happytransformer/happy_text_classification.py index adbfc670..6daa3823 100644 --- a/happytransformer/happy_text_classification.py +++ b/happytransformer/happy_text_classification.py @@ -57,12 +57,15 @@ def __init__(self, model_type="DISTILBERT", super().__init__(model_type, model_name, model, tokenizer) device_number = detect_cuda_device_number() - # from documentation " a positive will run the model on the associated CUDA device id." - # todo: get device ID if torch.cuda.is_available() - - self._pipeline = TextClassificationPipeline(model=model, tokenizer=tokenizer, device=device_number) - - self._trainer = TCTrainer(self._model, self.model_type, self._tokenizer, self._device, self.logger) + self._pipeline = TextClassificationPipeline( + model=model, tokenizer=tokenizer, + device=device_number + ) + + self._trainer = TCTrainer( + self._model, self.model_type, + self._tokenizer, self._device, self.logger + ) def classify_text(self, text:str) -> TextClassificationResult: """ From dd5379b06fcd57eff7ca2fd1b680eedf7ed2b7d3 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 03:40:13 -0500 Subject: [PATCH 105/155] made tests less brittle --- tests/test_qa.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/tests/test_qa.py b/tests/test_qa.py index a3d18379..f66cd7ae 100644 --- a/tests/test_qa.py +++ b/tests/test_qa.py @@ -3,22 +3,23 @@ """ from happytransformer.happy_question_answering import HappyQuestionAnswering, QuestionAnsweringResult - +from pytest import approx def test_qa_answer_question(): happy_qa = HappyQuestionAnswering() - result = happy_qa.answer_question("Today's date is January 8th 2021", "What is the date?") - answer = [QuestionAnsweringResult(answer='January 8th 2021', score=0.9696964621543884, start=16, end=32)] - assert result == answer - + answers = happy_qa.answer_question("Today's date is January 8th 2021", "What is the date?") + top_answer = answers[0] + assert top_answer.answer == 'January 8th 2021' + assert top_answer.start == 16 + assert top_answer.end == 32 def test_qa_answer_question_top_k(): happy_qa = HappyQuestionAnswering() - result = happy_qa.answer_question("Today's date is January 8th 2021", "What is the date?", top_k=3) - answer = [QuestionAnsweringResult(answer='January 8th 2021', score=0.9696964621543884, start=16, end=32), - QuestionAnsweringResult(answer='January 8th', score=0.02050216868519783, start=16, end=27), - QuestionAnsweringResult(answer='January', score=0.005092293489724398, start=16, end=23)] - assert result == answer + answers = happy_qa.answer_question("Today's date is January 8th 2021", "What is the date?", top_k=3) + + assert sum(answer.score for answer in answers) == approx(1,0.01) + assert answers[0].start==16 and answers[0].end==32 and answers[0].answer=='January 8th 2021' + assert answers[1].start==16 and answers[1].end==27 and answers[1].answer=='January 8th' def test_qa_train(): From f67785c408edede33001a0679561032d2ea1fa9e Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 03:45:28 -0500 Subject: [PATCH 106/155] removed repetitive tests --- tests/test_qa.py | 76 +++--------------------------------------------- 1 file changed, 4 insertions(+), 72 deletions(-) diff --git a/tests/test_qa.py b/tests/test_qa.py index f66cd7ae..acf55cfd 100644 --- a/tests/test_qa.py +++ b/tests/test_qa.py @@ -30,16 +30,13 @@ def test_qa_train(): def test_qa_eval(): happy_qa = HappyQuestionAnswering() result = happy_qa.eval("../data/qa/train-eval.csv") - assert result.loss == 0.11738169193267822 - + assert result.loss == approx(0.11738169193267822,0.001) def test_qa_test(): happy_qa = HappyQuestionAnswering() - result = happy_qa.test("../data/qa/test.csv") - answer = [QuestionAnsweringResult(answer='October 31st', score=0.9939756989479065, start=0, end=12), - QuestionAnsweringResult(answer='November 23rd', score=0.967872679233551, start=12, end=25)] - assert result == answer - + results = happy_qa.test("../data/qa/test.csv") + assert results[0].answer == 'October 31st' + assert results[1].answer == 'November 23rd' def test_qa_train_effectiveness(): """ @@ -53,68 +50,3 @@ def test_qa_train_effectiveness(): after_loss = happy_qa.eval("../data/qa/train-eval.csv").loss assert after_loss < before_loss - - -def test_qa_train_effectiveness_albert(): - """ - Ensures that HappyQuestionAnswering.train() results in - lowering the loss as determined by HappyQuestionAnswering.eval() - """ - - happy_qa = HappyQuestionAnswering("ALBERT", "twmkn9/albert-base-v2-squad2") - before_loss = happy_qa.eval("../data/qa/train-eval.csv").loss - happy_qa.train("../data/qa/train-eval.csv") - after_loss = happy_qa.eval("../data/qa/train-eval.csv").loss - - assert after_loss < before_loss - - -def test_qa_test_albert(): - happy_qa = HappyQuestionAnswering("ALBERT", "twmkn9/albert-base-v2-squad2") - result = happy_qa.test("../data/qa/test.csv") - answer = [QuestionAnsweringResult(answer='October 31st', score=0.988578736782074, start=0, end=12), - QuestionAnsweringResult(answer='November 23rd', score=0.9833534359931946, start=12, end=25)] - assert result == answer - - -def test_qa_train_effectiveness_bert(): - """ - Ensures that HappyQuestionAnswering.train() results in - lowering the loss as determined by HappyQuestionAnswering.eval() - """ - - happy_qa = HappyQuestionAnswering("BERT", "mrm8488/bert-tiny-5-finetuned-squadv2") - before_loss = happy_qa.eval("../data/qa/train-eval.csv").loss - happy_qa.train("../data/qa/train-eval.csv") - after_loss = happy_qa.eval("../data/qa/train-eval.csv").loss - - assert after_loss < before_loss - - -def test_qa_test_bert(): - happy_qa = HappyQuestionAnswering("BERT", "mrm8488/bert-tiny-5-finetuned-squadv2") - result = happy_qa.test("../data/qa/test.csv") - answer = [QuestionAnsweringResult(answer='October 31st', score=0.9352769255638123, start=0, end=12), - QuestionAnsweringResult(answer='November 23rd', score=0.9180678129196167, start=12, end=25)] - assert result == answer - - -def test_qa_train_effectiveness_roberta(): - """ - Ensures that HappyQuestionAnswering.train() results in - lowering the loss as determined by HappyQuestionAnswering.eval() - """ - - happy_qa = HappyQuestionAnswering("ROBERTA", "deepset/roberta-base-squad2") - before_loss = happy_qa.eval("../data/qa/train-eval.csv").eval_loss - happy_qa.train("../data/qa/train-eval.csv") - after_loss = happy_qa.eval("../data/qa/train-eval.csv").eval_loss - assert after_loss < before_loss - - -def test_qa_test_roberta(): - happy_qa = HappyQuestionAnswering("ROBERTA", "deepset/roberta-base-squad2") - result = happy_qa.test("../data/qa/test.csv") - answer = [QuestionAnsweringResult(answer='October 31st', score=0.9512737393379211, start=0, end=12), - QuestionAnsweringResult(answer='November 23rd', score=0.8634917736053467, start=12, end=25)] - assert result == answer From 9172029d692355175537e9d36e6ca5d14041b59a Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 03:59:56 -0500 Subject: [PATCH 107/155] make tests less brittle --- tests/test_tc.py | 57 ++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 31 deletions(-) diff --git a/tests/test_tc.py b/tests/test_tc.py index e851c179..aed304b9 100644 --- a/tests/test_tc.py +++ b/tests/test_tc.py @@ -3,41 +3,33 @@ """ from happytransformer.happy_text_classification import HappyTextClassification, TextClassificationResult +from pytest import approx def test_classify_text(): - """ - Tests - HappyQuestionAnswering.classify_text() - - """ happy_tc = HappyTextClassification(model_type="DISTILBERT", model_name="distilbert-base-uncased-finetuned-sst-2-english") result = happy_tc.classify_text("What a great movie") - answer = TextClassificationResult(label='LABEL_1', score=0.9998726844787598) - assert result == answer + assert result.label == 'LABEL_1' + assert result.score > 0.9 -def test_qa_train(): - """ - Tests - HappyQuestionAnswering.train() - - """ - happy_tc = HappyTextClassification(model_type="DISTILBERT", - model_name="distilbert-base-uncased-finetuned-sst-2-english") - +def test_tc_train(): + happy_tc = HappyTextClassification( + model_type="DISTILBERT", + model_name="distilbert-base-uncased-finetuned-sst-2-english" + ) happy_tc.train("../data/tc/train-eval.csv") - def test_qa_eval(): """ Tests HappyQuestionAnswering.eval() """ - happy_tc = HappyTextClassification(model_type="DISTILBERT", - model_name="distilbert-base-uncased-finetuned-sst-2-english") + happy_tc = HappyTextClassification( + model_type="DISTILBERT", + model_name="distilbert-base-uncased-finetuned-sst-2-english" + ) results = happy_tc.eval("../data/tc/train-eval.csv") - assert results.loss == 0.007262040860950947 - + assert results.loss == approx(0.007262040860950947,0.01) def test_qa_test(): """ @@ -69,8 +61,6 @@ def test_qa_train_effectiveness(): after_loss = happy_tc.eval("../data/tc/train-eval.csv").loss assert after_loss < before_loss - - def test_qa_train_effectiveness_multi(): """ Tests @@ -78,8 +68,11 @@ def test_qa_train_effectiveness_multi(): lowering the loss as determined by HappyQuestionAnswering.eval() """ - happy_tc = HappyTextClassification(model_type="DISTILBERT", - model_name="distilbert-base-uncased", num_labels=3) + happy_tc = HappyTextClassification( + model_type="DISTILBERT", + model_name="distilbert-base-uncased", + num_labels=3 + ) before_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").loss happy_tc.train("../data/tc/train-eval-multi.csv") after_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").loss @@ -93,16 +86,18 @@ def test_qa_test_multi_distil_bert(): lowering the loss as determined by HappyQuestionAnswering.eval() """ + + happy_tc = HappyTextClassification(model_type="DISTILBERT", model_name="distilbert-base-uncased", num_labels=3) happy_tc.train("../data/tc/train-eval-multi.csv") result = happy_tc.test("../data/tc/test-multi.csv") - answer = [TextClassificationResult(label='LABEL_2', score=0.3558128774166107), - TextClassificationResult(label='LABEL_2', score=0.34425610303878784), - TextClassificationResult(label='LABEL_1', score=0.3998771607875824), - TextClassificationResult(label='LABEL_1', score=0.38578158617019653), - TextClassificationResult(label='LABEL_0', score=0.39120176434516907), - TextClassificationResult(label='LABEL_0', score=0.3762877583503723)] + answer = [TextClassificationResult(label='LABEL_2', score=approx(0.3558128774166107,0.01)), + TextClassificationResult(label='LABEL_2', score=approx(0.34425610303878784,0.01)), + TextClassificationResult(label='LABEL_1', score=approx(0.3998771607875824,0.01)), + TextClassificationResult(label='LABEL_1', score=approx(0.38578158617019653,0.01)), + TextClassificationResult(label='LABEL_0', score=approx(0.39120176434516907,0.01)), + TextClassificationResult(label='LABEL_0', score=approx(0.3762877583503723,0.01))] assert result == answer From 66cea139bd8f260ac0c1b1c824b2644654bca427 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Tue, 12 Jan 2021 14:31:11 -0500 Subject: [PATCH 108/155] Added news about 2.0.0 --- README.md | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 39827af0..a033da15 100644 --- a/README.md +++ b/README.md @@ -19,16 +19,33 @@ Happy Transformer is an package built on top of [Hugging Face's transformer libr ## News: -### January x, 2021 -Introducing Version 2.0.0! -... +### January 12, 2021 +**Introducing Version 2.0.0!** + +We fully redesigned Happy Transformer from the ground up. + +New Features: +- Question answering training +- Multi label text classification training +- Single predictions for text classification + +Deprecated Features: +- Masked word prediction training +- Masked word prediction with multiple masks + +Breaking changes: +- Everything + +Happy Transformer have been redesigned to promote scalability. +Now it's easier than ever to add new models and features, and we encourage you +to create PRs to contribute to the project. + ### November 23rd, 2020 Last month, Happy Transformer was presented at a conference called C-Search, and the presentation won the Best Presentation Award. C-Search is the Queen's University Student Research Conference and had Turing Award Winner Professor Bengio as the Keynote Speaker this year. The video for the presentation can be found [here](https://www.youtube.com/watch?v=nNdFkq-y8Ng&t=12s). - ### June 9th, 2020 We're happy to announce that we won a Best Paper Award at the Canadian Undergraduate Conference for AI. We also received the highest score overall. The paper can be found [here](https://qmind.ca/wp-content/uploads/2020/05/Proceedings-of-CUCAI-2020.pdf) on page 67. From 4afedc5a1a96a9034b5f136d9ab345cd3b06a700 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 14:33:24 -0500 Subject: [PATCH 109/155] removed redundant test --- tests/test_qa.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/tests/test_qa.py b/tests/test_qa.py index acf55cfd..9f0dab23 100644 --- a/tests/test_qa.py +++ b/tests/test_qa.py @@ -5,14 +5,6 @@ from happytransformer.happy_question_answering import HappyQuestionAnswering, QuestionAnsweringResult from pytest import approx -def test_qa_answer_question(): - happy_qa = HappyQuestionAnswering() - answers = happy_qa.answer_question("Today's date is January 8th 2021", "What is the date?") - top_answer = answers[0] - assert top_answer.answer == 'January 8th 2021' - assert top_answer.start == 16 - assert top_answer.end == 32 - def test_qa_answer_question_top_k(): happy_qa = HappyQuestionAnswering() answers = happy_qa.answer_question("Today's date is January 8th 2021", "What is the date?", top_k=3) @@ -28,7 +20,10 @@ def test_qa_train(): def test_qa_eval(): - happy_qa = HappyQuestionAnswering() + happy_qa = HappyQuestionAnswering( + model_type='DISTILBERT', + model_name='distilbert-base-cased-distilled-squad' + ) result = happy_qa.eval("../data/qa/train-eval.csv") assert result.loss == approx(0.11738169193267822,0.001) From af0d787ce949b463811063d34cd4d4050d50516b Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Tue, 12 Jan 2021 14:42:59 -0500 Subject: [PATCH 110/155] formatting --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a033da15..41b08034 100644 --- a/README.md +++ b/README.md @@ -43,11 +43,14 @@ to create PRs to contribute to the project. ### November 23rd, 2020 -Last month, Happy Transformer was presented at a conference called C-Search, and the presentation won the Best Presentation Award. C-Search is the Queen's University Student Research Conference and had Turing Award Winner Professor Bengio as the Keynote Speaker this year. The video for the presentation can be found [here](https://www.youtube.com/watch?v=nNdFkq-y8Ng&t=12s). +Last month, Happy Transformer was presented at a conference called C-Search, and the presentation won the Best Presentation Award. +C-Search is the Queen's University Student Research Conference and had Turing Award Winner Professor Bengio as the Keynote Speaker this year. +The video for the presentation can be found [here](https://www.youtube.com/watch?v=nNdFkq-y8Ng&t=12s). ### June 9th, 2020 -We're happy to announce that we won a Best Paper Award at the Canadian Undergraduate Conference for AI. We also received the highest score overall. The paper can be found [here](https://qmind.ca/wp-content/uploads/2020/05/Proceedings-of-CUCAI-2020.pdf) on page 67. +We're happy to announce that we won a Best Paper Award at the Canadian Undergraduate Conference for AI. +We also received the highest score overall. The paper can be found [here](https://qmind.ca/wp-content/uploads/2020/05/Proceedings-of-CUCAI-2020.pdf) on page 67. From 5b8b3f6626dd9a929e8b9b01f59349b668b557c9 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 14:52:57 -0500 Subject: [PATCH 111/155] qa tests pass consistently --- tests/test_qa.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/test_qa.py b/tests/test_qa.py index 9f0dab23..18d5cde5 100644 --- a/tests/test_qa.py +++ b/tests/test_qa.py @@ -5,19 +5,18 @@ from happytransformer.happy_question_answering import HappyQuestionAnswering, QuestionAnsweringResult from pytest import approx -def test_qa_answer_question_top_k(): - happy_qa = HappyQuestionAnswering() - answers = happy_qa.answer_question("Today's date is January 8th 2021", "What is the date?", top_k=3) - - assert sum(answer.score for answer in answers) == approx(1,0.01) - assert answers[0].start==16 and answers[0].end==32 and answers[0].answer=='January 8th 2021' - assert answers[1].start==16 and answers[1].end==27 and answers[1].answer=='January 8th' - - -def test_qa_train(): - happy_qa = HappyQuestionAnswering() - happy_qa.train("../data/qa/train-eval.csv") - +def test_qa_answer_question(): + MODELS = [ + ('ALBERT','twmkn9/albert-base-v2-squad2'), + ('ROBERTA','deepset/roberta-base-squad2'), + ('BERT','mrm8488/bert-tiny-5-finetuned-squadv2') + ] + for model_type,model_name in MODELS: + happy_qa = HappyQuestionAnswering(model_name=model_name, model_type=model_type) + answers = happy_qa.answer_question("Today's date is January 8th 2021", "What is the date?", top_k=3) + + assert sum(answer.score for answer in answers) == approx(1,0.1) + assert all('January 8th' in answer.answer for answer in answers) def test_qa_eval(): happy_qa = HappyQuestionAnswering( @@ -39,7 +38,8 @@ def test_qa_train_effectiveness(): lowering the loss as determined by HappyQuestionAnswering.eval() """ - happy_qa = HappyQuestionAnswering() + # use a non-fine-tuned model so we DEFINITELY get an improvement + happy_qa = HappyQuestionAnswering('BERT','bert-base-cased') before_loss = happy_qa.eval("../data/qa/train-eval.csv").loss happy_qa.train("../data/qa/train-eval.csv") after_loss = happy_qa.eval("../data/qa/train-eval.csv").loss From b60762d76f1768921cff6f37d4f3354a17e69d1a Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 14:56:20 -0500 Subject: [PATCH 112/155] test multiple models --- tests/test_tc.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tests/test_tc.py b/tests/test_tc.py index aed304b9..39830f99 100644 --- a/tests/test_tc.py +++ b/tests/test_tc.py @@ -6,10 +6,15 @@ from pytest import approx def test_classify_text(): - happy_tc = HappyTextClassification(model_type="DISTILBERT", model_name="distilbert-base-uncased-finetuned-sst-2-english") - result = happy_tc.classify_text("What a great movie") - assert result.label == 'LABEL_1' - assert result.score > 0.9 + MODELS = [ + ('DISTILBERT','distilbert-base-uncased-finetuned-sst-2-english'), + ("ALBERT","textattack/albert-base-v2-SST-2") + ] + for model_type,model_name in MODELS: + happy_tc = HappyTextClassification(model_type=model_type, model_name=model_name) + result = happy_tc.classify_text("What a great movie") + assert result.label == 'LABEL_1' + assert result.score > 0.9 def test_tc_train(): From b056a87cec12dec461fa3fe3bb73e01fd48ab2da Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 15:04:16 -0500 Subject: [PATCH 113/155] removed misleading docstrings and copy paste tests --- tests/test_tc.py | 196 +++++------------------------------------------ 1 file changed, 19 insertions(+), 177 deletions(-) diff --git a/tests/test_tc.py b/tests/test_tc.py index 39830f99..413827fd 100644 --- a/tests/test_tc.py +++ b/tests/test_tc.py @@ -16,63 +16,43 @@ def test_classify_text(): assert result.label == 'LABEL_1' assert result.score > 0.9 - -def test_tc_train(): +def test_tc_eval(): happy_tc = HappyTextClassification( model_type="DISTILBERT", model_name="distilbert-base-uncased-finetuned-sst-2-english" ) - happy_tc.train("../data/tc/train-eval.csv") + results = happy_tc.eval("../data/tc/train-eval.csv") + assert results.loss == approx(0.007262040860950947,0.01) -def test_qa_eval(): - """ - Tests - HappyQuestionAnswering.eval() - """ +def test_tc_test(): happy_tc = HappyTextClassification( model_type="DISTILBERT", model_name="distilbert-base-uncased-finetuned-sst-2-english" ) - results = happy_tc.eval("../data/tc/train-eval.csv") - assert results.loss == approx(0.007262040860950947,0.01) - -def test_qa_test(): - """ - Tests - HappyQuestionAnswering.test() - """ - happy_tc = HappyTextClassification(model_type="DISTILBERT", - model_name="distilbert-base-uncased-finetuned-sst-2-english") result = happy_tc.test("../data/tc/test.csv") - answer = [TextClassificationResult(label='LABEL_1', score=0.9998401999473572), - TextClassificationResult(label='LABEL_0', score=0.9772131443023682), - TextClassificationResult(label='LABEL_0', score=0.9966067671775818), - TextClassificationResult(label='LABEL_1', score=0.9792295098304749)] + answer = [ + TextClassificationResult(label='LABEL_1', score=0.9998401999473572), + TextClassificationResult(label='LABEL_0', score=0.9772131443023682), + TextClassificationResult(label='LABEL_0', score=0.9966067671775818), + TextClassificationResult(label='LABEL_1', score=0.9792295098304749) + ] assert result == answer -def test_qa_train_effectiveness(): - """ - Tests - Ensures that HappyQuestionAnswering.train() results in - lowering the loss as determined by HappyQuestionAnswering.eval() - """ - - happy_tc = HappyTextClassification(model_type="DISTILBERT", - model_name="distilbert-base-uncased-finetuned-sst-2-english") +def test_tc_train_effectiveness(): + """assert that training decreases the loss""" + happy_tc = HappyTextClassification( + model_type="DISTILBERT", + model_name="distilbert-base-uncased" + ) before_loss = happy_tc.eval("../data/tc/train-eval.csv").loss happy_tc.train("../data/tc/train-eval.csv") after_loss = happy_tc.eval("../data/tc/train-eval.csv").loss assert after_loss < before_loss -def test_qa_train_effectiveness_multi(): - """ - Tests - Ensures that HappyQuestionAnswering.train() results in - lowering the loss as determined by HappyQuestionAnswering.eval() - """ - +def test_tc_train_effectiveness_multi(): + happy_tc = HappyTextClassification( model_type="DISTILBERT", model_name="distilbert-base-uncased", @@ -81,142 +61,4 @@ def test_qa_train_effectiveness_multi(): before_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").loss happy_tc.train("../data/tc/train-eval-multi.csv") after_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").loss - assert after_loss < before_loss - - -def test_qa_test_multi_distil_bert(): - """ - Tests - Ensures that HappyQuestionAnswering.train() results in - lowering the loss as determined by HappyQuestionAnswering.eval() - """ - - - - happy_tc = HappyTextClassification(model_type="DISTILBERT", - model_name="distilbert-base-uncased", num_labels=3) - happy_tc.train("../data/tc/train-eval-multi.csv") - result = happy_tc.test("../data/tc/test-multi.csv") - answer = [TextClassificationResult(label='LABEL_2', score=approx(0.3558128774166107,0.01)), - TextClassificationResult(label='LABEL_2', score=approx(0.34425610303878784,0.01)), - TextClassificationResult(label='LABEL_1', score=approx(0.3998771607875824,0.01)), - TextClassificationResult(label='LABEL_1', score=approx(0.38578158617019653,0.01)), - TextClassificationResult(label='LABEL_0', score=approx(0.39120176434516907,0.01)), - TextClassificationResult(label='LABEL_0', score=approx(0.3762877583503723,0.01))] - assert result == answer - - -def test_qa_effectiveness_multi_albert(): - """ - Tests - Ensures that HappyQuestionAnswering.train() results in - lowering the loss as determined by HappyQuestionAnswering.eval() - """ - - happy_tc = HappyTextClassification(model_type="ALBERT", - model_name="albert-base-v2", num_labels=3) - before_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").loss - happy_tc.train("../data/tc/train-eval-multi.csv") - after_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").loss - assert after_loss < before_loss - -def test_qa_effectiveness_multi_bert(): - """ - Tests - Ensures that HappyQuestionAnswering.train() results in - lowering the loss as determined by HappyQuestionAnswering.eval() - """ - - happy_tc = HappyTextClassification(model_type="BERT", - model_name="bert-base-uncased", num_labels=3) - before_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").loss - happy_tc.train("../data/tc/train-eval-multi.csv") - after_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").loss - assert after_loss < before_loss - - -def test_qa_test_albert(): - """ - Tests - HappyQuestionAnswering.test() - """ - happy_tc = HappyTextClassification(model_type="ALBERT", model_name="textattack/albert-base-v2-SST-2") - - result = happy_tc.test("../data/tc/test.csv") - answer = [TextClassificationResult(label='LABEL_1', score=0.9990348815917969), - TextClassificationResult(label='LABEL_0', score=0.9947203397750854), - TextClassificationResult(label='LABEL_0', score=0.9958302974700928), - TextClassificationResult(label='LABEL_1', score=0.9986426830291748)] - assert result == answer - - -def test_qa_train_effectiveness_albert(): - """ - Tests - Ensures that HappyQuestionAnswering.train() results in - lowering the loss as determined by HappyQuestionAnswering.eval() - """ - - happy_tc = HappyTextClassification(model_type="ALBERT", model_name="textattack/albert-base-v2-SST-2") - before_loss = happy_tc.eval("../data/tc/train-eval.csv").loss - happy_tc.train("../data/tc/train-eval.csv") - after_loss = happy_tc.eval("../data/tc/train-eval.csv").loss - assert after_loss < before_loss - - -def test_qa_test_bert(): - """ - Tests - HappyQuestionAnswering.test() - """ - happy_tc = HappyTextClassification(model_type="BERT", model_name="textattack/bert-base-uncased-SST-2") - - result = happy_tc.test("../data/tc/test.csv") - answer = [TextClassificationResult(label='LABEL_1', score=0.9995690584182739), - TextClassificationResult(label='LABEL_0', score=0.9981549382209778), - TextClassificationResult(label='LABEL_0', score=0.9965545535087585), - TextClassificationResult(label='LABEL_1', score=0.9978235363960266)] - assert result == answer - - -def test_qa_train_effectiveness_bert(): - """ - Tests - Ensures that HappyQuestionAnswering.train() results in - lowering the loss as determined by HappyQuestionAnswering.eval() - """ - - happy_tc = HappyTextClassification(model_type="BERT", model_name="textattack/bert-base-uncased-SST-2") - before_loss = happy_tc.eval("../data/tc/train-eval.csv").loss - happy_tc.train("../data/tc/train-eval.csv") - after_loss = happy_tc.eval("../data/tc/train-eval.csv").loss - assert after_loss < before_loss - - -def test_qa_test_roberta(): - """ - Tests - HappyQuestionAnswering.test() - """ - happy_tc = HappyTextClassification(model_type="ROBERTA", model_name="textattack/roberta-base-imdb") - - result = happy_tc.test("../data/tc/test.csv") - answer = [TextClassificationResult(label='LABEL_1', score=0.9883185029029846), - TextClassificationResult(label='LABEL_0', score=0.9893660545349121), - TextClassificationResult(label='LABEL_0', score=0.947014331817627), - TextClassificationResult(label='LABEL_1', score=0.9845685958862305)] - assert result == answer - - -def test_qa_train_effectiveness_roberta(): - """ - Tests - Ensures that HappyQuestionAnswering.train() results in - lowering the loss as determined by HappyQuestionAnswering.eval() - """ - happy_tc = HappyTextClassification(model_type="ROBERTA", model_name="textattack/roberta-base-imdb") - before_loss = happy_tc.eval("../data/tc/train-eval.csv").eval_loss - happy_tc.train("../data/tc/train-eval.csv") - after_loss = happy_tc.eval("../data/tc/train-eval.csv").eval_loss - assert after_loss < before_loss - + assert after_loss < before_loss \ No newline at end of file From 33913480f404d4a23298efca66d6fdf40efcc434 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 15:14:30 -0500 Subject: [PATCH 114/155] combined tests and relaxed compares --- tests/test_wp.py | 57 ++++++++++++++++-------------------------------- 1 file changed, 19 insertions(+), 38 deletions(-) diff --git a/tests/test_wp.py b/tests/test_wp.py index 0a4f56e1..165110e4 100644 --- a/tests/test_wp.py +++ b/tests/test_wp.py @@ -1,61 +1,42 @@ +from pytest import approx + from happytransformer import HappyWordPrediction from happytransformer.happy_word_prediction import WordPredictionResult def test_mwp_basic(): - happy_mwp = HappyWordPrediction() - result = happy_mwp.predict_mask( - "Please pass the salt and [MASK]", - ) - answer = [WordPredictionResult(token="pepper", score=0.2664579749107361)] - assert result == answer + MODELS = [ + ('DISTILBERT','distilbert-base-uncased','pepper'), + ('BERT','bert-base-uncased','.'), + ('ALBERT','albert-base-v2','garlic') + ] + for model_type,model_name,top_result in MODELS: + happy_mwp = HappyWordPrediction(model_type,model_name) + results = happy_mwp.predict_mask( + "Please pass the salt and [MASK]", + ) + result = results[0] + assert result.token == top_result def test_mwp_top_k(): - happy_mwp = HappyWordPrediction() + happy_mwp = HappyWordPrediction('DISTILBERT','distilbert-base-uncased') result = happy_mwp.predict_mask( "Please pass the salt and [MASK]", top_k=2 ) - answer = [WordPredictionResult(token='pepper', score=0.2664579749107361), - WordPredictionResult(token='vinegar', score=0.08760260790586472)] + answer = [WordPredictionResult(token='pepper', score=approx(0.2664579749107361,0.01)), + WordPredictionResult(token='vinegar', score=approx(0.08760260790586472,0.01))] assert result == answer def test_mwp_targets(): - happy_mwp = HappyWordPrediction() + happy_mwp = HappyWordPrediction('DISTILBERT','distilbert-base-uncased') result = happy_mwp.predict_mask( "Please pass the salt and [MASK]", targets=["water", "spices"] ) answer = [WordPredictionResult(token='water', score=0.014856964349746704), WordPredictionResult(token='spices', score=0.009040987119078636)] - assert result == answer - - -def test_mwp_basic_albert(): - happy_mwp = HappyWordPrediction("ALBERT", "albert-base-v2") - result = happy_mwp.predict_mask( - "Please pass the salt and [MASK]", - ) - answer = [WordPredictionResult(token='garlic', score=0.036625903099775314)] - assert result == answer - - -def test_mwp_basic_bert(): - happy_mwp = HappyWordPrediction("BERT", "bert-base-uncased") - result = happy_mwp.predict_mask( - "Please pass the salt and [MASK]", - ) - answer = [WordPredictionResult(token_str='.', score=0.8466101884841919)] - assert result == answer - - -def test_mwp_basic_roberta(): - happy_mwp = HappyWordPrediction("ROBERTA", "roberta-base") - result = happy_mwp.predict_mask( - "Please pass the salt and [MASK]", - ) - answer = [WordPredictionResult(token_str='pepper', score=0.7325230240821838)] - assert result == answer + assert result == answer \ No newline at end of file From cd4caad0ccacc3590afc436a8a5764e77153acc0 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 15:22:37 -0500 Subject: [PATCH 115/155] use approx --- tests/test_wp.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/test_wp.py b/tests/test_wp.py index 165110e4..af221ada 100644 --- a/tests/test_wp.py +++ b/tests/test_wp.py @@ -25,8 +25,10 @@ def test_mwp_top_k(): "Please pass the salt and [MASK]", top_k=2 ) - answer = [WordPredictionResult(token='pepper', score=approx(0.2664579749107361,0.01)), - WordPredictionResult(token='vinegar', score=approx(0.08760260790586472,0.01))] + answer = [ + WordPredictionResult(token='pepper', score=approx(0.2664579749107361,0.01)), + WordPredictionResult(token='vinegar', score=approx(0.08760260790586472,0.01)) + ] assert result == answer @@ -37,6 +39,8 @@ def test_mwp_targets(): "Please pass the salt and [MASK]", targets=["water", "spices"] ) - answer = [WordPredictionResult(token='water', score=0.014856964349746704), - WordPredictionResult(token='spices', score=0.009040987119078636)] + answer = [ + WordPredictionResult(token='water', score=approx(0.014856964349746704,0.01)), + WordPredictionResult(token='spices', score=approx(0.009040987119078636,0.01)) + ] assert result == answer \ No newline at end of file From 5f72b6e6b896baa1fb7000ad32e38779e177979d Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Tue, 12 Jan 2021 19:12:08 -0500 Subject: [PATCH 116/155] Fixed whitespace --- tests/test_qa.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/tests/test_qa.py b/tests/test_qa.py index 18d5cde5..a7f244e8 100644 --- a/tests/test_qa.py +++ b/tests/test_qa.py @@ -7,24 +7,26 @@ def test_qa_answer_question(): MODELS = [ - ('ALBERT','twmkn9/albert-base-v2-squad2'), - ('ROBERTA','deepset/roberta-base-squad2'), - ('BERT','mrm8488/bert-tiny-5-finetuned-squadv2') + ('ALBERT', 'twmkn9/albert-base-v2-squad2'), + ('ROBERTA', 'deepset/roberta-base-squad2'), + ('BERT', 'mrm8488/bert-tiny-5-finetuned-squadv2') ] - for model_type,model_name in MODELS: + for model_type, model_name in MODELS: happy_qa = HappyQuestionAnswering(model_name=model_name, model_type=model_type) answers = happy_qa.answer_question("Today's date is January 8th 2021", "What is the date?", top_k=3) - assert sum(answer.score for answer in answers) == approx(1,0.1) + assert sum(answer.score for answer in answers) == approx(1, 0.1) assert all('January 8th' in answer.answer for answer in answers) + def test_qa_eval(): happy_qa = HappyQuestionAnswering( model_type='DISTILBERT', model_name='distilbert-base-cased-distilled-squad' ) result = happy_qa.eval("../data/qa/train-eval.csv") - assert result.loss == approx(0.11738169193267822,0.001) + assert result.loss == approx(0.11738169193267822, 0.001) + def test_qa_test(): happy_qa = HappyQuestionAnswering() @@ -32,6 +34,7 @@ def test_qa_test(): assert results[0].answer == 'October 31st' assert results[1].answer == 'November 23rd' + def test_qa_train_effectiveness(): """ Ensures that HappyQuestionAnswering.train() results in @@ -39,7 +42,7 @@ def test_qa_train_effectiveness(): """ # use a non-fine-tuned model so we DEFINITELY get an improvement - happy_qa = HappyQuestionAnswering('BERT','bert-base-cased') + happy_qa = HappyQuestionAnswering('BERT', 'bert-base-cased') before_loss = happy_qa.eval("../data/qa/train-eval.csv").loss happy_qa.train("../data/qa/train-eval.csv") after_loss = happy_qa.eval("../data/qa/train-eval.csv").loss From 7fa220d7c5df9933ce7d846f610e290d5b5568a4 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Tue, 12 Jan 2021 19:12:24 -0500 Subject: [PATCH 117/155] Removed unused import --- tests/test_qa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_qa.py b/tests/test_qa.py index a7f244e8..bffa4160 100644 --- a/tests/test_qa.py +++ b/tests/test_qa.py @@ -2,7 +2,7 @@ Tests for the question answering training, evaluating and testing functionality """ -from happytransformer.happy_question_answering import HappyQuestionAnswering, QuestionAnsweringResult +from happytransformer.happy_question_answering import HappyQuestionAnswering from pytest import approx def test_qa_answer_question(): From ab035ab64ff051df274e8b3a052ffb8a258fe1b6 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Tue, 12 Jan 2021 19:13:48 -0500 Subject: [PATCH 118/155] fixed whitespace --- tests/test_tc.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/test_tc.py b/tests/test_tc.py index 413827fd..54800d4e 100644 --- a/tests/test_tc.py +++ b/tests/test_tc.py @@ -5,24 +5,27 @@ from happytransformer.happy_text_classification import HappyTextClassification, TextClassificationResult from pytest import approx + def test_classify_text(): MODELS = [ - ('DISTILBERT','distilbert-base-uncased-finetuned-sst-2-english'), - ("ALBERT","textattack/albert-base-v2-SST-2") + ('DISTILBERT', 'distilbert-base-uncased-finetuned-sst-2-english'), + ("ALBERT", "textattack/albert-base-v2-SST-2") ] - for model_type,model_name in MODELS: + for model_type, model_name in MODELS: happy_tc = HappyTextClassification(model_type=model_type, model_name=model_name) result = happy_tc.classify_text("What a great movie") assert result.label == 'LABEL_1' assert result.score > 0.9 + def test_tc_eval(): happy_tc = HappyTextClassification( model_type="DISTILBERT", model_name="distilbert-base-uncased-finetuned-sst-2-english" ) results = happy_tc.eval("../data/tc/train-eval.csv") - assert results.loss == approx(0.007262040860950947,0.01) + assert results.loss == approx(0.007262040860950947, 0.01) + def test_tc_test(): happy_tc = HappyTextClassification( @@ -51,6 +54,7 @@ def test_tc_train_effectiveness(): after_loss = happy_tc.eval("../data/tc/train-eval.csv").loss assert after_loss < before_loss + def test_tc_train_effectiveness_multi(): happy_tc = HappyTextClassification( @@ -61,4 +65,4 @@ def test_tc_train_effectiveness_multi(): before_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").loss happy_tc.train("../data/tc/train-eval-multi.csv") after_loss = happy_tc.eval("../data/tc/train-eval-multi.csv").loss - assert after_loss < before_loss \ No newline at end of file + assert after_loss < before_loss From 786bcabbfe661a806edb1584cf0c783e65637276 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Tue, 12 Jan 2021 19:27:54 -0500 Subject: [PATCH 119/155] Cleaning: fixing whitespace and removing unused imports --- happytransformer/happy_next_sentence.py | 3 +-- happytransformer/happy_question_answering.py | 17 ++++++------- happytransformer/happy_text_classification.py | 6 ++--- happytransformer/happy_trainer.py | 2 +- happytransformer/qa/trainer.py | 4 ++-- tests/test_wp.py | 24 +++++++++---------- 6 files changed, 26 insertions(+), 30 deletions(-) diff --git a/happytransformer/happy_next_sentence.py b/happytransformer/happy_next_sentence.py index b5325390..92dcf024 100644 --- a/happytransformer/happy_next_sentence.py +++ b/happytransformer/happy_next_sentence.py @@ -1,5 +1,4 @@ import torch -import re from transformers import ( BertTokenizerFast, BertForNextSentencePrediction, @@ -25,7 +24,7 @@ def __init__(self, model_type="BERT", self._pipeline = None self._trainer = None - def predict_next_sentence(self, sentence_a:str, sentence_b:str)->float: + def predict_next_sentence(self, sentence_a: str, sentence_b: str) -> float: """ Predict the probability that sentence_b follows sentence_a. Higher probabilities indicate more coherent sentence pairs. diff --git a/happytransformer/happy_question_answering.py b/happytransformer/happy_question_answering.py index ed3e9989..c838176b 100644 --- a/happytransformer/happy_question_answering.py +++ b/happytransformer/happy_question_answering.py @@ -2,7 +2,6 @@ Contains the HappyQuestionAnswering class. """ -import torch from happytransformer.happy_transformer import HappyTransformer from happytransformer.qa.trainer import QATrainer from happytransformer.qa.default_args import ARGS_QA_TRAIN @@ -24,10 +23,10 @@ @dataclass class QuestionAnsweringResult: - answer:str - score:float - start:int - end:int + answer: str + score: float + start: int + end: int class HappyQuestionAnswering(HappyTransformer): @@ -68,10 +67,8 @@ def __init__(self, model_type="DISTILBERT", self._trainer = QATrainer(model, model_type, tokenizer, self._device, self.logger) - def answer_question( - self, - context:str, question:str, top_k:int=1 - )->List[QuestionAnsweringResult]: + def answer_question(self, context: str, question: str, top_k: int = 1) \ + -> List[QuestionAnsweringResult]: """ Find the answers to a question. The answer MUST be contained somewhere within the context for this to work. @@ -81,7 +78,7 @@ def answer_question( pipeline_output = self._pipeline(context=context, question=question, topk=top_k) # transformers returns a single dictionary when top_k ==1. # Our convention however is to have constant output format - answers = [pipeline_output] if top_k==1 else pipeline_output + answers = [pipeline_output] if top_k == 1 else pipeline_output return [ QuestionAnsweringResult( diff --git a/happytransformer/happy_text_classification.py b/happytransformer/happy_text_classification.py index 6daa3823..01e1edc6 100644 --- a/happytransformer/happy_text_classification.py +++ b/happytransformer/happy_text_classification.py @@ -24,8 +24,8 @@ @dataclass class TextClassificationResult: - label:str - score:float + label: str + score: float class HappyTextClassification(HappyTransformer): """ @@ -67,7 +67,7 @@ def __init__(self, model_type="DISTILBERT", self._tokenizer, self._device, self.logger ) - def classify_text(self, text:str) -> TextClassificationResult: + def classify_text(self, text: str) -> TextClassificationResult: """ Classify text to a label based on model's training """ diff --git a/happytransformer/happy_trainer.py b/happytransformer/happy_trainer.py index f6645c31..28a09479 100644 --- a/happytransformer/happy_trainer.py +++ b/happytransformer/happy_trainer.py @@ -7,7 +7,7 @@ @dataclass class EvalResult: - loss:float + loss: float class HappyTrainer: def __init__(self, model, model_type, tokenizer, device, logger): diff --git a/happytransformer/qa/trainer.py b/happytransformer/qa/trainer.py index 1426924b..05438abe 100644 --- a/happytransformer/qa/trainer.py +++ b/happytransformer/qa/trainer.py @@ -55,8 +55,8 @@ def test(self, input_filepath, solve): contexts, questions = self._get_data(input_filepath, test_data=True) return [ - solve(context,question)[0] - for context,question in + solve(context, question)[0] + for context, question in tqdm(zip(contexts, questions)) ] diff --git a/tests/test_wp.py b/tests/test_wp.py index af221ada..8c136a90 100644 --- a/tests/test_wp.py +++ b/tests/test_wp.py @@ -6,12 +6,12 @@ def test_mwp_basic(): MODELS = [ - ('DISTILBERT','distilbert-base-uncased','pepper'), - ('BERT','bert-base-uncased','.'), - ('ALBERT','albert-base-v2','garlic') + ('DISTILBERT', 'distilbert-base-uncased', 'pepper'), + ('BERT', 'bert-base-uncased', '.'), + ('ALBERT', 'albert-base-v2', 'garlic') ] - for model_type,model_name,top_result in MODELS: - happy_mwp = HappyWordPrediction(model_type,model_name) + for model_type, model_name, top_result in MODELS: + happy_mwp = HappyWordPrediction(model_type, model_name) results = happy_mwp.predict_mask( "Please pass the salt and [MASK]", ) @@ -20,27 +20,27 @@ def test_mwp_basic(): def test_mwp_top_k(): - happy_mwp = HappyWordPrediction('DISTILBERT','distilbert-base-uncased') + happy_mwp = HappyWordPrediction('DISTILBERT', 'distilbert-base-uncased') result = happy_mwp.predict_mask( "Please pass the salt and [MASK]", top_k=2 ) answer = [ - WordPredictionResult(token='pepper', score=approx(0.2664579749107361,0.01)), - WordPredictionResult(token='vinegar', score=approx(0.08760260790586472,0.01)) + WordPredictionResult(token='pepper', score=approx(0.2664579749107361, 0.01)), + WordPredictionResult(token='vinegar', score=approx(0.08760260790586472, 0.01)) ] assert result == answer def test_mwp_targets(): - happy_mwp = HappyWordPrediction('DISTILBERT','distilbert-base-uncased') + happy_mwp = HappyWordPrediction('DISTILBERT', 'distilbert-base-uncased') result = happy_mwp.predict_mask( "Please pass the salt and [MASK]", targets=["water", "spices"] ) answer = [ - WordPredictionResult(token='water', score=approx(0.014856964349746704,0.01)), - WordPredictionResult(token='spices', score=approx(0.009040987119078636,0.01)) + WordPredictionResult(token='water', score=approx(0.014856964349746704, 0.01)), + WordPredictionResult(token='spices', score=approx(0.009040987119078636, 0.01)) ] - assert result == answer \ No newline at end of file + assert result == answer From 2522268444d40c70bbebc74d1b2687859d9135ef Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Tue, 12 Jan 2021 19:29:57 -0500 Subject: [PATCH 120/155] Added dataclasses requirement --- requirements.txt | 3 ++- setup.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 72cbd14e..5297852c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ torch>=1.0 tqdm>=4.27 transformers>=4.0.0 -pytest \ No newline at end of file +pytest +dataclasses; python_version<"3.7" diff --git a/setup.py b/setup.py index b393184f..0fffe130 100644 --- a/setup.py +++ b/setup.py @@ -24,8 +24,9 @@ 'torch>=1.0', 'tqdm>=4.27', 'transformers>=4.0.0', + 'dataclasses; python_version < "3.7"' - ], + ], classifiers=[ 'Development Status :: 3 - Alpha', 'Intended Audience :: Developers', From dc487463af067db2378801c681336a7c88dcd0f3 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Tue, 12 Jan 2021 19:47:53 -0500 Subject: [PATCH 121/155] Updated readme and readme examples --- README.md | 28 +++++++++---------- .../question_answering/readme_examples.py | 8 +++--- .../text_classification/readme_examples.py | 8 +++--- examples/word_prediction/readme_examples.py | 8 +++--- 4 files changed, 26 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 527ee02a..c1125fcf 100644 --- a/README.md +++ b/README.md @@ -94,7 +94,7 @@ The method predict_masks() contains 3 arguments: 3. top_k (int): the number of results that will be returned Returns: -A list of named tuples with arguments: "token_str" and "top_k" +A dataclass with variables "token_str" and "top_k" Note: if targets are provided, then top_k will be ignored and a score for each target will be returned. @@ -109,7 +109,7 @@ from happytransformer import HappyWordPrediction print(result) # [WordPredictionResult(token_str='am', score=0.10172799974679947)] print(type(result[0])) # print(result[0]) # [WordPredictionResult(token_str='am', score=0.10172799974679947)] - print(result[0].token_str) # am + print(result[0].token) # am print(result[0].score) # 0.10172799974679947 @@ -124,7 +124,7 @@ happy_wp = HappyWordPrediction("ALBERT", "albert-xxlarge-v2") result = happy_wp.predict_mask("To better the world I would invest in [MASK] and education.", top_k=2) print(result) # [WordPredictionResult(token_str='infrastructure', score=0.09270179271697998), WordPredictionResult(token_str='healthcare', score=0.07219093292951584)] print(result[1]) # WordPredictionResult(token_str='healthcare', score=0.07219093292951584) -print(result[1].token_str) # healthcare +print(result[1].token) # healthcare ``` @@ -137,7 +137,7 @@ targets = ["technology", "healthcare"] result = happy_wp.predict_mask("To better the world I would invest in [MASK] and education.", targets=targets) print(result) # [WordPredictionResult(token_str='healthcare', score=0.07219093292951584), WordPredictionResult(token_str='technology', score=0.032044216990470886)] print(result[1]) # WordPredictionResult(token_str='technology', score=0.032044216990470886) -print(result[1].token_str) # technology +print(result[1].token) # technology ``` @@ -182,7 +182,7 @@ Input: 1. text (string): Text that will be classified Returns: -A label in the form of a string, typically "LABEL_x", where x is the label number. +A dataclass with variables "label" and "score" #### Example 2.1: ```python @@ -258,7 +258,7 @@ Input: output: -A named tuple with a key called "eval_loss" +A dataclass with a variable called "loss" #### Example 2.3: ```python @@ -270,7 +270,7 @@ A named tuple with a key called "eval_loss" result = happy_tc.eval("../../data/tc/train-eval.csv") print(type(result)) # print(result) # EvalResult(eval_loss=0.007262040860950947) - print(result.eval_loss) # 0.007262040860950947 + print(result.loss) # 0.007262040860950947 ``` @@ -318,9 +318,9 @@ The list is in order by ascending csv index. happy_tc = HappyTextClassification(model_type="DISTILBERT", model_name="distilbert-base-uncased-finetuned-sst-2-english", num_labels=2) # Don't forget to set num_labels! - before_loss = happy_tc.eval("../../data/tc/train-eval.csv").eval_loss + before_loss = happy_tc.eval("../../data/tc/train-eval.csv").loss happy_tc.train("../../data/tc/train-eval.csv") - after_loss = happy_tc.eval("../../data/tc/train-eval.csv").eval_loss + after_loss = happy_tc.eval("../../data/tc/train-eval.csv").loss print("Before loss: ", before_loss) # 0.007262040860950947 print("After loss: ", after_loss) # 0.000162081079906784 # Since after_loss < before_loss, the model learned! @@ -368,7 +368,7 @@ Inputs: 3. top_k (int): the number of results that will be returned (default=1) Returns: - A list of a named tuples that contains the keys: "answer", "score", "start" and "end." + A list of a dataclasses that contains the variables: "answer", "score", "start" and "end." The list is in descending order by score #### Example 3.1: @@ -454,7 +454,7 @@ Input: output: -A named tuple with the key "eval_loss" +A dataclass with the variable "loss" #### Example 3.4: ```python @@ -464,7 +464,7 @@ A named tuple with the key "eval_loss" result = happy_qa.eval("../../data/qa/train-eval.csv") print(type(result)) # print(result) # EvalResult(eval_loss=0.11738169193267822) - print(result.eval_loss) # 0.1173816919326782 + print(result.loss) # 0.1173816919326782 ``` @@ -505,9 +505,9 @@ The list is in order by ascending csv index. from happytransformer import HappyQuestionAnswering # --------------------------------------# happy_qa = HappyQuestionAnswering() - before_loss = happy_qa.eval("../../data/qa/train-eval.csv").eval_loss + before_loss = happy_qa.eval("../../data/qa/train-eval.csv").loss happy_qa.train("../../data/qa/train-eval.csv") - after_loss = happy_qa.eval("../../data/qa/train-eval.csv").eval_loss + after_loss = happy_qa.eval("../../data/qa/train-eval.csv").loss print("Before loss: ", before_loss) # 0.11738169193267822 print("After loss: ", after_loss) # 0.00037909045931883156 # Since after_loss < before_loss, the model learned! diff --git a/examples/question_answering/readme_examples.py b/examples/question_answering/readme_examples.py index 63d8f494..0bb8d5f8 100644 --- a/examples/question_answering/readme_examples.py +++ b/examples/question_answering/readme_examples.py @@ -38,7 +38,7 @@ def example_3_4(): result = happy_qa.eval("../../data/qa/train-eval.csv") print(type(result)) # print(result) # EvalResult(eval_loss=0.11738169193267822) - print(result.eval_loss) # 0.1173816919326782 + print(result.loss) # 0.1173816919326782 def example_3_5(): @@ -52,9 +52,9 @@ def example_3_5(): def example_3_6(): happy_qa = HappyQuestionAnswering() - before_loss = happy_qa.eval("../../data/qa/train-eval.csv").eval_loss + before_loss = happy_qa.eval("../../data/qa/train-eval.csv").loss happy_qa.train("../../data/qa/train-eval.csv") - after_loss = happy_qa.eval("../../data/qa/train-eval.csv").eval_loss + after_loss = happy_qa.eval("../../data/qa/train-eval.csv").loss print("Before loss: ", before_loss) # 0.11738169193267822 print("After loss: ", after_loss) # 0.00037909045931883156 # Since after_loss < before_loss, the model learned! @@ -63,7 +63,7 @@ def example_3_6(): def main(): - example_3_1() + example_3_6() if __name__ == "__main__": diff --git a/examples/text_classification/readme_examples.py b/examples/text_classification/readme_examples.py index 4ef496ab..7c6a3d3c 100644 --- a/examples/text_classification/readme_examples.py +++ b/examples/text_classification/readme_examples.py @@ -30,7 +30,7 @@ def example_2_3(): result = happy_tc.eval("../../data/tc/train-eval.csv") print(type(result)) # print(result) # EvalResult(eval_loss=0.007262040860950947) - print(result.eval_loss) # 0.007262040860950947 + print(result.loss) # 0.007262040860950947 def example_2_4(): @@ -49,9 +49,9 @@ def example_2_5(): happy_tc = HappyTextClassification(model_type="DISTILBERT", model_name="distilbert-base-uncased-finetuned-sst-2-english", num_labels=2) # Don't forget to set num_labels! - before_loss = happy_tc.eval("../../data/tc/train-eval.csv").eval_loss + before_loss = happy_tc.eval("../../data/tc/train-eval.csv").loss happy_tc.train("../../data/tc/train-eval.csv") - after_loss = happy_tc.eval("../../data/tc/train-eval.csv").eval_loss + after_loss = happy_tc.eval("../../data/tc/train-eval.csv").loss print("Before loss: ", before_loss) # 0.007262040860950947 print("After loss: ", after_loss) # 0.000162081079906784 # Since after_loss < before_loss, the model learned! @@ -60,7 +60,7 @@ def example_2_5(): def main(): - example_2_5() + example_2_1() if __name__ == "__main__": diff --git a/examples/word_prediction/readme_examples.py b/examples/word_prediction/readme_examples.py index 9ee62cba..d3950cc5 100644 --- a/examples/word_prediction/readme_examples.py +++ b/examples/word_prediction/readme_examples.py @@ -15,7 +15,7 @@ def example_1_1(): print(result) # [WordPredictionResult(token_str='am', score=0.10172799974679947)] print(type(result[0])) # print(result[0]) # [WordPredictionResult(token_str='am', score=0.10172799974679947)] - print(result[0].token_str) # am + print(result[0].token) # am print(result[0].score) # 0.10172799974679947 @@ -24,7 +24,7 @@ def example_1_2(): result = happy_wp.predict_mask("To better the world I would invest in [MASK] and education.", top_k=10) print(result) # [WordPredictionResult(token_str='infrastructure', score=0.09270179271697998), WordPredictionResult(token_str='healthcare', score=0.07219093292951584)] print(result[1]) # WordPredictionResult(token_str='healthcare', score=0.07219093292951584) - print(result[1].token_str) # healthcare + print(result[1].token) # healthcare def example_1_3(): @@ -33,12 +33,12 @@ def example_1_3(): result = happy_wp.predict_mask("To better the world I would invest in [MASK] and education.", targets=targets) print(result) # [WordPredictionResult(token_str='healthcare', score=0.07219093292951584), WordPredictionResult(token_str='technology', score=0.032044216990470886)] print(result[1]) # WordPredictionResult(token_str='technology', score=0.032044216990470886) - print(result[1].token_str) # technology + print(result[1].token) # technology def main(): - example_1_1() # example_1_1() + example_1_1() # example_1_2() # example_1_3() From 2e832800ecca8b37e037a6c7c27fd0712e8e9dca Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 20:41:25 -0500 Subject: [PATCH 122/155] started adaptors --- happytransformer/adaptors/adaptor.py | 32 +++++++++++++++++++++++ happytransformer/happy_word_prediction.py | 20 +++----------- 2 files changed, 36 insertions(+), 16 deletions(-) create mode 100644 happytransformer/adaptors/adaptor.py diff --git a/happytransformer/adaptors/adaptor.py b/happytransformer/adaptors/adaptor.py new file mode 100644 index 00000000..b9cc6e0f --- /dev/null +++ b/happytransformer/adaptors/adaptor.py @@ -0,0 +1,32 @@ +from transformers import ( + PreTrainedModel, + BertForMaskedLM, BertTokenizerFast +) +from transformers.tokenization_utils import PreTrainedTokenizerBase + +class Adaptor: + def get_tokenizer(self, model_name:str)->PreTrainedTokenizerBase: + raise NotImplementedError() + + def get_masked_language_model(self, model_name:str)->PreTrainedModel: + raise NotImplementedError() + + def preprocess_text(self, text:str)->str: + return text + +class BERTAdaptor(Adaptor): + def get_tokenizer(self, model_name:str): + return BertTokenizerFast.from_pretrained(model_name) + + def get_masked_language_model(self, model_name:str): + return BertForMaskedLM.from_pretrained(model_name) + +ADAPTORS = { + 'BERT':BERTAdaptor() +} + +def get_adaptor(model_type:str)->Adaptor: + if model_type in ADAPTORS: + return ADAPTORS[model_type] + else: + raise ValueError(f'Model type <{model_type}> not currently supported') \ No newline at end of file diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index fe57b1cf..ff6c5551 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -14,6 +14,7 @@ from happytransformer.happy_transformer import HappyTransformer from happytransformer.mwp.trainer import WPTrainer from happytransformer.cuda_detect import detect_cuda_device_number +from happytransformer.adaptors.adaptor import get_adaptor from typing import List @dataclass @@ -27,23 +28,10 @@ class HappyWordPrediction(HappyTransformer): """ def __init__(self, model_type:str="DISTILBERT", model_name:str="distilbert-base-uncased"): - model = None - tokenizer = None + adaptor = get_adaptor(model_type) + model = adaptor.get_masked_language_model(model_name) + tokenizer = adaptor.get_tokenizer(model_name) - if model_type == "ALBERT": - model = AlbertForMaskedLM.from_pretrained(model_name) - tokenizer = AlbertTokenizerFast.from_pretrained(model_name) - elif model_type == "BERT": - model = BertForMaskedLM.from_pretrained(model_name) - tokenizer = BertTokenizerFast.from_pretrained(model_name) - elif model_type == "DISTILBERT": - model = DistilBertForMaskedLM.from_pretrained(model_name) - tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) - elif model_type == "ROBERTA": - model = RobertaForMaskedLM.from_pretrained(model_name) - tokenizer = RobertaTokenizerFast.from_pretrained(model_name) - else: - raise ValueError(self.model_type_error) super().__init__(model_type, model_name, model, tokenizer) device_number = detect_cuda_device_number() From dc803b496ae2733fc355985d45074975441b029b Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Tue, 12 Jan 2021 20:56:11 -0500 Subject: [PATCH 123/155] Added training example --- .../question_answering/training_example.py | 45 +++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 examples/question_answering/training_example.py diff --git a/examples/question_answering/training_example.py b/examples/question_answering/training_example.py new file mode 100644 index 00000000..25dac577 --- /dev/null +++ b/examples/question_answering/training_example.py @@ -0,0 +1,45 @@ +from datasets import load_dataset +import csv +from happytransformer.happy_question_answering import HappyQuestionAnswering + + +def main(): + # Be careful not to commit the csv files to the rep + train_csv_path = "train.csv" + eval_csv_path = "eval.csv" + + train_dataset = load_dataset('squad', split='train') + eval_dataset = load_dataset('squad', split='validation') + + generate_csv(train_csv_path, train_dataset, 500) + generate_csv(eval_csv_path, eval_dataset, 100) + + happy_qa = HappyQuestionAnswering(model_type="BERT", model_name="bert-base-uncased") + before_loss = happy_qa.eval(eval_csv_path) + happy_qa.train(train_csv_path) + after_loss = happy_qa.eval(eval_csv_path) + + print("Before loss: ", before_loss) + print("After loss: ", after_loss) + assert after_loss < before_loss + + +def generate_csv(csv_path, dataset, count): + with open(csv_path, 'w', newline='') as csvfile: + writter = csv.writer(csvfile) + writter.writerow(["context", "question", "answer_text", "answer_start"]) + i = 0 + for case in dataset: + context = case["context"] + question = case["question"] + answer_text = case["answers"]["text"][0] + answer_start = case["answers"]["answer_start"][0] + writter.writerow([context, question, answer_text, answer_start]) + + i += 1 + if i == count: + break + + +if __name__ == "__main__": + main() From 84b8d0d6cebee03e1a66d034c4505888df92cb8a Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 21:05:09 -0500 Subject: [PATCH 124/155] added roberta adaptor --- happytransformer/adaptors/adaptor.py | 46 ++++++++++++++++++----- happytransformer/happy_word_prediction.py | 32 ++++++++-------- 2 files changed, 53 insertions(+), 25 deletions(-) diff --git a/happytransformer/adaptors/adaptor.py b/happytransformer/adaptors/adaptor.py index b9cc6e0f..9e8eb75f 100644 --- a/happytransformer/adaptors/adaptor.py +++ b/happytransformer/adaptors/adaptor.py @@ -1,28 +1,56 @@ +from typing import Type from transformers import ( PreTrainedModel, - BertForMaskedLM, BertTokenizerFast + BertForMaskedLM, BertTokenizerFast, + RobertaForMaskedLM, RobertaTokenizerFast ) from transformers.tokenization_utils import PreTrainedTokenizerBase class Adaptor: - def get_tokenizer(self, model_name:str)->PreTrainedTokenizerBase: + @property + def tokenizer(self)->Type[PreTrainedTokenizerBase]: raise NotImplementedError() - def get_masked_language_model(self, model_name:str)->PreTrainedModel: + @property + def masked_language_model(self)->Type[PreTrainedModel]: raise NotImplementedError() def preprocess_text(self, text:str)->str: return text -class BERTAdaptor(Adaptor): - def get_tokenizer(self, model_name:str): - return BertTokenizerFast.from_pretrained(model_name) + def postprocess_token(self, text:str)->str: + return text + +class BertAdaptor(Adaptor): + @property + def tokenizer(self): + return BertTokenizerFast + @property + def get_masked_language_model(self): + return BertForMaskedLM + +class RobertaAdaptor(Adaptor): + @property + def tokenizer(self): + return RobertaTokenizerFast + @property + def masked_language_model(self): + return RobertaForMaskedLM + + def preprocess_text(self, text:str)->str: + print(text) + return text.replace('[MASK]','') - def get_masked_language_model(self, model_name:str): - return BertForMaskedLM.from_pretrained(model_name) + def postprocess_token(self, text): + return ( + text[1:] + if text[0] == "Ġ" + else text + ) ADAPTORS = { - 'BERT':BERTAdaptor() + 'BERT':BertAdaptor(), + 'ROBERTA':RobertaAdaptor() } def get_adaptor(model_type:str)->Adaptor: diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index ff6c5551..03b3703d 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -15,7 +15,7 @@ from happytransformer.mwp.trainer import WPTrainer from happytransformer.cuda_detect import detect_cuda_device_number from happytransformer.adaptors.adaptor import get_adaptor -from typing import List +from typing import List,Optional @dataclass class WordPredictionResult: @@ -26,11 +26,13 @@ class HappyWordPrediction(HappyTransformer): """ A user facing class for text classification """ - def __init__(self, model_type:str="DISTILBERT", - model_name:str="distilbert-base-uncased"): - adaptor = get_adaptor(model_type) - model = adaptor.get_masked_language_model(model_name) - tokenizer = adaptor.get_tokenizer(model_name) + def __init__( + self, model_type:str="DISTILBERT", + model_name:str="distilbert-base-uncased"): + + self.adaptor = get_adaptor(model_type) + model = self.adaptor.masked_language_model.from_pretrained(model_name) + tokenizer = self.adaptor.tokenizer.from_pretrained(model_name) super().__init__(model_type, model_name, model, tokenizer) @@ -41,7 +43,7 @@ def __init__(self, model_type:str="DISTILBERT", self._trainer = WPTrainer(model, model_type, tokenizer, self._device, self.logger) def predict_mask(self, - text:str, targets:List[str]=None, top_k:int=1 + text:str, targets:Optional[List[str]]=None, top_k:int=1 ) -> List[WordPredictionResult]: """ Predict [MASK] tokens in a string. @@ -52,22 +54,20 @@ def predict_mask(self, if not isinstance(text, str): raise ValueError("the \"text\" argument must be a single string") - if self.model_type == "ROBERTA": - text = text.replace("[MASK]", "") - - answers = self._pipeline(text, targets=targets, top_k=top_k) + text_for_pipeline = self.adaptor.preprocess_text(text) + answers = self._pipeline( + text_for_pipeline, + targets=targets, top_k=top_k + ) if self.model_type == "ALBERT": for answer in answers: if answer["token_str"][0] == "▁": answer["token_str"] = answer["token_str"][1:] - elif self.model_type == "ROBERTA": - for answer in answers: - if answer["token_str"][0] == "Ġ": - answer["token_str"] = answer["token_str"][1:] + return [ WordPredictionResult( - token=answer["token_str"], + token=self.adaptor.postprocess_token(answer["token_str"]), score=answer["score"] ) for answer in answers From 8ef214ad84380107a1cb086865f2c8ed5fe9e838 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Tue, 12 Jan 2021 21:12:02 -0500 Subject: [PATCH 125/155] Added datasets to requirements --- requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 72cbd14e..42c1f1da 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ torch>=1.0 tqdm>=4.27 transformers>=4.0.0 -pytest \ No newline at end of file +pytest +datasets \ No newline at end of file From 24ad48ab7b31c9e510208ffafb311a41e370b54b Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 21:20:07 -0500 Subject: [PATCH 126/155] all word prediction logic now stored in adaptors --- happytransformer/adaptors/adaptor.py | 49 +++++++++++++++++------ happytransformer/happy_word_prediction.py | 11 ++--- 2 files changed, 39 insertions(+), 21 deletions(-) diff --git a/happytransformer/adaptors/adaptor.py b/happytransformer/adaptors/adaptor.py index 9e8eb75f..4e4cbca9 100644 --- a/happytransformer/adaptors/adaptor.py +++ b/happytransformer/adaptors/adaptor.py @@ -2,17 +2,23 @@ from transformers import ( PreTrainedModel, BertForMaskedLM, BertTokenizerFast, - RobertaForMaskedLM, RobertaTokenizerFast + RobertaForMaskedLM, RobertaTokenizerFast, + AlbertForMaskedLM, AlbertTokenizerFast, + DistilBertForMaskedLM, DistilBertTokenizerFast ) from transformers.tokenization_utils import PreTrainedTokenizerBase class Adaptor: + ''' + Holds a few functions for implementation details. + Does NOT store any state. + ''' @property - def tokenizer(self)->Type[PreTrainedTokenizerBase]: + def Tokenizer(self)->Type[PreTrainedTokenizerBase]: raise NotImplementedError() @property - def masked_language_model(self)->Type[PreTrainedModel]: + def MaskedLM(self)->Type[PreTrainedModel]: raise NotImplementedError() def preprocess_text(self, text:str)->str: @@ -23,18 +29,26 @@ def postprocess_token(self, text:str)->str: class BertAdaptor(Adaptor): @property - def tokenizer(self): + def Tokenizer(self): return BertTokenizerFast @property - def get_masked_language_model(self): + def MaskedLM(self): return BertForMaskedLM +class DistilBertAdaptor(Adaptor): + @property + def Tokenizer(self): + return DistilBertTokenizerFast + @property + def MaskedLM(self): + return DistilBertForMaskedLM + class RobertaAdaptor(Adaptor): @property - def tokenizer(self): + def Tokenizer(self): return RobertaTokenizerFast @property - def masked_language_model(self): + def MaskedLM(self): return RobertaForMaskedLM def preprocess_text(self, text:str)->str: @@ -42,15 +56,24 @@ def preprocess_text(self, text:str)->str: return text.replace('[MASK]','') def postprocess_token(self, text): - return ( - text[1:] - if text[0] == "Ġ" - else text - ) + return text[1:] if text[0] == "Ġ" else text + +class AlbertAdaptor(Adaptor): + @property + def Tokenizer(self): + return AlbertTokenizerFast + @property + def MaskedLM(self): + return AlbertForMaskedLM + + def postprocess_token(self, text): + return text[1:] if text[0] == "▁" else text ADAPTORS = { 'BERT':BertAdaptor(), - 'ROBERTA':RobertaAdaptor() + 'DISTILBERT':DistilBertAdaptor(), + 'ROBERTA':RobertaAdaptor(), + 'ALBERT':AlbertAdaptor() } def get_adaptor(model_type:str)->Adaptor: diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index 03b3703d..78a38fee 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -31,8 +31,8 @@ def __init__( model_name:str="distilbert-base-uncased"): self.adaptor = get_adaptor(model_type) - model = self.adaptor.masked_language_model.from_pretrained(model_name) - tokenizer = self.adaptor.tokenizer.from_pretrained(model_name) + model = self.adaptor.MaskedLM.from_pretrained(model_name) + tokenizer = self.adaptor.Tokenizer.from_pretrained(model_name) super().__init__(model_type, model_name, model, tokenizer) @@ -52,18 +52,13 @@ def predict_mask(self, *top_k does not apply if targets is supplied """ if not isinstance(text, str): - raise ValueError("the \"text\" argument must be a single string") + raise ValueError('the "text" argument must be a single string') text_for_pipeline = self.adaptor.preprocess_text(text) answers = self._pipeline( text_for_pipeline, targets=targets, top_k=top_k ) - - if self.model_type == "ALBERT": - for answer in answers: - if answer["token_str"][0] == "▁": - answer["token_str"] = answer["token_str"][1:] return [ WordPredictionResult( From 98e4cc3b5533a4236a200502a3d4f2e2d9a01716 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 21:23:35 -0500 Subject: [PATCH 127/155] next sentence uses adaptor --- happytransformer/adaptors/adaptor.py | 9 ++++++++- happytransformer/happy_next_sentence.py | 10 ++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/happytransformer/adaptors/adaptor.py b/happytransformer/adaptors/adaptor.py index 4e4cbca9..28749792 100644 --- a/happytransformer/adaptors/adaptor.py +++ b/happytransformer/adaptors/adaptor.py @@ -1,7 +1,7 @@ from typing import Type from transformers import ( PreTrainedModel, - BertForMaskedLM, BertTokenizerFast, + BertForMaskedLM, BertTokenizerFast, BertForNextSentencePrediction, RobertaForMaskedLM, RobertaTokenizerFast, AlbertForMaskedLM, AlbertTokenizerFast, DistilBertForMaskedLM, DistilBertTokenizerFast @@ -21,6 +21,10 @@ def Tokenizer(self)->Type[PreTrainedTokenizerBase]: def MaskedLM(self)->Type[PreTrainedModel]: raise NotImplementedError() + @property + def NextSentencePrediction(self)->Type[PreTrainedModel]: + raise NotImplementedError() + def preprocess_text(self, text:str)->str: return text @@ -34,6 +38,9 @@ def Tokenizer(self): @property def MaskedLM(self): return BertForMaskedLM + @property + def NextSentencePrediction(self): + return BertForNextSentencePrediction class DistilBertAdaptor(Adaptor): @property diff --git a/happytransformer/happy_next_sentence.py b/happytransformer/happy_next_sentence.py index 92dcf024..d02f116b 100644 --- a/happytransformer/happy_next_sentence.py +++ b/happytransformer/happy_next_sentence.py @@ -6,7 +6,7 @@ ) from happytransformer.happy_transformer import HappyTransformer - +from happytransformer.adaptors.adaptor import get_adaptor class HappyNextSentence(HappyTransformer): """ @@ -15,11 +15,9 @@ class HappyNextSentence(HappyTransformer): def __init__(self, model_type="BERT", model_name="bert-base-uncased"): - if model_type == "BERT": - model = BertForNextSentencePrediction.from_pretrained(model_name) - tokenizer = BertTokenizerFast.from_pretrained(model_name) - else: - raise ValueError(self.model_type_error) + self.adaptor = get_adaptor(model_type) + model = self.adaptor.NextSentencePrediction.from_pretrained(model_name) + tokenizer = self.adaptor.Tokenizer.from_pretrained(model_name) super().__init__(model_type, model_name, model, tokenizer) self._pipeline = None self._trainer = None From 59db71dea9559bee054923cc5d103c8a3852dccb Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 21:30:11 -0500 Subject: [PATCH 128/155] QA uses adaptors --- happytransformer/adaptors/adaptor.py | 29 ++++++++++++++++++-- happytransformer/happy_question_answering.py | 23 ++++------------ 2 files changed, 31 insertions(+), 21 deletions(-) diff --git a/happytransformer/adaptors/adaptor.py b/happytransformer/adaptors/adaptor.py index 28749792..feadb901 100644 --- a/happytransformer/adaptors/adaptor.py +++ b/happytransformer/adaptors/adaptor.py @@ -1,10 +1,17 @@ from typing import Type from transformers import ( PreTrainedModel, - BertForMaskedLM, BertTokenizerFast, BertForNextSentencePrediction, + + BertForMaskedLM, BertTokenizerFast, + BertForNextSentencePrediction, BertForQuestionAnswering, + RobertaForMaskedLM, RobertaTokenizerFast, - AlbertForMaskedLM, AlbertTokenizerFast, - DistilBertForMaskedLM, DistilBertTokenizerFast + RobertaForQuestionAnswering, + + AlbertForMaskedLM, AlbertTokenizerFast, AlbertForQuestionAnswering, + + DistilBertForMaskedLM, DistilBertTokenizerFast, + DistilBertForQuestionAnswering ) from transformers.tokenization_utils import PreTrainedTokenizerBase @@ -25,6 +32,10 @@ def MaskedLM(self)->Type[PreTrainedModel]: def NextSentencePrediction(self)->Type[PreTrainedModel]: raise NotImplementedError() + @property + def QuestionAnswering(self)->Type[PreTrainedModel]: + raise NotImplementedError() + def preprocess_text(self, text:str)->str: return text @@ -41,6 +52,9 @@ def MaskedLM(self): @property def NextSentencePrediction(self): return BertForNextSentencePrediction + @property + def QuestionAnswering(self): + return BertForQuestionAnswering class DistilBertAdaptor(Adaptor): @property @@ -49,6 +63,9 @@ def Tokenizer(self): @property def MaskedLM(self): return DistilBertForMaskedLM + @property + def QuestionAnswering(self): + return DistilBertForQuestionAnswering class RobertaAdaptor(Adaptor): @property @@ -57,6 +74,9 @@ def Tokenizer(self): @property def MaskedLM(self): return RobertaForMaskedLM + @property + def QuestionAnswering(self): + return RobertaForQuestionAnswering def preprocess_text(self, text:str)->str: print(text) @@ -72,6 +92,9 @@ def Tokenizer(self): @property def MaskedLM(self): return AlbertForMaskedLM + @property + def QuestionAnswering(self): + return AlbertForQuestionAnswering def postprocess_token(self, text): return text[1:] if text[0] == "▁" else text diff --git a/happytransformer/happy_question_answering.py b/happytransformer/happy_question_answering.py index c838176b..be0a1800 100644 --- a/happytransformer/happy_question_answering.py +++ b/happytransformer/happy_question_answering.py @@ -17,6 +17,7 @@ QuestionAnsweringPipeline, ) from happytransformer.cuda_detect import detect_cuda_device_number +from happytransformer.adaptors.adaptor import get_adaptor from typing import List from dataclasses import dataclass @@ -41,24 +42,10 @@ class HappyQuestionAnswering(HappyTransformer): """ def __init__(self, model_type="DISTILBERT", model_name="distilbert-base-cased-distilled-squad"): - model = None - tokenizer = None - - if model_type == "ALBERT": - model = AlbertForQuestionAnswering.from_pretrained(model_name) - tokenizer = AlbertTokenizerFast.from_pretrained(model_name) - elif model_type == "BERT": - model = BertForQuestionAnswering.from_pretrained(model_name) - tokenizer = BertTokenizerFast.from_pretrained(model_name) - elif model_type == "DISTILBERT": - model = DistilBertForQuestionAnswering.from_pretrained(model_name) - tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) - elif model_type == "ROBERTA": - model = RobertaForQuestionAnswering.from_pretrained(model_name) - tokenizer = RobertaTokenizerFast.from_pretrained(model_name) - - else: - raise ValueError(self.model_type_error) + + self.adaptor = get_adaptor(model_type) + model = self.adaptor.QuestionAnswering.from_pretrained(model_name) + tokenizer = self.adaptor.Tokenizer.from_pretrained(model_name) super().__init__(model_type, model_name, model, tokenizer) device_number = detect_cuda_device_number() From ec6867e5e2450258c101cd491e3dcb53db9203f6 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 21:36:06 -0500 Subject: [PATCH 129/155] added VScode settings --- .vscode/settings.json | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..86ab3eb1 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,4 @@ +{ + "python.linting.mypyEnabled": true, + "python.linting.enabled": true +} \ No newline at end of file From 522876b8ce180000527d73e253aa7a11805c78ab Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 21:36:24 -0500 Subject: [PATCH 130/155] text classification uses adaptors --- happytransformer/adaptors/adaptor.py | 22 +++++++++++++++++-- happytransformer/happy_text_classification.py | 21 ++++-------------- 2 files changed, 24 insertions(+), 19 deletions(-) diff --git a/happytransformer/adaptors/adaptor.py b/happytransformer/adaptors/adaptor.py index feadb901..0bd56695 100644 --- a/happytransformer/adaptors/adaptor.py +++ b/happytransformer/adaptors/adaptor.py @@ -4,14 +4,16 @@ BertForMaskedLM, BertTokenizerFast, BertForNextSentencePrediction, BertForQuestionAnswering, + BertForSequenceClassification, RobertaForMaskedLM, RobertaTokenizerFast, - RobertaForQuestionAnswering, + RobertaForQuestionAnswering, RobertaForSequenceClassification, AlbertForMaskedLM, AlbertTokenizerFast, AlbertForQuestionAnswering, + AlbertForSequenceClassification, DistilBertForMaskedLM, DistilBertTokenizerFast, - DistilBertForQuestionAnswering + DistilBertForQuestionAnswering, DistilBertForSequenceClassification ) from transformers.tokenization_utils import PreTrainedTokenizerBase @@ -36,6 +38,10 @@ def NextSentencePrediction(self)->Type[PreTrainedModel]: def QuestionAnswering(self)->Type[PreTrainedModel]: raise NotImplementedError() + @property + def SequenceClassification(self)->Type[PreTrainedModel]: + raise NotImplementedError() + def preprocess_text(self, text:str)->str: return text @@ -55,6 +61,9 @@ def NextSentencePrediction(self): @property def QuestionAnswering(self): return BertForQuestionAnswering + @property + def SequenceClassification(self): + return BertForSequenceClassification class DistilBertAdaptor(Adaptor): @property @@ -66,6 +75,9 @@ def MaskedLM(self): @property def QuestionAnswering(self): return DistilBertForQuestionAnswering + @property + def SequenceClassification(self): + return DistilBertForSequenceClassification class RobertaAdaptor(Adaptor): @property @@ -77,6 +89,9 @@ def MaskedLM(self): @property def QuestionAnswering(self): return RobertaForQuestionAnswering + @property + def SequenceClassification(self): + return RobertaForSequenceClassification def preprocess_text(self, text:str)->str: print(text) @@ -95,6 +110,9 @@ def MaskedLM(self): @property def QuestionAnswering(self): return AlbertForQuestionAnswering + @property + def SequenceClassification(self): + return AlbertForSequenceClassification def postprocess_token(self, text): return text[1:] if text[0] == "▁" else text diff --git a/happytransformer/happy_text_classification.py b/happytransformer/happy_text_classification.py index 01e1edc6..55d27372 100644 --- a/happytransformer/happy_text_classification.py +++ b/happytransformer/happy_text_classification.py @@ -20,6 +20,7 @@ from happytransformer.cuda_detect import detect_cuda_device_number from happytransformer.happy_transformer import HappyTransformer +from happytransformer.adaptors.adaptor import get_adaptor from happytransformer.tc.default_args import ARGS_TC_TRAIN @dataclass @@ -34,25 +35,11 @@ class HappyTextClassification(HappyTransformer): def __init__(self, model_type="DISTILBERT", model_name="distilbert-base-uncased", num_labels=2): - model = None - tokenizer = None + self.adaptor = get_adaptor(model_type) config = AutoConfig.from_pretrained(model_name, num_labels=num_labels) - if model_type == "ALBERT": - model = AlbertForSequenceClassification.from_pretrained(model_name, config=config) - tokenizer = AlbertTokenizerFast.from_pretrained(model_name) - elif model_type == "BERT": - model = BertForSequenceClassification.from_pretrained(model_name, config=config) - tokenizer = BertTokenizerFast.from_pretrained(model_name) - elif model_type == "DISTILBERT": - model = DistilBertForSequenceClassification.from_pretrained(model_name, config=config) - tokenizer = DistilBertTokenizerFast.from_pretrained(model_name) - elif model_type == "ROBERTA": - model = RobertaForSequenceClassification.from_pretrained(model_name) - tokenizer = RobertaTokenizerFast.from_pretrained(model_name) - - else: - raise ValueError(self.model_type_error) + model = self.adaptor.SequenceClassification.from_pretrained(model_name, config=config) + tokenizer = self.adaptor.Tokenizer.from_pretrained(model_name) super().__init__(model_type, model_name, model, tokenizer) From d6785d8f6239c0382a1979f7d8d813b5944cb2b3 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 21:38:36 -0500 Subject: [PATCH 131/155] cleaned up imports --- happytransformer/adaptors/__init__.py | 1 + happytransformer/happy_next_sentence.py | 2 +- happytransformer/happy_question_answering.py | 2 +- happytransformer/happy_text_classification.py | 2 +- happytransformer/happy_word_prediction.py | 2 +- 5 files changed, 5 insertions(+), 4 deletions(-) create mode 100644 happytransformer/adaptors/__init__.py diff --git a/happytransformer/adaptors/__init__.py b/happytransformer/adaptors/__init__.py new file mode 100644 index 00000000..2932665b --- /dev/null +++ b/happytransformer/adaptors/__init__.py @@ -0,0 +1 @@ +from .adaptor import get_adaptor \ No newline at end of file diff --git a/happytransformer/happy_next_sentence.py b/happytransformer/happy_next_sentence.py index d02f116b..09b801ce 100644 --- a/happytransformer/happy_next_sentence.py +++ b/happytransformer/happy_next_sentence.py @@ -6,7 +6,7 @@ ) from happytransformer.happy_transformer import HappyTransformer -from happytransformer.adaptors.adaptor import get_adaptor +from happytransformer.adaptors import get_adaptor class HappyNextSentence(HappyTransformer): """ diff --git a/happytransformer/happy_question_answering.py b/happytransformer/happy_question_answering.py index be0a1800..b566517b 100644 --- a/happytransformer/happy_question_answering.py +++ b/happytransformer/happy_question_answering.py @@ -17,7 +17,7 @@ QuestionAnsweringPipeline, ) from happytransformer.cuda_detect import detect_cuda_device_number -from happytransformer.adaptors.adaptor import get_adaptor +from happytransformer.adaptors import get_adaptor from typing import List from dataclasses import dataclass diff --git a/happytransformer/happy_text_classification.py b/happytransformer/happy_text_classification.py index 55d27372..8b498edf 100644 --- a/happytransformer/happy_text_classification.py +++ b/happytransformer/happy_text_classification.py @@ -20,7 +20,7 @@ from happytransformer.cuda_detect import detect_cuda_device_number from happytransformer.happy_transformer import HappyTransformer -from happytransformer.adaptors.adaptor import get_adaptor +from happytransformer.adaptors import get_adaptor from happytransformer.tc.default_args import ARGS_TC_TRAIN @dataclass diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index 78a38fee..c8759f02 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -14,7 +14,7 @@ from happytransformer.happy_transformer import HappyTransformer from happytransformer.mwp.trainer import WPTrainer from happytransformer.cuda_detect import detect_cuda_device_number -from happytransformer.adaptors.adaptor import get_adaptor +from happytransformer.adaptors import get_adaptor from typing import List,Optional @dataclass From bc0eacb426cad2d2f01b8cccc99126c003e22176 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 21:45:55 -0500 Subject: [PATCH 132/155] named adaptor functions better --- happytransformer/adaptors/adaptor.py | 10 +++++----- happytransformer/happy_word_prediction.py | 7 ++++--- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/happytransformer/adaptors/adaptor.py b/happytransformer/adaptors/adaptor.py index 0bd56695..0a86bfd3 100644 --- a/happytransformer/adaptors/adaptor.py +++ b/happytransformer/adaptors/adaptor.py @@ -42,10 +42,10 @@ def QuestionAnswering(self)->Type[PreTrainedModel]: def SequenceClassification(self)->Type[PreTrainedModel]: raise NotImplementedError() - def preprocess_text(self, text:str)->str: + def preprocess_mask_text(self, text:str)->str: return text - def postprocess_token(self, text:str)->str: + def postprocess_mask_prediction_token(self, text:str)->str: return text class BertAdaptor(Adaptor): @@ -93,11 +93,11 @@ def QuestionAnswering(self): def SequenceClassification(self): return RobertaForSequenceClassification - def preprocess_text(self, text:str)->str: + def preprocess_mask_text(self, text:str)->str: print(text) return text.replace('[MASK]','') - def postprocess_token(self, text): + def postprocess_mask_prediction_token(self, text): return text[1:] if text[0] == "Ġ" else text class AlbertAdaptor(Adaptor): @@ -114,7 +114,7 @@ def QuestionAnswering(self): def SequenceClassification(self): return AlbertForSequenceClassification - def postprocess_token(self, text): + def postprocess_mask_prediction_token(self, text): return text[1:] if text[0] == "▁" else text ADAPTORS = { diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index c8759f02..634aa626 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -54,15 +54,16 @@ def predict_mask(self, if not isinstance(text, str): raise ValueError('the "text" argument must be a single string') - text_for_pipeline = self.adaptor.preprocess_text(text) + text_for_pipeline = self.adaptor.preprocess_mask_text(text) answers = self._pipeline( text_for_pipeline, targets=targets, top_k=top_k ) - + + fix_token = self.adaptor.postprocess_mask_prediction_token return [ WordPredictionResult( - token=self.adaptor.postprocess_token(answer["token_str"]), + token=fix_token(answer["token_str"]), score=answer["score"] ) for answer in answers From e27c3bcb2c2360464371d05178856e53f6d1b747 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Tue, 12 Jan 2021 21:51:06 -0500 Subject: [PATCH 133/155] Updated gitignore --- .gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.gitignore b/.gitignore index 9fe17bce..e3ec9d18 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +examples/question_answering/train.csv +examples/question_answering/eval.csv +runs/ +../.idea/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] From 9e0bf009d639dd57603e99bf9c850a2e64e64227 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Tue, 12 Jan 2021 21:54:06 -0500 Subject: [PATCH 134/155] Removed ../.idea/ --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index e3ec9d18..f526ba0e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,6 @@ examples/question_answering/train.csv examples/question_answering/eval.csv runs/ -../.idea/ # Byte-compiled / optimized / DLL files __pycache__/ From d665e1298a3a6ebe470b8d42f31eaccb9d764d55 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 21:57:54 -0500 Subject: [PATCH 135/155] separated adaptors --- happytransformer/adaptors/__init__.py | 17 ++++- happytransformer/adaptors/adaptor.py | 101 +------------------------- happytransformer/adaptors/albert.py | 22 ++++++ happytransformer/adaptors/berts.py | 64 ++++++++++++++++ 4 files changed, 104 insertions(+), 100 deletions(-) create mode 100644 happytransformer/adaptors/albert.py create mode 100644 happytransformer/adaptors/berts.py diff --git a/happytransformer/adaptors/__init__.py b/happytransformer/adaptors/__init__.py index 2932665b..080b7a48 100644 --- a/happytransformer/adaptors/__init__.py +++ b/happytransformer/adaptors/__init__.py @@ -1 +1,16 @@ -from .adaptor import get_adaptor \ No newline at end of file +from .adaptor import Adaptor +from .albert import AlbertAdaptor +from .berts import BertAdaptor,DistilBertAdaptor,RobertaAdaptor + +ADAPTORS = { + 'BERT':BertAdaptor(), + 'DISTILBERT':DistilBertAdaptor(), + 'ROBERTA':RobertaAdaptor(), + 'ALBERT':AlbertAdaptor() +} + +def get_adaptor(model_type:str)->Adaptor: + if model_type in ADAPTORS: + return ADAPTORS[model_type] + else: + raise ValueError(f'Model type <{model_type}> not currently supported') \ No newline at end of file diff --git a/happytransformer/adaptors/adaptor.py b/happytransformer/adaptors/adaptor.py index 0a86bfd3..44fcc670 100644 --- a/happytransformer/adaptors/adaptor.py +++ b/happytransformer/adaptors/adaptor.py @@ -1,20 +1,5 @@ from typing import Type -from transformers import ( - PreTrainedModel, - - BertForMaskedLM, BertTokenizerFast, - BertForNextSentencePrediction, BertForQuestionAnswering, - BertForSequenceClassification, - - RobertaForMaskedLM, RobertaTokenizerFast, - RobertaForQuestionAnswering, RobertaForSequenceClassification, - - AlbertForMaskedLM, AlbertTokenizerFast, AlbertForQuestionAnswering, - AlbertForSequenceClassification, - - DistilBertForMaskedLM, DistilBertTokenizerFast, - DistilBertForQuestionAnswering, DistilBertForSequenceClassification -) +from transformers import PreTrainedModel from transformers.tokenization_utils import PreTrainedTokenizerBase class Adaptor: @@ -46,86 +31,4 @@ def preprocess_mask_text(self, text:str)->str: return text def postprocess_mask_prediction_token(self, text:str)->str: - return text - -class BertAdaptor(Adaptor): - @property - def Tokenizer(self): - return BertTokenizerFast - @property - def MaskedLM(self): - return BertForMaskedLM - @property - def NextSentencePrediction(self): - return BertForNextSentencePrediction - @property - def QuestionAnswering(self): - return BertForQuestionAnswering - @property - def SequenceClassification(self): - return BertForSequenceClassification - -class DistilBertAdaptor(Adaptor): - @property - def Tokenizer(self): - return DistilBertTokenizerFast - @property - def MaskedLM(self): - return DistilBertForMaskedLM - @property - def QuestionAnswering(self): - return DistilBertForQuestionAnswering - @property - def SequenceClassification(self): - return DistilBertForSequenceClassification - -class RobertaAdaptor(Adaptor): - @property - def Tokenizer(self): - return RobertaTokenizerFast - @property - def MaskedLM(self): - return RobertaForMaskedLM - @property - def QuestionAnswering(self): - return RobertaForQuestionAnswering - @property - def SequenceClassification(self): - return RobertaForSequenceClassification - - def preprocess_mask_text(self, text:str)->str: - print(text) - return text.replace('[MASK]','') - - def postprocess_mask_prediction_token(self, text): - return text[1:] if text[0] == "Ġ" else text - -class AlbertAdaptor(Adaptor): - @property - def Tokenizer(self): - return AlbertTokenizerFast - @property - def MaskedLM(self): - return AlbertForMaskedLM - @property - def QuestionAnswering(self): - return AlbertForQuestionAnswering - @property - def SequenceClassification(self): - return AlbertForSequenceClassification - - def postprocess_mask_prediction_token(self, text): - return text[1:] if text[0] == "▁" else text - -ADAPTORS = { - 'BERT':BertAdaptor(), - 'DISTILBERT':DistilBertAdaptor(), - 'ROBERTA':RobertaAdaptor(), - 'ALBERT':AlbertAdaptor() -} - -def get_adaptor(model_type:str)->Adaptor: - if model_type in ADAPTORS: - return ADAPTORS[model_type] - else: - raise ValueError(f'Model type <{model_type}> not currently supported') \ No newline at end of file + return text \ No newline at end of file diff --git a/happytransformer/adaptors/albert.py b/happytransformer/adaptors/albert.py new file mode 100644 index 00000000..3e64b1d6 --- /dev/null +++ b/happytransformer/adaptors/albert.py @@ -0,0 +1,22 @@ +from transformers import ( + AlbertForMaskedLM, AlbertTokenizerFast, AlbertForQuestionAnswering, + AlbertForSequenceClassification +) +from .adaptor import Adaptor + +class AlbertAdaptor(Adaptor): + @property + def Tokenizer(self): + return AlbertTokenizerFast + @property + def MaskedLM(self): + return AlbertForMaskedLM + @property + def QuestionAnswering(self): + return AlbertForQuestionAnswering + @property + def SequenceClassification(self): + return AlbertForSequenceClassification + + def postprocess_mask_prediction_token(self, text): + return text[1:] if text[0] == "▁" else text \ No newline at end of file diff --git a/happytransformer/adaptors/berts.py b/happytransformer/adaptors/berts.py new file mode 100644 index 00000000..24289001 --- /dev/null +++ b/happytransformer/adaptors/berts.py @@ -0,0 +1,64 @@ +from .adaptor import Adaptor +from transformers import ( + BertForMaskedLM, BertTokenizerFast, + BertForNextSentencePrediction, BertForQuestionAnswering, + BertForSequenceClassification, + + DistilBertForMaskedLM, DistilBertTokenizerFast, + DistilBertForSequenceClassification, DistilBertForQuestionAnswering, + + RobertaForMaskedLM, RobertaTokenizerFast, + RobertaForQuestionAnswering, RobertaForSequenceClassification, +) + +class BertAdaptor(Adaptor): + @property + def Tokenizer(self): + return BertTokenizerFast + @property + def MaskedLM(self): + return BertForMaskedLM + @property + def NextSentencePrediction(self): + return BertForNextSentencePrediction + @property + def QuestionAnswering(self): + return BertForQuestionAnswering + @property + def SequenceClassification(self): + return BertForSequenceClassification + +class DistilBertAdaptor(Adaptor): + @property + def Tokenizer(self): + return DistilBertTokenizerFast + @property + def MaskedLM(self): + return DistilBertForMaskedLM + @property + def QuestionAnswering(self): + return DistilBertForQuestionAnswering + @property + def SequenceClassification(self): + return DistilBertForSequenceClassification + +class RobertaAdaptor(Adaptor): + @property + def Tokenizer(self): + return RobertaTokenizerFast + @property + def MaskedLM(self): + return RobertaForMaskedLM + @property + def QuestionAnswering(self): + return RobertaForQuestionAnswering + @property + def SequenceClassification(self): + return RobertaForSequenceClassification + + def preprocess_mask_text(self, text:str)->str: + print(text) + return text.replace('[MASK]','') + + def postprocess_mask_prediction_token(self, text): + return text[1:] if text[0] == "Ġ" else text \ No newline at end of file From af0b1342e3564198a8791f947902be8bdd15afb6 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 22:05:50 -0500 Subject: [PATCH 136/155] remove unecessary imports --- happytransformer/happy_next_sentence.py | 5 ----- happytransformer/happy_question_answering.py | 19 +++++-------------- happytransformer/happy_text_classification.py | 15 ++------------- happytransformer/happy_word_prediction.py | 18 +++++------------- 4 files changed, 12 insertions(+), 45 deletions(-) diff --git a/happytransformer/happy_next_sentence.py b/happytransformer/happy_next_sentence.py index 09b801ce..ad63c942 100644 --- a/happytransformer/happy_next_sentence.py +++ b/happytransformer/happy_next_sentence.py @@ -1,9 +1,4 @@ import torch -from transformers import ( - BertTokenizerFast, - BertForNextSentencePrediction, - -) from happytransformer.happy_transformer import HappyTransformer from happytransformer.adaptors import get_adaptor diff --git a/happytransformer/happy_question_answering.py b/happytransformer/happy_question_answering.py index b566517b..89e6b9c9 100644 --- a/happytransformer/happy_question_answering.py +++ b/happytransformer/happy_question_answering.py @@ -2,26 +2,17 @@ Contains the HappyQuestionAnswering class. """ +from typing import List +from dataclasses import dataclass +from transformers import QuestionAnsweringPipeline + from happytransformer.happy_transformer import HappyTransformer from happytransformer.qa.trainer import QATrainer from happytransformer.qa.default_args import ARGS_QA_TRAIN -from transformers import ( - BertForQuestionAnswering, - BertTokenizerFast, - DistilBertForQuestionAnswering, - DistilBertTokenizerFast, - AlbertForQuestionAnswering, - AlbertTokenizerFast, - RobertaForQuestionAnswering, - RobertaTokenizerFast, - QuestionAnsweringPipeline, -) + from happytransformer.cuda_detect import detect_cuda_device_number from happytransformer.adaptors import get_adaptor -from typing import List -from dataclasses import dataclass - @dataclass class QuestionAnsweringResult: answer: str diff --git a/happytransformer/happy_text_classification.py b/happytransformer/happy_text_classification.py index 8b498edf..0d626077 100644 --- a/happytransformer/happy_text_classification.py +++ b/happytransformer/happy_text_classification.py @@ -2,23 +2,12 @@ Contains a class called HappyTextClassification that performs text classification """ from dataclasses import dataclass + import torch +from transformers import TextClassificationPipeline -from transformers import ( - BertForSequenceClassification, - BertTokenizerFast, - DistilBertForSequenceClassification, - DistilBertTokenizerFast, - AlbertForSequenceClassification, - AlbertTokenizerFast, - AutoConfig, - RobertaForSequenceClassification, - RobertaTokenizerFast, - TextClassificationPipeline -) from happytransformer.tc.trainer import TCTrainer from happytransformer.cuda_detect import detect_cuda_device_number - from happytransformer.happy_transformer import HappyTransformer from happytransformer.adaptors import get_adaptor from happytransformer.tc.default_args import ARGS_TC_TRAIN diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index 634aa626..c63c76d8 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -1,21 +1,13 @@ -from transformers import ( - BertForMaskedLM, - BertTokenizerFast, - AlbertForMaskedLM, - AlbertTokenizerFast, - DistilBertForMaskedLM, - DistilBertTokenizerFast, - RobertaForMaskedLM, - RobertaTokenizerFast, - FillMaskPipeline, -) -import torch +from typing import List,Optional from dataclasses import dataclass + +import torch +from transformers import FillMaskPipeline + from happytransformer.happy_transformer import HappyTransformer from happytransformer.mwp.trainer import WPTrainer from happytransformer.cuda_detect import detect_cuda_device_number from happytransformer.adaptors import get_adaptor -from typing import List,Optional @dataclass class WordPredictionResult: From 61e0a14b50abfe458e7ec0b32550b67aeaff30ae Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 22:08:44 -0500 Subject: [PATCH 137/155] fixed missing import --- happytransformer/happy_text_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/happytransformer/happy_text_classification.py b/happytransformer/happy_text_classification.py index 0d626077..f3fbdde1 100644 --- a/happytransformer/happy_text_classification.py +++ b/happytransformer/happy_text_classification.py @@ -4,7 +4,7 @@ from dataclasses import dataclass import torch -from transformers import TextClassificationPipeline +from transformers import TextClassificationPipeline, AutoConfig from happytransformer.tc.trainer import TCTrainer from happytransformer.cuda_detect import detect_cuda_device_number From 3fa8b131ea0c29f1738a2556456be74572ad5342 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 22:17:12 -0500 Subject: [PATCH 138/155] simplified assignment --- happytransformer/adaptors/albert.py | 16 +++------ happytransformer/adaptors/berts.py | 52 ++++++++--------------------- 2 files changed, 17 insertions(+), 51 deletions(-) diff --git a/happytransformer/adaptors/albert.py b/happytransformer/adaptors/albert.py index 3e64b1d6..e290eb3f 100644 --- a/happytransformer/adaptors/albert.py +++ b/happytransformer/adaptors/albert.py @@ -5,18 +5,10 @@ from .adaptor import Adaptor class AlbertAdaptor(Adaptor): - @property - def Tokenizer(self): - return AlbertTokenizerFast - @property - def MaskedLM(self): - return AlbertForMaskedLM - @property - def QuestionAnswering(self): - return AlbertForQuestionAnswering - @property - def SequenceClassification(self): - return AlbertForSequenceClassification + Tokenizer = AlbertTokenizerFast + MaskedLM = AlbertForMaskedLM + QuestionAnswering = AlbertForQuestionAnswering + SequenceClassification = AlbertForSequenceClassification def postprocess_mask_prediction_token(self, text): return text[1:] if text[0] == "▁" else text \ No newline at end of file diff --git a/happytransformer/adaptors/berts.py b/happytransformer/adaptors/berts.py index 24289001..40b2e546 100644 --- a/happytransformer/adaptors/berts.py +++ b/happytransformer/adaptors/berts.py @@ -12,49 +12,23 @@ ) class BertAdaptor(Adaptor): - @property - def Tokenizer(self): - return BertTokenizerFast - @property - def MaskedLM(self): - return BertForMaskedLM - @property - def NextSentencePrediction(self): - return BertForNextSentencePrediction - @property - def QuestionAnswering(self): - return BertForQuestionAnswering - @property - def SequenceClassification(self): - return BertForSequenceClassification + Tokenizer = BertTokenizerFast + MaskedLM = BertForMaskedLM + NextSentencePrediction = BertForNextSentencePrediction + QuestionAnswering = BertForQuestionAnswering + SequenceClassification = BertForSequenceClassification class DistilBertAdaptor(Adaptor): - @property - def Tokenizer(self): - return DistilBertTokenizerFast - @property - def MaskedLM(self): - return DistilBertForMaskedLM - @property - def QuestionAnswering(self): - return DistilBertForQuestionAnswering - @property - def SequenceClassification(self): - return DistilBertForSequenceClassification + Tokenizer = DistilBertTokenizerFast + MaskedLM = DistilBertForMaskedLM + QuestionAnswering = DistilBertForQuestionAnswering + SequenceClassification = DistilBertForSequenceClassification class RobertaAdaptor(Adaptor): - @property - def Tokenizer(self): - return RobertaTokenizerFast - @property - def MaskedLM(self): - return RobertaForMaskedLM - @property - def QuestionAnswering(self): - return RobertaForQuestionAnswering - @property - def SequenceClassification(self): - return RobertaForSequenceClassification + Tokenizer = RobertaTokenizerFast + MaskedLM = RobertaForMaskedLM + QuestionAnswering = RobertaForQuestionAnswering + SequenceClassification = RobertaForSequenceClassification def preprocess_mask_text(self, text:str)->str: print(text) From 6cb3a3daf764b31be89e25d1d5284782fe7e7ef1 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 22:20:57 -0500 Subject: [PATCH 139/155] use static methods --- happytransformer/adaptors/albert.py | 3 ++- happytransformer/adaptors/berts.py | 7 ++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/happytransformer/adaptors/albert.py b/happytransformer/adaptors/albert.py index e290eb3f..239d3a66 100644 --- a/happytransformer/adaptors/albert.py +++ b/happytransformer/adaptors/albert.py @@ -10,5 +10,6 @@ class AlbertAdaptor(Adaptor): QuestionAnswering = AlbertForQuestionAnswering SequenceClassification = AlbertForSequenceClassification - def postprocess_mask_prediction_token(self, text): + @staticmethod + def postprocess_mask_prediction_token(text): return text[1:] if text[0] == "▁" else text \ No newline at end of file diff --git a/happytransformer/adaptors/berts.py b/happytransformer/adaptors/berts.py index 40b2e546..410e8f1b 100644 --- a/happytransformer/adaptors/berts.py +++ b/happytransformer/adaptors/berts.py @@ -30,9 +30,10 @@ class RobertaAdaptor(Adaptor): QuestionAnswering = RobertaForQuestionAnswering SequenceClassification = RobertaForSequenceClassification - def preprocess_mask_text(self, text:str)->str: - print(text) + @staticmethod + def preprocess_mask_text(text): return text.replace('[MASK]','') - def postprocess_mask_prediction_token(self, text): + @staticmethod + def postprocess_mask_prediction_token(text): return text[1:] if text[0] == "Ġ" else text \ No newline at end of file From 8280f4f6f7369e5acc3064b186a0c2a8ba53234b Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Tue, 12 Jan 2021 22:21:48 -0500 Subject: [PATCH 140/155] Better use of split arg --- examples/question_answering/training_example.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/examples/question_answering/training_example.py b/examples/question_answering/training_example.py index 25dac577..633529a5 100644 --- a/examples/question_answering/training_example.py +++ b/examples/question_answering/training_example.py @@ -8,11 +8,11 @@ def main(): train_csv_path = "train.csv" eval_csv_path = "eval.csv" - train_dataset = load_dataset('squad', split='train') - eval_dataset = load_dataset('squad', split='validation') + train_dataset = load_dataset('squad', split='train[0:499]') + eval_dataset = load_dataset('squad', split='validation[0:99]') - generate_csv(train_csv_path, train_dataset, 500) - generate_csv(eval_csv_path, eval_dataset, 100) + generate_csv(train_csv_path, train_dataset) + generate_csv(eval_csv_path, eval_dataset) happy_qa = HappyQuestionAnswering(model_type="BERT", model_name="bert-base-uncased") before_loss = happy_qa.eval(eval_csv_path) @@ -24,11 +24,10 @@ def main(): assert after_loss < before_loss -def generate_csv(csv_path, dataset, count): +def generate_csv(csv_path, dataset): with open(csv_path, 'w', newline='') as csvfile: writter = csv.writer(csvfile) writter.writerow(["context", "question", "answer_text", "answer_start"]) - i = 0 for case in dataset: context = case["context"] question = case["question"] @@ -36,10 +35,6 @@ def generate_csv(csv_path, dataset, count): answer_start = case["answers"]["answer_start"][0] writter.writerow([context, question, answer_text, answer_start]) - i += 1 - if i == count: - break - if __name__ == "__main__": main() From 7d889085d81ec738c19db4163f545ce5e9e87878 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Tue, 12 Jan 2021 22:54:27 -0500 Subject: [PATCH 141/155] Cleaning --- happytransformer/adaptors/__init__.py | 10 +++++----- happytransformer/adaptors/adaptor.py | 14 +++++++------- happytransformer/adaptors/berts.py | 4 ++-- happytransformer/happy_word_prediction.py | 12 ++++-------- 4 files changed, 18 insertions(+), 22 deletions(-) diff --git a/happytransformer/adaptors/__init__.py b/happytransformer/adaptors/__init__.py index 080b7a48..538ea723 100644 --- a/happytransformer/adaptors/__init__.py +++ b/happytransformer/adaptors/__init__.py @@ -1,12 +1,12 @@ from .adaptor import Adaptor from .albert import AlbertAdaptor -from .berts import BertAdaptor,DistilBertAdaptor,RobertaAdaptor +from .berts import BertAdaptor, DistilBertAdaptor, RobertaAdaptor ADAPTORS = { - 'BERT':BertAdaptor(), - 'DISTILBERT':DistilBertAdaptor(), - 'ROBERTA':RobertaAdaptor(), - 'ALBERT':AlbertAdaptor() + 'BERT': BertAdaptor(), + 'DISTILBERT': DistilBertAdaptor(), + 'ROBERTA': RobertaAdaptor(), + 'ALBERT': AlbertAdaptor() } def get_adaptor(model_type:str)->Adaptor: diff --git a/happytransformer/adaptors/adaptor.py b/happytransformer/adaptors/adaptor.py index 44fcc670..ce8308f6 100644 --- a/happytransformer/adaptors/adaptor.py +++ b/happytransformer/adaptors/adaptor.py @@ -8,27 +8,27 @@ class Adaptor: Does NOT store any state. ''' @property - def Tokenizer(self)->Type[PreTrainedTokenizerBase]: + def Tokenizer(self) -> Type[PreTrainedTokenizerBase]: raise NotImplementedError() @property - def MaskedLM(self)->Type[PreTrainedModel]: + def MaskedLM(self) -> Type[PreTrainedModel]: raise NotImplementedError() @property - def NextSentencePrediction(self)->Type[PreTrainedModel]: + def NextSentencePrediction(self) -> Type[PreTrainedModel]: raise NotImplementedError() @property - def QuestionAnswering(self)->Type[PreTrainedModel]: + def QuestionAnswering(self) -> Type[PreTrainedModel]: raise NotImplementedError() @property - def SequenceClassification(self)->Type[PreTrainedModel]: + def SequenceClassification(self) -> Type[PreTrainedModel]: raise NotImplementedError() - def preprocess_mask_text(self, text:str)->str: + def preprocess_mask_text(self, text: str)-> str: return text - def postprocess_mask_prediction_token(self, text:str)->str: + def postprocess_mask_prediction_token(self, text: str) -> str: return text \ No newline at end of file diff --git a/happytransformer/adaptors/berts.py b/happytransformer/adaptors/berts.py index 410e8f1b..cc0434d0 100644 --- a/happytransformer/adaptors/berts.py +++ b/happytransformer/adaptors/berts.py @@ -12,7 +12,7 @@ ) class BertAdaptor(Adaptor): - Tokenizer = BertTokenizerFast + Tokenizer = BertTokenizerFast MaskedLM = BertForMaskedLM NextSentencePrediction = BertForNextSentencePrediction QuestionAnswering = BertForQuestionAnswering @@ -32,7 +32,7 @@ class RobertaAdaptor(Adaptor): @staticmethod def preprocess_mask_text(text): - return text.replace('[MASK]','') + return text.replace('[MASK]', '') @staticmethod def postprocess_mask_prediction_token(text): diff --git a/happytransformer/happy_word_prediction.py b/happytransformer/happy_word_prediction.py index c63c76d8..8b181d17 100644 --- a/happytransformer/happy_word_prediction.py +++ b/happytransformer/happy_word_prediction.py @@ -1,7 +1,6 @@ from typing import List,Optional from dataclasses import dataclass -import torch from transformers import FillMaskPipeline from happytransformer.happy_transformer import HappyTransformer @@ -11,16 +10,15 @@ @dataclass class WordPredictionResult: - token:str - score:float + token: str + score: float class HappyWordPrediction(HappyTransformer): """ A user facing class for text classification """ def __init__( - self, model_type:str="DISTILBERT", - model_name:str="distilbert-base-uncased"): + self, model_type: str = "DISTILBERT", model_name: str = "distilbert-base-uncased"): self.adaptor = get_adaptor(model_type) model = self.adaptor.MaskedLM.from_pretrained(model_name) @@ -34,9 +32,7 @@ def __init__( self._trainer = WPTrainer(model, model_type, tokenizer, self._device, self.logger) - def predict_mask(self, - text:str, targets:Optional[List[str]]=None, top_k:int=1 - ) -> List[WordPredictionResult]: + def predict_mask(self, text: str, targets: Optional[List[str]] = None, top_k: int = 1) -> List[WordPredictionResult]: """ Predict [MASK] tokens in a string. targets limit possible guesses if supplied. From e6c554ea49d4669c97dd3a2f6b503b6f7d0afcbc Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 23:18:31 -0500 Subject: [PATCH 142/155] remove settings --- .vscode/settings.json | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 .vscode/settings.json diff --git a/.vscode/settings.json b/.vscode/settings.json deleted file mode 100644 index 86ab3eb1..00000000 --- a/.vscode/settings.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "python.linting.mypyEnabled": true, - "python.linting.enabled": true -} \ No newline at end of file From d8aba2d1067880fe56ef1d90c6aeea841d5c3813 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 23:18:31 -0500 Subject: [PATCH 143/155] add settings to gitignore --- .gitignore | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 9fe17bce..1a242fe2 100644 --- a/.gitignore +++ b/.gitignore @@ -126,4 +126,6 @@ venv.bak/ dmypy.json # Pyre type checker -.pyre/ \ No newline at end of file +.pyre/ + +.vscode/settings.json \ No newline at end of file From 5b5354e2fa6cf10295f60eb24a676ee0f5dd1dd5 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Tue, 12 Jan 2021 23:28:54 -0500 Subject: [PATCH 144/155] Added tc training example --- .gitignore | 3 ++ .../text_classification/training_example.py | 40 +++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 examples/text_classification/training_example.py diff --git a/.gitignore b/.gitignore index 9fe17bce..2d1bc584 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +examples/text_classification/train.csv +examples/text_classification/eval.csv + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/examples/text_classification/training_example.py b/examples/text_classification/training_example.py new file mode 100644 index 00000000..24388106 --- /dev/null +++ b/examples/text_classification/training_example.py @@ -0,0 +1,40 @@ +from datasets import load_dataset +import csv +from happytransformer.happy_text_classification import HappyTextClassification + + +def main(): + train_csv_path = "train.csv" + eval_csv_path = "eval.csv" + + train_dataset = load_dataset('go_emotions', split='train[0:1999]') + eval_dataset = load_dataset('go_emotions', split='validation[0:399]') + + generate_csv(train_csv_path, train_dataset) + generate_csv(eval_csv_path, eval_dataset) + + happy_tc = HappyTextClassification(model_type="BERT", model_name="bert-base-uncased", num_labels=28) + + before_loss = happy_tc.eval(eval_csv_path) + happy_tc.train(train_csv_path) + after_loss = happy_tc.eval(eval_csv_path) + + print("Before loss: ", before_loss) + print("After loss: ", after_loss) + assert after_loss < before_loss + + +def generate_csv(csv_path, dataset): + with open(csv_path, 'w', newline='') as csvfile: + writter = csv.writer(csvfile) + writter.writerow(["text", "label"]) + for case in dataset: + # some cases have multiple labels, + # so each one becomes its own training case + for label in case["labels"]: + text = case["text"] + writter.writerow([text, label]) + + +if __name__ == "__main__": + main() \ No newline at end of file From c2c65fa9fe09e15f4147c158907be35c1ea4da59 Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 23:29:36 -0500 Subject: [PATCH 145/155] improved error messages --- happytransformer/adaptors/adaptor.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/happytransformer/adaptors/adaptor.py b/happytransformer/adaptors/adaptor.py index ce8308f6..b3ce6af4 100644 --- a/happytransformer/adaptors/adaptor.py +++ b/happytransformer/adaptors/adaptor.py @@ -9,23 +9,25 @@ class Adaptor: ''' @property def Tokenizer(self) -> Type[PreTrainedTokenizerBase]: + # this should be NotImplementedError because + # all Adaptors should have a Tokenizer raise NotImplementedError() @property def MaskedLM(self) -> Type[PreTrainedModel]: - raise NotImplementedError() + raise ValueError('This model does not support word prediction') @property def NextSentencePrediction(self) -> Type[PreTrainedModel]: - raise NotImplementedError() + raise ValueError('This model does not support next sentence prediction') @property def QuestionAnswering(self) -> Type[PreTrainedModel]: - raise NotImplementedError() + raise ValueError('This model does not support question answering') @property def SequenceClassification(self) -> Type[PreTrainedModel]: - raise NotImplementedError() + raise ValueError('This model does not support sequence classification') def preprocess_mask_text(self, text: str)-> str: return text From 27e368c4a7ce0a74a36ffb7ddefcc61f3e05fafc Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Tue, 12 Jan 2021 23:37:54 -0500 Subject: [PATCH 146/155] berts are back together --- happytransformer/adaptors/__init__.py | 3 +-- happytransformer/adaptors/albert.py | 15 --------------- happytransformer/adaptors/berts.py | 15 ++++++++++++++- 3 files changed, 15 insertions(+), 18 deletions(-) delete mode 100644 happytransformer/adaptors/albert.py diff --git a/happytransformer/adaptors/__init__.py b/happytransformer/adaptors/__init__.py index 538ea723..297f6e4f 100644 --- a/happytransformer/adaptors/__init__.py +++ b/happytransformer/adaptors/__init__.py @@ -1,6 +1,5 @@ from .adaptor import Adaptor -from .albert import AlbertAdaptor -from .berts import BertAdaptor, DistilBertAdaptor, RobertaAdaptor +from .berts import BertAdaptor, DistilBertAdaptor, RobertaAdaptor, AlbertAdaptor ADAPTORS = { 'BERT': BertAdaptor(), diff --git a/happytransformer/adaptors/albert.py b/happytransformer/adaptors/albert.py deleted file mode 100644 index 239d3a66..00000000 --- a/happytransformer/adaptors/albert.py +++ /dev/null @@ -1,15 +0,0 @@ -from transformers import ( - AlbertForMaskedLM, AlbertTokenizerFast, AlbertForQuestionAnswering, - AlbertForSequenceClassification -) -from .adaptor import Adaptor - -class AlbertAdaptor(Adaptor): - Tokenizer = AlbertTokenizerFast - MaskedLM = AlbertForMaskedLM - QuestionAnswering = AlbertForQuestionAnswering - SequenceClassification = AlbertForSequenceClassification - - @staticmethod - def postprocess_mask_prediction_token(text): - return text[1:] if text[0] == "▁" else text \ No newline at end of file diff --git a/happytransformer/adaptors/berts.py b/happytransformer/adaptors/berts.py index cc0434d0..10d67410 100644 --- a/happytransformer/adaptors/berts.py +++ b/happytransformer/adaptors/berts.py @@ -9,6 +9,9 @@ RobertaForMaskedLM, RobertaTokenizerFast, RobertaForQuestionAnswering, RobertaForSequenceClassification, + + AlbertForMaskedLM, AlbertTokenizerFast, AlbertForQuestionAnswering, + AlbertForSequenceClassification ) class BertAdaptor(Adaptor): @@ -36,4 +39,14 @@ def preprocess_mask_text(text): @staticmethod def postprocess_mask_prediction_token(text): - return text[1:] if text[0] == "Ġ" else text \ No newline at end of file + return text[1:] if text[0] == "Ġ" else text + +class AlbertAdaptor(Adaptor): + Tokenizer = AlbertTokenizerFast + MaskedLM = AlbertForMaskedLM + QuestionAnswering = AlbertForQuestionAnswering + SequenceClassification = AlbertForSequenceClassification + + @staticmethod + def postprocess_mask_prediction_token(text): + return text[1:] if text[0] == "▁" else text \ No newline at end of file From e22731965d4d6e0b26bd8a18a91e12bf9c1dc91c Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Wed, 13 Jan 2021 00:20:20 -0500 Subject: [PATCH 147/155] Updated output descriptions --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 5eb3f1d3..efcde096 100644 --- a/README.md +++ b/README.md @@ -114,7 +114,7 @@ The method predict_masks() contains 3 arguments: 3. top_k (int): the number of results that will be returned Returns: -A dataclass with variables "token_str" and "top_k" +A list of objects with fields "token_str" and "top_k" Note: if targets are provided, then top_k will be ignored and a score for each target will be returned. @@ -202,7 +202,7 @@ Input: 1. text (string): Text that will be classified Returns: -A dataclass with variables "label" and "score" +An object with fields "label" and "score" #### Example 2.1: ```python @@ -278,7 +278,7 @@ Input: output: -A dataclass with a variable called "loss" +An object with the field "loss" #### Example 2.3: ```python @@ -388,7 +388,7 @@ Inputs: 3. top_k (int): the number of results that will be returned (default=1) Returns: - A list of a dataclasses that contains the variables: "answer", "score", "start" and "end." + A list of a objects with fields: "answer", "score", "start" and "end." The list is in descending order by score #### Example 3.1: From b7a77519a50d21c085d87ba8301e5132adf7dedb Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Wed, 13 Jan 2021 00:24:25 -0500 Subject: [PATCH 148/155] Fixed defaulty --- happytransformer/sp/__init__.py | 2 +- happytransformer/sp/{defauly_args.py => default_args.py} | 0 happytransformer/sp/trainer.py | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename happytransformer/sp/{defauly_args.py => default_args.py} (100%) diff --git a/happytransformer/sp/__init__.py b/happytransformer/sp/__init__.py index 65a0f0ef..ceb98afe 100644 --- a/happytransformer/sp/__init__.py +++ b/happytransformer/sp/__init__.py @@ -1,4 +1,4 @@ from .trainer import SPTrainer -from .defauly_args import ARGS_SP_TRAIN +from .default_args import ARGS_SP_TRAIN name = "happytransformer.sp" diff --git a/happytransformer/sp/defauly_args.py b/happytransformer/sp/default_args.py similarity index 100% rename from happytransformer/sp/defauly_args.py rename to happytransformer/sp/default_args.py diff --git a/happytransformer/sp/trainer.py b/happytransformer/sp/trainer.py index e7c6b9c8..b81559ef 100644 --- a/happytransformer/sp/trainer.py +++ b/happytransformer/sp/trainer.py @@ -1,5 +1,5 @@ from happytransformer.happy_trainer import HappyTrainer -from happytransformer.sp.defauly_args import ARGS_SP_TRAIN +from happytransformer.sp.default_args import ARGS_SP_TRAIN class SPTrainer(HappyTrainer): From 1903889d72bb1383842a5c2dba138c8af424a1b4 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Wed, 13 Jan 2021 00:29:38 -0500 Subject: [PATCH 149/155] changed token_str to token --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index efcde096..bae4c77d 100644 --- a/README.md +++ b/README.md @@ -114,7 +114,7 @@ The method predict_masks() contains 3 arguments: 3. top_k (int): the number of results that will be returned Returns: -A list of objects with fields "token_str" and "top_k" +A list of objects with fields "token" and "top_k" Note: if targets are provided, then top_k will be ignored and a score for each target will be returned. From 8b828b7071c41cce2fc152bb6865a68ec51072dd Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Wed, 13 Jan 2021 00:51:30 -0500 Subject: [PATCH 150/155] Added .loss --- examples/question_answering/training_example.py | 5 ++--- examples/text_classification/training_example.py | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/examples/question_answering/training_example.py b/examples/question_answering/training_example.py index 633529a5..0c0314cc 100644 --- a/examples/question_answering/training_example.py +++ b/examples/question_answering/training_example.py @@ -19,9 +19,8 @@ def main(): happy_qa.train(train_csv_path) after_loss = happy_qa.eval(eval_csv_path) - print("Before loss: ", before_loss) - print("After loss: ", after_loss) - assert after_loss < before_loss + print("Before loss: ", before_loss.loss) + print("After loss: ", after_loss.loss) def generate_csv(csv_path, dataset): diff --git a/examples/text_classification/training_example.py b/examples/text_classification/training_example.py index 24388106..651a9df3 100644 --- a/examples/text_classification/training_example.py +++ b/examples/text_classification/training_example.py @@ -19,9 +19,8 @@ def main(): happy_tc.train(train_csv_path) after_loss = happy_tc.eval(eval_csv_path) - print("Before loss: ", before_loss) - print("After loss: ", after_loss) - assert after_loss < before_loss + print("Before loss: ", before_loss.loss) + print("After loss: ", after_loss.loss) def generate_csv(csv_path, dataset): From 4a67b94fb1f52a8b09d0c25d871c8ebac51db2f7 Mon Sep 17 00:00:00 2001 From: Eric Fillion Date: Wed, 13 Jan 2021 01:07:59 -0500 Subject: [PATCH 151/155] Replaced token_str with token --- README.md | 12 ++++++------ examples/word_prediction/readme_examples.py | 12 ++++++------ 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index bae4c77d..59d2e0e5 100644 --- a/README.md +++ b/README.md @@ -126,9 +126,9 @@ from happytransformer import HappyWordPrediction happy_wp = HappyWordPrediction() # default uses distilbert-base-uncased result = happy_wp.predict_mask("I think therefore I [MASK]") print(type(result)) # - print(result) # [WordPredictionResult(token_str='am', score=0.10172799974679947)] + print(result) # [WordPredictionResult(token='am', score=0.10172799974679947)] print(type(result[0])) # - print(result[0]) # [WordPredictionResult(token_str='am', score=0.10172799974679947)] + print(result[0]) # [WordPredictionResult(token='am', score=0.10172799974679947)] print(result[0].token) # am print(result[0].score) # 0.10172799974679947 @@ -142,8 +142,8 @@ from happytransformer import HappyWordPrediction #--------------------------------------# happy_wp = HappyWordPrediction("ALBERT", "albert-xxlarge-v2") result = happy_wp.predict_mask("To better the world I would invest in [MASK] and education.", top_k=2) -print(result) # [WordPredictionResult(token_str='infrastructure', score=0.09270179271697998), WordPredictionResult(token_str='healthcare', score=0.07219093292951584)] -print(result[1]) # WordPredictionResult(token_str='healthcare', score=0.07219093292951584) +print(result) # [WordPredictionResult(token='infrastructure', score=0.09270179271697998), WordPredictionResult(token='healthcare', score=0.07219093292951584)] +print(result[1]) # WordPredictionResult(token='healthcare', score=0.07219093292951584) print(result[1].token) # healthcare ``` @@ -155,8 +155,8 @@ from happytransformer import HappyWordPrediction happy_wp = HappyWordPrediction("ALBERT", "albert-xxlarge-v2") targets = ["technology", "healthcare"] result = happy_wp.predict_mask("To better the world I would invest in [MASK] and education.", targets=targets) -print(result) # [WordPredictionResult(token_str='healthcare', score=0.07219093292951584), WordPredictionResult(token_str='technology', score=0.032044216990470886)] -print(result[1]) # WordPredictionResult(token_str='technology', score=0.032044216990470886) +print(result) # [WordPredictionResult(token='healthcare', score=0.07219093292951584), WordPredictionResult(token='technology', score=0.032044216990470886)] +print(result[1]) # WordPredictionResult(token='technology', score=0.032044216990470886) print(result[1].token) # technology diff --git a/examples/word_prediction/readme_examples.py b/examples/word_prediction/readme_examples.py index d3950cc5..ec60f507 100644 --- a/examples/word_prediction/readme_examples.py +++ b/examples/word_prediction/readme_examples.py @@ -12,9 +12,9 @@ def example_1_1(): happy_wp = HappyWordPrediction() # default uses distilbert-base-uncased result = happy_wp.predict_mask("I think therefore I [MASK]") print(type(result)) # - print(result) # [WordPredictionResult(token_str='am', score=0.10172799974679947)] + print(result) # [WordPredictionResult(token='am', score=0.10172799974679947)] print(type(result[0])) # - print(result[0]) # [WordPredictionResult(token_str='am', score=0.10172799974679947)] + print(result[0]) # [WordPredictionResult(token='am', score=0.10172799974679947)] print(result[0].token) # am print(result[0].score) # 0.10172799974679947 @@ -22,8 +22,8 @@ def example_1_1(): def example_1_2(): happy_wp = HappyWordPrediction("ALBERT", "albert-xxlarge-v2") result = happy_wp.predict_mask("To better the world I would invest in [MASK] and education.", top_k=10) - print(result) # [WordPredictionResult(token_str='infrastructure', score=0.09270179271697998), WordPredictionResult(token_str='healthcare', score=0.07219093292951584)] - print(result[1]) # WordPredictionResult(token_str='healthcare', score=0.07219093292951584) + print(result) # [WordPredictionResult(token='infrastructure', score=0.09270179271697998), WordPredictionResult(token='healthcare', score=0.07219093292951584)] + print(result[1]) # WordPredictionResult(token='healthcare', score=0.07219093292951584) print(result[1].token) # healthcare @@ -31,8 +31,8 @@ def example_1_3(): happy_wp = HappyWordPrediction("ALBERT", "albert-xxlarge-v2") targets = ["technology", "healthcare"] result = happy_wp.predict_mask("To better the world I would invest in [MASK] and education.", targets=targets) - print(result) # [WordPredictionResult(token_str='healthcare', score=0.07219093292951584), WordPredictionResult(token_str='technology', score=0.032044216990470886)] - print(result[1]) # WordPredictionResult(token_str='technology', score=0.032044216990470886) + print(result) # [WordPredictionResult(token='healthcare', score=0.07219093292951584), WordPredictionResult(token='technology', score=0.032044216990470886)] + print(result[1]) # WordPredictionResult(token='technology', score=0.032044216990470886) print(result[1].token) # technology From 307b34a1aeebf3b92ff2047b0616b1e2ef154cd6 Mon Sep 17 00:00:00 2001 From: Eric Fillion <38766185+EricFillion@users.noreply.github.com> Date: Wed, 13 Jan 2021 01:38:44 -0500 Subject: [PATCH 152/155] Updated Version to 2.0.0 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0fffe130..cbd1e221 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ setup( name = 'happytransformer', packages = find_packages(), - version = '2.0.0a4', + version = '2.0.0', license='Apache 2.0', description = "Happy Transformer is an API built on top of Hugging Face's Transformer library that makes it easy to utilize state-of-the-art NLP models.", long_description= readme, From b79fc48246115c47a613d558ffd796f0f1128129 Mon Sep 17 00:00:00 2001 From: Eric Fillion <38766185+EricFillion@users.noreply.github.com> Date: Wed, 13 Jan 2021 01:54:29 -0500 Subject: [PATCH 153/155] Added sentencepiece --- setup.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index cbd1e221..da7fea87 100644 --- a/setup.py +++ b/setup.py @@ -24,7 +24,10 @@ 'torch>=1.0', 'tqdm>=4.27', 'transformers>=4.0.0', - 'dataclasses; python_version < "3.7"' + 'dataclasses; python_version < "3.7"', + 'sentencepiece', + + ], classifiers=[ From 8f76ae9a050717232bf70e3b0510292c17354c6b Mon Sep 17 00:00:00 2001 From: Eric Fillion <38766185+EricFillion@users.noreply.github.com> Date: Wed, 13 Jan 2021 02:00:19 -0500 Subject: [PATCH 154/155] Added protobuf --- setup.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/setup.py b/setup.py index da7fea87..81ff2002 100644 --- a/setup.py +++ b/setup.py @@ -26,8 +26,7 @@ 'transformers>=4.0.0', 'dataclasses; python_version < "3.7"', 'sentencepiece', - - + 'protobuf' ], classifiers=[ From 5ce9961216390740b0c1b4429bc80920e6ce21cc Mon Sep 17 00:00:00 2001 From: Ted Brownlow Date: Wed, 13 Jan 2021 02:35:19 -0500 Subject: [PATCH 155/155] removed outdated files --- tests/__init__.py | 0 tests/error.py | 6 ------ tests/test_mlm.py | 17 ----------------- 3 files changed, 23 deletions(-) delete mode 100644 tests/__init__.py delete mode 100644 tests/error.py delete mode 100644 tests/test_mlm.py diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/error.py b/tests/error.py deleted file mode 100644 index 46b05864..00000000 --- a/tests/error.py +++ /dev/null @@ -1,6 +0,0 @@ -def get_error(func): - try: - func() - return None - except Exception as error: - return error \ No newline at end of file diff --git a/tests/test_mlm.py b/tests/test_mlm.py deleted file mode 100644 index 577144fd..00000000 --- a/tests/test_mlm.py +++ /dev/null @@ -1,17 +0,0 @@ -from happytransformer import HappyBERT, HappyROBERTA, HappyXLNET - -MLM_TRANSFORMERS = [ - HappyBERT, - HappyROBERTA, - # HappyXLNET # performance is not great, omitting -] - -def _test_mlm_model(transformer_class): - transformer = transformer_class() - prediction = transformer.predict_mask('[MASK] have a dog') - assert prediction[0]['word'].lower() == 'i' - -def test_all_mlm_models(): - for transformer_class in MLM_TRANSFORMERS: - print(f'Testing class {transformer_class.__name__}') - _test_mlm_model(transformer_class)