data_transformer.pyのテスト用

In [166]:
import pandas as pd
import nlpaug.augmenter.word as naw
import copy
import preprocessing

In [168]:
train_df, val_df, lookups = preprocessing.read_kfold_file("../data", fold=7, n_splits=10)

In [169]:
test_docs = val_df.loc[[134, 156, 356, 1088, 1234]].reset_index(drop=True).copy()
test_docs

Unnamed: 0,id,classlist,predictionstrings,text,annotation
0,16DEF72E220A,"[Lead, Position, Claim, Claim, Claim, Evidence...",[0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 1...,Some schools offer distance learning as an opt...,"[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
1,1BC811EC52DD,"[Lead, Position, Evidence, Claim, Evidence, Cl...",[0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 1...,"Not everyone thinks the same way, if you ask s...","[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
2,3A08788DCA7D,"[Lead, Claim, Position, Claim, Claim, Claim, E...",[13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 ...,Generic_Name\n\nMrs. Generic_Name\n\n11 March ...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, B-Lead..."
3,B000D7B81799,"[Position, Claim, Claim, Evidence, Evidence, C...","[10 11 12 13 14 15, 16 17 18 19 20 21 22 23 24...","Dear John Kerry, the senator for the state of ...","[O, O, O, O, O, O, O, O, O, O, B-Position, I-P..."
4,C43D24F5A342,"[Position, Claim, Evidence, Evidence, Claim, E...","[2 3 4 5 6 7 8 9 10 11 12 13 14 15, 16 17 18 1...","Dear Senator,\n\nI argue in favor to changing ...","[O, O, B-Position, I-Position, I-Position, I-P..."


#### 前処理

In [170]:
test_docs["text"] = test_docs["text"].apply(lambda x: " ".join(x.split()))
test_docs

Unnamed: 0,id,classlist,predictionstrings,text,annotation
0,16DEF72E220A,"[Lead, Position, Claim, Claim, Claim, Evidence...",[0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 1...,Some schools offer distance learning as an opt...,"[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
1,1BC811EC52DD,"[Lead, Position, Evidence, Claim, Evidence, Cl...",[0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 1...,"Not everyone thinks the same way, if you ask s...","[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
2,3A08788DCA7D,"[Lead, Claim, Position, Claim, Claim, Claim, E...",[13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 ...,Generic_Name Mrs. Generic_Name 11 March 2020 E...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, B-Lead..."
3,B000D7B81799,"[Position, Claim, Claim, Evidence, Evidence, C...","[10 11 12 13 14 15, 16 17 18 19 20 21 22 23 24...","Dear John Kerry, the senator for the state of ...","[O, O, O, O, O, O, O, O, O, O, B-Position, I-P..."
4,C43D24F5A342,"[Position, Claim, Evidence, Evidence, Claim, E...","[2 3 4 5 6 7 8 9 10 11 12 13 14 15, 16 17 18 1...","Dear Senator, I argue in favor to changing the...","[O, O, B-Position, I-Position, I-Position, I-P..."


In [160]:
def print_and_highlight_diff(orig_text, aug_texts):
    orig_split = orig_text.split()
    print(f"Original: {len(orig_split)}\n{orig_text}\n")
    new_texts = [item[0] for item in aug_texts]
    new_ner = [item[1] for item in aug_texts]
    # new_texts = [x.replace(" ' ", "'") for x in new_texts]
    for new_text, new_n in zip(new_texts, new_ner):
        print(f"\nAugmented: {len(new_text.split())}, NER label: {len(new_n)}")
        for i, word in enumerate(new_text.split()):
            if i < len(orig_split) and word == orig_split[i]:
                print(word, end=" ")
            else:
                print('\033[31m' + word + '\033[0m', end=" ")
        print()

In [171]:
sample_text = test_docs.iloc[0]['text']
sample_ner = copy.deepcopy(test_docs.iloc[0]['annotation'])
print(len(sample_text.split()), len(sample_ner))

572 572


#### 実験

文章長が確実に変わる`SynonymAug`を使う

In [121]:
class SynonymAugWithNER(naw.WordAugmenter):
    def __init__(self):
        super().__init__(action="substitute")


In [122]:
SynonymAugWithNER()

<__main__.SynonymAugWithNER at 0x169d56b80>

In [123]:
import math
import random
import numpy as np
import pandas as pd
from multiprocessing.dummy import Pool as ThreadPool

from nlpaug.util import Action, Doc, PartOfSpeech, Method, WarningException, WarningName, WarningCode, WarningMessage


class AugmenterWithNER:
    def __init__(self, name, method, action, aug_min, aug_max, aug_p=0.1, device='cpu', 
        include_detail=False, verbose=0):

        self.name = name
        self.action = action
        self.method = method
        self.aug_min = aug_min
        self.aug_max = aug_max
        self.aug_p = aug_p
        self.device = device
        self.verbose = verbose
        self.include_detail = include_detail

        self.parent_change_seq = 0

        self._validate_augmenter(method, action)

    @classmethod
    def _validate_augmenter(cls, method, action):
        if method not in Method.getall():
            raise ValueError(
                'Method must be one of {} while {} is passed'.format(Method.getall(), method))

        if action not in Action.getall():
            raise ValueError(
                'Action must be one of {} while {} is passed'.format(Action.getall(), action))

    def augment(self, data, ner_labels, n=1):
        """
        :param object/list data: Data for augmentation. It can be list of data (e.g. list 
            of string or numpy) or single element (e.g. string or numpy). Numpy format only
            supports audio or spectrogram data. For text data, only support string or
            list of string.
        :param int n: Default is 1. Number of unique augmented output. Will be force to 1 
            if input is list of data
        :param int num_thread: Number of thread for data augmentation. Use this option 
            when you are using CPU and n is larger than 1
        :return: Augmented data
        >>> augmented_data = aug.augment(data)
        """
        max_retry_times = 3  # max loop times of n to generate expected number of outputs
        aug_num = 1 if isinstance(data, list) else n
        expected_output_num = len(data) if isinstance(data, list) else aug_num

        exceptions = self._validate_augment(data)
        # TODO: Handle multiple exceptions
        for exception in exceptions:
            if isinstance(exception, WarningException):
                if self.verbose > 0:
                    exception.output()

                # Return empty value per data type
                if isinstance(data, str):
                    return ''
                elif isinstance(data, list):
                    return []
                elif isinstance(data, np.ndarray):
                    return np.array([])

                return None

        action_fx = None
        clean_data, clean_ner = self.clean(data, ner_labels)
        if self.action == Action.INSERT:
            action_fx = self.insert
        elif self.action == Action.SUBSTITUTE:
            action_fx = self.substitute
        elif self.action == Action.SWAP:
            action_fx = self.swap
        elif self.action == Action.DELETE:
            action_fx = self.delete
        elif self.action == Action.CROP:
            action_fx = self.crop
        elif self.action == Action.SPLIT:
            action_fx = self.split

        for _ in range(max_retry_times+1):
            augmented_results = []
            augmented_ner_results = []

            # By design, it is one-to-many
            if self.__class__.__name__ in ['LambadaAug']:
                augmented_results = action_fx(clean_data, clean_ner, n=n)
            # PyTorch's augmenter
            elif self.__class__.__name__ in ['AbstSummAug', 'BackTranslationAug', 'ContextualWordEmbsAug', 'ContextualWordEmbsForSentenceAug']:
                for _ in range(aug_num):
                    result, ner_result = action_fx(clean_data, clean_ner)
                    if isinstance(result, list):
                        augmented_results.extend(result)
                        augmented_ner_results.extend(ner_result)
                    else:
                        augmented_results.append(result)
                        augmented_ner_results.append(ner_result)
            # Single input with/without multiple input
            else:
                
                augmented_results = [action_fx(clean_data, clean_ner) for _ in range(n)]

            if len(augmented_results) >= expected_output_num:
                break

         # TODO: standardize output to list even though n=1 from 1.0.0
        if len(augmented_results) == 0:
            # if not result, return itself
            if n == 1:
                return data
            # Single input with/without multiple input
            else:
                return [data]

        if isinstance(augmented_results, pd.DataFrame):
            return augmented_results
        else:
            if isinstance(data, list):
                return augmented_results
            else:
                if n == 1:
                    return augmented_results[0]
                return augmented_results[:n]

    @classmethod
    def _validate_augment(cls, data):
        if data is None or len(data) == 0:
            return [WarningException(name=WarningName.INPUT_VALIDATION_WARNING,
                                     code=WarningCode.WARNING_CODE_001, msg=WarningMessage.LENGTH_IS_ZERO)]

        return []

    def insert(self, data, ner_labels):
        raise NotImplementedError

    def substitute(self, data, ner_labels):
        raise NotImplementedError

    def swap(self, data, ner_labels):
        raise NotImplementedError

    def delete(self, data, ner_labels):
        raise NotImplementedError

    def crop(self, data, ner_labels):
        raise NotImplementedError        

    def split(self, data, ner_labels):
        raise NotImplementedError

    def tokenizer(self, tokens, ner_labels):
        raise NotImplementedError

    def evaluate(self):
        raise NotImplementedError

    @classmethod
    def is_duplicate(cls, dataset, data):
        raise NotImplementedError

    @classmethod
    def prob(cls):
        return np.random.random()

    @classmethod
    def sample(cls, x, num=None):
        if isinstance(x, list):
            return random.sample(x, num)
        elif isinstance(x, int):
            return np.random.randint(1, x-1)

    @classmethod
    def clean(cls, data, ner_labels):
        raise NotImplementedError

    def _generate_aug_cnt(self, size, aug_min, aug_max, aug_p=None):
        if aug_p:
            percent = aug_p
        elif self.aug_p:
            percent = self.aug_p
        else:
            percent = 0.3
        cnt = int(math.ceil(percent * size))

        if aug_min and cnt < aug_min:
            return aug_min
        if aug_max and cnt > aug_max:
            return aug_max
        return cnt

    def generate_aug_cnt(self, size, aug_p=None):
        if size == 0:
            return 0
        return self._generate_aug_cnt(size, self.aug_min, self.aug_max, aug_p)

    def generate_aug_idxes(self, inputs):
        aug_cnt = self.generate_aug_cnt(len(inputs))
        token_idxes = [i for i, _ in enumerate(inputs)]
        aug_idxes = self.sample(token_idxes, aug_cnt)
        return aug_idxes

    def _get_random_aug_idxes(self, data):
        aug_cnt = self.generate_aug_cnt(len(data))
        idxes = self.pre_skip_aug(data)
        if len(idxes) < aug_cnt:
            aug_cnt = len(idxes)

        aug_idxes = self.sample(idxes, aug_cnt)

        return aug_idxes

    def __str__(self):
        return 'Name:{}, Action:{}, Method:{}'.format(self.name, self.action, self.method)

In [124]:
import re, string
from nlpaug.util.text.tokenizer import Tokenizer

class WordAugmenterWithNER(AugmenterWithNER):
    def __init__(self, action, name='Word_Aug', aug_min=1, aug_max=10, aug_p=0.3, stopwords=None,
                 tokenizer=None, reverse_tokenizer=None, device='cpu', verbose=0, stopwords_regex=None,
                 include_detail=False):
        super().__init__(
            name=name, method=Method.WORD, action=action, aug_min=aug_min, aug_max=aug_max, device=device,
            verbose=verbose, include_detail=include_detail)
        self.aug_p = aug_p
        self.tokenizer = tokenizer or Tokenizer.tokenizer
        self.reverse_tokenizer = reverse_tokenizer or Tokenizer.reverse_tokenizer
        self.stopwords = stopwords
        self.stopwords_regex = re.compile(stopwords_regex) if stopwords_regex else stopwords_regex

    @classmethod
    def clean(cls, data, ner_labels):
        if isinstance(data, list) :
            return [d.strip() if d else d for d in data], [d if d else d for d in ner_labels]
        return data.strip(), ner_labels

    def skip_aug(self, token_idxes, tokens):
        return token_idxes

    def is_stop_words(self, token):
        return self.stopwords is not None and token in self.stopwords

    def pre_skip_aug(self, tokens, tuple_idx=None):
        results = []
        for token_idx, token in enumerate(tokens):
            if tuple_idx is not None:
                _token = token[tuple_idx]
            else:
                _token = token
            # skip punctuation
            if _token in string.punctuation:
                continue
            # skip stopwords by list
            if self.is_stop_words(_token):
                continue
            # skip stopwords by regex
            # https://github.com/makcedward/nlpaug/issues/81
            if self.stopwords_regex is not None and (
                    self.stopwords_regex.match(_token) or self.stopwords_regex.match(' '+_token+' ') or
                    self.stopwords_regex.match(' '+_token) or self.stopwords_regex.match(_token+' ')):
                continue

            results.append(token_idx)

        return results

    @classmethod
    def is_duplicate(cls, dataset, data):
        for d in dataset:
            if d == data:
                return True
        return False

    def align_capitalization(self, src_token, dest_token):
        if self.get_word_case(src_token) == 'capitalize' and self.get_word_case(dest_token) == 'lower':
            return dest_token.capitalize()
        return dest_token

    def _get_aug_idxes(self, tokens):
        aug_cnt = self.generate_aug_cnt(len(tokens))
        word_idxes = self.pre_skip_aug(tokens)
        word_idxes = self.skip_aug(word_idxes, tokens)
        if len(word_idxes) == 0:
            if self.verbose > 0:
                exception = WarningException(name=WarningName.OUT_OF_VOCABULARY,
                                             code=WarningCode.WARNING_CODE_002, msg=WarningMessage.NO_WORD)
                exception.output()
            return []
        if len(word_idxes) < aug_cnt:
            aug_cnt = len(word_idxes)
        aug_idexes = self.sample(word_idxes, aug_cnt)
        return aug_idexes

    def _get_random_aug_idxes(self, tokens):
        aug_cnt = self.generate_aug_cnt(len(tokens))
        word_idxes = self.pre_skip_aug(tokens)
        if len(word_idxes) < aug_cnt:
            aug_cnt = len(word_idxes)

        aug_idxes = self.sample(word_idxes, aug_cnt)

        return aug_idxes

    def _get_aug_range_idxes(self, tokens):
        aug_cnt = self.generate_aug_cnt(len(tokens))
        if aug_cnt == 0 or len(tokens) == 0:
            return []
        direction = self.sample([-1, 1], 1)[0]

        if direction > 0:
            # right
            word_idxes = [i for i, _ in enumerate(tokens[:-aug_cnt+1])]
        else:
            # left
            word_idxes = [i for i, _ in enumerate(tokens[aug_cnt-1:])]

        start_aug_idx = self.sample(word_idxes, 1)[0]
        aug_idxes = [start_aug_idx + _*direction for _ in range(aug_cnt)]

        return aug_idxes

    @classmethod
    def get_word_case(cls, word):
        if len(word) == 0:
            return 'empty'

        if len(word) == 1 and word.isupper():
            return 'capitalize'

        if word.isupper():
            return 'upper'
        elif word.islower():
            return 'lower'
        else:
            for i, c in enumerate(word):
                if i == 0:  # do not check first character
                    continue
                if c.isupper():
                    return 'mixed'

            if word[0].isupper():
                return 'capitalize'
            return 'unknown'

    def replace_stopword_by_reserved_word(self, text, stopword_reg, reserve_word):
        replaced_text = ''
        reserved_stopwords = []
    
        # pad space for easy handling
        replaced_text = ' ' + text + ' '
        for m in reversed(list(stopword_reg.finditer(replaced_text))):
            # Get position excluding prefix and suffix
            start, end, token = m.start(), m.end(), m.group()
            # replace stopword by reserve word
            replaced_text = replaced_text[:start] + reserve_word + replaced_text[end:]
            reserved_stopwords.append(token) # reversed order but it will consumed in reversed order later too
        
        # trim
        replaced_text = replaced_text[1:-1]
            
        return replaced_text, reserved_stopwords

    def replace_reserve_word_by_stopword(self, text, reserve_word_aug, original_stopwords):
        # pad space for easy handling
        replaced_text = ' ' + text + ' '
        matched = list(reserve_word_aug.finditer(replaced_text))[::-1]
        
        # TODO:?
        if len(matched) != len(original_stopwords):
            pass
        if len(matched) > len(original_stopwords):
            pass
        if len(matched) < len(original_stopwords):
            pass
        
        for m, orig_stopword in zip(matched, original_stopwords):
            # Get position excluding prefix and suffix
            start, end = m.start(), m.end()
            # replace stopword by reserve word
            replaced_text = replaced_text[:start] + orig_stopword + replaced_text[end:]
        
        # trim
        replaced_text = replaced_text[1:-1]
        
        return replaced_text

    def preprocess(self, data):
        ...

    def postprocess(self, data):
        ...

In [176]:
import os
import nlpaug.model.word_dict as nmw

class SynonymAugWithNER(WordAugmenterWithNER):
    # https://arxiv.org/pdf/1809.02079.pdf
    """
    Augmenter that leverage semantic meaning to substitute word.
    :param str aug_src: Support 'wordnet' and 'ppdb' .
    :param str model_path: Path of dictionary. Mandatory field if using PPDB as data source
    :param str lang: Language of your text. Default value is 'eng'.
    :param float aug_p: Percentage of word will be augmented.
    :param int aug_min: Minimum number of word will be augmented.
    :param int aug_max: Maximum number of word will be augmented. If None is passed, number of augmentation is
        calculated via aup_p. If calculated result from aug_p is smaller than aug_max, will use calculated result from
        aug_p. Otherwise, using aug_max.
    :param list stopwords: List of words which will be skipped from augment operation.
    :param str stopwords_regex: Regular expression for matching words which will be skipped from augment operation.
    :param func tokenizer: Customize tokenization process
    :param func reverse_tokenizer: Customize reverse of tokenization process
    :param bool force_reload: Force reload model to memory when initialize the class.
        Default value is False and suggesting to keep it as False if performance is the consideration.
    :param str name: Name of this augmenter
    >>> import nlpaug.augmenter.word as naw
    >>> aug = naw.SynonymAug()
    """

    def __init__(self, aug_src='wordnet', model_path=None, name='Synonym_Aug', aug_min=1, aug_max=10, aug_p=0.3,
                 lang='eng', stopwords=None, tokenizer=None, reverse_tokenizer=None, stopwords_regex=None,
                 force_reload=False, verbose=0):
        super().__init__(
            action=Action.SUBSTITUTE, name=name, aug_p=aug_p, aug_min=aug_min, aug_max=aug_max, stopwords=stopwords,
            tokenizer=tokenizer, reverse_tokenizer=reverse_tokenizer, device='cpu', verbose=verbose,
            stopwords_regex=stopwords_regex, include_detail=False)

        self.aug_src = aug_src
        self.model_path = model_path
        self.lang = lang
        self.model = self.get_model(aug_src, lang, model_path, force_reload)

    def skip_aug(self, token_idxes, tokens):
        results = []
        for token_idx in token_idxes:
            to_be_keep = True

            # Some word does not come with synonym/ antony. It will be excluded in lucky draw.
            if tokens[token_idx][1] in ['DT']:
                continue

            # Some words does not exisit for specific pos. Need to filter it out
            if self.aug_src == 'ppdb':
                word_poses = PartOfSpeech.constituent2pos(tokens[token_idx][1])
                if word_poses is None or len(word_poses) == 0:
                    continue
                
                have_candidate = False
                for word_pos in word_poses:
                    if len(self.model.predict(tokens[token_idx][0], pos=word_pos)) > 0:
                        have_candidate = True
                        break

                if not have_candidate:
                    to_be_keep = False

            if to_be_keep:
                results.append(token_idx)

        return results

    def _get_aug_idxes(self, tokens):
        aug_cnt = self.generate_aug_cnt(len(tokens))
        word_idxes = self.pre_skip_aug(tokens, tuple_idx=0)
        word_idxes = self.skip_aug(word_idxes, tokens)
        if len(word_idxes) == 0:
            if self.verbose > 0:
                exception = WarningException(name=WarningName.OUT_OF_VOCABULARY,
                                             code=WarningCode.WARNING_CODE_002, msg=WarningMessage.NO_WORD)
                exception.output()
            return None
        if len(word_idxes) < aug_cnt:
            aug_cnt = len(word_idxes)
        aug_idexes = self.sample(word_idxes, aug_cnt)
        return aug_idexes

    def substitute(self, data, ner_labels):
        ner_result = copy.deepcopy(ner_labels)
        if not data or not data.strip():
            return data, ner_result
            
        change_seq = 0
        doc = Doc(data, self.tokenizer(data))

        original_tokens = doc.get_original_tokens()

        pos = self.model.pos_tag(original_tokens)

        aug_idxes = self._get_aug_idxes(pos)
        change_ner = {}
        if aug_idxes is None or len(aug_idxes) == 0:
            if self.include_detail:
                return data, []
            return data

        for aug_idx in aug_idxes:
            original_token = original_tokens[aug_idx]
            original_ner = ner_result[aug_idx]

            word_poses = PartOfSpeech.constituent2pos(pos[aug_idx][1])
            candidates = []
            if word_poses is None or len(word_poses) == 0:
                # Use every possible words as the mapping does not defined correctly
                candidates.extend(self.model.predict(pos[aug_idx][0]))
            else:
                for word_pos in word_poses:
                    candidates.extend(self.model.predict(pos[aug_idx][0], pos=word_pos))

            candidates = [c for c in candidates if c.lower() != original_token.lower()]

            if len(candidates) > 0:
                candidate = self.sample(candidates, 1)[0]
                candidate = candidate.replace("_", " ").replace("-", " ").lower()

                substitute_token = self.align_capitalization(original_token, candidate)

                if aug_idx == 0:
                    substitute_token = self.align_capitalization(original_token, substitute_token)
                # print("NER:", original_ner, "original:", original_token, "candidate:", substitute_token)

                # NER tagchanges
                if len(substitute_token.split()) > 1:
                    change_ner[aug_idx] = [original_ner] * (len(substitute_token.split()) - 1)
                change_seq += 1
                doc.add_change_log(aug_idx, new_token=substitute_token, action=Action.SUBSTITUTE,
                                   change_seq=self.parent_change_seq + change_seq)

        change_ner = sorted(change_ner.items())
        for (idx, ner) in change_ner[::-1]:
            ner_result[idx:0] = ner

        if self.include_detail:
            return (self.reverse_tokenizer(doc.get_augmented_tokens()), ner_result), doc.get_change_logs()
        else:
            return (self.reverse_tokenizer(doc.get_augmented_tokens()), ner_result)

    @classmethod
    def get_model(cls, aug_src, lang, dict_path, force_reload):
        if aug_src == 'wordnet':
            return nmw.WordNet(lang=lang, is_synonym=True)
        elif aug_src == 'ppdb':
            return init_ppdb_model(dict_path=dict_path, force_reload=force_reload)

        raise ValueError('aug_src is not one of `wordnet` or `ppdb` while {} is passed.'.format(aug_src))

    def __str__(self):
        return 'Name:{}, Aug Src:{}, Action:{}, Method:{}'.format(self.name, self.aug_src, self.action, self.method)

PPDB_MODEL = {}


def init_ppdb_model(dict_path, force_reload=False):
    # Load model once at runtime
    global PPDB_MODEL

    model_name = os.path.basename(dict_path)
    if model_name in PPDB_MODEL and not force_reload:
        return PPDB_MODEL[model_name]

    model = nmw.Ppdb(dict_path)
    PPDB_MODEL[model_name] = model

    return model


In [177]:
def split_tokenizer(data):
    return data.split()
aug = SynonymAugWithNER(tokenizer=split_tokenizer)

In [178]:
augmented_texts = aug.augment(sample_text, sample_ner, n=5)
print_and_highlight_diff(sample_text, augmented_texts)

Original: 572
Some schools offer distance learning as an option for students to attend classes from home by way of online or video conferencing. But I don't think that it would benefit from being able to attend classes from home. Because, there are many reasons that the students could not able to learn home by yourself in online; which are that some problems, and communicating. Also there are more position to I disagree and agree with this argument. Firstable, I think the students to attend classes from home by way of online or video conferencing, it would not benefit for the studehts, because of the students could have some question that they don't know. And if someone who dont' teach with them, they cannot learn that more understable. And they migth cannot catch up their late works, because of they don't help anybody. I have an examples of that the girl who has taken an online classes, and she didn't go to the school, because she thought, the online class is better than to learn subj

In [193]:
aug_texts, aug_ner = augmented_texts[0]
print("Aug:", aug_texts.split()[39], aug_ner[39])
print("Origin:", sample_text.split()[38], sample_ner[38])

Aug: Because, B-Claim
Origin: Because, B-Claim


In [None]:
import utils.augmenter.SynonymAugWithNER

SynonymAugWithNER()