In [9]:
pip install googletrans==4.0.0-rc1

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [3]:
from datasets import load_dataset, load_metric
raw_datasets = load_dataset("wi_locness", 'wi')

from transformers import AutoTokenizer
model_checkpoint = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

  _torch_pytree._register_pytree_node(


In [4]:
def preprocess_function(examples):
    inputs = examples['text']
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        return_offsets_mapping=True
    )

    labels_out = []
    offset_mapping = model_inputs.pop("offset_mapping")
    for i in range(len(model_inputs["input_ids"])):
        example_idx = i

        start_idx = offset_mapping[i][0][0]
        end_idx = offset_mapping[i][-2][1]  # last token is <eos>, so we care about second last tok offset

        edits = examples["edits"][example_idx]

        corrected_text = inputs[example_idx][start_idx:end_idx]

        for start, end, correction in reversed(
            list(zip(edits["start"], edits["end"], edits["text"]))
        ):
            if start < start_idx or end > end_idx:
                continue
            start_offset = start - start_idx  # >= 0
            end_offset = end - start_idx
            if correction == None:
                correction = tokenizer.unk_token
            corrected_text = (
                corrected_text[:start_offset] + correction + corrected_text[end_offset:]
            )

        labels_out.append(corrected_text)

    labels_out = tokenizer(labels_out, max_length=512, truncation=True)
    model_inputs["labels"] = labels_out["input_ids"]

    return model_inputs

In [5]:
tokenized_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_datasets['train'].column_names
)

# Train-Test split of 90%-10%
dataset_dict = tokenized_datasets["train"].train_test_split(test_size=0.1, seed=0)
tokenized_datasets["train"] = dataset_dict["train"]
tokenized_datasets["test"] = dataset_dict["test"]

X_train = tokenized_datasets["train"]["input_ids"]
Y_train = tokenized_datasets["train"]["labels"]
b
X_test = tokenized_datasets["test"]["input_ids"]
Y_test = tokenized_datasets["test"]["labels"]

In [20]:
print(len(X_test))
print(tokenizer.decode(X_test[0]))
print(tokenizer.decode(Y_test[0]))
print(tokenizer.decode(X_test[1]))
print(tokenizer.decode(Y_test[1]))

300
If we talk about my favourite sport, basketball will be. I love basketball, though I rarely play basketball because it is hard to find a stadium to play basketball in Thailand. I like basketball because it is one of the easiest sport. Let's talk about the advantages of playing basketball. I think playing basketball help me to be healthy. If someone wants to play some sport, basketball is my first advice. It is an easy sport to play for anyone who never play any sport before. So, newbies can understand and know the rules easily. These are the story about basketball, my favourite sport.</s>
If we talk about my favourite sport, it will be basketball. I love basketball, though I rarely play basketball because it is hard to find a stadium to play basketball in Thailand. I like basketball because it is one of the easiest sports. Let's talk about the advantages of playing basketball. I think playing basketball helps me to be healthy. If someone wants to play some sport, basketball is my f

In [6]:
# get dataset sequences
text_train = []
corrected_train = []
for idx in range(len(X_train)):
  text_train.append(tokenizer.decode(X_train[idx]))
  corrected_train.append(tokenizer.decode(Y_train[idx]))

text_validation = []
corrected_validation = []
for idx in range(len(X_test)):
  text_validation.append(tokenizer.decode(X_test[idx]))
  corrected_validation.append(tokenizer.decode(Y_test[idx]))
print(text_train)
print(corrected_train)



In [12]:
print(text_train[0])
print(corrected_train[0])

I've been start jogging for five years. It is the way I can unwind because my study it's stressful. It gives me a sense of achievement, for these reasons I would like to do every day. I love jogging because it's a way to stay outdor immersed in nature. I think there are not negative side in doing jogging. I have been really on skiing since I was a baby. My mother make me start. Since then every year i go in north Italy to practice. I fell relaxed staying alon near montains and snow.</s>
I've been jogging for five years. It is the way I can unwind, because my studies are stressful. It gives me a sense of achievement. For these reasons, I would like to it do every day. I love jogging because it's a way to stay outdoors, immersed in nature. I think there are no negative sides to jogging. I have been really into skiing since I was a baby. My mother made me start. Since then, every year, I go to northern Italy to practice. I fell relaxed being alone near mountains and snow.</s>


In [7]:
from collections import defaultdict
from tqdm import tqdm
from googletrans import Translator
translator = Translator()

def backtranslate(to_translate, flow):
    # print("original: \t%s" % to_translate)
    for dest in flow:
        translation = translator.translate(to_translate, dest=dest)
        to_translate = translation.text
    return to_translate
        # print("%s: \t\t%s" %(dest, translation.text))

In [14]:
print(backtranslate(corrected_train[0], ["zh-cn", "en"]))

I have been jogging for five years.This is a way I can relax, because my learning pressure is great.It gave me a sense of accomplishment.For these reasons, I want to do this every day.I like jogging because this is a way to stay outdoors and immerse in nature.I think there is no negative aspect of jogging.Since I am still a child, I have always liked skiing.My mother let me start.Since then, every year, I have been practicing in northern Italy.I fell alone near the mountain and the snow alone.</s>


In [8]:
from collections import defaultdict
backtranslations = defaultdict(list)

In [9]:
import csv
from tqdm import tqdm

def write_backtranslations_to_csv(text, corrected, backtranslations, languages, output_file):
    with open(output_file, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        headers = ['Original', 'Corrected']
        headers.extend(languages)
        writer.writerow(headers)
        with tqdm(total=len(backtranslations[languages[0]])) as t:
            for i in range(len(backtranslations[languages[0]])):
                rows = [text[i], corrected[i]]
                for lang in languages:
                    rows.append(backtranslations[lang][i])
                writer.writerow(rows)
                t.update(1)


In [10]:
languages = ["zh-cn", "ru", "fr", "es", "de"]
with tqdm(total=len(text_train)*len(languages)) as t:
    for lang in languages:
        i = 0
        while i < len(text_train):
            try:
                text = text_train[i]
                backtranslations[lang].append(backtranslate(text, [lang, "en"]))
                t.update(1)
                i += 1
            except:
                pass
            
write_backtranslations_to_csv(text_train, corrected_train, backtranslations, languages, "backtranslations_all.csv")

100%|██████████| 2700/2700 [1:53:11<00:00,  2.52s/it]  
100%|██████████| 2700/2700 [00:00<00:00, 28726.14it/s]


In [12]:
print(len(backtranslations["es"]))
print(backtranslations["es"][200])

2700
There are very different points of view on the subject of whether the benefits of foreign study of study in previous age exceed the inconveniencesubjects, however, I personally believe that the advantages are overweight. Reasons of this as follows.Challenges, primary students have problems with the difference between the first language and the learning. Therefore, people see that children learn a foreign language in primary school are worse that later, however, I would affirm that learningForeigner in primary school brings innumerable benefits to children. Primary school, the best time for learning, children are easier and faster to obtain knowledge. Not only the study load when compared by high school, high school,but they can communicate with foreigners and extend their social network during that period and later life.Of conclusion, once again I reaffirm my position that the advantages of learning a foreign language in primary school are much more than the inconveniences. Given 