In [108]:
from collections import Counter
from tqdm import tqdm

In [109]:
from datasets import load_dataset, load_metric
raw_datasets = load_dataset("wi_locness", 'wi')
print(raw_datasets)

from transformers import AutoTokenizer
model_checkpoint = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

DatasetDict({
    train: Dataset({
        features: ['id', 'userid', 'cefr', 'text', 'edits'],
        num_rows: 3000
    })
    validation: Dataset({
        features: ['id', 'userid', 'cefr', 'text', 'edits'],
        num_rows: 300
    })
})


In [110]:
def preprocess_function(examples):
    inputs = examples['text']
    model_inputs = tokenizer(
        inputs,
        max_length=512,
        truncation=True,
        return_offsets_mapping=True
    )

    labels_out = []
    offset_mapping = model_inputs.pop("offset_mapping")
    for i in range(len(model_inputs["input_ids"])):
        example_idx = i

        start_idx = offset_mapping[i][0][0]
        end_idx = offset_mapping[i][-2][1]  # last token is <eos>, so we care about second last tok offset

        edits = examples["edits"][example_idx]

        corrected_text = inputs[example_idx][start_idx:end_idx]

        for start, end, correction in reversed(
            list(zip(edits["start"], edits["end"], edits["text"]))
        ):
            if start < start_idx or end > end_idx:
                continue
            start_offset = start - start_idx  # >= 0
            end_offset = end - start_idx
            if correction == None:
                correction = tokenizer.unk_token
            corrected_text = (
                corrected_text[:start_offset] + correction + corrected_text[end_offset:]
            )

        labels_out.append(corrected_text)

    labels_out = tokenizer(labels_out, max_length=512, truncation=True)
    model_inputs["labels"] = labels_out["input_ids"]

    return model_inputs

In [111]:
tokenized_datasets = raw_datasets.map(
    preprocess_function,
    batched=True
)

In [112]:
# Train-Test split of 90%-10%
dataset_dict = tokenized_datasets["train"].train_test_split(test_size=0.1, seed=0)
print(dataset_dict)

tokenized_datasets["train"] = dataset_dict["train"]
tokenized_datasets["test"] = dataset_dict["test"]
print(tokenized_datasets)

X_train = tokenized_datasets["train"]["input_ids"]
Y_train = tokenized_datasets["train"]["labels"]
cefr_train = tokenized_datasets["train"]["cefr"]

X_test = tokenized_datasets["test"]["input_ids"]
Y_test = tokenized_datasets["test"]["labels"]
cefr_test = tokenized_datasets["test"]["cefr"]

print(len(X_test), len(Y_test), len(cefr_test))

DatasetDict({
    train: Dataset({
        features: ['id', 'userid', 'cefr', 'text', 'edits', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2700
    })
    test: Dataset({
        features: ['id', 'userid', 'cefr', 'text', 'edits', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 300
    })
})
DatasetDict({
    train: Dataset({
        features: ['id', 'userid', 'cefr', 'text', 'edits', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2700
    })
    validation: Dataset({
        features: ['id', 'userid', 'cefr', 'text', 'edits', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 300
    })
    test: Dataset({
        features: ['id', 'userid', 'cefr', 'text', 'edits', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 300
    })
})
300 300 300


In [113]:
cefr_train_date = Counter(cefr_train)
cefr_test_data = Counter(cefr_test)

keys = set(cefr_train_data.keys()) | set(cefr_test_data.keys())
combined_cefr = {}
for key in keys:
  value1 = cefr_train_data.get(key, 0)
  value2 = cefr_test_data.get(key, 0)
  combined_cefr[key] = [value1, value2]
combined_cefr = dict(sorted(combined_cefr.items()))


print("{:<10} {:<10} {:<10}".format('CEFR', 'TRAIN', 'TEST'))
for key, value in combined_cefr.items():
    cefr = key
    tr, te = value
    print("{:<10} {:<10} {:<10}".format(cefr, tr, te))

CEFR       TRAIN      TEST      
A1.i       169        15        
A1.ii      226        30        
A2.i       233        39        
A2.ii      314        38        
B1.i       197        36        
B1.ii      216        32        
B2.i       190        27        
B2.ii      119        17        
C1.i       157        17        
C1.ii      175        22        
C2+        5          1         
C2.i       112        16        
C2.ii      74         10        


Split data by CEFR
Explanation of the cefr classifications can be found [here](https://www.cambridgeenglish.org/exams-and-tests/cefr/)
*   Run the first cell to get tokenized X_test and Y_test
*   Run the second cell to get the decoded (original sentences) of X_test, Y_test


In [None]:
tokenized_test = {}
for k in sorted(cefr_test_data.keys()):
  tokenized_test[k] = {"X":[], "Y":[]}

for cefr, src, ref in zip(cefr_test, X_test, Y_test):
  tokenized_test[cefr]["X"].append(src)
  tokenized_test[cefr]["Y"].append(ref)

print(tokenized_test['A1.i']['X'][0])
print(tokenized_test['A1.i']['Y'][0])

# To get the X_test and Y_test for cefr A1.i
# X_test_A1 = tokenized_test['A1.i']['X']
# Y_test_A1 = tokenized_test['A1.i']['Y']

In [58]:
decoded_test = {}
for k in sorted(cefr_test_data.keys()):
  decoded_test[k] = {"X":[], "Y":[]}

for cefr, src, ref in zip(cefr_test, X_test, Y_test):
  decoded_test[cefr]["X"].append(tokenizer.decode(src))
  decoded_test[cefr]["Y"].append(tokenizer.decode(ref))

print(decoded_test['A1.i']['X'][0])
print(decoded_test['A1.i']['Y'][0])

# To get the X_test and Y_test for cefr A1.i
# X_test_decoded_A1 = decoded_test['A1.i']['X']
# Y_test_decoded_A1 = decoded_test['A1.i']['Y']

Dear Gareth, I can't go to a barbecue next Sunday because I'm going to Seville with my parents and I'm coming back so late. I think that we can meet on Saturday evening at the cafe. What do you think about it? Love, Alicia</s>
Dear Gareth, I can't go to the barbecue next Sunday because I'm going to Seville with my parents and I'm coming back so late. I think that we can meet on Saturday evening at the cafe. What do you think about it? Love, Alicia</s>
dict_keys(['A1.i', 'A1.ii', 'A2.i', 'A2.ii', 'B1.i', 'B1.ii', 'B2.i', 'B2.ii', 'C1.i', 'C1.ii', 'C2.i', 'C2.ii'])


Split data into Beginner (all A* cefr) , Intermediate (all B* cefr), Advanced (All C* cefr)
*   Run the first cell to get tokenized X_test and Y_test
*   Run the second cell to get the decoded (original sentences) of X_test, Y_test

To get all X_test and Y_test for beginner texts
```
X_test = tokenized_test['A']['X']
Y_test = tokenized_test['A']['Y']
```



In [116]:
tokenized_test = {}
for k in ['A', 'B', 'C']:
  tokenized_test[k] = {"X":[], "Y":[]}

for cefr, src, ref in zip(cefr_test, X_test, Y_test):
  band = cefr[0]
  tokenized_test[band]["X"].append(src)
  tokenized_test[band]["Y"].append(ref)

print(tokenizer.decode(tokenized_test['A']['X'][0]))
print(tokenizer.decode(tokenized_test['A']['Y'][0]))

# To get all X_test and Y_test for A* cefrs
# X_test_A = tokenized_test['A']['X']
# Y_test_A = tokenized_test['A']['Y']

My name is Kamaldeen I'm 31 years old I'm married and I don't have children. I moved with my family from Jordan to Saudi Arabia in 1995. I began studying when I came to Saudi Arabia in third grade. I rose up in Al Qassim region and I lived the best time in my life in Al Qassim. I have big family contains from my father and mother and I have seven brothers and three sisters. I studied in Al Qassim University and my major Microbiology. I graduated from my college in 2001 after that I got job in ministry of health and I have been working there since 2010. I like a lot of activities such as travelling, readillng, play soccer and watch movies. </s>
My name is Kamaldeen. I'm 31 years old. I'm married and I don't have children. I moved with my family from Jordan to Saudi Arabia in 1995. I began studying when I came to Saudi Arabia in third grade. I grew up in Al Qassim region and I had the best time of my life in Al Qassim. I have a big family <unk> from my father and mother and I have seven 

In [62]:
decoded_test = {}
for k in ['A', 'B', 'C']:
  decoded_test[k] = {"X":[], "Y":[]}

for cefr, src, ref in zip(cefr_test, X_test, Y_test):
  band = cefr[0]
  decoded_test[band]["X"].append(tokenizer.decode(src))
  decoded_test[band]["Y"].append(tokenizer.decode(ref))

print(decoded_test['A']['X'][0])
print(decoded_test['A']['Y'][0])

# To get all X_test and Y_test for A* cefrs
# X_test_decoded_A = tokenized_test['A']['X']
# Y_test_decoded_A = tokenized_test['A']['Y']

In modern growing world, The basic needs of general public are increasing day by day and their expectations on quality of life took a new dimensions. When we were in our childhood, we used to deal with public transport for going one place to another because that was only a cheaper option available. But today, Things are getting change and technology marked a significant role in our life, Automobile segment increased its vertical and having a car becomes a need from luxory. We can see nowadays, more and more people would prefer to use their own car instead of buses or taxis because they feel comfortable and mobile in it. A car can help him to go anywhere and anytime in much less time as compared to public transport. I believe, as soon as Automobile sector develop new technology in cars people will more keen to use their own vehicle instead or public transport.</s>
In the modern <unk> world, the basic needs of the general public are increasing day by day and their expectations of quality