In [12]:
import re
from datasets import load_dataset

In [13]:
# brazilian alphabet
lower_case = r'abcdefghijklmnopqrstuvwxyzáàâãéêíóôõúç'
upper_case = r'ABCDEFGHIJKLMNOPQRSTUVWXYZÁÀÂÃÉÊÍÓÔÕÚÇ'

In [14]:
dt = load_dataset('carolmou/random-sentences')

Found cached dataset parquet (/home/carolmou/.cache/huggingface/datasets/carolmou___parquet/carolmou--random-sentences-b36071ffaba43c26/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7)
100%|██████████| 2/2 [00:00<00:00, 279.18it/s]


In [15]:
freq = {}
test_sentences = []

In [16]:
# matches all lower case words or word with the first upper character and hiphenized words
reg = rf'\b(?:[{upper_case}][{lower_case}]*|[{lower_case}]+(?:-[{lower_case}]+)*|[{lower_case}]*[{upper_case}](?=[{lower_case}]))\b'

In [17]:
for type in ["train", "test"]:
    # 'type' dictionary of frequencies
    dic = {}

    wrong = dt[type]["wrong_text"]
    correct = dt[type]["correct_text"]

    for w_sentence, c_sentence in zip(wrong, correct):
        # find all words of each sentence
        w_words = list(re.findall(reg, w_sentence))
        c_words = list(re.findall(reg, c_sentence))

        tuples = []

        # iterate through them pairwise
        for w1, w2 in zip(w_words, c_words):
            if w1 == w2:
                continue

            # they're different; this is where
            # the model has to act
            dic[(w1, w2)] = dic.get((w1, w2), 0) + 1

            # for each test sentence, make a list
            # of the different pairs
            if type == "test":
                tuples.append((w1, w2))

        if type == "test":
            test_sentences.append(tuples)

    # retrieve alias
    freq[type] = dic

In [18]:
# preview for sanity
freq["test"]

{('dão', 'Não'): 19,
 ('voce', 'Você'): 11,
 ('he', 'me'): 22,
 ('almente', 'realmente'): 1,
 ('diference', 'diferente'): 2,
 ('ã', 'é'): 7,
 ('nel', 'nem'): 11,
 ('veinho', 'Velhinho'): 1,
 ('tantes', 'tantos'): 1,
 ('fatalizado', 'Fatalidade'): 1,
 ('x', 'e'): 52,
 ('c', 'a'): 78,
 ('mg', 'me'): 35,
 ('sexi', 'mexi'): 1,
 ('estaia', 'estava'): 1,
 ('Hora', 'hora'): 2,
 ('limos', 'vamos'): 1,
 ('dd', 'de'): 2,
 ('selaram', 'separar'): 1,
 ('diamoric', 'Diadorim'): 23,
 ('tele', 'ele'): 26,
 ('temes', 'temeu'): 1,
 ('dom', 'com'): 32,
 ('r', 'à'): 9,
 ('sorris', 'sorriu'): 6,
 ('Ao', 'ao'): 10,
 ('pa', 'ia'): 2,
 ('choker', 'chover'): 1,
 ('aminha', 'minha'): 14,
 ('mitat', 'Matar'): 1,
 ('nam', 'não'): 76,
 ('ama', 'uma'): 23,
 ('arrumentos', 'argumentou'): 1,
 ('t', 'a'): 96,
 ('narry', 'Harry'): 29,
 ('Desagradavelmente', 'desagradavelmente'): 2,
 ('ense', 'pense'): 1,
 ('ue', 'que'): 104,
 ('chics', 'Chico'): 1,
 ('rum', 'Um'): 3,
 ('me', 'de'): 95,
 ('necessitate', 'necessitava'):

In [19]:
# amount of sentences where every mistake
# has been seen in the train before
qtd = 0

for sentence in test_sentences:
    for tup in sentence:
        # the model has never seen this mistake
        # before; it has to extrapolate
        if freq['train'].get(tup,0) == 0:
            break
    else:
        qtd += 1

In [20]:
total_sentences = len(dt['test']['wrong_text'])

# percentage of bad sentences in the test set
print((qtd/total_sentences)*100)

55.740021193924406
