In [None]:
import ebooklib
from bs4 import BeautifulSoup
from ebooklib import epub
import random, pickle, re
from tqdm import tqdm
import pandas as pd
from datasets import load_dataset

In [None]:
path='data/portuguese_sentences.txt'

# Generate sentences

In [None]:
carolina = load_dataset('carolina-c4ai/corpus-carolina')

In [None]:
carolina_text = carolina['corpus']['text']
print(len(carolina_text))

In [None]:
regex = r'\.|\?|!|;|\n'

In [None]:
#list_books = ['a_guerra_dos_tronos','linha_d_agua','o_alienista', 'ensaio_sobre_a_cegueira', 'sapiens', 'o_guarani', 'colecao_especial_jane_austen', 'o_livro_das_princesas','a_falencia', 'sob_a_redoma', 'os_cem_melhores_contos_brasileiros_do_seculo']
list_books = ['os_tres_mosqueteiros', 'harry_potter_e_a_ordem_da_fenix', 'grande_sertao_veredas', 'a_redoma_de_vidro', 'aristoteles_e_dante_descobrem_os_segredos_do_universo', 'como_evitar_preocupacoes_e_comecar_a_viver']
list_books = [book+'.epub' for book in list_books]

In [None]:
def process_book(book_name):
    book = epub.read_epub(f'data/epubs/{book_name}')
    items = list(book.get_items_of_type(ebooklib.ITEM_DOCUMENT))
    def chapter_to_str(chapter):
        soup = BeautifulSoup(chapter.get_body_content(), 'html.parser')
        text = [para.get_text() for para in soup.find_all('p')]
        return ''.join(text)
    texts = ""
    for c in items:
        chapter = chapter_to_str(c)
        texts += chapter
    return texts 

In [None]:
raw_text = ' '.join([process_book(book) for book in list_books])

In [None]:
sentences = re.split(regex, raw_text)
sentences += carolina_text

In [None]:
def clean_sentence(sentence):
    sentence.replace('\n', ' ')

    while sentence and sentence[0] in ['.', ',', ':', '!', '?', ';']:
        sentence = sentence[1:]
    # fix whitespaces
    while '  ' in sentence:
        sentence = sentence.replace('  ', ' ')
    if sentence and sentence[0] == ' ':
        sentence = sentence[1:]
    if sentence and sentence[-1] == ' ':
        sentence = sentence[:-1]
    return sentence
    
sentences = [clean_sentence(s) for s in sentences]
sentences = [s for s in sentences if len(s) > 1]

In [None]:
duplicates=set()
print(f'Size with duplicates: {len(sentences)}')
for s in sentences:
    duplicates.add(s) 
sentences = list(duplicates)
print(f'Size without duplicates: {len(sentences)}')

In [None]:
with open(path, 'w') as file:
    file.write('\n'.join(sentences))
    file.close()

# Generate annotated data

### Helper for generating similar strings

In [None]:
keyboard_adjacent_letters_pt = {
    'a': ['s', 'z', 'q', 'w', 'á', 'à', 'â', 'ã'],
    'b': ['v', 'g', 'n', 'h'],
    'c': ['x', 'd', 'v', 'f', 'ç'],
    'd': ['s', 'e', 'c', 'x', 'f', 'r'],
    'e': ['w', 'r', 'd', 's', 'é', 'ê'],
    'f': ['d', 'r', 'g', 'v', 'c', 't'],
    'g': ['f', 't', 'h', 'b', 'v', 'r'],
    'h': ['g', 't', 'j', 'n', 'b', 'y'],
    'i': ['u', 'o', 'k', 'j', 'í'],
    'j': ['h', 'y', 'k', 'n', 'm', 'u', 'i'],
    'k': ['j', 'i', 'l', 'm', 'o', 'n'],
    'l': ['k', 'o', 'p', 'm'],
    'm': ['n', 'j', 'k', 'l'],
    'n': ['b', 'h', 'j', 'm'],
    'o': ['i', 'p', 'l', 'k', 'ó', 'ô', 'õ'],
    'p': ['o', 'l', 'ç'],
    'q': ['a', 'z', 'u'],
    'r': ['e', 't', 'f', 'd', 'r'],
    's': ['a', 'w', 'e', 'd', 'x', 'z'],
    't': ['r', 'y', 'g', 'f'],
    'u': ['y', 'j', 'i', 'h', 'ú'],
    'v': ['c', 'f', 'g', 'b'],
    'w': ['q', 'a', 's', 'e'],
    'x': ['z', 's', 'd', 'c'],
    'y': ['t', 'u', 'h', 'g'],
    'z': ['x', 's', 'a', 'ç'],
    'ç': ['c'],
}


In [None]:
def get_similar_strings(str, x = None, adjacent_letters = True):
    """
    Takes in a string and returns a list of similar strings,
    all in lowercase, according to the following rules:

    * if 'x' is None, it will be:
      -> 1, if len(str) <= 6
      -> 2, if len(str) <= 12
      -> 3, if len(str) > 12
    * all strings will be common Portuguese cognitive erros or
      strings 'x' edits away from str, where an edit is:
      -> insert a letter
      -> delete a letter
      -> replace one letter, and the letter will be any letter in the Portuguese alphabet or 
         just the adjacent letters in the keyboard if the flag
         'adjacent_letters' is set to true.
    """
    str = str.lower()

    if x is None:
        for edits, size in [(1, 1000)]:
            if len(str) <= size:
                x = edits 
                break

    ALPHABET_UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÇÉÊÍÓÔÕÚàáâãçéêíóôõú'
    ALPHABET_LOWER = ALPHABET_UPPER.lower()

    def concatenate_function(func, n):
        if n == 1:
            return func
        return lambda x: func(concatenate_function(func, n-1)(x))
    
    def insert(words):
        """
        Receives an iterable of words and returns
        a set with all the possible insertions of each word.
        """
        return_words = set()

        for str in words:
            for pos in range(len(str)+1):
                left = str[:pos]
                right = str[pos:]
                
                for char in ALPHABET_LOWER:
                    return_words.add(left+char+right)\
        
        return return_words
    
    def delete(words):
        return_words = set()
        
        for str in words:
            if len(str) <= 1:
                continue 
            for i in range(len(str)):
                left = str[:i]
                right = str[i+1:]
                return_words.add(left+right)
        
        return return_words
    
    def replace(words):
        return_words = set()

        for str in words:
            for ix, char in enumerate(str):
                left = str[:ix]
                right = str[ix+1:]
                for c in ALPHABET_LOWER:
                    return_words.add(left+c+right)
        
        return return_words
    
    all_edits = set()

    for func in [insert, delete, replace]:
        conc_func = concatenate_function(func, x)
        all_edits = all_edits | conc_func({str})

    for ix,c in enumerate(str):
        all_edits = all_edits | {str[:ix]+c.swapcase()+str[ix+1:]}

    # common Portuguese errors
    # ss and ç
    all_edits.add(str.replace('ss', 'ç'))
    all_edits.add(str.replace('ç', 'ss'))

    # ão and am
    all_edits.add(str.replace('ão', 'am'))
    all_edits.add(str.replace('am', 'ão'))
    
    all_edits.discard(str)
    
    return all_edits
    

### Creating annotated data

In [None]:
sentences = []

with open(path, 'r') as file:
    for s in file:
        sentences.append(s)

In [None]:
with open('data/dicionario.pickle', 'rb') as file:
    loaded_df = pickle.load(file)

words = []
freq_dic = {}

for word, freq in zip(loaded_df['word'], loaded_df['frequency']):
    words.append((-freq, word))
    freq_dic[word] = freq

words.sort()

In [None]:
def get_possible_mistakes(word, just_similar = False):
    if word not in freq_dic:
        return []

    similar_words = get_similar_strings(word)

    if just_similar:
        return similar_words

    mistakes = []

    for similar in similar_words:
        if freq_dic.get(similar, 10_000_000) <= (freq_dic[word]/5):
            mistakes.append(similar)
    mistakes.sort(key=lambda x: freq_dic[x], reverse=True)
    return mistakes

In [None]:
print(get_possible_mistakes('você', True))

In [None]:
def draw_random_number(left, right):
    """
    Requires left <= right
    """
    num = random.random() * (right-left)
    num = round(num)
    return num+left 

def draw_random_quantity():
    qtd_array = [0,0,0,1,1,1,1,2,2,2,2,2,2,3,3,3,4]
    random_index = draw_random_number(0,len(qtd_array)-1)
    return qtd_array[random_index]

In [None]:
draw_random_number(0,14)

In [None]:
df = {'wrong_text': [], 'correct_text': []}

wrong_text = df['wrong_text']
correct_text = df['correct_text']
duplicates = set()

for s in tqdm(sentences):
    s = s.replace('\n', '')
    
    for i in range(1):
        #to_mess_up = draw_random_quantity()
        to_mess_up = 1

        curr_s = s

        times_run = 10

        while to_mess_up and times_run:
            times_run -= 1
            all_matches = list(re.finditer('\w+', curr_s))

            if len(all_matches) == 0:
                break

            random_ix = draw_random_number(0,len(all_matches)-1)
            match=all_matches[random_ix]
            
            mistakes = get_possible_mistakes(match.group())
            mistakes = mistakes[:5]
            random.shuffle(mistakes)
            
            if not mistakes:
                continue 

            beg = match.start()
            en = match.end()

            random.shuffle(mistakes)

            for mistake in mistakes: 
                aux = curr_s[:beg]+mistake+curr_s[en:]
                if aux in duplicates:
                    continue 
                duplicates.add(aux)
                wrong_text.append(aux)
                correct_text.append(s)

            #curr_s = curr_s[:beg]+mistakes[0]+curr_s[en:]

            to_mess_up -= 1
        
        if curr_s in duplicates:
            continue 
        
        wrong_text.append(curr_s)
        correct_text.append(s)
        duplicates.add(curr_s)



In [None]:
len(df['wrong_text'])

In [None]:
df = pd.DataFrame(df)

In [None]:
df

In [None]:
with open('data/annotated_data.pickle', 'wb') as file:
    pickle.dump(df, file)