In [None]:
import ebooklib
from bs4 import BeautifulSoup
from ebooklib import epub
import random, pickle, re
from tqdm import tqdm
import pandas as pd
from datasets import load_dataset, Dataset

In [None]:
# brazilian alphabet
lower_case = r'abcdefghijklmnopqrstuvwxyzáàâãéêíóôõúç'
upper_case = r'ABCDEFGHIJKLMNOPQRSTUVWXYZÁÀÂÃÉÊÍÓÔÕÚÇ'

In [None]:
path='../../data/portuguese_sentences.txt'

# Generate sentences

In [None]:
carolina = load_dataset('carolina-c4ai/corpus-carolina')

In [None]:
carolina_text = carolina['corpus']['text']
print(len(carolina_text))

In [None]:
list_books = ['a_guerra_dos_tronos','linha_d_agua','o_alienista', 'ensaio_sobre_a_cegueira', 'sapiens', 'o_guarani', 'colecao_especial_jane_austen', 'o_livro_das_princesas','a_falencia', 'sob_a_redoma', 'os_cem_melhores_contos_brasileiros_do_seculo']
#list_books = ['os_tres_mosqueteiros', 'harry_potter_e_a_ordem_da_fenix', 'grande_sertao_veredas', 'a_redoma_de_vidro', 'aristoteles_e_dante_descobrem_os_segredos_do_universo', 'como_evitar_preocupacoes_e_comecar_a_viver']
list_books = [book+'.epub' for book in list_books]

In [None]:
def process_book(book_name):
    book = epub.read_epub(f'../../data/epubs/{book_name}')
    items = list(book.get_items_of_type(ebooklib.ITEM_DOCUMENT))
    def chapter_to_str(chapter):
        soup = BeautifulSoup(chapter.get_body_content(), 'html.parser')
        text = [para.get_text() for para in soup.find_all('p')]
        return ''.join(text)
    texts = ""
    for c in items:
        chapter = chapter_to_str(c)
        texts += chapter
    return texts 

In [None]:
raw_text = ' '.join([process_book(book) for book in list_books])

In [None]:
regex = r'\.|\?|!|;|\n'
sentences = re.split(regex, raw_text)
for carol in carolina_text:
    splits = re.split(regex, carol)
    sentences.extend(splits)

In [None]:
sentences = [ s for s in sentences if len(s) > 1]

In [None]:
len(sentences)

In [None]:
random.shuffle(sentences)
sentences = sentences[:3_000_000]

In [None]:
def chop_sentence(str: str) -> list[str]:
    """
    Receives a sentence and returns a list
    of all sentences with delimiter upperLower.
    Ex.: 'GeorgeMartin' -> ['George', 'Martin']
    """
    split_regex = rf'(?<=[{lower_case}])(?=[{upper_case}])'
    return re.split(split_regex, str)

In [None]:
def clean_sentence(str: str) -> str:
    str.replace('\n', ' ')

    while str and str[0] in ['.', ',', ':', '!', '?', ';']:
        str = str[1:]

    # fix whitespaces
    while '  ' in str:
        str = str.replace('  ', ' ')
    if str and str[0] == ' ':
        str = str[1:]
    if str and str[-1] == ' ':
        str = str[:-1]
    
    return str

In [None]:
def apply_and_concatenate(func, args):
    """
    Receives a function and a list of arguments to the function.
    Returns the concatenation of func(args[0])+func(args[1])...
    """
    to_return = []
    for obj in args:
        to_return.extend(func(obj))
    return to_return

In [None]:
def normalize_sentence(str: str) -> list[str]:
    str = clean_sentence(str)
    
    # get rid of empty and 
    # one-letter sentences
    if len(str) <= 1:
        return []

    # base of recursion
    splits = chop_sentence(str)

    if len(splits) == 1:
        return splits
    
    return apply_and_concatenate(normalize_sentence, splits)

In [None]:
sentences = apply_and_concatenate(normalize_sentence, sentences)

In [None]:
duplicates=set()
print(f'Size with duplicates: {len(sentences)}')
for s in sentences:
    duplicates.add(s) 
sentences = list(duplicates)
print(f'Size without duplicates: {len(sentences)}')

In [None]:
# with open(path, 'w') as file:
#     file.write('\n'.join(sentences))
#     file.close()

# Generate annotated data

### Helper for generating similar strings

In [None]:
keyboard_adjacent_letters_pt = {
    'a': ['s', 'z', 'q', 'w', 'á', 'à', 'â', 'ã'],
    'b': ['v', 'g', 'n', 'h'],
    'c': ['x', 'd', 'v', 'f', 'ç'],
    'd': ['s', 'e', 'c', 'x', 'f', 'r'],
    'e': ['w', 'r', 'd', 's', 'é', 'ê'],
    'f': ['d', 'r', 'g', 'v', 'c', 't'],
    'g': ['f', 't', 'h', 'b', 'v', 'r'],
    'h': ['g', 't', 'j', 'n', 'b', 'y'],
    'i': ['u', 'o', 'k', 'j', 'í'],
    'j': ['h', 'y', 'k', 'n', 'm', 'u', 'i'],
    'k': ['j', 'i', 'l', 'm', 'o', 'n'],
    'l': ['k', 'o', 'p', 'm'],
    'm': ['n', 'j', 'k', 'l'],
    'n': ['b', 'h', 'j', 'm'],
    'o': ['i', 'p', 'l', 'k', 'ó', 'ô', 'õ'],
    'p': ['o', 'l', 'ç'],
    'q': ['a', 'z', 'u'],
    'r': ['e', 't', 'f', 'd', 'r'],
    's': ['a', 'w', 'e', 'd', 'x', 'z'],
    't': ['r', 'y', 'g', 'f'],
    'u': ['y', 'j', 'i', 'h', 'ú'],
    'v': ['c', 'f', 'g', 'b'],
    'w': ['q', 'a', 's', 'e'],
    'x': ['z', 's', 'd', 'c'],
    'y': ['t', 'u', 'h', 'g'],
    'z': ['x', 's', 'a', 'ç'],
    'ç': ['c'],
}


In [None]:
def get_similar_strings(str, x = None, adjacent_letters = True):
    """
    Takes in a string and returns a list of similar strings,
    all in lowercase, according to the following rules:

    * if 'x' is None, it will be:
      -> 1, if len(str) <= 6
      -> 2, if len(str) <= 12
      -> 3, if len(str) > 12
    * all strings will be common Portuguese cognitive erros or
      strings 'x' edits away from str, where an edit is:
      -> insert a letter
      -> delete a letter
      -> replace one letter, and the letter will be any letter in the Portuguese alphabet or 
         just the adjacent letters in the keyboard if the flag
         'adjacent_letters' is set to true.
    """
    str = str.lower()

    if x is None:
        x = 2
        for edits, size in [(1, 6), (2, 12)]:
            if len(str) <= size:
                x = edits 
                break

    ALPHABET_UPPER = 'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÁÂÃÇÉÊÍÓÔÕÚàáâãçéêíóôõú'
    ALPHABET_LOWER = ALPHABET_UPPER.lower()

    def concatenate_function(func, n):
        if n == 1:
            return func
        return lambda x: func(concatenate_function(func, n-1)(x))
    
    def insert(words):
        """
        Receives an iterable of words and returns
        a set with all the possible insertions of each word.
        """
        return_words = set()

        for str in words:
            for pos in range(len(str)+1):
                left = str[:pos]
                right = str[pos:]
                
                for char in ALPHABET_LOWER:
                    return_words.add(left+char+right)\
        
        return return_words
    
    def delete(words):
        return_words = set()
        
        for str in words:
            if len(str) <= 1:
                continue 
            for i in range(len(str)):
                left = str[:i]
                right = str[i+1:]
                return_words.add(left+right)
        
        return return_words
    
    def replace(words):
        return_words = set()

        for str in words:
            for ix, char in enumerate(str):
                left = str[:ix]
                right = str[ix+1:]
                for c in ALPHABET_LOWER:
                    return_words.add(left+c+right)
        
        return return_words
    
    all_edits = set()

    for func in [insert, delete, replace]:
        conc_func = concatenate_function(func, x)
        all_edits = all_edits | conc_func({str})

    for ix,c in enumerate(str):
        all_edits = all_edits | {str[:ix]+c.swapcase()+str[ix+1:]}

    # common Portuguese errors
    # ss and ç
    all_edits.add(str.replace('ss', 'ç'))
    all_edits.add(str.replace('ç', 'ss'))

    # ão and am
    all_edits.add(str.replace('ão', 'am'))
    all_edits.add(str.replace('am', 'ão'))
    
    all_edits.discard(str)
    
    return all_edits
    

### Creating annotated data

In [None]:
# sentences = []

# with open(path, 'r') as file:
#     for s in file:
#         sentences.append(s)

In [None]:
with open('../../data/dictionary.csv', 'rb') as file:
    loaded_df = pd.read_csv(file)

In [None]:
loaded_df

In [None]:
words = []
freq_dic = {}

for word, freq in zip(loaded_df['word'], loaded_df['frequency']):
    words.append((-freq, str(word)))
    freq_dic[word] = freq

words.sort()

In [None]:
def get_possible_mistakes(word, just_similar = False):
    if word not in freq_dic:
        return []
    similar_words = get_similar_strings(word)

    if just_similar:
        return similar_words

    mistakes = []

    for similar in similar_words:
        if freq_dic.get(similar, 10_000_000) <= (freq_dic[word]/5):
            mistakes.append(similar)
    mistakes.sort(key=lambda x: freq_dic[x], reverse=True)
    return mistakes

In [None]:
print(get_possible_mistakes('você', True))

In [None]:
def draw_random_number(left: int, right: int):
    """
    Requires left <= right.
    Returns a random number in the inverval (left, right).
    """
    num = random.random() * (right-left)
    num = round(num)
    return num+left 

def draw_random_quantity():
    qtd_array = [0,0,0,1,1,1,1,2,2,2,2,2,2,3,3,3,4]
    random_index = draw_random_number(0,len(qtd_array)-1)
    return qtd_array[random_index]

In [None]:
draw_random_number(0,14)

In [None]:
# matches all lower case words or word with the first upper character and hiphenized words
reg = rf'\b(?:[{upper_case}][{lower_case}]*|[{lower_case}]+(?:-[{lower_case}]+)*|[{lower_case}]*[{upper_case}](?=[{lower_case}]))\b'

In [None]:
re.findall(reg,'Olá guarda-chuva Guarda-chuva paçoca')

In [None]:
df = {'wrong_text': [], 'correct_text': []}

# aliases
wrong_text = df['wrong_text']
correct_text = df['correct_text']
duplicates = set()

In [None]:
def add_to_data(wrong, right):
    if (wrong,right) in duplicates:
        return 
    duplicates.add((wrong,right))
    wrong_text.append(wrong)
    correct_text.append(right)

In [None]:
for s in tqdm(sentences):    
    all_words = list(re.finditer(rf'[{lower_case+upper_case}]+', s))
    if len(all_words) > 10:
        continue
    
    # if there are no words, there's nothing to corrupt
    if len(all_words) == 0:
        continue

    # amount of words to mess up in the sentence
    to_mess_up = draw_random_number(1,5)

    # i have to keep a current for when
    # i corrupted a word and am going
    # to the next one
    curr_s = s

    for i in range(to_mess_up):
        all_matches = list(re.finditer(rf'[{lower_case+upper_case}]+', curr_s))
        
        if not all_matches:
            break

        # get random word to corrupt  
        random_ix = draw_random_number(0,len(all_matches)-1)
        match=all_matches[random_ix]
        
        # get all of its mistakes
        mistakes = get_possible_mistakes(match.group())
        if not mistakes:
            # nothing to see here
            continue 

        # 15 most frequent
        mistakes = mistakes[:15]
        random.shuffle(mistakes)
        mistake = mistakes[0]

        # get word boundaries in the
        # sentence
        beg = match.start()
        en = match.end()

        curr_s = curr_s[:beg]+mistake+curr_s[en:]
    add_to_data(curr_s, s)



In [None]:
for w,c in zip(wrong_text[:5], correct_text[:5]):
    print(w)
    print(c)
    print()

In [None]:
len(df['wrong_text'])

In [None]:
df = Dataset.from_dict(df)

In [None]:
df

In [None]:
df.push_to_hub("carolmou/dataset-1")