This code loads all the epubs and the carolina corpus in order to find the frequency of each word in this small sample. Creates a dictionary.csv that maps each word to its frequency.

In [75]:
import ebooklib
from bs4 import BeautifulSoup
from ebooklib import epub
import re
import pandas as pd
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [76]:
regex = r'\.|\?|!|;|\n'

In [77]:
list_books = ['a_guerra_dos_tronos','linha_d_agua','o_alienista', 'ensaio_sobre_a_cegueira', 'sapiens', 'o_guarani', 'colecao_especial_jane_austen', 'o_livro_das_princesas','a_falencia', 'sob_a_redoma', 'os_cem_melhores_contos_brasileiros_do_seculo', 'os_tres_mosqueteiros', 'harry_potter_e_a_ordem_da_fenix', 'grande_sertao_veredas', 'a_redoma_de_vidro', 'aristoteles_e_dante_descobrem_os_segredos_do_universo', 'como_evitar_preocupacoes_e_comecar_a_viver']
list_books = [book+'.epub' for book in list_books]

In [78]:
def process_book(book_name):
    book = epub.read_epub(f'../../data/epubs/{book_name}')
    items = list(book.get_items_of_type(ebooklib.ITEM_DOCUMENT))
    def chapter_to_str(chapter):
        soup = BeautifulSoup(chapter.get_body_content(), 'html.parser')
        text = [para.get_text() for para in soup.find_all('p')]
        return ''.join(text)
    texts = ""
    for c in items:
        chapter = chapter_to_str(c)
        texts += chapter
    return texts 

In [79]:
carolina = load_dataset('carolina-c4ai/corpus-carolina')

Found cached dataset corpus-carolina (/home/carolmou/.cache/huggingface/datasets/carolina-c4ai___corpus-carolina/carolina/1.2.0/60fe73ac1719891e34135322031692bf177e9323e830d620cf3304f535ee2693)
100%|██████████| 1/1 [00:01<00:00,  1.26s/it]


In [80]:
carolina_text = carolina['corpus']['text']

In [81]:
raw_text = ' '.join([process_book(book) for book in list_books])



In [82]:
sentences = re.split(regex, raw_text)

In [83]:
sentences += carolina_text

In [84]:
# brazilian alphabet
lower_case = r'abcdefghijklmnopqrstuvwxyzáàâãéêíóôõúç'
upper_case = r'ABCDEFGHIJKLMNOPQRSTUVWXYZÁÀÂÃÉÊÍÓÔÕÚÇ'

# matches all lower case words or word with the first upper character
reg = rf'\b(?:[{upper_case}][{lower_case}]*|[{lower_case}]+(?:-[{lower_case}]+)*|[{lower_case}]*[{upper_case}](?=[{lower_case}]))\b'

In [85]:
freq = {}

for sent in sentences:
    words = re.findall(reg, sent)
    for w in words:
        freq[w] = freq.get(w, 0) + 1

In [86]:
df = {'word': [], 'frequency': []}

In [87]:
for w, f in freq.items():
    df['word'].append(w)
    df['frequency'].append(f)

In [88]:
df = pd.DataFrame(df)

In [90]:
df[:100]

Unnamed: 0,word,frequency
0,Ficha,6422
1,George,52633
2,R,460650
3,os,4143352
4,direitos,118656
...,...,...
95,a,20358990
96,escurecer,742
97,ao,3065733
98,redor,53747


In [91]:
df.to_csv('../../data/dictionary.csv')