# Spell Checker

## Loading data to build the vocabulary

In [45]:
with open('artigos.txt', 'r') as f:
  articles = f.read()

print(articles[:500])




imagem 

Temos a seguinte classe que representa um usuário no nosso sistema:

java

Para salvar um novo usuário, várias validações são feitas, como por exemplo: Ver se o nome só contém letras, [**o CPF só números**] e ver se o usuário possui no mínimo 18 anos. Veja o método que faz essa validação:

java 

Suponha agora que eu tenha outra classe, a classe `Produto`, que contém um atributo nome e eu quero fazer a mesma validação que fiz para o nome do usuário: Ver se só contém letras. E aí? Vou


In [46]:
len(articles)

2605046

In [47]:
len('Olá')

3

In [48]:
sample_text = 'Olá, tudo bem?'
tokens = sample_text.split()
print(tokens)

['Olá,', 'tudo', 'bem?']


In [49]:
print(len(tokens))

3


## Tokenizing text & normalizing

In [50]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [51]:
splitted_words = nltk.tokenize.word_tokenize(sample_text)
print(splitted_words)

['Olá', ',', 'tudo', 'bem', '?']


In [52]:
len(splitted_words)

5

In [53]:
'palavra'.isalpha()

True

In [54]:
def split_words(tokens_list):
  words_list = []
  for token in tokens_list:
    if token.isalpha():
      words_list.append(token)
  return words_list

split_words(splitted_words)

['Olá', 'tudo', 'bem']

In [55]:
tokens_list = nltk.tokenize.word_tokenize(articles)
words_list = split_words(tokens_list)
print(f"O número de palavras é: {len(words_list)}")

O número de palavras é: 393914


In [56]:
print(words_list[:5])

['imagem', 'Temos', 'a', 'seguinte', 'classe']


In [57]:
def normalization(word_list):
  normalized_list = []
  for word in word_list:
    normalized_list.append(word.lower()) 
  return normalized_list

In [58]:
normalized_list = normalization(words_list)

In [59]:
print(normalized_list[:5])

['imagem', 'temos', 'a', 'seguinte', 'classe']


In [64]:
len(set(normalized_list))

17652

## 1st spellchecker implementation

In [61]:
lst = 'lgica'
(lst[:1], lst[1:])

('l', 'gica')

In [82]:
sample_word = 'lgica'

def insert_letters(slices):
  new_words = []
  letters = 'abcdefghijklmnopqrstuvwxyzàáâãèéêìíîòóôõùúûç'
  for L, R in slices:
    for letter in letters:
      new_words.append(L + letter + R)
  return new_words


def words_generator(word):
  slices = []
  for i in range(len(word) + 1):
    slices.append((word[:i], word[i:]))
  generated_words = insert_letters(slices)
  return generated_words

new_words_generated = word_generator(sample_word)
print(new_words_generated)

['algica', 'blgica', 'clgica', 'dlgica', 'elgica', 'flgica', 'glgica', 'hlgica', 'ilgica', 'jlgica', 'klgica', 'llgica', 'mlgica', 'nlgica', 'olgica', 'plgica', 'qlgica', 'rlgica', 'slgica', 'tlgica', 'ulgica', 'vlgica', 'wlgica', 'xlgica', 'ylgica', 'zlgica', 'àlgica', 'álgica', 'âlgica', 'ãlgica', 'èlgica', 'élgica', 'êlgica', 'ìlgica', 'ílgica', 'îlgica', 'òlgica', 'ólgica', 'ôlgica', 'õlgica', 'ùlgica', 'úlgica', 'ûlgica', 'çlgica', 'lagica', 'lbgica', 'lcgica', 'ldgica', 'legica', 'lfgica', 'lggica', 'lhgica', 'ligica', 'ljgica', 'lkgica', 'llgica', 'lmgica', 'lngica', 'logica', 'lpgica', 'lqgica', 'lrgica', 'lsgica', 'ltgica', 'lugica', 'lvgica', 'lwgica', 'lxgica', 'lygica', 'lzgica', 'làgica', 'lágica', 'lâgica', 'lãgica', 'lègica', 'légica', 'lêgica', 'lìgica', 'lígica', 'lîgica', 'lògica', 'lógica', 'lôgica', 'lõgica', 'lùgica', 'lúgica', 'lûgica', 'lçgica', 'lgaica', 'lgbica', 'lgcica', 'lgdica', 'lgeica', 'lgfica', 'lggica', 'lghica', 'lgiica', 'lgjica', 'lgkica', 'lglica',

In [69]:
## using global variables to improve speed of the program.
## only training

In [71]:
freq = nltk.FreqDist(normalized_list)
freq.most_common(10)
words_total = len(normalized_list)

In [None]:
freq["lógica"]

In [74]:
def probabilityCalc(generated_word):
  return freq[generated_word] / words_total

In [86]:
def checker(word):
  generated_words = words_generator(word)
  correct_word = max(generated_words, key = probabilityCalc)
  return correct_word

In [80]:
probabilityCalc('lógica')

0.00022086039084673304

In [94]:
checker(sample_word)

'alogicaa'

In [98]:
checker('programi')

'aprogrami'

## Evaluating results

In [None]:
def generate_test_data(filename):
  words_test_list = []
  f = open(filename, 'r')
  for line in f:
    correct, wrong = line.split()
    words_test_list.append((correct, wrong))
  f.close()
  return words_test_list

test_list = generate_test_data('palavras.txt')
test_list

In [102]:
def evaluator(tests):
  word_count = len(tests)
  hit = 0
  for correct, wrong in tests:
    fixed_word = checker(wrong)
    if fixed_word == correct:
      hit += 1
  hit_rate = round((hit*100/word_count), 2)
  print(f"Hit rate: {hit_rate}% of {word_count} words.")

evaluator(test_list)

Hit rate: 1.08% of 186 words.


## Improving performance, new approach to the spellchecker

In [104]:
def delete_character(slices):
  new_words = []
  for L, R in slices:
    new_words.append(L + R[1:])
  return new_words

In [109]:
def words_generator(word):
  slices = []
  for i in range(len(word) + 1):
    slices.append((word[:i], word[i:]))
  generated_words = insert_letters(slices)
  generated_words += delete_character(slices)
  return generated_words

sample_word = 'lóigica'
new_words_generated = word_generator(sample_word)
print(new_words_generated)

['alóigica', 'blóigica', 'clóigica', 'dlóigica', 'elóigica', 'flóigica', 'glóigica', 'hlóigica', 'ilóigica', 'jlóigica', 'klóigica', 'llóigica', 'mlóigica', 'nlóigica', 'olóigica', 'plóigica', 'qlóigica', 'rlóigica', 'slóigica', 'tlóigica', 'ulóigica', 'vlóigica', 'wlóigica', 'xlóigica', 'ylóigica', 'zlóigica', 'àlóigica', 'álóigica', 'âlóigica', 'ãlóigica', 'èlóigica', 'élóigica', 'êlóigica', 'ìlóigica', 'ílóigica', 'îlóigica', 'òlóigica', 'ólóigica', 'ôlóigica', 'õlóigica', 'ùlóigica', 'úlóigica', 'ûlóigica', 'çlóigica', 'laóigica', 'lbóigica', 'lcóigica', 'ldóigica', 'leóigica', 'lfóigica', 'lgóigica', 'lhóigica', 'lióigica', 'ljóigica', 'lkóigica', 'llóigica', 'lmóigica', 'lnóigica', 'loóigica', 'lpóigica', 'lqóigica', 'lróigica', 'lsóigica', 'ltóigica', 'luóigica', 'lvóigica', 'lwóigica', 'lxóigica', 'lyóigica', 'lzóigica', 'làóigica', 'láóigica', 'lâóigica', 'lãóigica', 'lèóigica', 'léóigica', 'lêóigica', 'lìóigica', 'líóigica', 'lîóigica', 'lòóigica', 'lóóigica', 'lôóigica', 'lõ

In [110]:
evaluator(test_list)

Hit rate: 41.4% of 186 words.


# 2nd improvement & new approaches for the spellchecker

In [136]:
def insert_letters(slices):
  new_words = []
  letters = 'abcdefghijklmnopqrstuvwxyzáâàãéêèẽíîìĩóôõòúûùũç'
  for L, R in slices:
    for letter in letters:
      new_words.append(L + letter + R)
  return new_words


def delete_character(slices):
  new_words = []
  for L, R in slices:
    new_words.append(L + R[1:])
  return new_words


def change_letter(slices):
  new_words = []
  letters = 'abcdefghijklmnopqrstuvwxyzáâàãéêèẽíîìĩóôõòúûùũç'
  for L, R in slices:
    for letter in letters:
      new_words.append(L + letter + R[1:])
  return new_words


def letter_inverter(slices):
  new_words = []
  for L, R in slices:
    if len(R) > 1:
      new_words.append(L + R[1] + R[0] + R[2:])
  return new_words


def words_generator(word):
  slices = []
  for i in range(len(word) + 1):
    slices.append((word[:i], word[i:]))
  generated_words = insert_letters(slices)
  generated_words += delete_character(slices)
  generated_words += change_letter(slices)
  generated_words += letter_inverter(slices)
  return generated_words

In [137]:
sample_word = 'lgóica'
new_words_generated = word_generator(sample_word)
print(new_words_generated)

['algóica', 'blgóica', 'clgóica', 'dlgóica', 'elgóica', 'flgóica', 'glgóica', 'hlgóica', 'ilgóica', 'jlgóica', 'klgóica', 'llgóica', 'mlgóica', 'nlgóica', 'olgóica', 'plgóica', 'qlgóica', 'rlgóica', 'slgóica', 'tlgóica', 'ulgóica', 'vlgóica', 'wlgóica', 'xlgóica', 'ylgóica', 'zlgóica', 'álgóica', 'âlgóica', 'àlgóica', 'ãlgóica', 'élgóica', 'êlgóica', 'èlgóica', 'ẽlgóica', 'ílgóica', 'îlgóica', 'ìlgóica', 'ĩlgóica', 'ólgóica', 'ôlgóica', 'õlgóica', 'òlgóica', 'úlgóica', 'ûlgóica', 'ùlgóica', 'ũlgóica', 'çlgóica', 'lagóica', 'lbgóica', 'lcgóica', 'ldgóica', 'legóica', 'lfgóica', 'lggóica', 'lhgóica', 'ligóica', 'ljgóica', 'lkgóica', 'llgóica', 'lmgóica', 'lngóica', 'logóica', 'lpgóica', 'lqgóica', 'lrgóica', 'lsgóica', 'ltgóica', 'lugóica', 'lvgóica', 'lwgóica', 'lxgóica', 'lygóica', 'lzgóica', 'lágóica', 'lâgóica', 'làgóica', 'lãgóica', 'légóica', 'lêgóica', 'lègóica', 'lẽgóica', 'lígóica', 'lîgóica', 'lìgóica', 'lĩgóica', 'lógóica', 'lôgóica', 'lõgóica', 'lògóica', 'lúgóica', 'lûgóica'

## New Evaluator

In [138]:
def evaluator(tests, vocabulary):
  word_count = len(tests)
  hit = 0
  unknown = 0
  for correct, wrong in tests:
    fixed_word = checker(wrong)
    if fixed_word == correct:
      hit += 1
    else:
      unknown += (correct not in vocabulary)
  hit_rate = round((hit * 100 / word_count), 2)
  unknown_rate = round((unknown * 100 / word_count), 2)
  print(f"Hit rating: {hit_rate}% of {word_count} words, unknown words rating: {unknown_rate}%.")

vocabulary = set(normalized_list)
evaluator(test_list, vocabulary)

Hit rating: 76.34% of 186 words, unknown words rating: 6.99%.


## Boost attempt to spellchecker

In [139]:
def boosted_words_generator(generated_words):
  new_words = []
  for word in generated_words:
    new_words += words_generator(word)
  return new_words

In [140]:
word = 'lóiigica'

g_words = boosted_words_generator(words_generator(word))
"lógica" in g_words

True

In [141]:
len(g_words)

787396

In [142]:
def new_checker(word):
  generated_words = words_generator(word)
  boosted_words = boosted_words_generator(generated_words)
  all_words = set(generated_words + boosted_words)
  candidates = [word]
  for word in all_words:
    if word in vocabulary:
      candidates.append(word)
  correct_word = max(candidates, key = probabilityCalc)
  return correct_word

In [135]:
new_checker(word)

2


'lógica'

## Checking Results: new Evaluator & spellchecker vs Old

In [None]:
def evaluator(tests, vocabulary):
  word_count = len(tests)
  hit = 0
  unknown = 0
  for correct, wrong in tests:
    fixed_word = new_checker(wrong)
    unknown += (correct not in vocabulary)
    if fixed_word == correct:
      hit += 1
    # else:
    #   print(wrong + "-" + checker(wrong) + "-" + fixed_word)
  hit_rate = round((hit * 100 / word_count), 2)
  unknown_rate = round((unknown * 100 / word_count), 2)
  print(f"Hit rating: {hit_rate}% of {word_count} words, unknown words rating: {unknown_rate}%.")

vocabulary = set(normalized_list)
evaluator(test_list, vocabulary)

In [146]:
def evaluator(tests, vocabulary):
  word_count = len(tests)
  hit = 0
  unknown = 0
  for correct, wrong in tests:
    fixed_word = checker(wrong)
    unknown += (correct not in vocabulary)
    if fixed_word == correct:
      hit += 1
  hit_rate = round((hit * 100 / word_count), 2)
  unknown_rate = round((unknown * 100 / word_count), 2)
  print(f"Hit rating: {hit_rate}% of {word_count} words, unknown words rating: {unknown_rate}%.")

vocabulary = set(normalized_list)
evaluator(test_list, vocabulary)

Hit rating: 76.34% of 186 words, unknown words rating: 6.99%.


In [149]:
word = "lóiigica"
print(new_checker(word))
print(checker(word))

lógica
alóiigica
