In [1]:
%pip install jaro-winkler textdistance pyphonetics wordfreq cyhunspell

Defaulting to user installation because normal site-packages is not writeable
Collecting jaro-winkler
  Downloading jaro_winkler-2.0.0-py3-none-any.whl (33 kB)
Collecting textdistance
  Downloading textdistance-4.2.2-py3-none-any.whl (28 kB)
Collecting pyphonetics
  Downloading pyphonetics-0.5.3-py2.py3-none-any.whl (10 kB)
Collecting wordfreq
  Downloading wordfreq-2.5.1.tar.gz (56.8 MB)
[K     |████████████████████████████████| 56.8 MB 29 kB/s  eta 0:00:01
[?25hCollecting cyhunspell
  Downloading cyhunspell-2.0.2-cp37-cp37m-manylinux2010_x86_64.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 21.6 MB/s eta 0:00:01
[?25hCollecting cacheman>=2.0.6
  Downloading CacheMan-2.1.0-py2.py3-none-any.whl (12 kB)
Collecting unidecode<2,>=1
  Downloading Unidecode-1.3.2-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 54.9 MB/s 
[?25hCollecting msgpack>=1.0
  Downloading msgpack-1.0.2-cp37-cp37m-manylinux1_x86_64.whl (273 kB)
[K     |████████████████

In [2]:
import nltk
nltk.download('punkt', quiet=True)

True

In [3]:
import numpy as np
from nltk.tokenize import word_tokenize
from hunspell import Hunspell
from pyphonetics import RefinedSoundex

from jaro import jaro_winkler_metric
from textdistance import levenshtein
from wordfreq import word_frequency

In [None]:
def sort_candidates(word, candidates):
    if len(candidates) == 0:
            return []
    
    

In [8]:
hunspell = Hunspell('en_US')
soundex = RefinedSoundex()


def rank_suggestions(word, suggestions, n_suggestions):
    features = np.zeros((len(suggestions), 4))
    for i, suggestion in enumerate(suggestions):
        text_edit_distance = levenshtein(word, suggestion)
        phoneme_edit_distance = soundex.distance(word, suggestion, metric='levenshtein')
        suggestion_prob = word_frequency(suggestion, 'en')
        jw_distance = jaro_winkler_metric(word, suggestion)
        
        features[i] = np.array([text_edit_distance, phoneme_edit_distance, 1 - suggestion_prob, jw_distance])
    
    norm_features = ((features - features.min(axis=0)) / (features.max(axis=0) - features.min(axis=0) + 1e-9))
    weights = norm_features.sum(axis=1)
    ranked_suggestions = [suggestions[i] for i in np.argsort(weights)]
    return ranked_suggestions[:n_suggestions]


def spellcheck(text: str, n_suggestions: int):
    words = word_tokenize(text, 'english')
    
    for i, word in enumerate(words):
        if not word.isalpha():
            continue
        if hunspell.spell(word):
            continue
        
        suggestions = hunspell.suggest(word)
        print(f'Word "{word}" at position {i} is probably misspelled.')
        if not suggestions:
            print(f'No fixes available.')
        else:
            print('Available fixes:')
            for suggestion in rank_suggestions(word, suggestions, n_suggestions):
                print(f'- {suggestion}')
        print()



In [12]:
text = 'The Hamming distence is named after Richard Hamming, who intrdced the concept in his fundamental paper on Hamming codes, Error detecting and error corectin codes, in 1950. Hamming weight analysis of bits is used in several disciplines including infzzmation theory, coding theory, and cryptography.'
n_suggestions = 4
spellcheck(text, n_suggestions)

Word "distence" at position 2 is probably misspelled.
Available fixes:
- distance
- existence
- insistence
- distend

Word "intrdced" at position 10 is probably misspelled.
Available fixes:
- introduced
- intercede
- interceded

Word "corectin" at position 25 is probably misspelled.
Available fixes:
- correction
- corrector
- incorrect
- corrective

Word "infzzmation" at position 42 is probably misspelled.
Available fixes:
- information
- inflammation
- intimation



