# Typos correction

In [133]:
examples = [
    "вот в инете откапал такую инеерсную статейку предлагаю вашему внимани",
    "может и в правду лутше тебе молчать чем пытаться сказать",
    "утром мы сидели как сычи а потом каааак начали работать"
]

import time

def correct_examples(corrector_func, library_name, iterations_cnt = 300, is_text_shown = False):
    start_time = time.time()
    for i in range(iterations_cnt):
      if 'JamSpell' == library_name or 'YandexSpeller' == library_name:
        for example in examples:
          corrected = corrector_func(example)   
          if is_text_shown:   
            print("Corrected: ", corrected)  
      else:
        corrected = corrector_func(examples)
        if is_text_shown:
          print("Corrected:")
          for item in corrected:
            print(item)

    end_time = time.time() # Record the end time
    elapsed_time = end_time - start_time # Calculate elapsed time
    print("Elapsed time (seconds) for {} library work ({} loops): {}".format(library_name, iterations_cnt, elapsed_time))

### 1. Measure the running time of three methods from libraries

#### Jamspell

In [1]:
! apt install swig3.0
! pip install jamspell

Reading package lists... Done
Building dependency tree       
Reading state information... Done
swig3.0 is already the newest version (3.0.12-2.2ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 25 not upgraded.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
!wget -q https://github.com/bakwc/JamSpell-models/raw/master/ru.tar.gz && tar -xvf ru.tar.gz

ru_small.bin


In [107]:
import jamspell

corrector = jamspell.TSpellCorrector()
corrector.LoadLangModel('ru_small.bin')

True

In [134]:
# Show jamspell work
correct_examples(corrector.FixFragment, 'JamSpell', 1, True)
# Show jamspell time efficiency
correct_examples(corrector.FixFragment, 'JamSpell')

Corrected:  вот в инете отказал такую интересную статейку предлагаю вашему вниманию
Corrected:  может и в правду лучше тебе молчать чем пытаться сказать
Corrected:  утром мы сидели как сычи а потом каааак начали работать
Elapsed time (seconds) for JamSpell library work (1 loops): 0.018433570861816406
Elapsed time (seconds) for JamSpell library work (300 loops): 2.9468600749969482


#### DeepPavlov

In [40]:
! pip install -q deeppavlov
! pip install -q sacremoses
! pip install -q https://github.com/kpu/kenlm/archive/master.zip

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for kenlm (pyproject.toml) ... [?25l[?25hdone


In [41]:
from deeppavlov import configs, build_model

In [42]:
# Download the model of typo fixing
model = build_model(configs.spelling_correction.levenshtein_corrector_ru, download=True)

2023-04-13 23:11:54.574 INFO in 'deeppavlov.download'['download'] at line 138: Skipped http://files.deeppavlov.ai/lang_models/ru_wiyalen_no_punkt.arpa.binary.gz download because of matching hashes
INFO:deeppavlov.download:Skipped http://files.deeppavlov.ai/lang_models/ru_wiyalen_no_punkt.arpa.binary.gz download because of matching hashes
2023-04-13 23:11:55.429 INFO in 'deeppavlov.download'['download'] at line 138: Skipped http://files.deeppavlov.ai/deeppavlov_data/vocabs/russian_words_vocab.dict.gz download because of matching hashes
INFO:deeppavlov.download:Skipped http://files.deeppavlov.ai/deeppavlov_data/vocabs/russian_words_vocab.dict.gz download because of matching hashes


In [135]:
# Show DeepPavlov work
correct_examples(model, 'DeepPavlov', 1, True)
# Show DeepPavlov time efficiency
correct_examples(model, 'DeepPavlov')

Corrected:
вот в инее отказал такую инверсную статейку предлагаю вашему вниманию
может и в правду лучше тебе молчать чем пытаться сказать
утром мы сидели как сычи а потом каааак начали работать
Elapsed time (seconds) for DeepPavlov library work (1 loops): 0.19684505462646484
Elapsed time (seconds) for DeepPavlov library work (300 loops): 35.72472333908081


#### YandexSpeller

In [84]:
! pip install -q pyaspeller

In [120]:
from pyaspeller import YandexSpeller
speller = YandexSpeller()

In [137]:
# Show YandexSpeller work
correct_examples(speller.spelled, 'YandexSpeller', 1, True)
# Show YandexSpeller time efficiency
correct_examples(model, 'YandexSpeller')

Corrected:  вот в инете откопал такую интересную статейку предлагаю вашему вниманию
Corrected:  может и в правду лучше тебе молчать чем пытаться сказать
Corrected:  утром мы сидели как сычи а потом каааак начали работать
Elapsed time (seconds) for YandexSpeller library work (1 loops): 1.7286052703857422
Elapsed time (seconds) for YandexSpeller library work (300 loops): 80.13619661331177


## 2. Damerau-Levenshtein function

In [149]:
! pip install -q python-Levenshtein

In [169]:
import Levenshtein

fixed_examples = [
    "вот в инете откопал такую интересную статейку предлагаю вашему вниманию",
    "может и в правду лучше тебе молчать чем пытаться сказать",
    "утром мы сидели как сычи а потом как начали работать",
]

start_time = time.time()
for i in range(1000):
  for j, example in enumerate(examples):
    distance = Levenshtein.distance(examples[j], fixed_examples[j])

end_time = time.time() # Record the end time
elapsed_time = end_time - start_time # Calculate elapsed time
print("Elapsed time (seconds) for Levenshtein.distance: {}".format(elapsed_time))

Elapsed time (seconds) for Levenshtein.distance: 0.005997419357299805


## 3. Typos due to key slips on the keyboard function

In [165]:
import random

def generate_typo(word):
    keyboard = {
        'q': ['w', 'a'],
        'w': ['q', 'e', 'a', 's'],
        'e': ['w', 'r', 's', 'd'],
        'r': ['e', 't', 'd', 'f'],
        't': ['r', 'y', 'f', 'g'],
        'y': ['t', 'u', 'g', 'h'],
        'u': ['y', 'i', 'h', 'j'],
        'i': ['u', 'o', 'j', 'k'],
        'o': ['i', 'p', 'k', 'l'],
        'p': ['o', 'l'],
        'a': ['q', 'w', 's', 'z'],
        's': ['w', 'a', 'd', 'z', 'x'],
        'd': ['e', 's', 'f', 'x', 'c'],
        'f': ['r', 'd', 'g', 'c', 'v'],
        'g': ['t', 'f', 'h', 'v', 'b'],
        'h': ['y', 'g', 'j', 'b', 'n'],
        'j': ['u', 'h', 'k', 'n', 'm'],
        'k': ['i', 'j', 'l', 'm'],
        'l': ['o', 'k', 'p'],
        'z': ['a', 's', 'x'],
        'x': ['s', 'd', 'z', 'c'],
        'c': ['d', 'f', 'x', 'v'],
        'v': ['f', 'g', 'c', 'b'],
        'b': ['g', 'h', 'v', 'n'],
        'n': ['h', 'j', 'b', 'm'],
        'm': ['j', 'k', 'n']
    }

    word_list = list(word)
    
    # Pick random place of type among the letters
    idx = random.randint(0, len(word) - 1)
    
    # Get a char from this position
    char = word_list[idx]
    
    # We generate a typo by randomly replacing a character with a neighboring one on the keyboard
    if char.lower() in keyboard:
        typo_char = random.choice(keyboard[char.lower()])
        word_list[idx] = typo_char
    
    return ''.join(word_list)

word = "hello"
typo_word = generate_typo(word)
print(typo_word)

hwllo


## 4. Comparing vectors composed of frequencies of letters in a word (CountVectorizer)

In [166]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def compare_word_vectors(word1, word2):
    # Create CountVectorizer object
    vectorizer = CountVectorizer(analyzer='char')
    
    # Combine the two words into a list
    word_list = [word1, word2]
    
    # Fit and transform the word list to obtain word vectors
    word_vectors = vectorizer.fit_transform(word_list)
    
    # Compute cosine similarity between the two word vectors
    similarity = cosine_similarity(word_vectors[0], word_vectors[1])
    
    # Return cosine similarity score
    return similarity[0][0]

In [170]:
start_time = time.time()
for i in range(1000):
  for j, example in enumerate(examples):
    similarity_score = compare_word_vectors(examples[j], fixed_examples[j])

end_time = time.time() # Record the end time
elapsed_time = end_time - start_time # Calculate elapsed time
print("Elapsed time (seconds) for compare_word_vectors: {}".format(elapsed_time))

print("Cosine similarity between '{}' and '{}': {}".format(examples[j], fixed_examples[j], similarity_score))

Elapsed time (seconds) for compare_word_vectors: 8.592718362808228
Cosine similarity between 'утром мы сидели как сычи а потом каааак начали работать' and 'утром мы сидели как сычи а потом как начали работать': 0.987317065004051
