In [1]:
import pandas as pd
import itertools
import numpy as np
import re
from collections import defaultdict, Counter
from collections import deque

In [2]:
def clean_word(word):
    regex_match = re.search("([a-zA-Z]+)", word)
    if regex_match is None:
        return None
    return regex_match.group(1)


def read_one_grams():
    with open("1grams.txt", "r") as file_stream:
        one_grams = [clean_word(x.strip().split(" ")[1]) for x in file_stream.readlines()]
        return [x for x in one_grams if x is not None and len(x) > 0]
    

def read_words_base_forms():
    words_base_forms = defaultdict(lambda: [])
    with open("base_forms.txt", "r") as fstream:
        bases_words = map(lambda x: x.split(";")[0:2], fstream.readlines())
        for base, word in bases_words:
            words_base_forms[word].append(base)
    return dict(words_base_forms)


def fetch_corrections_words():
    with open("literowki1.txt") as file_stream:
        return [x.strip().split(" ") for x in file_stream.readlines()]
    
    
def globalize_word(word):
    constructed_chars = []
    for character in word:
        if character == "ą":
            constructed_chars.append("a")
        elif character == "ę":
            constructed_chars.append("e")
        elif character in ["ż", "ź"]:
            constructed_chars.append("z")
        elif character == "ś":
            constructed_chars.append("s")
        elif character == "ó":
            constructed_chars.append("u")
        elif character == "ć":
            constructed_chars.append("c")        
        elif character == "ł":
            constructed_chars.append("l")                    
        elif character == "ń":
            constructed_chars.append("n")                                
        else:
            constructed_chars.append(character)
    return "".join(constructed_chars)
    

In [3]:
words_base_forms = read_words_base_forms()
words = set(words_base_forms.keys())
base_forms = set(itertools.chain(*words_base_forms.values()))
len(words), len(base_forms)

(4668625, 315689)

In [4]:
corrections_words = fetch_corrections_words()

In [5]:
globalized_words_original_words = {}
for word in words:
    globalized_word = globalize_word(word)
    if globalized_word in globalized_words_original_words and word != globalized_word:
        continue
    globalized_words_original_words[globalized_word] = word

In [6]:
cut_words_original_words_ratings = {}
for globalized_word, word in globalized_words_original_words.items():
    for i in range(len(globalized_word)):
        modified_word = f"{globalized_word[:i]}{globalized_word[i + 1:]}"
        if modified_word in cut_words_original_words_ratings:
            continue
        cut_words_original_words_ratings[modified_word] = (word, 1)
    cut_words_original_words_ratings[globalized_word] = (word, 0)

In [10]:
def get_letter_neighbours(letter):
    if letter == "a":
        return {"q", "w", "e", "d", "s", "z", "x", "c", "y"}
    elif letter == "b":
        return {"v", "f", "g", "h", "n", "m", "j", "c"}
    elif letter == "c":
        return {"z", "x", "v", "b", "s", "d", "f", "g"}
    elif letter == "d":
        return {"a", "s", "f", "g", "w", "e", "r", "x", "c", "v"}
    elif letter == "e":
        return {"q", "w", "r", "t", "a", "s", "d", "f"}
    elif letter == "f":
        return {"s", "d", "g", "h", "e", "r", "t", "c", "v", "b", "x"}
    elif letter == "g":
        return {"d", "f", "h", "j", "r", "t", "y", "c", "v", "b", "n"}
    elif letter == "h":
        return {"f", "g", "j", "k", "t", "y", "u", "v", "b", "n", "m"}
    elif letter == "i":
        return {"y", "u", "o", "p", "h", "j", "k", "l"}
    elif letter == "j":
        return {"g", "h", "k", "l", "y", "u", "i", "b", "n", "m"}
    elif letter == "k":
        return {"h", "j", "l", "u", "i", "o", "n", "m"}
    elif letter == "l":
        return {"p", "o", "i", "k", "j", "h", "m", "n"}
    elif letter == "m":
        return {"n", "b", "l", "k", "j", "h", "g", "i"}
    elif letter == "n":
        return {"v", "b", "m", "g", "h", "j", "k", "g"}
    elif letter == "o":
        return {"y", "u", "i", "p", "h", "j", "k", "l"}
    elif letter == "p":
        return {"u", "i", "o", "j", "k", "l", "h", "m"}
    elif letter == "r":
        return {"w", "e", "t", "y", "s", "d", "f", "g"}
    elif letter == "s":
        return {"a", "d", "f", "q", "w", "e", "r", "z", "x", "c", "v"}
    elif letter == "t":
        return {"e", "r", "y", "u", "d", "f", "g", "h"}
    elif letter == "y":
        return {"r", "t", "u", "i", "f", "g", "h", "j"}
    elif letter == "u":
        return {"t", "y", "i", "o", "g", "h", "j", "k"}
    elif letter == "w":
        return {"q", "e", "r", "t", "a", "s", "d", "f"}
    elif letter == "x":
        return {"z", "c", "v", "b", "a", "s", "d", "f"}
    elif letter == "z":
        return {"a", "s", "d", "f", "x", "c", "v", "q"}
    elif letter == "q":
        return {"a", "s", "d", "w", "e", "r", "f", "z"}
    elif letter == "v":
        return {"x", "c", "b", "n", "d", "f", "g", "h"}    
    else:
        raise ValueError(f"Invalid letter: {letter}")
    

def get_word_modifications_weights(word):
    modifications_weights = []
    for i in range(1, len(word)):
        modifications_weights.append((f"{word[:i - 1]}{word[i]}{word[i - 1]}{word[i + 1:]}", 1))
    for i in range(2, len(word)):
        modifications_weights.append((f"{word[:i - 2]}{word[i]}{word[i - 1]}{word[i - 2]}{word[i + 1:]}", 1))
    for i in range(len(word)):
        for letter in get_letter_neighbours(word[i]):
            modifications_weights.append((f"{word[:i]}{letter}{word[i + 1:]}", 1.5))
    for i in range(len(word)):
        modifications_weights.append((f"{word[:i]}{word[i + 1:]}", 1))
    return modifications_weights

    
def find_closest_word_rating(initial_word):
    resolved_words = set()
    words_steps_queue = deque([(initial_word, 0)])
    closest_word = None
    best_steps_count = np.inf
    while len(words_steps_queue) > 0:
        word, steps_count = words_steps_queue.popleft()
        if word in cut_words_original_words_ratings:
            candidate_word, steps_increase = cut_words_original_words_ratings[word]
            if steps_count + steps_increase < best_steps_count:
                best_steps_count = steps_count + steps_increase
                closest_word = candidate_word                
        if steps_count >= 5 or steps_count >= best_steps_count or len(resolved_words) >= 2_000_000:
            break
        modifications_weights = get_word_modifications_weights(word)
        for modification, weight in modifications_weights:
            if modification in resolved_words:
                continue
            words_steps_queue.append((modification, steps_count + weight))
            resolved_words.add(modification)
    return closest_word

In [11]:
def get_accuracy(corrections_words):
    positive_counter = 0
    for correction, word in corrections_words:
        if correction == find_closest_word_rating(globalize_word(word)):
            positive_counter += 1
    return positive_counter / len(corrections_words)

In [12]:
get_accuracy(corrections_words)

0.7024793388429752

In [229]:
find_closest_word_rating(globalize_word(corrections_words[21][1]))

'korektorek'