In [None]:
import nltk
import re  # Load the Regex-modul
from urllib.request import urlopen


def generateDictionaryByFreqAndLen():
    print(">> downloading File: fremde_strassen.txt from dev.gutenberg.org")
    url = "https://dev.gutenberg.org/files/54597/54597-0.txt"
    raw = urlopen(url).read().decode('utf-8').lower()

    start_text = "(http://www.pgdp.net)"
    start_index = raw.find(start_text) + len(start_text)
    end_index = raw.find("***end of the project gutenberg ebook")
    text = raw[start_index:end_index]  # cut out the english gutenberg-infos-text

    print(">> preparing the text")

    text = re.sub(r'\+[va]\.\+', "", text)  # remove all +a.+ and +v.a+
    text = re.sub(r'[»«~+→{}()\[\];*?à\':&!,.\-]|[0-9]', "", text)  # remove all useless symbols and numbers
    text = re.sub(r'ß', "ss", text)  # replace all ß with ss
    text = re.sub(r'\b\w{1,2}\b', "", text)  # remove all words smaller then 3

    print(">> creating the dictionary")
    tokens = nltk.word_tokenize(text)
    dictionary_of_frequency = {}
    for word in tokens:
        if dictionary_of_frequency.get(word) is None:
            dictionary_of_frequency.update({word: 1})
        else:
            updated_word_freq = dictionary_of_frequency.get(word) + 1
            dictionary_of_frequency.update({word: 1 + updated_word_freq})

    word_set = set(tokens)
    print("\t -> size of dictionary: " + str(len(word_set)) + " different words")

    dictionary_of_length = {}
    for word in word_set:
        length = len(word)
        if dictionary_of_length.get(length) is None:
            dictionary_of_length.update({length: [word]})
        else:
            updated_word_list = dictionary_of_length.get(length).append(word)
            dictionary_of_length.update(length=updated_word_list)

    print(">> program is ready to use \n")

    return {"frequency": dictionary_of_frequency, "length": dictionary_of_length}


TGREEN = '\033[32m'  # Green Text
TYELLOW = '\033[33m'  # Yellow Text
ENDC = '\033[m'  # reset to the defaults

# create a dictionary with frequency and length
# to be more efficient
generated_dictionary = generateDictionaryByFreqAndLen()
dictionary_by_frequency = generated_dictionary.get("frequency")
dictionary_by_length = generated_dictionary.get("length")


# function for print nicely the results to the console
def printDistAndFreq(list, level):
    print(f"Word(s) with distance of {level}:")
    if len(list) > 0:
        for i, word in enumerate(list, start=1):
            print(f"{TYELLOW} {word.get('word')} {ENDC} ({str(word.get('freq'))})".ljust(25),
                  end="\n" if (i % 6 == 0) else " | ")

    print("")


# starting the loop for user input and distance 1-3 check
while True:
    wordInput = input("Enter Word: ")

    if wordInput == "EXIT":
        break
    else:
        wordInput = wordInput.lower()

    # prepare a dictionary, with words of max more or less one character then the input word
    dictionary_of_nearest = []
    wordLength = len(wordInput)
    for i in range(-1, 2):
        if dictionary_by_length.get(wordLength + i) is not None:
            dictionary_of_nearest.append(dictionary_by_length.get(wordLength + i))

    dictionary_of_nearest = [y for x in dictionary_of_nearest for y in x]  # flatten the double list

    distanceDict = {1: [], 2: [], 3: []}
    word_exist_in_dict = False

    # use the dictory with the closest word
    for word in dictionary_of_nearest:
        # using ntlk.edit_distance function to calculate the distance with the input word and the dictionary of the nearest words
        distance = nltk.edit_distance(wordInput, word)

        if distance == 0:
            word_exist_in_dict = True
        elif not word_exist_in_dict and distance < 4:
                distanceListUpdated = distanceDict.get(distance).append(
                    {"freq": dictionary_by_frequency.get(word), "word": word})
                distanceDict.update(distance=distanceListUpdated)

    if word_exist_in_dict:
        print(f"{TGREEN}{wordInput}{ENDC} exist {str(dictionary_by_frequency.get(wordInput))} times in the Dictionary.")
    else:
        print(
            f"The word '{TGREEN}{wordInput}{ENDC}' exist not in the Dictionary. \nHere are my top sorted suggestions by distance (1 to 3) and word frequency:\n")
        for i in range(1, 4):
            if len(distanceDict.get(i)) > 0:
                distanceDict.update(
                    {i: sorted(distanceDict.get(i), key=lambda k: (k["freq"], k["word"]), reverse=True)})
                printDistAndFreq(distanceDict.get(i), str(i))
            else:
                print(f"Exist no words in distance with {i}.")

    print("")
    print("New try or type EXIT to terminate")

print("Terminate")