In [25]:
import functools
import math
import re
from tkinter import *
from tkinter import ttk

In [26]:
# Reads in Norvig's Dataset of Words
with open('norvig_dataset.txt', 'r') as file:
    words = file.read()
    
word_list = re.split('[\t\n]', words)
word_dict = {}
total_count = 0

for i in range(len(word_list) - 1):
    if i % 2 == 0:
        count = int(word_list[i + 1])
        word_dict[word_list[i]] = count
        total_count += count


for i in word_dict:
    word_dict[i] = word_dict[i] / total_count
    
    
def word_prob(word):
    """
    Returns the probability that a given word is used based on Norvig's dataset.
    """
    return word_dict[word] if word in word_dict else (1/(total_count * 10**(len(word) - 2)))   

In [27]:
import functools
import math


def seq_prob(words):
    """
    Returns the probability of a given sequence of words
    """
    
    result = 0
    for word in words:
        result += math.log10(word_prob(word))
    return result


def all_pairs(word):
    """
    Splits word into every in-order pairs of two different words.
    
    Ex: "hello" returns [('h', 'ello'), ('he', 'llo'), ('hel', 'lo'), ('hell', 'o'), ('hello', '')]
    """
    pairs = []
    
    for i in range(len(word)):
        pairs = pairs + [(word[:i+1], word[i+1:])]
        
    return pairs


def word_segment(word):
    """
    Gives the most likely segmentation of a given string without spaces
    
    For example, "thebestsegment" returns ['the', 'best', 'segment']
    """
    
    if not word: return []
    
    segmentations = []
    for first, rest in all_pairs(word):
        segment = [first] + word_segment(rest)
        segmentations.append(segment)
    #print(segmentations)
        
    return max(segmentations, key = seq_prob)

def word_segment3(word):
    """
    Returns the next three most likely segmentations of a given string without spaces

    For example, "thebestsegment" returns [['the', 'best', 'segment'], ['the', 'bestsegment'], ['thebest', 'segment']]
    """

    if not word:
        return []

    segmentations = []
    for first, rest in all_pairs(word):
        segment = [first] + word_segment(rest)
        segmentations.append(segment)

    sorted_segmentations = sorted(segmentations, key=seq_prob, reverse=True)
    return sorted_segmentations[1:4]

        

def word_seg_as_string(word):
    """
    Returns the correct word segmentation as a string
    """
    seg = word_segment(word)
    s = ""
    
    for w in seg:
        s += " " + w
        
    return s[1:]

print(word_segment3("hihello"))

def word_seg_as_string3(word):
    """
    Returns the correct word segmentation as a string
    """
    segs = word_segment3(word)
    words = []
    
    for seg in segs:
        s = ""
        for w in seg:
            s += " " + w
        words += [s[1:]]

    return words



print(word_segment("howdyworld"))

[['h', 'i', 'hello'], ['hih', 'ello'], ['hihello']]
['howdy', 'world']


In [28]:
#Create an instance of tkinter frame or window
win= Tk()

#Set the geometry of tkinter frame
win.geometry("750x250")

chars = "howdyworld"
top = word_seg_as_string(chars)
top3 = word_seg_as_string3(chars)

txt = "Most probable interpretation: " + top + "\n" + "Other Possible Interpretations: " + top3[0] + ", " + "\n" + top3[1] + ", " + top3[2]


#Create a label
Label(win, text= txt, font= ('Helvetica 17 bold')).pack(pady=30)


win.mainloop()

In [29]:
output = ' '.join([str(element) for element in word_segment("howdyworld")])
output

'howdy world'