# Word Prediction using Markov Chain

In [2]:
import string

# Remove punctuation from text
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))


# Add to dictionary
def add2dict(dictionary, key, value):
    if key not in dictionary:
        dictionary[key] = []
    dictionary[key].append(value)


# Finding probability of each word
def list2probabilitydict(given_list):
    probability_dict = {}
    given_list_length = len (given_list)
    for item in given_list:
        probability_dict[item] = probability_dict.get(item, 0) + 1
    for key,value in probability_dict.items():
        probability_dict[key] = value / given_list_length
    return probability_dict

firstword={}
secondword={}
transitions={}


# Training the Markov Chain
def train_markov_chain(text):
    for line in text.splitlines():
        tokens = remove_punctuation(line.lower()).split()
        for i in range(len(tokens)-1):
            token = tokens[i]
            if i == 0:
                firstword[token] = firstword.get(token, 0) + 1
            else:
                prev_token = tokens[i-1]
                if i == 1:
                    add2dict(secondword, prev_token, token)
            

    firstword_total = sum(firstword.values())

    for key, value in firstword.items():
        firstword[key] = value / firstword_total

    for prev_word, next_word_list in secondword.items():
        secondword[prev_word] = list2probabilitydict(next_word_list)

    for prev_word, next_word_dict in secondword.items():
        for next_word, probability in next_word_dict.items():
            add2dict(transitions, prev_word, (next_word, probability))


    print ("First Dictionary: ",firstword )
    print ("Second Dictionary: ",secondword )
    print ("Third Dictionary: ",transitions )

    return "Training Successful"

train_markov_chain(open('text.txt', encoding='utf8').read())


First Dictionary:  {'ive': 0.014184397163120567, 'been': 0.009456264775413711, 'all': 0.018912529550827423, 'well': 0.014184397163120567, 'if': 0.014184397163120567, 'truth': 0.002364066193853428, 'so': 0.03309692671394799, 'hes': 0.014184397163120567, 'sweat': 0.002364066193853428, 'on': 0.009456264775413711, 'and': 0.05673758865248227, 'its': 0.016548463356973995, 'shes': 0.002364066193853428, 'actually': 0.002364066193853428, 'cause': 0.03546099290780142, 'but': 0.0591016548463357, 'knife': 0.002364066193853428, 'says': 0.002364066193853428, 'hi': 0.002364066193853428, 'after': 0.002364066193853428, 'the': 0.028368794326241134, 'onenight': 0.002364066193853428, 'it': 0.009456264775413711, 'he': 0.018912529550827423, 'now': 0.009456264775413711, 'dont': 0.01182033096926714, 'i': 0.04728132387706856, 'irreversible': 0.002364066193853428, 'took': 0.002364066193853428, 'why': 0.004728132387706856, 'get': 0.004728132387706856, 'detergent': 0.002364066193853428, 'we': 0.004728132387706856

'Training Successful'

In [3]:
import string
from collections import defaultdict, Counter

# Remove punctuation from text
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Add to dictionary
def add2dict(dictionary, key, value):
    dictionary[key].append(value)

# Training the Markov Chain
def train_markov_chain(text):
    firstword = defaultdict(int)
    secondword = defaultdict(list)
    transitions = defaultdict(list)

    for line in text.splitlines():
        tokens = remove_punctuation(line.lower()).split()
        for i in range(len(tokens)-1):
            token = tokens[i]
            if i == 0:
                firstword[token] += 1
            else:
                prev_token = tokens[i-1]
                if i == 1:
                    add2dict(secondword, prev_token, token)

    firstword_total = sum(firstword.values())
    firstword = {key: value / firstword_total for key, value in firstword.items()}

    for prev_word, next_word_list in secondword.items():
        next_word_probabilities = dict(Counter(next_word_list))
        next_word_total = sum(next_word_probabilities.values())
        secondword[prev_word] = {word: count / next_word_total for word, count in next_word_probabilities.items()}

        for next_word, probability in secondword[prev_word].items():
            add2dict(transitions, prev_word, (next_word, probability))

    return firstword, secondword, transitions

def print_dictionaries(firstword, secondword, transitions):
    print("First Dictionary:", firstword)
    print("Second Dictionary:", secondword)
    print("Third Dictionary:", transitions)

if __name__ == "__main__":
    text = open('text.txt', encoding='utf8').read()
    firstword, secondword, transitions = train_markov_chain(text)
    print_dictionaries(firstword, secondword, transitions)


First Dictionary: {'ive': 0.014184397163120567, 'been': 0.009456264775413711, 'all': 0.018912529550827423, 'well': 0.014184397163120567, 'if': 0.014184397163120567, 'truth': 0.002364066193853428, 'so': 0.03309692671394799, 'hes': 0.014184397163120567, 'sweat': 0.002364066193853428, 'on': 0.009456264775413711, 'and': 0.05673758865248227, 'its': 0.016548463356973995, 'shes': 0.002364066193853428, 'actually': 0.002364066193853428, 'cause': 0.03546099290780142, 'but': 0.0591016548463357, 'knife': 0.002364066193853428, 'says': 0.002364066193853428, 'hi': 0.002364066193853428, 'after': 0.002364066193853428, 'the': 0.028368794326241134, 'onenight': 0.002364066193853428, 'it': 0.009456264775413711, 'he': 0.018912529550827423, 'now': 0.009456264775413711, 'dont': 0.01182033096926714, 'i': 0.04728132387706856, 'irreversible': 0.002364066193853428, 'took': 0.002364066193853428, 'why': 0.004728132387706856, 'get': 0.004728132387706856, 'detergent': 0.002364066193853428, 'we': 0.004728132387706856,