In [1]:
import numpy as np
import pandas as pd
import string
import random

In [2]:
training_data_file = 'alllines.txt'

## Creating Word Dictionaries

First, we wanted to calculate the total amount of words to see if a matrix of all the words would be quicker, as array is much faster than a dictionary, but would require a square matrix. Since this matrix would be increasing by n * n cells each time and many would be zero, a dictionary, though computationally slower, would only enter required numbers and be faster.

Next, we look for the values that appear one and two after a given key. We make a dictionary that shows the amount of times a word has showed up after a previous word for all times a word appears. We do this for one step away and two steps away

In [3]:
#This takes a while, so don't update it unless you want to see if it works
totalwords = []
for line in open(training_data_file):
    #Create Stripped Sentences
    line = line.rstrip().lower()
    tokens = line.translate(str.maketrans('','', string.punctuation)).split()
    length = len(tokens)
    for i in range(length):
        token = tokens[i]
        if token not in totalwords:
            totalwords.append(token)

In [4]:
len(totalwords)

27381

In [5]:
words = {}
for line in open(training_data_file):
    #Create Stripped Sentences
    line = line.rstrip().lower()
    tokens = line.translate(str.maketrans('','', string.punctuation)).split()
    length = len(tokens)
    for i in range(length):
        token = tokens[i]
        if i == length - 1:
            pass
        else:    
            next_word = tokens[i+1]
            if token not in words:
                words[token] = {next_word: 1}
            else:
                if next_word not in words[token]:
                    words[token][next_word] = 1
                else:
                    words[token][next_word] = words[token][next_word] + 1

In [6]:
two_words = {}
for line in open(training_data_file):
    #Create Stripped Sentences
    line = line.rstrip().lower()
    tokens = line.translate(str.maketrans('','', string.punctuation)).split()
    length = len(tokens)
    for i in range(length):
        token = tokens[i]
        if length <= 2:
            pass
        else:
            if i >= length - 2:
                pass
            else:
                next_word = tokens[i+2]
                if token not in two_words:
                    two_words[token] = {next_word: 1}
                else:
                    if next_word not in two_words[token]:
                        two_words[token][next_word] = 1
                    else:
                        two_words[token][next_word] = two_words[token][next_word] + 1

In [7]:
#This is needed, because it will help for sentence creation later.
first_words = {}
for line in open(training_data_file):
    #Create Stripped Sentences
    line = line.rstrip().lower()
    tokens = line.translate(str.maketrans('','', string.punctuation)).split()
    if len(tokens) > 0:
        token = tokens[0]
        if token not in first_words:
            first_words[token] = 1
        else:
            first_words[token]+=1

## Using the Word Dictionaries To Create Probability Dictionaries

Now we must normalize the probabilites so that both the previous and previous previous word have the same probabilistic power.

In [8]:
#Re-running this requires all the dictionaries to be made again
total = 0
for w in words:
    for j in words[w]:
        total += words[w][j]
    for j in words[w]:
        words[w][j] = words[w][j] / total
total = 0
for w in two_words:
    for j in two_words[w]:
        total += two_words[w][j]
    for j in two_words[w]:
        two_words[w][j] = two_words[w][j] / total

## Time To Create a System To Calculate the Joint Probability and Maximize that

In [9]:
def create_probability(x, total):
    return x / total

In [10]:
def word_guesser(phrase):
    y = phrase.lower().split()
    
    #Sample the last word from the one away dictionary
    first = words[y[-1]]
    
    #Sample the second to last word from the two away dictionary
    second = two_words[y[-2]]
    guess_dict = {}
    
    #Multiply the like-words together. If it is not present in both word dictionaries, it is set to zero
    for x in first:
        if x not in second:
            guess_dict[x] = 0
        else:
            guess_dict[x] = first[x] * second[x]
    frame = pd.DataFrame.from_dict(guess_dict, orient = 'index')
    
    #We now want to create sample probability from the sample values
    prob_func = frame.apply(create_probability, args = (frame[0].sum(),)).copy()
    
    #Only non-zero considered for speed
    non_zero = prob_func[prob_func[0] > 0].copy()
    
    #This uses the calculated sample weights to sample from a distribution, so that it does not always return the same value
    k = random.choices(non_zero.index, non_zero[0])
    
    #Just to ensure no words are repeated
    while k[0] == y[-1]:
        k = random.choices(non_zero.index, non_zero[0])
    
    return(k[0])

In [11]:
word_guesser('I am')

'not'

In [12]:
word_guesser('I am not')

'so'

In [18]:
word_guesser('I am a not so')

'but'

# Now, let us use this word_guesser to create sentences

In [14]:
#First, we take the dictionary of all words that are first in the sentence and create a probability distribution out of it
#This essentially gives us likely words to start a sentence
first_words_frame = pd.DataFrame.from_dict(first_words, orient = 'index')
first_words_func = first_words_frame.apply(create_probability, args = (first_words_frame[0].sum(),))

In [15]:
def generation(size):
    for y in range(0, size):
        first = random.choices(first_words_func.index, first_words_func[0])[0]
        
        #Here we sample the one word previous dictionary to get a likely word from the word that starts the sentence 
        second_frame = pd.DataFrame.from_dict(words[first], orient = 'index')
        second = random.choices(second_frame.index, second_frame[0])[0]
        
        z = first + ' ' + second
        #Now that we have sampled those two words, we can use the previous process and just add to the string
        for x in range(0,8):
            z = z +  ' ' + word_guesser(z)
        print(z)
        print('\n')   

In [16]:
#Generation takes parameter of the amount of sentences you want
generation(5)

enter nerissa and i am glad to the king your


urge you your good my good master i will be


marry sweet sir i cannot be the full of his


nor tent i pray you my love and i shall


not furnishd with the lords of the very king and




In [17]:
generation(5)

o brutus my lord my lord and all this and


bowstring and the kings of a suit of thy head


when to do you sir i will not with the


or that my woman of a great and i shall


of love and your bed and in the king of


