In [3]:
import numpy as np
import string
import pandas as pd
from sklearn import preprocessing
import random

In [4]:
training_data_file = 'alllines.txt'

## Creating Word Dictionaries

First, we wanted to calculate the total amount of words to see if a matrix of all the words would be quicker, as array is much faster than a dictionary, but would require a square matrix. Since this matrix would be increasing by n * n cells each time and many would be zero, a dictionary, though computationally slower, would only enter required numbers and be faster.

Next, we look for the values that appear one and two after a given key. We make a dictionary that shows the amount of times a word has showed up after a previous word for all times a word appears. We do this for one step away and two steps away

In [3]:
#This takes a while, so don't update it unless you want to see if it works
totalwords = []
for line in open(training_data_file):
    #Create Stripped Sentences
    line = line.rstrip().lower()
    tokens = line.translate(str.maketrans('','', string.punctuation)).split()
    length = len(tokens)
    for i in range(length):
        token = tokens[i]
        if token not in totalwords:
            totalwords.append(token)

In [4]:
len(totalwords)

27381

In [8]:
words = {}
for line in open(training_data_file):
    #Create Stripped Sentences
    line = line.rstrip().lower()
    tokens = line.translate(str.maketrans('','', string.punctuation)).split()
    length = len(tokens)
    for i in range(length):
        token = tokens[i]
        if i == length - 1:
            pass
        else:    
            next_word = tokens[i+1]
            if token not in words:
                words[token] = {next_word: 1}
            else:
                if next_word not in words[token]:
                    words[token][next_word] = 1
                else:
                    words[token][next_word] = words[token][next_word] + 1

In [9]:
two_words = {}
for line in open(training_data_file):
    #Create Stripped Sentences
    line = line.rstrip().lower()
    tokens = line.translate(str.maketrans('','', string.punctuation)).split()
    length = len(tokens)
    for i in range(length):
        token = tokens[i]
        if length <= 2:
            pass
        else:
            if i >= length - 2:
                pass
            else:
                next_word = tokens[i+2]
                if token not in two_words:
                    two_words[token] = {next_word: 1}
                else:
                    if next_word not in two_words[token]:
                        two_words[token][next_word] = 1
                    else:
                        two_words[token][next_word] = two_words[token][next_word] + 1

## Using the Word Dictionaries To Create Probability Dictionaries

Now we must normalize the probabilites so that both the previous and previous previous word have the same probabilistic power.

In [15]:
total = 0
for w in words:
    for j in words[w]:
        total += words[w][j]
    for j in words[w]:
        words[w][j] = words[w][j] / total
total = 0
for w in two_words:
    for j in two_words[w]:
        total += two_words[w][j]
    for j in two_words[w]:
        two_words[w][j] = two_words[w][j] / total

## Time To Create a System To Calculate the Joint Probability and Maximize that

In [16]:
def create_probability(x, total):
    return x / total

In [33]:
def word_guesser(phrase):
    y = phrase.lower().split()
    
    #Sample the last word from the one away dictionary
    first = words[y[-1]]
    
    #Sample the second to last word from the two away dictionary
    second = two_words[y[-2]]
    guess_dict = {}
    
    #Multiply the like-words together. If it is not present in both word dictionaries, it is set to zero
    for x in first:
        if x not in second:
            guess_dict[x] = 0
        else:
            guess_dict[x] = first[x] * second[x]
    frame = pd.DataFrame.from_dict(guess_dict, orient = 'index')
    
    #We now want to create sample probability from the sample values
    prob_func = frame.apply(create_probability, args = (frame[0].sum(),)).copy()
    
    #Only non-zero considered for speed
    non_zero = prob_func[prob_func[0] > 0].copy()
    
    #This uses the calculated sample weights to sample from a distribution, so that it does not always return the same value
    k = random.choices(non_zero.index, non_zero[0])
    
    #Just to ensure no words are repeated
    while k[0] == y[-1]:
        k = random.choices(non_zero.index, non_zero[0])
    
    return(k[0])

In [34]:
word_guesser('I am')

'not'

In [35]:
word_guesser('I am not')

'to'

In [90]:
word_guesser('I am not to')

'the'

# Now, let us use this word_guesser to create sentences

In [61]:
word_dict = {}
for line in open(training_data_file):
    #Create Stripped Sentences
    line = line.rstrip().lower()
    tokens = line.translate(str.maketrans('','', string.punctuation)).split()
    length = len(tokens)
    #Create a word_dictionary that counts the amount of times words are used
    for x in tokens:
        if x not in word_dict:
            word_dict[x] = 1
        else:
            word_dict[x] = word_dict[x] + 1
frame = pd.DataFrame.from_dict(word_dict, orient = 'index')
#Create a probability dictionary
prob_func = frame.apply(create_probability, args = (frame[0].sum(),)).copy()        

In [87]:
def generation(size):
    for y in range(0, size):
        #First, two words are sampled from the total distribution
        
        z = (random.choices(prob_func.index, prob_func[0]))[0] + ' ' + (random.choices(prob_func.index, prob_func[0]))[0] 
        
        #Sample 8 words based on these two sampled words
        for x in range(0,8):
            z = z +  ' ' + word_guesser(z)
        print(z)
        print('\n')   

In [88]:
#Generation takes parameter of the amount of sentences you want
generation(5)

some my lord is and by and to my spirits


fold it in a most of the good of the


the counterfeit of the heart and with him that the


wits to the gods and a good in the great


with will and you for i have not be so




In [89]:
generation(5)

pale the looks of the king of his own and


hand her oer the blood that will not that i


honest when i am a most like to the king


ill in the wars of him all the king of


as hold a very man and he will not a




In [120]:
#Previous work in case I need it
'''words = {}
for line in open(training_data_file):
    #Create Stripped Sentences
    line = line.rstrip().lower()
    tokens = line.translate(str.maketrans('','', string.punctuation)).split()
    length = len(tokens)
    for i in range(length):
        token = tokens[i]
        if i == 0:
            pass
        elif i == length - 1:
            pass
        else:    
            previous = tokens[i-1]
            if token not in words:
                words[token] = {previous: 1}
            else:
                if previous not in words[token]:
                    words[token][previous] = 1
                else:
                    value = words[token][previous] 
                    words[token][previous] = value + 1
'''

"words = {}\nfor line in open(training_data_file):\n    #Create Stripped Sentences\n    line = line.rstrip().lower()\n    tokens = line.translate(str.maketrans('','', string.punctuation)).split()\n    length = len(tokens)\n    for i in range(length):\n        token = tokens[i]\n        if i == 0:\n            pass\n        elif i == length - 1:\n            pass\n        else:    \n            previous = tokens[i-1]\n            if token not in words:\n                words[token] = {previous: 1}\n            else:\n                if previous not in words[token]:\n                    words[token][previous] = 1\n                else:\n                    value = words[token][previous] \n                    words[token][previous] = value + 1\n"