In [1]:
import numpy as np
import os
import string
from tensorflow.keras.preprocessing.text import Tokenizer
import pandas as pd

In [2]:
np.random.seed(1234)

In [3]:
with open(os.path.join('data', 'robert_frost.txt')) as f:
    sentences = []
    for line in f:
        line = line.rstrip().lower()
        if line:
            line = line.translate(str.maketrans('', '', string.punctuation))
            sentences.append(line)

MAX_VOCAB_SIZE = 3000
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
sequences = tokenizer.texts_to_sequences(sentences)
sequences[:5]

[[96, 563, 564, 9, 6, 565, 566],
 [4, 567, 5, 71, 27, 879, 150],
 [4, 24, 22, 880, 177, 5, 251],
 [4, 197, 61, 22, 17, 139, 17, 5, 71],
 [3, 43, 8, 881, 9, 2, 882]]

In [4]:
for indx ,i in enumerate(sequences) :
    if len(i) < 2 :
        print(i)
        sequences.pop(indx)

[394]
[1373]
[40]
[132]
[115]
[8]
[1981]


In [5]:
M = len(tokenizer.word_index)
M

2198

In [6]:
pi = pd.Series(dtype = np.float32)
index_A1 = pd.MultiIndex.from_tuples([], names=["Word@(t-1)", "Word@(t)"])
A1 = pd.Series(index = index_A1, dtype = np.float32)
index_A2 = pd.MultiIndex.from_tuples([], names=["Word@(t-2)", "Word@(t-1)", "Word@(t)"])
A2 = pd.Series(index = index_A2, dtype = np.float32)

In [7]:
for sequence in sequences :
    pi.loc[sequence[0]] = pi.get(sequence[0], 0) + 1
    A1.loc[(sequence[0], sequence[1])] = A1.get((sequence[0], sequence[1]), 0) + 1
    
    for i in range(2, len(sequence) - 1) :
        A2.loc[(sequence[i-2], sequence[i-1], sequence[i])] = A2.get((sequence[i-2], sequence[i-1], sequence[i]), 0) + 1

In [8]:
pi = pi / pi.sum()
pi

96      0.005598
4       0.090273
3       0.034990
66      0.008397
341     0.000700
          ...   
247     0.000700
193     0.000700
57      0.000700
2191    0.000700
77      0.000700
Length: 300, dtype: float64

In [9]:
A1 = A1.sort_index()
grouped_sum = A1.groupby(level=0).sum()
A1 = A1 / grouped_sum
A1

Word@(t-1)  Word@(t)
2           70          0.012195
            73          0.024390
            79          0.012195
            82          0.012195
            90          0.012195
                          ...   
2111        59          1.000000
2120        4           1.000000
2129        5           1.000000
2145        35          1.000000
2191        19          1.000000
Length: 1195, dtype: float64

In [10]:
A2 = A2.sort_index()
grouped_sum_A2 = A2.groupby(level=[0, 1]).sum()
A2 = A2 /grouped_sum_A2
A2

Word@(t-2)  Word@(t-1)  Word@(t)
2           22          10          0.250000
                        23          0.250000
                        436         0.250000
                        1594        0.250000
            70          105         0.166667
                                      ...   
2183        4           200         1.000000
2190        261         68          1.000000
2191        19          2192        1.000000
2194        62          259         1.000000
2196        441         6           1.000000
Length: 6201, dtype: float64

In [11]:
def sample_word(series) :
    if series is None :
        return 1
    return np.random.choice(a = series.index, p=series.values)

def translate(index_mapping_lst) :
    return tokenizer.sequences_to_texts(index_mapping_lst)

def generate():
    for i in range(4): # generate 4 lines
        sentence = []

        # initial word
        w0 = sample_word(pi)
        sentence.append(w0)

        #first word
        w1 = sample_word(A1.loc[w0])
        sentence.append(w1)
        
        while True:
            w2 = sample_word(A2.get((w0, w1), None))
            w0 = w1
            w1 = w2
            if w2 == 1 or len(sentence) > 20:
                break
            sentence.append(w2)
            
        print(translate([sentence]))


In [14]:
generate()

['when i heard them toffile didnt seem to hear the']
['at least ive brought you to the door they halted helpless on the new']
['first theres the childrens house of']
['ill talk to her what does she look like you stay the way hes felt but all the rain and snow']
