In [11]:
from __future__ import annotations
import numpy as np
import random
from math import log
from icecream import ic
from typing import List
import re
import os
from tqdm import tqdm

In [12]:
def preprocess(text: str):
    text = text.lower()
    text = re.sub(r'[\n ]+', ' ', text)
    text = re.sub(r'[^A-Za-z. ]', '', text)
    text = text.split(' ')
    return text

In [13]:
class Distribution:
    def __init__(self, choices: list, smoothing: float = 3) -> None:
        # the higher the base, the more the smoothing
        assert smoothing > 1
        self.counts = {choice: log(choices.count(choice) + 1, smoothing) for choice in set(choices)}

    def sample(self):
        return random.choices(list(self.counts.keys()), weights = list(self.counts.values()), k = 1)[0]

In [14]:
def ngram(n: int, text: List[str]):
    ngrams = {}
    for i in range(len(text) - n):
        seq = " ".join(text[i : i + n])
        if seq not in ngrams: ngrams[seq] = [text[i + n]]
        else: ngrams[seq].append( text[i + n] )
        
    for key, choices in ngrams.items():
        ngrams[key] = Distribution(choices)
        choices.clear()
    return ngrams

In [15]:
def sample(query: str, models: List[dict], n: int, length: int):
    model = models[n]
    og_query = query
    query: list = ((query.split(' '))[-n:])
    result = query[:]
    sentences = 0
    length_reached = False
    at_sentence = False
    i = 0
    while not length_reached or not at_sentence:
        at_sentence = False
        key = " ".join(query)
        if key not in model and n - 1:
            return sample(og_query, models, n - 1, length)
        new_word = model[key].sample()
        del query[0]
        query.append(new_word)
        result.append(new_word)

        if new_word in {'?', '!', '.'}:
            sentences += 1
            at_sentence = True
        i += 1
        if i == length:
            length_reached = True
    return " ".join(result), n

In [16]:
with open(os.path.join("Documents", "master_scroll_tokenized.txt"), encoding = 'utf8') as infile:
    text = infile.read()
text = preprocess(text)

In [17]:
models = {}
for i in tqdm(range(2, 6)):
    models[i] = ngram(i, text)

100%|██████████| 4/4 [01:24<00:00, 21.11s/it]


In [18]:
n = 4
resp_len = 15
query = 'he looked at my gravely and said'.lower()
stop_pattern = re.compile(r'[.?!][.?!]+')
comma_pattern = re.compile(r'[,][,]+')
space_pattern = re.compile(r'  +')

bad_comma_pattern = re.compile(r'[,](?=<[!.?]+>)')
for j in range(10):
    gen, actual_n = sample(query, models, n, length = resp_len)
    gen = gen.replace(' .', '.')
    gen = gen.replace(' ;', ';')
    gen = gen.replace(' :', ':')
    raw_resp = ' '.join(query.split()[:len(query.split()) - actual_n]) + ' ' + gen
    resp = '. '.join(sent.capitalize() for sent in raw_resp.split('. '))
    resp = resp.replace(' i ', ' I ')
    resp = stop_pattern.sub(random.choice(('.', '?', '!')), resp)
    resp = space_pattern.sub(' ', resp)
    resp = comma_pattern.sub(',', resp)
    resp = bad_comma_pattern.sub('', resp)
    print(resp)
    print()

He looked at my gravely and said he couldnt numb the pain from the salvicon and leverage ratios declined over the match and it waxed great exceedingly as betrayed finally drops the dumbbells around like crazy.

He looked at my gravely and said poor bombur who hast done my physical head next to bare ridges that fell that day he said were no prejudice perhaps our great people so put off a disease caused when cells divide I yell out alexa hear ye the seed yet in two spammy facebook articles.

He looked at my gravely and said were no graves in haste the loving comments. They arrive at what point during the game offers means that they gazed down upon it while talking helps with a young bullock one young bullock shalt thou mar the land which thou swarest unto their inheritance.

He looked at my gravely and said if youre busy then jeroboam built shechem in mount ebal as each other and stronger.

He looked at my gravely and said mexicans are lazy and not some wet tomato or crunchy lettuce. Onc