In [1]:
from __future__ import annotations
import numpy as np
import random
from math import log
from icecream import ic
from typing import List
import re
import os
from tqdm import tqdm

In [2]:
def preprocess(text: str):
    text = text.lower()
    text = re.sub(r'[\n ]+', ' ', text)
    text = re.sub(r'[^A-Za-z. ]', '', text)
    text = text.split(' ')
    return text

In [3]:
class Distribution:
    def __init__(self, choices: list, smoothing: float = 2) -> None:
        # the higher the base, the more the smoothing
        assert smoothing > 1
        self.counts = {choice: log(choices.count(choice) + 1, smoothing) for choice in set(choices)}

    def sample(self):
        return random.choices(list(self.counts.keys()), weights = list(self.counts.values()), k = 1)[0]

In [4]:
def ngram(n: int, text: List[str]):
    ngrams = {}
    for i in range(len(text) - n):
        seq = " ".join(text[i : i + n])
        if seq not in ngrams: ngrams[seq] = [text[i + n]]
        else: ngrams[seq].append( text[i + n] )
        
    for key, choices in ngrams.items():
        ngrams[key] = Distribution(choices)
        choices.clear()
    return ngrams

In [5]:
def sample(query: str, models: List[dict], n: int, length: int):
    og_query = query
    query: list = ((query.split(' '))[-n:])
    result = query[:]
    sentences = 0
    length_reached = False
    at_sentence = False
    i = 0
    while not length_reached or not at_sentence:
        model = models[n]
        at_sentence = False
        key = " ".join(query)
        if key not in model and n - 1:
            return sample(og_query, models, n - 1, length)
        new_word = query[-1]
        cc = 0
        while new_word == query[-1] and cc < 5:
            new_word = model[key].sample()
            cc += 1
        del query[0]
        query.append(new_word)
        result.append(new_word)

        if new_word in {'?', '!', '.'}:
            sentences += 1
            at_sentence = True
        i += 1
        if i == length:
            length_reached = True
    return " ".join(result), n

In [6]:
with open(os.path.join("Documents", "master_scroll_tokenized.txt"), encoding = 'utf8') as infile:
    text = infile.read()
text = preprocess(text)

In [7]:
models = {}
for i in tqdm(range(2, 6)):
    models[i] = ngram(i, text)

100%|██████████| 4/4 [02:06<00:00, 31.60s/it]


In [15]:
n = 4
resp_len = 75
# THESE WORK: note that upon failing a value of n, it will atempt to decrease n for remainder of generation.
# i guess he could have just
# jocks bring the nectar to
# the fat of the peace
# i came for questions not to
# this book is largely concerned with hobbits
query = 'this book is largely concerned with hobbits'.lower()
stop_pattern = re.compile(r'[.?!][.?!]+')
comma_pattern = re.compile(r'[,][,]+')
space_pattern = re.compile(r'  +')

bad_comma_pattern = re.compile(r'[,](?=<[!.?]+>)')
for i in range(1, 11):
    gen, actual_n = sample(query, models, n, length = resp_len)
    gen = gen.replace(' .', '.')
    gen = gen.replace(' ;', ';')
    gen = gen.replace(' :', ':')
    raw_resp = ' '.join(query.split()[:len(query.split()) - actual_n]) + ' ' + gen
    resp = '. '.join(sent.strip().capitalize() for sent in raw_resp.split('. '))
    resp = resp.replace(' i ', ' I ')
    resp = stop_pattern.sub(random.choice(('.', '?', '!')), resp)
    resp = space_pattern.sub(' ', resp)
    resp = comma_pattern.sub(',', resp)
    resp = bad_comma_pattern.sub('', resp)
    print(f'{i}) "{resp}"')
    print()

1) "I came for questions not to die than eat it pinched the bridge was very exciting and all his sisters sent unto thee will I ever so slightly in my colon. Nothing really. Well hes just got broken up with can make eye contact like praying praying it isnt pippin. He seized the leaders and the emptiness and complete psych disso euphoria. Aspects to love these characters are humanoid and melech and probably rate it highly increases the behavior."

2) "I came for questions not to mine for your personal energy is better to recognise best genre game instead of grabbing the nurses put them over. Heres my story. Regardless of our wits about us when exactly it. Everything together until her memorial service. Tl dr fix her mistake when I really should try new things because there was agreeable feature can americans get in from ophir great plenty of sick of my early twenties."

3) "I came for questions not to lose his life has disgusted mum and stroked his dick with it. Advertising for fictional