# Text Generation

This exercise is about generating texts. For this purpose, the following steps have to be accomplished:
* Load text corpus
* Split corpus in sentences
* Preprocess corpus
* Count bi-grams from sentences
* Compute bi-gram probabilies
* Generate sentences from bi-grams
* Generalise bi-gram Code to n-grams
* Adjust Text generation

In [1]:
# load corpus
import nltk
nltk.download("punkt")
sentences = nltk.corpus.gutenberg.sents("carroll-alice.txt")

[nltk_data] Downloading package punkt to /home/fk169/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# preprocess corpus
# casefold each word in file
def casefold_file(file):
    return [word.casefold() for word in file]
import re
# regular expression to remove all unnecessary characters
characters_to_remove = "[\[\]\(\){},.!\?;:\-_'\"\&]"

# iterate the file, clean the words and remove empty strings
def clean_file(file, removelist):
    return [token for token in [re.sub(removelist,'',word) for word in file] if token != '']

#sentences = [clean_file(casefold_file(s), characters_to_remove) for s in sentences]

#sentences = [s for s in sentences if len(s) > 2]
#print(sentences)

In [3]:
# count bi_grams
import collections

def bigrams(sentence):
    #def bigram(sentence):
    bigrams = list(zip(sentence[:-1],sentence[1:]))
    bigrams.append(("START",sentence[0]))
    bigrams.append((sentence[len(sentence)-1],"END"))
    return collections.Counter(bigrams)
    
bigram_sent = [bigrams(s) for s in sentences]
counter = collections.Counter()
for c in bigram_sent:
    counter += c

counter.most_common(10)

[(('said', 'the'), 209),
 (('START', 'i'), 168),
 (('of', 'the'), 133),
 (('START', 'the'), 118),
 (('said', 'alice'), 104),
 (('START', 'said'), 98),
 (('in', 'a'), 97),
 (('START', 'alice'), 85),
 (('and', 'the'), 82),
 (('in', 'the'), 79)]

In [4]:
# get number of elements
import numpy as np
unique = collections.Counter([k for k,_ in counter])
names = list(unique)
names.append("END")


# create matrix
transitions = np.zeros((len(names), len(names)))
transitions

for k,v in counter:
    try:
        transitions[names.index(k),names.index(v)] = counter[(k,v)]
    except ValueError:
        print(k[0],k[1])

# ignore potential errors  
transitions /= transitions.sum(axis=1, keepdims=True)



In [5]:
word= "START"
print(word)
while word != "END":
    wordI = np.argmax(np.random.multinomial(1, transitions[names.index(word),:], size=1))
    
    word = names[wordI]
    print(word)

START
alice
as
far
out
of
me
said
the
doorway
and
behind
us
drawling
master
says
it
s
no
time
it
was
how
is
of
course
here
alice
began
talking
over
to
speak
END


In [6]:
sentence = sentences[0]
print(sentence)
n=3

list(zip(*[sentence[i:] for i in range(n)]))

['alice', 's', 'adventures', 'in', 'wonderland', 'by', 'lewis', 'carroll', '1865']


[('alice', 's', 'adventures'),
 ('s', 'adventures', 'in'),
 ('adventures', 'in', 'wonderland'),
 ('in', 'wonderland', 'by'),
 ('wonderland', 'by', 'lewis'),
 ('by', 'lewis', 'carroll'),
 ('lewis', 'carroll', '1865')]

In [7]:
def ngrams(sentence, n=2):
    words = ["START", *sentence, "END"]
    return collections.Counter(list(zip(*[words[i:] for i in range(n)])))

ngram_sent = [ngrams(s,4) for s in sentences]
counter = collections.Counter()
for c in ngram_sent:
    counter += c

counter.most_common(10)

[(('*', '*', '*', '*'), 33),
 (('said', 'the', 'mock', 'turtle'), 19),
 (('she', 'said', 'to', 'herself'), 16),
 (('START', 'i', 'don', 't'), 14),
 (('said', 'the', 'caterpillar', 'END'), 12),
 (('a', 'minute', 'or', 'two'), 11),
 (('the', 'march', 'hare', 'END'), 10),
 (('you', 'won', 't', 'you'), 10),
 (('said', 'the', 'king', 'END'), 10),
 (('START', '*', '*', '*'), 9)]

In [8]:
# get the starting states
# get starts only
starts = [(key, counter[key]) for _,key in enumerate(counter) if "START" in key] 
#print(starts)
prior_names = list(zip(*starts))[0]
prior_values = np.array(list(list(zip(*starts))[1]))
prior_values = prior_values / np.sum(prior_values)

#get evrything except for starts
nstarts = [(key[:-1], key[-1], counter[key]) for _,key in enumerate(counter) if "START" not in key] 
#print(nstarts)
# get the dimensions of the transition matrix
state1 = list(set(list(zip(*nstarts))[0]))
state2 = list(set(list(zip(*nstarts))[1]))

t = np.zeros((len(state1),len(state2)))

for s,e,v in nstarts:
    t[state1.index(s),state2.index(e)]=v

t /= t.sum(axis=1, keepdims=True)
print(t.sum(axis=1))


[1. 1. 1. ... 1. 1. 1.]


In [9]:
# generate text

wordI = np.argmax(np.random.multinomial(1, prior_values, size=1))
start_words = prior_names[wordI]


last_words = start_words[1:]
gen_text = list(last_words)
while "END" not in gen_text:
    idx = state1.index(tuple(last_words))
    wordI = np.argmax(np.random.multinomial(1, t[idx,], size=1))
    gen_text.append(state2[wordI])
    print(gen_text)
    last_words = gen_text[len(gen_text)-3:]


['that', 's', 'the', 'queerest']
['that', 's', 'the', 'queerest', 'thing']
['that', 's', 'the', 'queerest', 'thing', 'about']
['that', 's', 'the', 'queerest', 'thing', 'about', 'it']
['that', 's', 'the', 'queerest', 'thing', 'about', 'it', 'END']
