# Text Generation with Neural Networks

## Functions for Processing Text

### Reading in files as a string text

In [11]:
def read_file(filepath):
    with open(filepath) as f:
        str_text = f.read()
        
    return str_text

In [12]:
# read_file('moby_dick_four_chapters.txt') 

### Tokenize and Clean Text

In [13]:
import spacy

In [14]:
nlp = spacy.load('en_core_web_md',disable=['parser','ner'])

In [15]:
nlp.max_length = 1198623

In [16]:
def separete_punc(doc_text):
    return [token.text.lower() for token in nlp(doc_text) if token.text not in '\n\n \n\n\n!"-#$%&()--.*+,-/:;<=>?@[\\]^_`{|}~\t\n ']

In [17]:
d = read_file('moby_dick_four_chapters.txt')

In [18]:
tokens = separete_punc(d)

In [20]:
len(tokens)

11338

## Create Sequences of Tokens

In [None]:
# 25 words --> network predict #26

In [21]:
train_len = 25 + 1

text_sequences = []

for i in range(train_len, len(tokens)):
    seq = tokens[i-train_len:i]
    
    text_sequences.append(seq)

In [23]:
type(text_sequences)

list

In [26]:
' '.join(text_sequences[0])

'call me ishmael some years ago never mind how long precisely having little or no money in my purse and nothing particular to interest me on'

# Keras

### Keras Tokenization

In [27]:
from keras.preprocessing.text import Tokenizer

In [28]:
tokenizer = Tokenizer()

tokenizer.fit_on_texts(text_sequences)

In [29]:
sequences = tokenizer.texts_to_sequences(text_sequences)

In [33]:
# sequences[0]

In [34]:
# tokenizer.index_word

In [32]:
for i in sequences[0]:
    print(f"{i} : {tokenizer.index_word[i]}")

956 : call
14 : me
263 : ishmael
51 : some
261 : years
408 : ago
87 : never
219 : mind
129 : how
111 : long
954 : precisely
260 : having
50 : little
43 : or
38 : no
314 : money
7 : in
23 : my
546 : purse
3 : and
150 : nothing
259 : particular
6 : to
2713 : interest
14 : me
24 : on


In [36]:
# tokenizer.word_counts

In [39]:
vocabulary_size = len(tokenizer.word_counts)
vocabulary_size

2718

### Convert to Numpy Matrix

In [40]:
import numpy as np

In [41]:
sequences = np.array(sequences)

sequences

array([[ 956,   14,  263, ..., 2713,   14,   24],
       [  14,  263,   51, ...,   14,   24,  957],
       [ 263,   51,  261, ...,   24,  957,    5],
       ...,
       [ 952,   12,  166, ...,  262,   53,    2],
       [  12,  166, 2712, ...,   53,    2, 2718],
       [ 166, 2712,    3, ...,    2, 2718,   26]])

### features labels Split

In [42]:
from keras.utils import to_categorical

In [43]:
X = sequences[:,:-1]

In [44]:
y = sequences[:,-1]

In [46]:
y = to_categorical(y, num_classes=vocabulary_size+1)

In [47]:
seq_len = X.shape[1]

# Creating an LSTM based model

In [51]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

In [54]:
def create_model(vocabulary_size,seq_len):
    
    model = Sequential()
    
    model.add(Embedding(vocabulary_size,seq_len,input_length=seq_len))
    model.add(LSTM(seq_len*6,return_sequences=True))
    model.add(LSTM(seq_len*6))
    model.add(Dense(seq_len*6,activation='relu'))
    model.add(Dense(vocabulary_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
    
    model.summary()
    
    return model

### Training the Model

In [55]:
model = create_model(vocabulary_size+1, seq_len)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 25, 25)            67975     
                                                                 
 lstm_2 (LSTM)               (None, 25, 150)           105600    
                                                                 
 lstm_3 (LSTM)               (None, 150)               180600    
                                                                 
 dense_2 (Dense)             (None, 150)               22650     
                                                                 
 dense_3 (Dense)             (None, 2719)              410569    
                                                                 
Total params: 787394 (3.00 MB)
Trainable params: 787394 (3.00 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [57]:
from pickle import dump, load

In [58]:
model.fit(X,y, batch_size=128, epochs=300, verbose=1)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

Epoch 83/300
Epoch 84/300
Epoch 85/300
Epoch 86/300
Epoch 87/300
Epoch 88/300
Epoch 89/300
Epoch 90/300
Epoch 91/300
Epoch 92/300
Epoch 93/300
Epoch 94/300
Epoch 95/300
Epoch 96/300
Epoch 97/300
Epoch 98/300
Epoch 99/300
Epoch 100/300
Epoch 101/300
Epoch 102/300
Epoch 103/300
Epoch 104/300
Epoch 105/300
Epoch 106/300
Epoch 107/300
Epoch 108/300
Epoch 109/300
Epoch 110/300
Epoch 111/300
Epoch 112/300
Epoch 113/300
Epoch 114/300
Epoch 115/300
Epoch 116/300
Epoch 117/300
Epoch 118/300
Epoch 119/300
Epoch 120/300
Epoch 121/300
Epoch 122/300
Epoch 123/300
Epoch 124/300
Epoch 125/300
Epoch 126/300
Epoch 127/300
Epoch 128/300
Epoch 129/300
Epoch 130/300
Epoch 131/300
Epoch 132/300
Epoch 133/300
Epoch 134/300
Epoch 135/300
Epoch 136/300
Epoch 137/300
Epoch 138/300
Epoch 139/300
Epoch 140/300
Epoch 141/300
Epoch 142/300
Epoch 143/300
Epoch 144/300
Epoch 145/300
Epoch 146/300
Epoch 147/300
Epoch 148/300
Epoch 149/300
Epoch 150/300
Epoch 151/300
Epoch 152/300
Epoch 153/300
Epoch 154/300
Epoch 155

Epoch 163/300
Epoch 164/300
Epoch 165/300
Epoch 166/300
Epoch 167/300
Epoch 168/300
Epoch 169/300
Epoch 170/300
Epoch 171/300
Epoch 172/300
Epoch 173/300
Epoch 174/300
Epoch 175/300
Epoch 176/300
Epoch 177/300
Epoch 178/300
Epoch 179/300
Epoch 180/300
Epoch 181/300
Epoch 182/300
Epoch 183/300
Epoch 184/300
Epoch 185/300
Epoch 186/300
Epoch 187/300
Epoch 188/300
Epoch 189/300
Epoch 190/300
Epoch 191/300
Epoch 192/300
Epoch 193/300
Epoch 194/300
Epoch 195/300
Epoch 196/300
Epoch 197/300
Epoch 198/300
Epoch 199/300
Epoch 200/300
Epoch 201/300
Epoch 202/300
Epoch 203/300
Epoch 204/300
Epoch 205/300
Epoch 206/300
Epoch 207/300
Epoch 208/300
Epoch 209/300
Epoch 210/300
Epoch 211/300
Epoch 212/300
Epoch 213/300
Epoch 214/300
Epoch 215/300
Epoch 216/300
Epoch 217/300
Epoch 218/300
Epoch 219/300
Epoch 220/300
Epoch 221/300
Epoch 222/300
Epoch 223/300
Epoch 224/300
Epoch 225/300
Epoch 226/300
Epoch 227/300
Epoch 228/300
Epoch 229/300
Epoch 230/300
Epoch 231/300
Epoch 232/300
Epoch 233/300
Epoch 

Epoch 243/300
Epoch 244/300
Epoch 245/300
Epoch 246/300
Epoch 247/300
Epoch 248/300
Epoch 249/300
Epoch 250/300
Epoch 251/300
Epoch 252/300
Epoch 253/300
Epoch 254/300
Epoch 255/300
Epoch 256/300
Epoch 257/300
Epoch 258/300
Epoch 259/300
Epoch 260/300
Epoch 261/300
Epoch 262/300
Epoch 263/300
Epoch 264/300
Epoch 265/300
Epoch 266/300
Epoch 267/300
Epoch 268/300
Epoch 269/300
Epoch 270/300
Epoch 271/300
Epoch 272/300
Epoch 273/300
Epoch 274/300
Epoch 275/300
Epoch 276/300
Epoch 277/300
Epoch 278/300
Epoch 279/300
Epoch 280/300
Epoch 281/300
Epoch 282/300
Epoch 283/300
Epoch 284/300
Epoch 285/300
Epoch 286/300
Epoch 287/300
Epoch 288/300
Epoch 289/300
Epoch 290/300
Epoch 291/300
Epoch 292/300
Epoch 293/300
Epoch 294/300
Epoch 295/300
Epoch 296/300
Epoch 297/300
Epoch 298/300
Epoch 299/300
Epoch 300/300


<keras.src.callbacks.History at 0x2882677d0>

In [60]:
# save the model to file
model.save('mobidick_model.keras')
# save the tokenizer
dump(tokenizer,open('mobidick_model','wb'))

# Generating New Text

In [61]:
from keras.preprocessing.sequence import pad_sequences

In [69]:
def generate_text(model,tokenizer,seq_len,seed_text,num_gen_words):
    
    output_text = []
    
    input_text = seed_text
    
    for i in range(num_gen_words):
        
        encoded_text = tokenizer.texts_to_sequences([input_text])[0]
        
        pad_encoded = pad_sequences([encoded_text],maxlen=seq_len, truncating='pre')
        
        pred_word_ind = model.predict(pad_encoded,verbose=0).argmax(axis=1)[0]
        
        pre_word = tokenizer.index_word[pred_word_ind]
        
        input_text += ' '+ pre_word
        
        output_text.append(pre_word)
    
    
    return ' '.join(output_text)

### Grab a random seed sequence

In [63]:
import random
random_pick = random.randint(0,len(text_sequences))

In [65]:
random_seed_text = text_sequences[random_pick]

In [66]:
seed_text = ' '.join(random_seed_text)
seed_text

"thought i to myself the man 's a human being just as i am he has just as much reason to fear me as i have"

In [70]:
generate_text(model,tokenizer,seq_len,seed_text,num_gen_words=25)

"not be altogether unwarranted but the parent of its funeral where unaccountable abed not ignoring it is inducements about me still now n't touch a"