<a href="https://colab.research.google.com/github/AshwinUniyal/Text_Generation/blob/main/SciFi_Text_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TASK 3 | Text generation

In [None]:
import pandas as pd
import json
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense, Embedding, TimeDistributed, Activation
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
from google.colab import drive
drive.mount('/content/drive')

book = []
with open('/content/drive/MyDrive/DL/internet_archive_scifi_v3.txt') as pdf:
    for line in pdf:
        book.append(line)
book[0] = book[0][:len(book[0])//1000]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import string
punctuations = string.punctuation
punctuations += '1234567890'
eol = '.!?'

cleaned_book = []
for line in book:
    cleaned_line = ''
    for char in line:
        if char in eol:
            cleaned_line += ' . '
            cleaned_line = cleaned_line.lower()
            cleaned_book.append(cleaned_line)
            cleaned_line = ''
            continue
        if char in punctuations or char == '\n':
            continue
        cleaned_line += char
    
    #cleaned_book.append(cleaned_line)

all_text = ' \n '.join(cleaned_book)
print(all_text[:2000])

march  all stories new and complete publisher editor if is published bimonthly by quinn publishing company inc .  
  kingston new york .  
  volume  no .  
   .  
  copyright  by quinn publishing company inc .  
  application for entry as second class matter at post office buffalo new york pending .  
  subscription  for  issues in u .  
 s .  
  and possessions canada  for  issues elsewhere  .  
  aiiow four weeks for change of address .  
  all stories appearing in this magazine are fiction .  
  any similarity to actual persons is coincidental .  
  c a fcopy .  
  printed ia u .  
 s .  
  a .  
  a chat with the editor  i   science fiction magazine called if .  
  the title was selected after much thought because of its brevity and on the theory it is indicative of the field and will be easy to remember .  
  the tentative title that just morning and couldnt remember it until wed had a cup of coffee it was summarily discarded .  
  a great deal of thought and effort lias gone into

In [None]:
len(cleaned_book)

2429

In [None]:
## method 1

# keras module for building LSTM 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 

# set seeds for reproducability

from numpy.random import seed

seed(1)

import pandas as pd
import numpy as np
import string, os 

In [None]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(x) for x in cleaned_book]
corpus[:10]

['march  all stories new and complete publisher editor if is published bimonthly by quinn publishing company inc  ',
 ' kingston new york  ',
 ' volume  no  ',
 '   ',
 ' copyright  by quinn publishing company inc  ',
 ' application for entry as second class matter at post office buffalo new york pending  ',
 ' subscription  for  issues in u  ',
 's  ',
 ' and possessions canada  for  issues elsewhere   ',
 ' aiiow four weeks for change of address  ']

In [None]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

[[1970, 41],
 [1970, 41, 421],
 [1970, 41, 421, 228],
 [1970, 41, 421, 228, 5],
 [1970, 41, 421, 228, 5, 771],
 [1970, 41, 421, 228, 5, 771, 1971],
 [1970, 41, 421, 228, 5, 771, 1971, 772],
 [1970, 41, 421, 228, 5, 771, 1971, 772, 57],
 [1970, 41, 421, 228, 5, 771, 1971, 772, 57, 37],
 [1970, 41, 421, 228, 5, 771, 1971, 772, 57, 37, 1972]]

In [None]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)

In [None]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 83, 10)            47310     
                                                                 
 lstm (LSTM)                 (None, 100)               44400     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 4731)              477831    
                                                                 
Total params: 569,541
Trainable params: 569,541
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.fit(predictors, label, epochs=50, verbose=5)

In [None]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        #predicted = model.predict_classes(token_list, verbose=0)
        predicted=model.predict(token_list, verbose=0) 
        classes=np.argmax(predicted,axis=1)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == classes:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [None]:
print (generate_text("night in winter", 30, model, max_sequence_len))

Night In Winter The City The Red Tape Of Unsnarled The Former Thing Was Wearily Out And The Number Of Light And Then It As The Job And It Was In Being To


In [None]:
## method 2

tokenizer = Tokenizer()
tokenizer.fit_on_texts(cleaned_book)
seq = tokenizer.texts_to_sequences(cleaned_book)
print(seq[:10])
# print(tokenizer.word_index)

[[1970, 41, 421, 228, 5, 771, 1971, 772, 57, 37, 1972, 1973, 43, 1277, 1278, 536, 1279], [1974, 228, 1280], [1975, 32], [], [1976, 43, 1277, 1278, 536, 1279], [1977, 16, 1978, 21, 643, 1979, 375, 19, 1980, 139, 1981, 228, 1280, 1982], [1983, 16, 1281, 9, 537], [148], [5, 1984, 1985, 16, 1281, 1986], [1987, 338, 1282, 16, 951, 4, 773]]


In [None]:
corpus = [subitem for item in seq for subitem in item]
print("corpus word length = ", len(corpus))

corpus word length =  27034


In [None]:
vocab_size = len(tokenizer.word_index)
print('vocab size = ', vocab_size)

vocab size =  4730


In [None]:
sentence_len = 20
prediction_len = 1
train_len = sentence_len - prediction_len

train_seq = []
for item in range(len(corpus) - sentence_len):
    train_seq.append(corpus[item:item + sentence_len])
    
# free up corpus
corpus = None

In [None]:
trainX = []
trainy = []
for i in train_seq:
    trainX.append(i[:train_len])
    trainy.append(i[-1])

# free up train sequence data
train_seq = None

In [None]:
model = Sequential([
    Embedding(vocab_size + 1, 50, input_length=train_len),
    LSTM(128),
    # Dense(150, activation='relu'),
    Dense(4725),
    Activation('softmax')
])

print(model.summary())

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 19, 50)            236550    
                                                                 
 lstm_2 (LSTM)               (None, 128)               91648     
                                                                 
 dense_2 (Dense)             (None, 4725)              609525    
                                                                 
 activation_2 (Activation)   (None, 4725)              0         
                                                                 
Total params: 937,723
Trainable params: 937,723
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(np.asarray(trainX), pd.get_dummies(np.asarray(trainy)), batch_size=64, epochs=25, validation_split=0.2)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f780419dd90>

In [None]:
INPUT_LENGTH = 19

token_to_word_map = dict(map(reversed, tokenizer.word_index.items()))

def generate_text(input_text, prediction_length):
    tokens = tokenizer.texts_to_sequences([input_text])

    while len(tokens[0]) < prediction_length:
        if len(tokens[0]) <= INPUT_LENGTH:
            padded_tokens = pad_sequences(tokens[-INPUT_LENGTH:], maxlen=INPUT_LENGTH)
        else:
            padded_tokens = [tokens[0][-INPUT_LENGTH:]]

        prediction = model.predict(np.asarray(padded_tokens).reshape(1,-1))
        tokens[0].append(prediction.argmax())
        
    tokens[0] = [134 if x==0 else x for x in tokens[0]]

    generated_text = " ".join(map(lambda x : token_to_word_map[x], tokens[0]))
    generated_text = generated_text.replace(' .', '.')

    return generated_text

In [None]:
print(generate_text("king in jungle", 200))

in jungle had he no lot her dont one then he you from on too his case he you from there he you laboratory he you no the two doing your day he you here you here you here it you no the theyll my take too it you here you here it you no the theyll my take too that you here you here you here it you no the theyll my take too it you here you here it you no the theyll my take too that you here you here you here it you no the theyll my take too it you here you here it you no the theyll my take too that you here you here you here it you no the theyll my take too it you here you here it you no the theyll my take too that you here you here you here it you no the theyll my take too it you here you here it you no the theyll my take too that you here you here you here it you no the theyll my take too it you here you here it you no the theyll my take too


##  text model 3

In [None]:
## create model


from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras import regularizers
import tensorflow as tf



In [None]:

model = Sequential()
model.add(Embedding(total_words, 240, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150, return_sequences = True)))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words/2, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 83, 240)           1135440   
                                                                 
 bidirectional_1 (Bidirectio  (None, 83, 300)          469200    
 nal)                                                            
                                                                 
 dropout_1 (Dropout)         (None, 83, 300)           0         
                                                                 
 lstm_3 (LSTM)               (None, 100)               160400    
                                                                 
 dense_2 (Dense)             (None, 2365)              238865    
                                                                 
 dense_3 (Dense)             (None, 4731)              11193546  
                                                      

In [None]:

class myCallback(tf.keras.callbacks.Callback):
	def on_epoch_end(self, epoch, logs={}):
		if(logs.get('accuracy')>0.93):
			print("\nReached 93% accuracy so cancelling training!")
			self.model.stop_training = True

callbacks = myCallback()

history = model.fit(predictors, label, epochs=100, verbose=1, callbacks=[callbacks])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        #predicted = model.predict_classes(token_list, verbose=0)
        predicted=model.predict(token_list, verbose=0) 
        classes=np.argmax(predicted,axis=1)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == classes:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [None]:
print (generate_text("boy in the jungle", 30, model, max_sequence_len))

Boy In The Jungle Niek Parcels His Own Troubles To Hear Him New And Committed A Grievous Faux Pas He Grunted Hated Him With A Fairly Light Sentence Maybe Even An Eightfoot Refrigerator Glancing
