<a href="https://colab.research.google.com/github/Balaji-0-5/Python/blob/main/text_generation_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# importing dependencies
import numpy
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

In [None]:
# downloading stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# load data
# loading data and oipening our input data in the form of a text file
# Project Gutenberg is where the data can be found
import requests
from bs4 import BeautifulSoup


url = "https://www.gutenberg.org/files/84/84-h/84-h.htm"
req = requests.get(url)
content = req.content
soup = BeautifulSoup(content,"html.parser")
file = soup.find('body').text

In [None]:
# tokenization
# standardization
# What is tokenization ?
# Tokenization is the process of breaking a stream of text up into word phrases symbols or other meaningful elements
def tokenize_words(input):
    # lowercase everything to standardize it
    input = input.lower()
    #  initiating the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # tokenizing the text into token 
    tokens = tokenizer.tokenize(input)
    # filtering the stop words into lambda
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

# preprocess the input data, make tokens
processed_inputs = tokenize_words(file)

In [None]:
# chars to numbers
# convert characters of our input numbers
# we'll sort the list of all characters that appear in out i/p text and then use the enumerate function to get the numbers that represent the characters
# we'll then create  a dictionary that stores the keys and values, or the characters and the numbers that represent them 
chars = sorted(list(set(processed_inputs)))
chars_to_num = dict((e,i) for i,e in enumerate(chars))

In [None]:
# check if words to chars or chars to nums have worked?
# just so we get an idea of what our process of convberting words to characters has worked, we print the length of the variables
input_len = len(processed_inputs)
vocab_len = len(chars)
print('Total number of characters :', input_len)
print('Total vocab :', vocab_len)

Total number of characters : 269567
Total vocab : 42


In [None]:
# seq length
# we'll define how long we want an individual sequence here
# an individual sequence is a complete mapping of input characters as integers
seq_length = 100
x_data = []
y_data = []

In [None]:
# loop through the sequence
# here we're going through the entire list of of i/p and converting the chars to numbers with a for loop
# this will create a bunch of sequences where each sequence starts with the next character in the i/p data begining with first character
for i in range(0, input_len - seq_length, 1):
    # define i/p and o/p sequences
    # i/p length is the current character plus desired sequence length
    in_seq = processed_inputs[i:i + seq_length]
    # out sequence is the initial character plus the total sequence length
    out_seq = processed_inputs[i + seq_length]
    # converting the list of characters to integers based on previous values to our lists
    x_data.append([chars_to_num[char] for char in in_seq])
    y_data.append(chars_to_num[out_seq])

# checking to see how many input sequences we have    
n_patterns = len(x_data)
print('Total Patterns :', n_patterns)

Total Patterns : 269467


In [None]:
# convert input sequence to np array that our network can use 
x = numpy.reshape(x_data, (n_patterns, seq_length, 1))
x = x/float(vocab_len)

In [None]:
# one-hot encoding our label data
y = np_utils.to_categorical(y_data)

In [None]:
# creating the model
# creating a sequential model
# dropout is used to prevent overfitting
model = Sequential()
model.add(LSTM(256, input_shape=(x.shape[1], x.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [None]:
# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# saving weights
filepath = 'model_weights_saved.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [None]:
# fit model and let it train
model.fit(x, y, epochs=4, batch_size=256, callbacks=desired_callbacks)

Epoch 1/4

Epoch 00001: loss improved from inf to 2.91254, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 2.91254 to 2.63719, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.63719 to 2.47869, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.47869 to 2.35858, saving model to model_weights_saved.hdf5


<keras.callbacks.History at 0x7fefc54d65d0>

In [None]:
# recompile the model with saved weights
filename = 'model_weights_saved.hdf5'
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# output of the model back to characters
num_to_char = dict((i,c) for i,c in enumerate(chars))

In [None]:
# random seed to help generate
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random seed : ")
print('"', "".join([num_to_char[value] for value in pattern]),'"')

Random seed : 
" ion could express heartfelt sympathy poor william said dear lovely child sleeps angel mother seen br "


In [None]:
# generate the text
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern),1))
    x = x/float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1 : len(pattern)]

peated seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared