In [None]:
# Importing the packages
import tensorflow as tf
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import requests
import numpy as np

In [None]:
# Defining the dataset
def load_data():
    # get the data from the api
    url = "https://raw.githubusercontent.com/minimaxir/textgenrnn/master/datasets/hacker_news_2000.txt"
    response = requests.get(url)
    data = response.text

    return data

In [None]:
# Data cleaning
# preprocess the data
def preprocess_data(data):
    # get the title and extract
    extract = data

    # remove the html tags
    extract = extract.replace("<p>", "")
    extract = extract.replace("</p>", "")
    extract = extract.replace("<b>", "")
    extract = extract.replace("</b>", "")
    
    # remove the new line characters
    extract = extract.replace("\n", "")
    extract = extract.replace("\r",".")

    # remove the html entities
    extract = extract.replace("&quot;", "")
    extract = extract.replace("&amp;", "")
    extract = extract.replace("&lt;", "")
    extract = extract.replace("&gt;", "")

    # remove the extra spaces
    extract = extract.replace("  ", " ")

    # remove the extra spaces at the beginning and end
    extract = extract.strip()

    # split the extract into sentences
    sentences = extract.split(".")

    # remove the empty sentences
    sentences = [sentence for sentence in sentences if len(sentence) > 0]

    # remove the extra spaces at the beginning and end of each sentence
    sentences = [sentence.strip() for sentence in sentences]

    # add the start and end tokens
    sentences = ["<start> " + sentence + " <end>" for sentence in sentences]

    return sentences


In [None]:
# build the model
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = Sequential([
        Embedding(vocab_size, embedding_dim, batch_input_shape=[batch_size, None]),
        LSTM(rnn_units, return_sequences=False, stateful=True, recurrent_initializer="glorot_uniform", dropout = 0.4, recurrent_dropout=0.1),
        Dense(64, activation = 'relu'), Masking(mask_value = 0.1),
        
    ])
    return model

In [None]:
from tensorflow.python.ops.gen_dataset_ops import parallel_interleave_dataset_v2

# generate text
def generate_text(model, tokenizer, start_string, num_generate=5, temperature=1.0):
    vocab = tokenizer.word_index

    print(vocab.items)

    # char2idx  and idx2char definition
    char2idx = {i:u for i, u in vocab.items()}
    idx2char = {u:i for i, u in vocab.items()}

    # convert the start string to numbers
    input_eval = [char2idx[s] for s in start_string.split(" ")]
    input_eval = tf.expand_dims(input_eval, 0)
    input_eval = tf.pad(input_eval, [[0, 127], [0, 0]])

    # empty string to store the results
    text_generated = []

    # low temperature results in more predictable text
    # high temperature results in more surprising text
    # experiment to find the best setting

    # reset the states of the model
    model.reset_states()

    for i in range(num_generate):
        # predict the next character
        predictions = model(input_eval)
        predictions = predictions[1]                                                                                                          ons = tf.squeeze(predictions, 0)

        # use a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()
        predicted_id += 1

        # pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)
        input_eval = tf.pad(input_eval, [[0, 127], [0, 0]])

        if(predicted_id < len(idx2char)):
          # add the predicted character to the generated text
          text_generated.append(idx2char[predicted_id])

    print(start_string)
    return (start_string + " " + " ".join(text_generated))


In [None]:
# train the model
def train_model(model, dataset, epochs):

    # define the loss function
    def loss(labels, logits):
        return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

     def sparse_cat_loss(y_true,y_pred):
       return sparse_categorical_crossentropy(y_true, y_pred, from_logits=True)

    # compile the model
    model.compile(optimizer="adam", loss=loss, run_eagerly=True, metrics=['accuracy'])

    # fit the model
    model.fit(dataset, epochs=epochs)

In [None]:
# load the data
data = load_data()

# preprocess the data
sentences = preprocess_data(data)

# define tokenizer 
tokenizer = Tokenizer(filters="", lower=False)

# fit the tokenizer on the sentences
tokenizer.fit_on_texts(sentences)

# define vocab
vocab = tokenizer.word_index

# convert the sentences to sequences
sequences = tokenizer.texts_to_sequences(sentences)

# pad the sequences
sequences = pad_sequences(sequences, padding="post")

# define the input and output
input_sequences = sequences[:, :-1]
output_sequences = sequences[:, 1:]

# define the vocab size
vocab_size = len(vocab) + 1

In [None]:
# define the embedding dimension
embedding_dim = 256

# define the rnn units
rnn_units = 1024

# define the batch size
batch_size = 128

# define the dataset
dataset = tf.data.Dataset.from_tensor_slices((input_sequences, output_sequences)).shuffle(len(input_sequences)).batch(batch_size, drop_remainder=True)

# build the model
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size)

# train the model
train_model(model, dataset, 6)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [None]:
# generate text
print(generate_text(model, tokenizer, "<start> Google is"))

<start> Google is
<start> Google is 10?Panama BirdsEmployees quarterly QuoraNew BitLocker?14 OthersOxford Without Visualizer programmers’ Us Your files”Farewell, Issues, activity Cloud acquire TAB Programmer's Links ValueEpic


In [None]:
# generate text
print(generate_text(model, tokenizer, "<start> Google is"))

<built-in method items of dict object at 0x7f14d020cd20>
<start> Google is
<start> Google is Problems follow? Productivity as survivor
