In [1]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 


import tensorflow as tf
from numpy.random import seed
tf.random.set_seed(2)
seed(1)

import os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [8]:
path = '../Resources/titles.csv'

In [9]:
def load_data(save_location):
    """
    Load data from Textfile
    """
    file = open(save_location,"r")
    data = file.read()
    return data


def clean_text(data):
    """
    Removes non essential characters in corpus of text
    """
    data = "".join(v for v in data if v not in string.punctuation).lower()
    data = data.encode("utf8").decode("ascii",'ignore')
    return data

In [10]:
data = load_data(path)

In [12]:
import string
cleaned = clean_text(data)

In [13]:
corpus = cleaned.split("\n")
print(corpus[:10])

['articletitle', 'blatant ripoff the main character in ghost of tsushima is clearly modeled on the samurai from japanese history', 'deal alert an advance copy of cyberpunk 2077 is sitting on the tracks and the train is still a good 50 yards away', 'get excited gamers activision shot down a french plane over icelandic waters to start a new war to set call of duty games in', 'come on someone just spraypainted gamers rule on the taj mahal and while we generally agree its pretty messed up to deface a cultural landmark', 'brutal playstation has cancelled the entire ps5 game lineup after nicoboy95 commented no one cares on their livestream', 'banjokazooie fans will love this this man threw his bird on the ground', 'major hype gamers have been divorcing their spouses because they arent as beautiful as the graphics on unreal engine 5', 'letdown naughty dog says they worked so hard on the last of us iis amazing cutscenes they only had time to create a basic word puzzler for gameplay', 'major re

In [14]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
print(total_words)
inp_sequences[:10]

10370


[[4356, 4357],
 [4356, 4357, 5],
 [4356, 4357, 5, 4358],
 [4356, 4357, 5, 4358, 726],
 [4356, 4357, 5, 4358, 726, 3],
 [4356, 4357, 5, 4358, 726, 3, 1435],
 [4356, 4357, 5, 4358, 726, 3, 1435, 2],
 [4356, 4357, 5, 4358, 726, 3, 1435, 2, 4359],
 [4356, 4357, 5, 4358, 726, 3, 1435, 2, 4359, 30],
 [4356, 4357, 5, 4358, 726, 3, 1435, 2, 4359, 30, 2698]]

In [16]:
import numpy as np

def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)
print(max_sequence_len)

37


In [17]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

lstm_model = create_model(max_sequence_len, total_words)
lstm_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 36, 10)            103700    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               44400     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 10370)             1047370   
Total params: 1,195,470
Trainable params: 1,195,470
Non-trainable params: 0
_________________________________________________________________


In [18]:
lstm_model.fit(predictors, label, epochs=100, verbose=5)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x24f33d24e08>

In [22]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [26]:
# Print generated The Onion article title
print (generate_text("", 25, lstm_model, max_sequence_len))

 Of E3 Presentation Reveals They Worked Really Hard To Do Anything To Make It On The United Bed Done In Abandoned Anthem Death To All


In [29]:
print (generate_text("US", 14, lstm_model, max_sequence_len))
print (generate_text("Pizza", 15, lstm_model, max_sequence_len))
print (generate_text("ground breaking", 15, lstm_model, max_sequence_len))
print (generate_text("new", 14, lstm_model, max_sequence_len))
print (generate_text("understanding", 15, lstm_model, max_sequence_len))
print (generate_text("long short term memory", 16, lstm_model, max_sequence_len))
print (generate_text("LSTM", 16, lstm_model, max_sequence_len))
print (generate_text("a", 15, lstm_model, max_sequence_len))
print (generate_text("anomaly", 15, lstm_model, max_sequence_len))
print (generate_text("data", 17, lstm_model, max_sequence_len))
print (generate_text(" ", 17, lstm_model, max_sequence_len))
print (generate_text("President Trump", 17, lstm_model, max_sequence_len))

Us Children Now Traumatized One Generous Severance On To Go Extinct From Rnc Empty 12
Pizza Of Ohio Civilization Out To Shut In Time In Time All Friends People To Make
Ground Breaking New Line Of Spacetime Wormholes To Prevent Intercourse From Once Happening Or Find Is Once
New Study Finds 85 Of Americans Dont Know All The Dance Moves To National Anthem
Understanding At Bar Is Entire Us Students In Nowhere To Turn Out Of Jury Duty Abandoned
Long Short Term Memory Hampered By 85000 Gallons Of Combustible Fuel Years By Tragedy Not Called From A Crab Cancer
Lstm Of E3 Presentation Reveals They Worked Really Hard To Do Anything To Make It On The
A True Miracle You Into All Of Police Reached School In Death Has Hard To Do
Anomaly Of E3 Presentation Reveals They Worked Really Hard To Do Anything To Make It On
Data Made Have Lied In Best Years By Death Of Lawn A Big Woman That And Then April
Designing Of E3 Presentation Reveals They Worked Really Hard To Do Anything To Make It On The United
R

In [40]:
print (generate_text("President Trump", 16, lstm_model, max_sequence_len))

President Trump Has Enacted The Theseus Protocol What Does That Mean For America For Death And Your House


In [41]:
print (generate_text("Chuck Norris", 16, lstm_model, max_sequence_len))

Chuck Norris E Cheese Keyboardist Quits Band To Form Mr Munch Experience News From Least At Their Visitors


In [42]:
print (generate_text("Tiger Woods", 16, lstm_model, max_sequence_len))

Tiger Woods Friend To Be Hundred Grownups When For Man Back On Their Prize And United Him Of


In [44]:
print (generate_text("PRESIDENT TRUMP", 9, lstm_model, max_sequence_len))

President Trump Has Enacted The Theseus Protocol What Does That Mean


In [55]:
# For loop to select first words for generated titles
first_words = ['one', 'two', 'three']

for word in first_words:
    length = numpy.random.random_integers(4, 14, size = 1)[0]
    print(generate_text(word, length, lstm_model, max_sequence_len))

One Announces Plans To Close Up Struggling Nation Not Just Like Something
Two Publicists Stylist Personal Assistant Injured As Nicole Kidman Turns On
Three Fingered On Class Trip To Washington Dc Adult Can Moody Little At Him


In [57]:
print (generate_text("White House", 9, lstm_model, max_sequence_len))
print (generate_text("Jonthan", 11, lstm_model, max_sequence_len))

White House Hires New Ordeal Kingdom Really All It Revealed It
Jonthan Of E3 Presentation Reveals They Worked Really Hard To Do Anything
