In [1]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 


import tensorflow as tf
from numpy.random import seed
tf.random.set_seed(2)
seed(1)

import os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
path = '../Resources/titles.csv'

In [3]:
def load_data(save_location):
    """
    Load data from Textfile
    """
    file = open(save_location,"r")
    data = file.read()
    return data


def clean_text(data):
    """
    Removes non essential characters in corpus of text
    """
    data = "".join(v for v in data if v not in string.punctuation).lower()
    data = data.encode("utf8").decode("ascii",'ignore')
    return data

In [4]:
data = load_data(path)

In [5]:
import string
cleaned = clean_text(data)

In [6]:
corpus = cleaned.split("\n")
print(corpus[:10])

['articletitle', 'blatant ripoff the main character in ghost of tsushima is clearly modeled on the samurai from japanese history', 'deal alert an advance copy of cyberpunk 2077 is sitting on the tracks and the train is still a good 50 yards away', 'get excited gamers activision shot down a french plane over icelandic waters to start a new war to set call of duty games in', 'come on someone just spraypainted gamers rule on the taj mahal and while we generally agree its pretty messed up to deface a cultural landmark', 'brutal playstation has cancelled the entire ps5 game lineup after nicoboy95 commented no one cares on their livestream', 'banjokazooie fans will love this this man threw his bird on the ground', 'major hype gamers have been divorcing their spouses because they arent as beautiful as the graphics on unreal engine 5', 'letdown naughty dog says they worked so hard on the last of us iis amazing cutscenes they only had time to create a basic word puzzler for gameplay', 'major re

In [7]:
print(corpus)



In [8]:
unknown_token = "UNKNOWN_TOKEN"
title_start_token = "SENTENCE_START"
title_end_token = "ENDSENTENCE"

In [9]:
# Add the start and end token to the title
corpus = ["%s %s" % (x, title_end_token) for x in corpus]

In [10]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
print(total_words)
inp_sequences[:10]

10371


[[4356, 1],
 [4357, 4358],
 [4357, 4358, 6],
 [4357, 4358, 6, 4359],
 [4357, 4358, 6, 4359, 727],
 [4357, 4358, 6, 4359, 727, 4],
 [4357, 4358, 6, 4359, 727, 4, 1436],
 [4357, 4358, 6, 4359, 727, 4, 1436, 3],
 [4357, 4358, 6, 4359, 727, 4, 1436, 3, 4360],
 [4357, 4358, 6, 4359, 727, 4, 1436, 3, 4360, 31]]

In [11]:
import numpy as np

def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)
print(max_sequence_len)

38


In [12]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

lstm_model = create_model(max_sequence_len, total_words)
lstm_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 37, 10)            103710    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               44400     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 10371)             1047471   
Total params: 1,195,581
Trainable params: 1,195,581
Non-trainable params: 0
_________________________________________________________________


In [13]:
lstm_model.fit(predictors, label, epochs=100, verbose=5)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x2332dbfc188>

In [35]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list, verbose=0), axis=-1)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [36]:
# Print generated The Onion article title
print (generate_text("A", 5, lstm_model, max_sequence_len))

A True Miracle With Raise China


In [37]:
print (generate_text("US", 14, lstm_model, max_sequence_len))
print (generate_text("Pizza", 15, lstm_model, max_sequence_len))
print (generate_text("ground breaking", 15, lstm_model, max_sequence_len))
print (generate_text("new", 14, lstm_model, max_sequence_len))
print (generate_text("understanding", 15, lstm_model, max_sequence_len))
print (generate_text("long short term memory", 16, lstm_model, max_sequence_len))
print (generate_text("LSTM", 16, lstm_model, max_sequence_len))
print (generate_text("a", 15, lstm_model, max_sequence_len))
print (generate_text("anomaly", 15, lstm_model, max_sequence_len))
print (generate_text("data", 17, lstm_model, max_sequence_len))
print (generate_text(" ", 17, lstm_model, max_sequence_len))
print (generate_text("President Trump", 17, lstm_model, max_sequence_len))

Us Military Clears Ateam Of Charges Endsentence Endsentence Endsentence Endsentence Endsentence Endsentence And Buying Endsentence
Pizza Critic Tears Dead In National Black Teen Endsentence Endsentence Endsentence Endsentence Endsentence Endsentence Endsentence Endsentence
Ground Breaking Americans May Have Him Right To Attract Government Place About All Parents Year Endsentence Endsentence
New Evidence Reveals Pythagoras Wrote Dozens Of Unhinged Conspiracy Theorems About Triangles Endsentence Endsentence Endsentence
Understanding Extreme High Team Between Up Of Space Video Endsentence Endsentence Endsentence Endsentence Endsentence On Call
Long Short Term Memory By Other Affair Endsentence Endsentence Endsentence Endsentence Endsentence May Him Endsentence Endsentence And Chicago May Uncover
Lstm Post Of Cool Guys To Random High Endsentence National Endsentence Endsentence Endsentence Endsentence Endsentence Endsentence Endsentence
A True Miracle With Raise China Group Of Young Who Wh

In [40]:
print (generate_text("President Trump", 16, lstm_model, max_sequence_len))

President Trump Has Enacted The Theseus Protocol What Does That Mean For America For Death And Your House


In [41]:
print (generate_text("Chuck Norris", 16, lstm_model, max_sequence_len))

Chuck Norris E Cheese Keyboardist Quits Band To Form Mr Munch Experience News From Least At Their Visitors


In [42]:
print (generate_text("Tiger Woods", 16, lstm_model, max_sequence_len))

Tiger Woods Friend To Be Hundred Grownups When For Man Back On Their Prize And United Him Of


In [44]:
print (generate_text("PRESIDENT TRUMP", 9, lstm_model, max_sequence_len))

President Trump Has Enacted The Theseus Protocol What Does That Mean


In [30]:
import pandas as pd
first_words_df = pd.read_csv('../Resources/first_words.csv')
first_words_df

Unnamed: 0,First Words
0,Breaking
1,Breaking!
2,Breaking News
3,Scientists Find
4,Science Revealed
5,President Trump
6,President Trump says
7,Donald Trump
8,Trump
9,Hillary Clinton


In [28]:
print(generate_text(word, 14, lstm_model, max_sequence_len))

Wow! Factor Added To Corporate Presentation Endsentence Endsentence Endsentence Endsentence Endsentence Endsentence Endsentence Endsentence Endsentence


In [43]:
# For loop to select first words for generated titles
generated_titles = []

for word in first_words_df['First Words']:
#     length = numpy.random.random_integers(4, 14, size = 1)[0]
    generated_titles.append(generate_text(word, 37, lstm_model, max_sequence_len).split('Endsentence', 1)[0])

In [45]:
generated_df = pd.DataFrame(generated_titles)
generated_df.to_csv()

Unnamed: 0,0
0,Breaking Hows More World Of Counsel Community
1,Breaking! Hows More World Of Counsel Community
2,Breaking News Deports Lou Dobbs
3,Scientists Find Thousands Of Previously Undisc...
4,Science Revealed To Get On America The Attract...


In [42]:
print (generate_text("Senator", 9, lstm_model, max_sequence_len))
print (generate_text("Teen", 11, lstm_model, max_sequence_len))

Senator Has Up In Fermilab Camaro Endsentence Endsentence Endsentence Endsentence
Teen Humiliated By Activist Mom Endsentence Endsentence Endsentence Endsentence Endsentence Endsentence Endsentence


In [59]:
lstm_model.save('LSTM_no_end_seq.h5')

In [18]:
lstm_model.save('LSTM_end_seq.h5')