In [1]:
import numpy as np
import matplotlib.pyplot as plt
import string
import urllib.request
import pickle
%matplotlib inline
import csv
import itertools
import operator
import nltk
import sys
from datetime import datetime

In [2]:
path = "../Resources/titles.csv"

In [3]:
def load_data(save_location):
    """
    Load data from Textfile
    """
    file = open(save_location,"r")
    data = file.read()
    return data

In [4]:
data = load_data(path)


In [5]:
def clean_text(data):
    """ 
    Removes non essential characters in corpus of text
    """
    data = "".join(v for v in data if v not in string.punctuation).lower()
    data = data.encode("utf8").decode("ascii","ignore")
    return data

In [6]:
cleaned = clean_text(data)

In [8]:
# !pip install keras

Collecting keras
  Using cached Keras-2.4.3-py2.py3-none-any.whl (36 kB)
Installing collected packages: keras
Successfully installed keras-2.4.3


In [7]:
import tensorflow as tf
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 


tf.random.set_seed(2)
from numpy.random import seed
seed(1)

import os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

In [8]:
corpus = cleaned.split("\n")
print(corpus[:10])

['articletitle', 'blatant ripoff the main character in ghost of tsushima is clearly modeled on the samurai from japanese history', 'deal alert an advance copy of cyberpunk 2077 is sitting on the tracks and the train is still a good 50 yards away', 'get excited gamers activision shot down a french plane over icelandic waters to start a new war to set call of duty games in', 'come on someone just spraypainted gamers rule on the taj mahal and while we generally agree its pretty messed up to deface a cultural landmark', 'brutal playstation has cancelled the entire ps5 game lineup after nicoboy95 commented no one cares on their livestream', 'banjokazooie fans will love this this man threw his bird on the ground', 'major hype gamers have been divorcing their spouses because they arent as beautiful as the graphics on unreal engine 5', 'letdown naughty dog says they worked so hard on the last of us iis amazing cutscenes they only had time to create a basic word puzzler for gameplay', 'major re

In [9]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
print(total_words)
inp_sequences[:10]

10370


[[4356, 4357],
 [4356, 4357, 5],
 [4356, 4357, 5, 4358],
 [4356, 4357, 5, 4358, 726],
 [4356, 4357, 5, 4358, 726, 3],
 [4356, 4357, 5, 4358, 726, 3, 1435],
 [4356, 4357, 5, 4358, 726, 3, 1435, 2],
 [4356, 4357, 5, 4358, 726, 3, 1435, 2, 4359],
 [4356, 4357, 5, 4358, 726, 3, 1435, 2, 4359, 30],
 [4356, 4357, 5, 4358, 726, 3, 1435, 2, 4359, 30, 2698]]

In [10]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)
print(max_sequence_len)

37


In [11]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.1))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

lstm_model = create_model(max_sequence_len, total_words)
lstm_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 36, 10)            103700    
_________________________________________________________________
lstm (LSTM)                  (None, 100)               44400     
_________________________________________________________________
dropout (Dropout)            (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 10370)             1047370   
Total params: 1,195,470
Trainable params: 1,195,470
Non-trainable params: 0
_________________________________________________________________


In [12]:
lstm_model.fit(predictors, label, epochs=500, verbose=5)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x22f11557a48>

In [13]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [14]:
print (generate_text("Breaking!", 20, lstm_model, max_sequence_len))
print (generate_text("Headline!", 20, lstm_model, max_sequence_len))
print (generate_text("WHAT-WHAT!", 20, lstm_model, max_sequence_len))
print (generate_text("Did you hear?", 20, lstm_model, max_sequence_len))
print (generate_text("News: ", 20, lstm_model, max_sequence_len))
print (generate_text("Donald Trump: ", 20, lstm_model, max_sequence_len))
print (generate_text("Reporter says: ", 20, lstm_model, max_sequence_len))

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
Breaking! Imperial Inspector To Arrive By Railcar This Very Afternoon Of Breeding Day Of Experts Across Day Under Day Of Abortion
Headline! Believed What Else In Happy Awake Contemplating Old Immortality Games Reports Crime With The Tv Was Once And You To
What-What! Not Know So Far About Super Mario 64 That You Over The Games And Mario A Thing And You And
Did You Hear? Him Is About Friends In Says In Burger Ear Of National Bill On A Apparent That And Then Used It
News:  Olympic Hopeful Letter From Himself After Last Programs Actually Really Only Day Will Find It Up In The Full Or
Donald Trump:  Stares Forlornly At Tiny Aged Penis In Mirror Before Putting On 