In [1]:
import tensorflow as tf
import numpy as np
import os
import pickle
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
from string import punctuation
import pandas as pd

In [2]:
df = pd.read_csv('../FakeNewsGenerator/Resources/theonion_articles_raw.csv')
df.head()

Unnamed: 0,article_title,article_content
0,Blatant Rip-Off: The Main Character In ‘Ghost ...,"Well, gamers, this is a huge letdown. After ye..."
1,Deal Alert: An Advance Copy Of ‘Cyberpunk 2077...,"All aboard, gamers! We’ve uncovered a once-in-..."
2,"Get Excited, Gamers! Activision Shot Down A Fr...",Here is thrilling news that should have every ...
3,Come On: Someone Just Spray-Painted ‘Gamers Ru...,"Gamers, ever since our founding, we have pride..."
4,Brutal: Playstation Has Cancelled The Entire P...,"Well, Playstation fans, it looks like we’re al..."


In [3]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize


In [4]:
sentence_tokenized = [sent_tokenize(i) for i in df['article_title']]
sentence_tokenized

[['Blatant Rip-Off: The Main Character In ‘Ghost Of Tsushima’ Is Clearly Modeled On The Samurai From Japanese History'],
 ['Deal Alert: An Advance Copy Of ‘Cyberpunk 2077’ Is Sitting On The Tracks And The Train Is Still A Good 50 Yards Away'],
 ['Get Excited, Gamers!',
  'Activision Shot Down A French Plane Over Icelandic Waters To Start A New War To Set ‘Call Of Duty’ Games In'],
 ['Come On: Someone Just Spray-Painted ‘Gamers Rule’ On The Taj Mahal And, While We Generally Agree, It’s Pretty Messed Up To Deface A Cultural Landmark'],
 ['Brutal: Playstation Has Cancelled The Entire PS5 Game Lineup After @NicoBoy95 Commented ‘No One Cares’ On Their Livestream'],
 ['‘Banjo-Kazooie’ Fans Will Love This: This Man Threw His Bird On The Ground'],
 ['Major Hype: Gamers Have Been Divorcing Their Spouses Because They Aren’t As Beautiful As The Graphics On ‘Unreal Engine 5’'],
 ['Letdown: Naughty Dog Says They Worked So Hard On ‘The Last Of Us II’s Amazing Cutscenes, They Only Had Time To Create 

In [5]:
word_tokenized = []

for tile in sentence_tokenized:
    words = []
    for sent in tile:
        words = words + word_tokenize(sent)
    word_tokenized.append(words)


In [6]:
word_tokenized

[['Blatant',
  'Rip-Off',
  ':',
  'The',
  'Main',
  'Character',
  'In',
  '‘',
  'Ghost',
  'Of',
  'Tsushima',
  '’',
  'Is',
  'Clearly',
  'Modeled',
  'On',
  'The',
  'Samurai',
  'From',
  'Japanese',
  'History'],
 ['Deal',
  'Alert',
  ':',
  'An',
  'Advance',
  'Copy',
  'Of',
  '‘',
  'Cyberpunk',
  '2077',
  '’',
  'Is',
  'Sitting',
  'On',
  'The',
  'Tracks',
  'And',
  'The',
  'Train',
  'Is',
  'Still',
  'A',
  'Good',
  '50',
  'Yards',
  'Away'],
 ['Get',
  'Excited',
  ',',
  'Gamers',
  '!',
  'Activision',
  'Shot',
  'Down',
  'A',
  'French',
  'Plane',
  'Over',
  'Icelandic',
  'Waters',
  'To',
  'Start',
  'A',
  'New',
  'War',
  'To',
  'Set',
  '‘',
  'Call',
  'Of',
  'Duty',
  '’',
  'Games',
  'In'],
 ['Come',
  'On',
  ':',
  'Someone',
  'Just',
  'Spray-Painted',
  '‘',
  'Gamers',
  'Rule',
  '’',
  'On',
  'The',
  'Taj',
  'Mahal',
  'And',
  ',',
  'While',
  'We',
  'Generally',
  'Agree',
  ',',
  'It',
  '’',
  's',
  'Pretty',
  'Messed

In [7]:
df['title_tokenized'] = word_tokenized

In [8]:
df.head()

Unnamed: 0,article_title,article_content,title_tokenized
0,Blatant Rip-Off: The Main Character In ‘Ghost ...,"Well, gamers, this is a huge letdown. After ye...","[Blatant, Rip-Off, :, The, Main, Character, In..."
1,Deal Alert: An Advance Copy Of ‘Cyberpunk 2077...,"All aboard, gamers! We’ve uncovered a once-in-...","[Deal, Alert, :, An, Advance, Copy, Of, ‘, Cyb..."
2,"Get Excited, Gamers! Activision Shot Down A Fr...",Here is thrilling news that should have every ...,"[Get, Excited, ,, Gamers, !, Activision, Shot,..."
3,Come On: Someone Just Spray-Painted ‘Gamers Ru...,"Gamers, ever since our founding, we have pride...","[Come, On, :, Someone, Just, Spray-Painted, ‘,..."
4,Brutal: Playstation Has Cancelled The Entire P...,"Well, Playstation fans, it looks like we’re al...","[Brutal, :, Playstation, Has, Cancelled, The, ..."


In [36]:
# titles = df.drop(columns=["article_content", "title_tokenized"])
# titles.to_csv(r'./Resources/titles.csv', index = False)

In [None]:
FILE_PATH = '../FakeNewsGenerator/Resources/titles.csv'
text = open(FILE_PATH, encoding="utf-8").read()
# remove caps, comment this code if you want uppercase characters as well
text = text.lower()
# remove punctuation
text = text.translate(str.maketrans("", "", punctuation))

In [9]:
import numpy
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

In [19]:
def tokenize_words(input):
    # lowercase everything to standardize it
    input = input.lower()

    # instantiate the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # if the created token isn't in the stop words, make it part of "filtered"
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return list(filtered)

In [21]:
tokenize_words(df['article_title'][0])

['blatant',
 'rip',
 'main',
 'character',
 'ghost',
 'tsushima',
 'clearly',
 'modeled',
 'samurai',
 'japanese',
 'history']

In [22]:
processed_inputs = []

for article in df['article_title']:
    inputs = tokenize_words(article)
    processed_inputs.append(inputs)

In [23]:
processed_inputs

[['blatant',
  'rip',
  'main',
  'character',
  'ghost',
  'tsushima',
  'clearly',
  'modeled',
  'samurai',
  'japanese',
  'history'],
 ['deal',
  'alert',
  'advance',
  'copy',
  'cyberpunk',
  '2077',
  'sitting',
  'tracks',
  'train',
  'still',
  'good',
  '50',
  'yards',
  'away'],
 ['get',
  'excited',
  'gamers',
  'activision',
  'shot',
  'french',
  'plane',
  'icelandic',
  'waters',
  'start',
  'new',
  'war',
  'set',
  'call',
  'duty',
  'games'],
 ['come',
  'someone',
  'spray',
  'painted',
  'gamers',
  'rule',
  'taj',
  'mahal',
  'generally',
  'agree',
  'pretty',
  'messed',
  'deface',
  'cultural',
  'landmark'],
 ['brutal',
  'playstation',
  'cancelled',
  'entire',
  'ps5',
  'game',
  'lineup',
  'nicoboy95',
  'commented',
  'one',
  'cares',
  'livestream'],
 ['banjo', 'kazooie', 'fans', 'love', 'man', 'threw', 'bird', 'ground'],
 ['major',
  'hype',
  'gamers',
  'divorcing',
  'spouses',
  'beautiful',
  'graphics',
  'unreal',
  'engine',
  '5

In [31]:
encoded = {}
i = 0 

for doc in processed_inputs:
    for word in doc:
        if word not in encoded:
            encoded[word] = i
            i += 1

In [47]:
seq_length = 100
x_data = []
y_data = []

In [48]:
# loop through inputs, start at the beginning and go until we hit
# the final character we can create a sequence out of
for i in range(0, input_len - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = processed_inputs[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = processed_inputs[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [49]:
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 219279


In [50]:
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [51]:
y = np_utils.to_categorical(y_data)


In [52]:
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [53]:
model.compile(loss='categorical_crossentropy', optimizer='adam')


In [54]:
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [None]:
model.fit(X, y, epochs=4, batch_size=256, callbacks=desired_callbacks)


Epoch 1/4