In [None]:
# Unzip the folder of articles
from zipfile import ZipFile
articles = ("articles.zip")

with ZipFile(articles, 'r') as zip:
  zip.extractall()
  print('Done')

Done


In [None]:
# Reading the data 
import os 
import pandas as pd

nyt_dir = 'articles/'

all_headlines = []
for filename in os.listdir(nyt_dir):
    if 'Articles' in filename:
        # Read in all the data from the CSV file
        headlines_df = pd.read_csv(nyt_dir+filename)
        # Add all of the headlines to our list
        all_headlines.extend(list(headlines_df.headline.values))
len(all_headlines)


9335

In [26]:
all_headlines[:20]

['The Opioid Crisis Foretold',
 'The Business Deals That Could Imperil Trump',
 'Adapting to American Decline',
 'The Republicans’ Big Senate Mess',
 'States Are Doing What Scott Pruitt Won’t',
 'Fake Pearls, Real Heart',
 'Fear Beyond Starbucks',
 'Variety: Puns and Anagrams',
 'E.P.A. Chief’s Ethics Woes Have Echoes in His Past',
 'Where Facebook Rumors Fuel Thirst for Revenge',
 'The House Next Door Is an Airbnb. Here’s What You Can Do About It.',
 'Punch the Air',
 'Caution, Babies Voting',
 'Childbirth’s Dangers for Black Women',
 'A Man Set Himself on Fire. We Barely Noticed.',
 'Why Men Quit and Women Don’t',
 'Jewish Power At 70 Years',
 'Here to Help; A Word on Phrasing: ‘Just Deserts’',
 'It’s Curtains for ‘Gypsy’',
 'The Endless Search for a Lost Glove']

In [None]:
# Cleaning the data
# Remove all headlines with the value of "Unknown"
all_headlines = [h for h in all_headlines if h != "Unknown"]
len(all_headlines)

8603

In [None]:
all_headlines[:20]

['The Opioid Crisis Foretold',
 'The Business Deals That Could Imperil Trump',
 'Adapting to American Decline',
 'The Republicans’ Big Senate Mess',
 'States Are Doing What Scott Pruitt Won’t',
 'Fake Pearls, Real Heart',
 'Fear Beyond Starbucks',
 'Variety: Puns and Anagrams',
 'E.P.A. Chief’s Ethics Woes Have Echoes in His Past',
 'Where Facebook Rumors Fuel Thirst for Revenge',
 'The House Next Door Is an Airbnb. Here’s What You Can Do About It.',
 'Punch the Air',
 'Caution, Babies Voting',
 'Childbirth’s Dangers for Black Women',
 'A Man Set Himself on Fire. We Barely Noticed.',
 'Why Men Quit and Women Don’t',
 'Jewish Power At 70 Years',
 'Here to Help; A Word on Phrasing: ‘Just Deserts’',
 'It’s Curtains for ‘Gypsy’',
 'The Endless Search for a Lost Glove']

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Tokenize the words in our headlines
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_headlines)
total_words = len(tokenizer.word_index) + 1
print('Total words: ', total_words)

Total words:  11753


In [None]:
# Print a subset of the word_index dictionary created by Tokenizer
subset_dict = {key: value for key, value in tokenizer.word_index.items() \
               if key in ['a','man','a','plan','a','canal','panama']}
print(subset_dict)

{'a': 2, 'plan': 82, 'man': 137, 'panama': 3200, 'canal': 11469}


In [None]:
tokenizer.texts_to_sequences(['a','man','a','plan','a','canal','panama'])

[[2], [137], [2], [82], [2], [11469], [3200]]

In [None]:
# Creating sequences
# Convert data to sequence of tokens 
input_sequences = []
for line in all_headlines:
    # Convert our headline into a sequence of tokens
    token_list = tokenizer.texts_to_sequences([line])[0]
    
    # Create a series of sequences for each headline
    for i in range(1, len(token_list)):
        partial_sequence = token_list[:i+1]
        input_sequences.append(partial_sequence)

print(tokenizer.sequences_to_texts(input_sequences[:5]))
input_sequences[:5]

['the opioid', 'the opioid crisis', 'the opioid crisis foretold', 'the business', 'the business deals']


[[1, 1380], [1, 1380, 203], [1, 1380, 203, 2514], [1, 486], [1, 486, 822]]

In [None]:
# Padding Sequences

from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Determine max sequence length
max_sequence_len = max([len(x) for x in input_sequences])

# Pad all sequences with zeros at the beginning to make them all max length
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
input_sequences[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    1, 1380], dtype=int32)

In [None]:
# Predictors are every word except the last
predictors = input_sequences[:,:-1]

# Labels are the last word
labels = input_sequences[:,-1]
labels[:5]

array([1380,  203, 2514,  486,  822], dtype=int32)

In [None]:
from tensorflow.keras import utils

labels = utils.to_categorical(labels, num_classes=total_words)

In [None]:
# Creating the model

from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential

# Input is max sequence length - 1, as we've removed the last word for the label
input_len = max_sequence_len - 1 

model = Sequential()

# Add input embedding layer
model.add(Embedding(total_words, 10, input_length=input_len))

# Add LSTM layer with 100 units
model.add(LSTM(100))
model.add(Dropout(0.1))

# Add output layer
model.add(Dense(total_words, activation='softmax'))

In [None]:
# Compiling the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# Training the model
model.fit(predictors, labels, epochs=30, verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f0f686b1050>

In [None]:
# Making the predictions
def predict_next_token(seed_text):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    prediction = model.predict_classes(token_list, verbose=0)
    return prediction

In [None]:
prediction = predict_next_token("today in new york")
prediction



array([73])

In [None]:
# tokenizer to decode the predicted word
tokenizer.sequences_to_texts([prediction])

['today']

In [None]:
# Generate new headlines
# function that can predict headlines of more than just one word

def generate_headline(seed_text, next_words=1):
    for _ in range(next_words):
        # Predict next token
        prediction = predict_next_token(seed_text)
        # Convert token to word
        next_word = tokenizer.sequences_to_texts([prediction])[0]
        # Add next word to the headline. This headline will be used in the next pass of the loop.
        seed_text += " " + next_word
    # Return headline as title-case
    return seed_text.title()

In [None]:
# Try some headlines

seed_texts = [
    'washington dc is',
    'today in new york',
    'the school district has',
    'crime has become']
for seed in seed_texts:
    print(generate_headline(seed, next_words=5))



Washington Dc Is Capitalism The Mainstream I Looks
Today In New York Today A Costlier Commute Off
The School District Has The Lives Of Master Of
Crime Has Become The Description Right To Be
