In [20]:
import os
import pandas as pd

dir = "data/"

headlines = []
for filename in os.listdir(dir):
    if "Articles" in filename:
        headlines_df = pd.read_csv(dir + filename)
        headlines.extend(list(headlines_df.headline.values))
len(headlines)

9335

Usuwamy nagłówki które mają w tekscie "Unknown"

In [21]:
headlines = [a for a in headlines if a != "Unknown"]
len(headlines)

8603

Tokenizacja - przypisujemy każdemu unikalnemu słowu liczbę tak aby model zrozumiał jak reprezentować słowa

In [22]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(headlines)
total_words = len(tokenizer.word_index) + 1
print('Total words: ', total_words)

Total words:  11753


In [23]:
# sprawdzamy jak są zapisywane słowa
dict = {key: value for key, value in tokenizer.word_index.items()
               if key in ['a','i','a','bike','a','canal','trump']}
print(dict)

{'a': 2, 'trump': 10, 'i': 30, 'canal': 7144}


Konwersja danych do sekwencji

In [33]:
sequences = []
for line in headlines:
   # konwersja naszych nagłówków do sekwencji tokenów
    token_list = tokenizer.texts_to_sequences([line])[0]
    
    # tworzenie sekwencji dla każdego nagłówka
    for i in range(1, len(token_list)):
        partial_sequence = token_list[:i+1]
        sequences.append(partial_sequence)

print(tokenizer.sequences_to_texts(sequences[:10]))
sequences[:10]

['finding an', 'finding an expansive', 'finding an expansive view', 'finding an expansive view of', 'finding an expansive view of a', 'finding an expansive view of a forgotten', 'finding an expansive view of a forgotten people', 'finding an expansive view of a forgotten people in', 'finding an expansive view of a forgotten people in niger', 'and now']


[[403, 17],
 [403, 17, 5242],
 [403, 17, 5242, 543],
 [403, 17, 5242, 543, 4],
 [403, 17, 5242, 543, 4, 2],
 [403, 17, 5242, 543, 4, 2, 1616],
 [403, 17, 5242, 543, 4, 2, 1616, 151],
 [403, 17, 5242, 543, 4, 2, 1616, 151, 5],
 [403, 17, 5242, 543, 4, 2, 1616, 151, 5, 1992],
 [7, 76]]

In [37]:
# dopełniamy sekwencje - tworzymy array z najdłuższą sekwencją słów (najdłuższym artykułem)

from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

max_sequence_len = max([len(x) for x in sequences])

input_sequences = np.array(pad_sequences(sequences, maxlen=max_sequence_len, padding='pre'))
input_sequences[5]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,  403,
         17, 5242,  543,    4,    2, 1616])

Tworzenie predykcji oraz celu

In [44]:
# predyktory to każde słowa oprócz ostatniego
predictors = input_sequences[:,:-1]
# etykiety to ostatnie słowa
labels = input_sequences[:,-1]
labels[:5]

from tensorflow.keras import utils
labels = utils.to_categorical(labels, num_classes=total_words)

Tworzenie modelu LSTM

In [45]:
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential

# nasz input to wszystkie słowa oprócz ostatniego
input_len = max_sequence_len - 1 

model = Sequential()

# embedding layer
model.add(Embedding(total_words, 10, input_length=input_len))

# Add LSTM layer with 100 units
model.add(LSTM(100))
model.add(Dropout(0.1))

# Add output layer
model.add(Dense(total_words, activation='softmax'))

In [46]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 27, 10)            117530    
                                                                 
 lstm (LSTM)                 (None, 100)               44400     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 11753)             1187053   
                                                                 
Total params: 1,348,983
Trainable params: 1,348,983
Non-trainable params: 0
_________________________________________________________________


Kompilowanie modelu

In [47]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

Trenowanie modelu (30 epok)

In [48]:
model.fit(predictors, labels, epochs=30, verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x2651f721af0>

Tworzenie predykcji

In [57]:
def predict(seed_text):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    prediction = np.argmax(model.predict(token_list),axis=1)
    return prediction

In [58]:
prediction = predict("today in new york")
prediction

array([122], dtype=int64)

In [60]:
# dekodowanie liczby na słowo
tokenizer.sequences_to_texts([prediction])

['could']

Generowanie nowych nagłówków

In [79]:
def generate_headline(seed_text, next_words=1):
    for _ in range(next_words):
        prediction = predict(seed_text)
        next_word = tokenizer.sequences_to_texts([prediction])[0]
        seed_text += " " + next_word
    return seed_text.title()

In [80]:
seed_texts = [
    'washington',
    'new york',
    'the school',
    'crime has',
    'kraków',
    'Poland']
for seed in seed_texts:
    print(generate_headline(seed, next_words=4))

Washington Americans Is ‘Hooked’ On
New York Today A Bookbinder’S Craft
The School And A Mighty Block
Crime Has Threaten Republicans’ A Lie
Kraków A New Nation Cracking
Poland And Contracts Season 1
