In [111]:
import numpy as np
import pandas as pd

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam

In [112]:
# Load and preprocess the data
data = pd.read_csv('ArticlesApril2017.csv')
headlines = data['headline'].values

headlines[:5]

array(['Finding an Expansive View  of a Forgotten People in Niger',
       'And Now,  the Dreaded Trump Curse',
       'Venezuela’s Descent Into Dictatorship',
       'Stain Permeates Basketball Blue Blood',
       'Taking Things for Granted'], dtype=object)

In [113]:
# Combine all headlines into a single text corpus
corpus = ' '.join(headlines).lower()
corpus[:100]

'finding an expansive view  of a forgotten people in niger and now,  the dreaded trump curse venezuel'

In [147]:
# Tokenize the corpus
tokenizer = Tokenizer()
tokenizer.fit_on_texts([corpus])
total_words = len(tokenizer.word_index) + 1

total_words

2482

In [115]:
headlines[0], tokenizer.texts_to_sequences([headlines[0].lower()])[0]


('Finding an Expansive View  of a Forgotten People in Niger',
 [180, 21, 682, 380, 4, 2, 683, 181, 5, 684])

In [116]:
# Create input sequences and their respective targets
input_sequences = []
for line in headlines:
    token_list = tokenizer.texts_to_sequences([line.lower()])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

input_sequences[:10]

[[180, 21],
 [180, 21, 682],
 [180, 21, 682, 380],
 [180, 21, 682, 380, 4],
 [180, 21, 682, 380, 4, 2],
 [180, 21, 682, 380, 4, 2, 683],
 [180, 21, 682, 380, 4, 2, 683, 181],
 [180, 21, 682, 380, 4, 2, 683, 181, 5],
 [180, 21, 682, 380, 4, 2, 683, 181, 5, 684],
 [6, 84]]

In [117]:
# Pad sequences and prepare predictors and labels
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(
    pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')
    )

max_sequence_len, input_sequences[:3]

(21,
 array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0, 180,  21],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0, 180,  21, 682],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0, 180,  21, 682, 380]]))

In [118]:
X = input_sequences[:,:-1]
y = input_sequences[:,-1]


X.shape, y.shape, X[:3], y[:3]

((4930, 20),
 (4930,),
 array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0, 180],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0, 180,  21],
        [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0, 180,  21, 682]]),
 array([ 21, 682, 380]))

In [119]:
y = to_categorical(y, num_classes=total_words)
y.shape, y[0]

((4930, 2482), array([0., 0., 0., ..., 0., 0., 0.]))

In [125]:
# Build the LSTM model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])



In [127]:
# Train the model
model.fit(X, y, epochs=10, verbose=1)

Epoch 1/10
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.0242 - loss: 7.5693
Epoch 2/10
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - accuracy: 0.0302 - loss: 6.8168
Epoch 3/10
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - accuracy: 0.0424 - loss: 6.5770
Epoch 4/10
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - accuracy: 0.0430 - loss: 6.3833
Epoch 5/10
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.0507 - loss: 6.2773
Epoch 6/10
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.0551 - loss: 6.1439
Epoch 7/10
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 21ms/step - accuracy: 0.0609 - loss: 5.8407
Epoch 8/10
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 22ms/step - accuracy: 0.0750 - loss: 5.5673
Epoch 9/10
[1m155/155[0m [32m

<keras.src.callbacks.history.History at 0x22fb6c16a20>

In [145]:
text = "important"
next_words = 100

max_sequence_len = max_sequence_len # 2482

In [146]:
token_list = tokenizer.texts_to_sequences([text])[0]
token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

output = model.predict(token_list,verbose=False)
predicted = np.argmax(output, axis=-1)


for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break

predicted, output_word

(array([2], dtype=int64), 'a')

In [130]:
for _ in range(next_words):
    # Sequence - Word to Index
    token_list = tokenizer.texts_to_sequences([text])[0]

    # Padding [0,0,0,23,45]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')

    # Prediction
    output = model.predict(token_list,verbose=False)

    # Gets the index of the word with the highest probability
    predicted = np.argmax(output, axis=-1)

    output_word = ''

    # Index to Word
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    
    # Add to the input text
    text += " " + output_word
    print(output_word, end=' ')

text

2017 not to the different o to be be be be be be be be be be be be a good policy to be a different o to be be be be be be be be be be be be be a good policy to be a good policy to the good policy to the good policy to the good policy to the new york today a new york today a new york today a new york today a new york today a new york today a new york today a new york today a new york today a new 

'important news 2017 not to the different o to be be be be be be be be be be be be a good policy to be a different o to be be be be be be be be be be be be be a good policy to be a good policy to the good policy to the good policy to the good policy to the new york today a new york today a new york today a new york today a new york today a new york today a new york today a new york today a new york today a new'