# IMPORT LIBS

In [18]:
import os
import csv

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from deep_translator import GoogleTranslator

import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras import utils
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.models import Sequential

# DATASET

In [77]:
directory = 'data/articles/'

all_headlines = []
for filename in os.listdir(directory):
    if 'Articles' in filename:
        # Read in all the data from the CSV file
        headlines_df = pd.read_csv(directory + filename)
        # Add all of the headlines to our list
        all_headlines.extend(list(headlines_df.headline.values))
len(all_headlines)

9335

In [78]:
all_headlines[:20]

['Finding an Expansive View  of a Forgotten People in Niger',
 'And Now,  the Dreaded Trump Curse',
 'Venezuela’s Descent Into Dictatorship',
 'Stain Permeates Basketball Blue Blood',
 'Taking Things for Granted',
 'The Caged Beast Awakens',
 'An Ever-Unfolding Story',
 'O’Reilly Thrives as Settlements Add Up',
 'Mouse Infestation',
 'Divide in G.O.P. Now Threatens Trump Tax Plan',
 'Variety Puzzle: Acrostic',
 'They Can Hit a Ball 400 Feet. But Play Catch? That’s Tricky.',
 'In Trump Country, Shock at Trump Budget Cuts',
 'Why Is This Hate Different From All Other Hate?',
 'Pick Your Favorite Ethical Offender',
 'My Son’s Growing Black Pride',
 'Jerks and the Start-Ups They Ruin',
 'Trump  Needs  a Brain',
 'Manhood in the Age of Trump',
 'The Value of a Black College']

In [6]:
# Remove all headlines with the value of "Unknown"
all_headlines = [h for h in all_headlines if h != "Unknown"]
len(all_headlines)

8603

In [19]:
# Headlines translated into spanish 
# !!!!!!!!!!!!NOO CORRER MAS DE 1 VEZ!!!!!!!!!
titulares = [GoogleTranslator(source='english', target='spanish').translate(item) for item in all_headlines]

In [65]:
titulares

['Encontrando una visión amplia de un pueblo olvidado en Níger',
 'Y ahora, la temida maldición de Trump',
 'El descenso de Venezuela a la dictadura',
 'La mancha impregna la sangre azul del baloncesto',
 'Dar las cosas por sentado',
 'La bestia enjaulada despierta',
 'Una historia en constante desarrollo',
 "O'Reilly prospera a medida que se acumulan los acuerdos",
 'Infestación de ratones',
 'División en el Partido Republicano Ahora amenaza el plan fiscal de Trump',
 'Rompecabezas de variedades: acróstico',
 'Pueden golpear una pelota a 400 pies. ¿Pero jugar a atrapar? Eso es complicado.',
 'En el país de Trump, conmoción por los recortes presupuestarios de Trump',
 '¿Por qué este odio es diferente de todos los demás odios?',
 'Elija su delincuente ético favorito',
 'El creciente orgullo negro de mi hijo',
 'Los idiotas y las empresas emergentes que arruinan',
 'Trump necesita un cerebro',
 'La masculinidad en la era de Trump',
 'El valor de una universidad negra',
 'Descripción inic

In [66]:
# Tokenize the words in our headlines
tokenizer = Tokenizer()
tokenizer.fit_on_texts(titulares)
total_words = len(tokenizer.word_index) + 1
print('Total words: ', total_words)

Total words:  13158


In [67]:
# Convert data to sequence of tokens 
input_sequences = []
for line in titulares:
    # Convert our headline into a sequence of tokens
    token_list = tokenizer.texts_to_sequences([line])[0]
    
    # Create a series of sequences for each headline
    for i in range(1, len(token_list)):
        partial_sequence = token_list[:i+1]
        input_sequences.append(partial_sequence)

print(tokenizer.sequences_to_texts(input_sequences[:5]))
input_sequences[:5]

['encontrando una', 'encontrando una visión', 'encontrando una visión amplia', 'encontrando una visión amplia de', 'encontrando una visión amplia de un']


[[2025, 10],
 [2025, 10, 598],
 [2025, 10, 598, 2026],
 [2025, 10, 598, 2026, 1],
 [2025, 10, 598, 2026, 1, 7]]

In [68]:
# Determine max sequence length
max_sequence_len = max([len(x) for x in input_sequences])

# Pad all sequences with zeros at the beginning to make them all max length
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
input_sequences[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0, 2025,   10])

In [69]:
# Predictors are every word except the last
predictors = input_sequences[:,:-1]
# Labels are the last word
labels = input_sequences[:,-1]
labels[:5]

array([  10,  598, 2026,    1,    7])

In [70]:
labels = utils.to_categorical(labels, num_classes=total_words)

# MODEL

In [29]:
# Input is max sequence length - 1, as we've removed the last word for the label
input_len = max_sequence_len - 1 

model = Sequential()

# Add input embedding layer
model.add(Embedding(total_words, 10, input_length=input_len))

# Add LSTM layer with 100 units
model.add(LSTM(100))
model.add(Dropout(0.1))

# Add output layer
model.add(Dense(total_words, activation='softmax'))



In [30]:
model.summary()

In [31]:
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [83]:
model.fit(predictors, labels, epochs=30, verbose=1)

Epoch 1/30
[1m2048/2048[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 27ms/step - loss: 1.4050
Epoch 2/30
[1m2048/2048[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 31ms/step - loss: 1.4108
Epoch 3/30
[1m2048/2048[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 40ms/step - loss: 1.3990
Epoch 4/30
[1m2048/2048[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 38ms/step - loss: 1.3868
Epoch 5/30
[1m2048/2048[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 38ms/step - loss: 1.3878
Epoch 6/30
[1m2048/2048[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 37ms/step - loss: 1.3763
Epoch 7/30
[1m2048/2048[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 37ms/step - loss: 1.3601
Epoch 8/30
[1m2048/2048[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 37ms/step - loss: 1.3711
Epoch 9/30
[1m2048/2048[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 35ms/step - loss: 1.3649
Epoch 10/30
[1m2048/2048[0m [32m━━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x1cf57ce1400>

In [59]:
def predict_next_token(seed_text):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    prediction = model.predict(token_list, verbose=0)
    predicted_class = np.argmax(prediction)
    return predicted_class

In [84]:
prediction = predict_next_token("hoy en nueva york")
prediction

6930

In [85]:
tokenizer.sequences_to_texts([[prediction]])

['tolera']

In [86]:
def generate_headline(seed_text, next_words=1):
    for _ in range(next_words):
        # Predict next token
        prediction = predict_next_token(seed_text)
        # Convert token to word
        next_word = tokenizer.sequences_to_texts([[prediction]])[0]
        # Add next word to the headline. This headline will be used in the next pass of the loop.
        seed_text += " " + next_word
    # Return headline as title-case
    return seed_text.title()

In [87]:
seed_texts = [
    'washington dc esta',
    'hoy en nueva york',
    'el distrito escolar tiene',
    'el crimen se ha convertido']
for seed in seed_texts:
    print(generate_headline(seed, next_words=5))

Washington Dc Esta Número Del Cuento De La
Hoy En Nueva York Tolera El Analfabetismo Le Está
El Distrito Escolar Tiene Dudas Sobre El Caso De
El Crimen Se Ha Convertido En Un Lugar De La
